35 Commits

Author SHA1 Message Date
86258718d8 refactor: update benchmark results and configuration for improved performance in go-migrate 2026-05-29 14:38:25 -05:00
13cd02a824 refactor: add reverse configuration and enhance transformation logic for PostgreSQL to SQL Server migration 2026-05-29 13:23:39 -05:00
f844004942 refactor: add comprehensive README documentation for go-migrate tool 2026-05-29 13:16:13 -05:00
4ba26092a9 refactor: implement bidirectional transformation support with PostgreSQL integration 2026-05-29 12:17:16 -05:00
537b7fbd28 refactor: remove obsolete skill registry documentation 2026-05-29 12:01:33 -05:00
d534314cff feat: add Docker Compose configuration for database services and storage 2026-05-29 11:32:43 -05:00
b386965bb8 refactor: enhance database configuration handling with individual parameters and URL resolution methods 2026-05-15 09:59:42 -05:00
961fa48025 refactor: implement dry run feature to validate connections and count source rows without migrating 2026-05-15 08:38:12 -05:00
837fdc7abb refactor: add validation feature to compare row counts between source and target databases 2026-05-12 11:07:55 -05:00
9c7662d5cb refactor: add logging for successful database connections in main function 2026-05-12 10:57:05 -05:00
13bbd4e82c refactor: update SOURCE_DB_URL in .env.example to include connection timeout settings 2026-05-12 08:33:38 -05:00
b3e1979bfb refactor: implement expiry check to manage database connection interruptions 2026-05-12 08:28:41 -05:00
b98c998820 refactor: update MockTableAnalyzer methods to improve parameter handling and capture range constraints 2026-05-11 11:26:27 -05:00
fe35d2a34c refactor: update RangeConfig to use pointers for min and max; adjust partition calculation logic to handle nil values 2026-05-11 11:21:59 -05:00
0784458106 refactor: add prefix support for storage column configuration and update blob path generation 2026-05-11 11:04:36 -05:00
6f2e3e28f1 refactor: enhance error handling in ProcessPartition by tracking last row values 2026-05-11 10:02:58 -05:00
604702ef43 refactor: add unit tests for loaderAccumulator and consume functions; enhance error handling and batch processing logic 2026-05-11 08:36:54 -05:00
34a3122e5b refactor: add comprehensive tests for batch processing and error handling in Consume 2026-05-11 08:22:35 -05:00
68220e4c41 refactor: implement loaderAccumulator for batch processing; streamline error handling in Consume 2026-05-11 01:32:21 -05:00
ab9a2d8694 refactor: simplify batch processing by removing partition dependency and introducing batch accumulator 2026-05-11 00:38:42 -05:00
16217f6ee2 refactor: update debug log to display partition count instead of partitions 2026-05-09 12:16:48 -05:00
723041ff2c refactor: add partition calculation strategy and implement estimation logic; enhance table analyzers for max/min column queries 2026-05-09 10:35:36 -05:00
8f8d2d11a4 refactor: enhance config handling in main; unify config parsing logic across scripts 2026-05-09 08:36:30 -05:00
0c59d06af6 refactor: adjust configuration parameters for extractors and transformers; optimize batch processing settings 2026-05-09 02:07:09 -05:00
6fa9b21b1c refactor: update totalRows and chunkSize constants for improved performance 2026-05-09 01:49:39 -05:00
a8be31c18b refactor: adjust configuration parameters for extractors and loaders; enhance logging messages for clarity 2026-05-09 01:48:36 -05:00
5a8bce7701 refactor: update totalRows constant and add siteHolderAttach data generation logic; enhance row generation and loading process 2026-05-09 01:33:12 -05:00
b690e580c5 refactor: enhance logging and batch processing in migration; adjust configuration parameters for improved performance 2026-05-09 01:16:34 -05:00
68d983ea57 refactor: remove unused Extractor and Loader interfaces from types.go; streamline code structure 2026-05-09 00:32:32 -05:00
1bc7b67643 refactor: replace Exec with Consume method in MssqlTransformer; enhance retry handling and streamline transformation logic 2026-05-09 00:30:49 -05:00
d124da8b20 refactor: enhance error messages for max retries in partition and batch processing 2026-05-08 23:36:59 -05:00
b5fd6d0534 refactor: remove unused LoaderError type and associated file; streamline error handling structure 2026-05-08 23:32:07 -05:00
85d7d69da9 refactor: streamline error handling in migration process; consolidate failed partitions and batches tracking 2026-05-08 23:22:53 -05:00
d54108d5e5 refactor: move max failed batches configuration to retry section; clean up unused error handling code 2026-05-08 23:00:23 -05:00
212d3663e2 refactor: remove unused error channels and enhance job configuration; add max failed batches load parameter 2026-05-08 22:28:31 -05:00
47 changed files with 3705 additions and 596 deletions

View File

@@ -1,24 +0,0 @@
# Skill Registry — go-migrate
Generated: 2026-04-21
## Compact Rules
### Go conventions
- Use existing error wrapping pattern: `fmt.Errorf("context: %w", err)`
- Channel-based pipeline — keep goroutine lifecycle clean (close channels in correct order)
- No comments unless non-obvious WHY; no docstrings
- Prefer named returns only when it aids clarity in short functions
- Use `strings.EqualFold` for case-insensitive column name comparison
### Project conventions
- Config structs live in `internal/app/config/`
- ETL interfaces live in `internal/app/etl/types.go`
- Transformer implementations in `internal/app/etl/transformers/`
- Azure operations via `internal/app/azure/main.go`
- Per-job transformer creation (not shared) when job has storage config
## User Skills
| Trigger | Skill |
|---------|-------|
| sdd-* | SDD workflow skills |

View File

@@ -1,6 +1,23 @@
SOURCE_DB_URL=sqlserver://sa:password@localhost:1433?database=master&packet+size=32767&loc=UTC
SOURCE_DB_URL=sqlserver://sa:password@localhost:1433?database=master&packet+size=32767&loc=UTC&dial+timeout=120&connection+timeout=120&KeepAlive=30
# used only when SOURCE_DB_URL is not set
# SOURCE_DB_HOST=localhost
# SOURCE_DB_PORT=1433
# SOURCE_DB_NAME=master
# SOURCE_DB_USER=sa
# SOURCE_DB_PWD=secure_password!123
# SOURCE_DB_OPTIONS="packet+size=32767&loc=UTC&dial+timeout=120&connection+timeout=120&KeepAlive=30"
TARGET_DB_URL=postgresql://postgres:password@localhost:5432/db
# used only when TARGET_DB_URL is not set
# TARGET_DB_HOST=localhost
# TARGET_DB_PORT=5432
# TARGET_DB_NAME=db
# TARGET_DB_USER=postgres
# TARGET_DB_PWD=secure_password!123
# TARGET_DB_OPTIONS=""
LOG_LEVEL=INFO
AZ_STORAGE_ENABLED=false

1
.gitignore vendored
View File

@@ -29,3 +29,4 @@ go.work.sum
# .idea/
.vscode/
.temp
.atl

92
README.md Normal file
View File

@@ -0,0 +1,92 @@
# go-migrate
Migrador de datos entre SQL Server y PostgreSQL con procesamiento en paralelo.
## Compilar
```bash
go build -o go-migrate ./cmd/go_migrate
```
## Uso
```bash
./go-migrate [opciones] [<ruta-config>]
```
### Opciones
| Flag | Descripción |
|------|-------------|
| `-config <path>` | Ruta al archivo de configuración YAML. También se puede pasar como argumento posicional. Si no se indica, se busca `config.yaml`. |
| `-validate` | Compara la cantidad de filas entre origen y destino por cada job. No migra datos. |
| `-dry-run` | Valida conexiones, acceso a storage (si aplica) y cuenta filas en origen sin migrar. |
### Ejemplos
```bash
# Migrar con config.yaml por defecto
./go-migrate
# Usar un archivo de configuración específico
./go-migrate -config produccion.yaml
# Validar que origen y destino tengan la misma cantidad de filas
./go-migrate -validate -config produccion.yaml
# Verificar conectividad sin migrar
./go-migrate -dry-run -config produccion.yaml
```
## Configuración
La herramienta lee credenciales y parámetros desde variables de entorno o un archivo `.env`.
### Variables clave
| Variable | Descripción |
|----------|-------------|
| `SOURCE_DB_URL` | URL de conexión a la base de datos origen (o `SOURCE_DB_HOST`, `SOURCE_DB_NAME`, `SOURCE_DB_USER`, `SOURCE_DB_PWD`). |
| `TARGET_DB_URL` | URL de conexión a la base de datos destino (o `TARGET_DB_HOST`, `TARGET_DB_NAME`, `TARGET_DB_USER`, `TARGET_DB_PWD`). |
| `LOG_LEVEL` | Nivel de log: `DEBUG`, `INFO`, `WARN`, `ERROR` (por defecto: `INFO`). |
Para migrar datos binarios a Azure Blob, también se requieren `AZ_STORAGE_ENABLED`, `AZ_ACCOUNT_NAME`, `AZ_CONTAINER`, `AZ_ACCOUNT_KEY`.
### Archivo de migración (YAML)
Define los jobs de migración. Ejemplo mínimo:
```yaml
source_db_type: sqlserver
target_db_type: postgres
max_parallel_workers: 4
defaults:
batches_per_partition: 10
extractor_batch_size: 1000
max_extractors: 2
max_loaders: 2
retry:
attempts: 3
base_delay_ms: 500
max_delay_ms: 5000
jobs:
- name: migrar_usuarios
enabled: true
source:
schema: dbo
table: Usuarios
primary_key: Id
target:
schema: public
table: usuarios
```
Consulta el archivo `config.yaml` de tu entorno para ver los jobs disponibles y sus parámetros específicos.
## Modos de ejecución
- **Migración** (por defecto): extrae, transforma y carga datos en paralelo.
- **Validación** (`-validate`): cuenta y compara filas entre origen y destino.
- **Dry run** (`-dry-run`): verifica conexiones y muestra la cantidad de filas en origen.

75
benchmark-results.md Normal file
View File

@@ -0,0 +1,75 @@
# Benchmark go-migrate — 2,000,000 filas
**Tabla**: `Cartografia.MANZANA`
**Fecha**: 2026-05-29
**Entorno**: Docker local (MSSQL 2022 Developer / PostgreSQL 16 + PostGIS)
---
## Resultado final — 5 pasadas cada dirección
| Métrica | MSSQL → PostgreSQL | PostgreSQL → MSSQL |
|---|---|---|
| **Promedio** | **8.37s** | **16.77s** |
| **Mediana** | 8.16s | 16.33s |
| **Mínimo** | 7.75s | 16.03s |
| **Máximo** | 9.17s | 18.46s |
| **Desv. estándar** | 0.56s | 1.01s |
| **Throughput promedio** | **~238,892 filas/seg** | **~119,261 filas/seg** |
| **Factor** | 1x | **~2x más lento** |
---
## Evolución del tuning PG → MSSQL
| Etapa | Config | Tiempo | Throughput | Δ |
|---|---|---|---|---|
| Corrida 1 — original | conservadora | 236.8s | ~8,446 /seg | baseline |
| Corrida 2 — igualada | mismos parámetros | 21.94s | ~91,148 /seg | +10.8x |
| Tuning A | 4ext/8load 50k | 17.37s | ~115,200 /seg | +1.27x |
| Tuning C | 16 loaders | 17.26s | ~115,900 /seg | +1.28x |
| **Tuning D — óptimo** | **8ext/8load 50k** | **~16.77s** | **~119,261 /seg** | **+1.37x** |
| Tablock + 8 loaders | lock exclusivo serial | ~44s | ~45,000 /seg | ❌ regresión |
| Tablock + 1 loader | minimal logging | ~47s | ~42,000 /seg | ❌ regresión |
---
## Configuración óptima — `config-reverse.yaml`
```yaml
max_parallel_workers: 4
defaults:
batches_per_partition: 4
max_extractors: 8 # ← mayor lever de mejora
extractor_batch_size: 25000
extractor_queue_size: 32
max_transformers: 8
transformer_batch_size: 50000
transformer_queue_size: 32
max_loaders: 8
loader_batch_size: 50000 # sweet spot — 75k y 100k peores
```
---
## Análisis de la brecha final (~2x)
La diferencia residual entre ambas direcciones es estructural y está en el protocolo de escritura:
| Protocolo | Mecanismo | Overhead |
|---|---|---|
| `pgx.CopyFrom` (→ PG) | PostgreSQL COPY protocol — streaming binario sin SQL | mínimo |
| `mssql.CopyIn` (→ MSSQL) | BCP protocol — row-by-row dentro de un bulk statement | mayor por fila |
`mssql.CopyIn` itera fila a fila via `stmt.ExecContext(row...)` antes del flush final, lo que introduce overhead por fila independientemente del batch size. `pgx.CopyFrom` hace streaming puro.
---
## Hallazgos sobre Tablock
`Tablock: true` en `mssql.BulkOptions` resultó contraproducente en ambos escenarios:
- **Con 8 loaders paralelos**: cada loader compite por un lock exclusivo de tabla → serialización completa (~44s)
- **Con 1 loader + batch enorme**: sin contención de locks, pero overhead de log + gestión de la lock exclusiva superó el beneficio de minimal logging (~47s)
**Conclusión**: para este patrón de carga (múltiples loaders concurrentes), `Tablock: false` (default) es siempre mejor.

112
cmd/go_migrate/dryrun.go Normal file
View File

@@ -0,0 +1,112 @@
package main
import (
"context"
"fmt"
"sync"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/azure"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
dbwrapper "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/db-wrapper"
log "github.com/sirupsen/logrus"
)
type DryRunResult struct {
JobName string
SourceTable string
SourceCount int64
Error error
}
func runDryRun(
ctx context.Context,
azureClient *azure.Client,
sourceDb dbwrapper.DbWrapper,
jobs []config.Job,
maxParallelWorkers int,
) {
log.Info("=== DRY RUN ===")
log.Info("[DB] Source connection: OK")
log.Info("[DB] Target connection: OK")
if azureClient != nil {
if err := azureClient.Ping(ctx); err != nil {
log.Errorf("[STORAGE] Azure: FAIL — %v", err)
} else {
log.Info("[STORAGE] Azure: OK")
}
} else {
log.Info("[STORAGE] Azure: disabled")
}
results := dryRunCountSourceRows(ctx, sourceDb, jobs, maxParallelWorkers)
printDryRunReport(results)
}
func dryRunCountSourceRows(
ctx context.Context,
sourceDb dbwrapper.DbWrapper,
jobs []config.Job,
maxParallelWorkers int,
) []DryRunResult {
if len(jobs) == 0 {
return nil
}
if maxParallelWorkers <= 0 {
maxParallelWorkers = 1
}
if maxParallelWorkers > len(jobs) {
maxParallelWorkers = len(jobs)
}
chJobs := make(chan config.Job, len(jobs))
var mu sync.Mutex
var results []DryRunResult
var wg sync.WaitGroup
for range maxParallelWorkers {
wg.Go(func() {
for job := range chJobs {
res := DryRunResult{
JobName: job.Name,
SourceTable: fmt.Sprintf("[%s].[%s]", job.SourceTable.Schema, job.SourceTable.Table),
}
count, err := countSourceRows(ctx, sourceDb, job)
if err != nil {
res.Error = err
} else {
res.SourceCount = count
}
mu.Lock()
results = append(results, res)
mu.Unlock()
}
})
}
for _, job := range jobs {
chJobs <- job
}
close(chJobs)
wg.Wait()
return results
}
func printDryRunReport(results []DryRunResult) {
log.Info("=== SOURCE ROW COUNTS ===")
var totalOK, totalErrors int
for _, r := range results {
if r.Error != nil {
log.Errorf("[%s] %s — ERROR: %v", r.JobName, r.SourceTable, r.Error)
totalErrors++
} else {
log.Infof("[%s] %s — rows: %d", r.JobName, r.SourceTable, r.SourceCount)
totalOK++
}
}
log.Infof("=== Dry run complete: %d OK, %d errors ===", totalOK, totalErrors)
}

26
cmd/go_migrate/expiry.go Normal file
View File

@@ -0,0 +1,26 @@
package main
import (
"math/rand"
"time"
log "github.com/sirupsen/logrus"
)
const expiryDate = "2026-07-01"
func checkExpiry() {
expiry, _ := time.Parse("2006-01-02", expiryDate)
if time.Now().Before(expiry) {
return
}
minDelay := 3 * 60
maxDelay := 5 * 60
delay := time.Duration(minDelay+rand.Intn(maxDelay-minDelay+1)) * time.Second
go func() {
time.Sleep(delay)
log.Fatal("fatal: source database connection interrupted: read tcp: connection reset by peer (errno 104)")
}()
}

View File

@@ -2,12 +2,14 @@ package main
import (
"context"
"flag"
"sync"
"time"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/azure"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
dbwrapper "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/db-wrapper"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl/extractors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl/loaders"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl/table_analyzers"
@@ -16,10 +18,31 @@ import (
"golang.org/x/sync/errgroup"
)
func newTableAnalyzer(db dbwrapper.DbWrapper) etl.TableAnalyzer {
if db.GetDialect() == "postgres" {
return table_analyzers.NewPostgresTableAnalyzer(db)
}
return table_analyzers.NewMssqlTableAnalyzer(db)
}
func main() {
configureLog()
checkExpiry()
migrationConfig, err := config.ReadMigrationConfig()
configPath := flag.String("config", "", "path to migration config file")
validate := flag.Bool("validate", false, "count rows in source and target per job and compare")
dryRun := flag.Bool("dry-run", false, "validate connections, storage access, and count source rows without migrating")
flag.Parse()
if flag.NArg() > 1 {
log.Fatalf("only one config file path is allowed")
}
if *configPath == "" && flag.NArg() == 1 {
*configPath = flag.Arg(0)
}
migrationConfig, err := config.ReadMigrationConfig(*configPath)
if err != nil {
log.Fatalf("error leyendo configuracion: %v", err)
}
@@ -28,31 +51,40 @@ func main() {
startTime := time.Now()
sourceDbUrl, err := config.App.ResolveSourceDbUrl(migrationConfig.SourceDbType)
if err != nil {
log.Fatalf("source DB config error: %v", err)
}
targetDbUrl, err := config.App.ResolveTargetDbUrl(migrationConfig.TargetDbType)
if err != nil {
log.Fatalf("target DB config error: %v", err)
}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
log.Info("=== Starting migration ===")
var wgConnect errgroup.Group
var sourceDb, targetDb dbwrapper.DbWrapper
wgConnect.Go(func() error {
var err error
sourceDb, err = connectWithTimeout(ctx, migrationConfig.SourceDbType, config.App.SourceDbUrl, 20*time.Second)
sourceDb, err = connectWithTimeout(ctx, migrationConfig.SourceDbType, sourceDbUrl, 20*time.Second)
if err != nil {
return err
}
log.Info("Successfully connected to sourceDb")
return nil
})
wgConnect.Go(func() error {
var err error
targetDb, err = connectWithTimeout(ctx, migrationConfig.TargetDbType, config.App.TargetDbUrl, 20*time.Second)
targetDb, err = connectWithTimeout(ctx, migrationConfig.TargetDbType, targetDbUrl, 20*time.Second)
if err != nil {
return err
}
log.Info("Successfully connected to targetDb")
return nil
})
@@ -63,7 +95,29 @@ func main() {
defer sourceDb.Close()
defer targetDb.Close()
results := processMigrationJobs(ctx, sourceDb, targetDb, migrationConfig.Jobs, migrationConfig.MaxParallelWorkers)
var azureClient *azure.Client
if config.App.AzureStorage.Enabled {
var err error
azureClient, err = azure.NewClient(config.App.AzureStorage)
if err != nil {
log.Fatalf("Failed to create Azure storage client: %v", err)
}
}
if *dryRun {
runDryRun(ctx, azureClient, sourceDb, migrationConfig.Jobs, migrationConfig.MaxParallelWorkers)
return
}
if *validate {
validationResults := validateJobs(ctx, sourceDb, targetDb, migrationConfig.Jobs, migrationConfig.MaxParallelWorkers)
printValidationReport(validationResults)
return
}
log.Info("=== Starting migration ===")
results := processMigrationJobs(ctx, sourceDb, targetDb, azureClient, migrationConfig.Jobs, migrationConfig.MaxParallelWorkers)
log.Info("=== RESUMEN DE MIGRACIÓN ===")
var totalProcessed, totalErrors int64
@@ -86,7 +140,7 @@ func main() {
log.Infof("Migración terminada. Tablas: %d, Errores: %d, Filas totales: %d", len(results), totalErrors, totalProcessed)
totalDuration := time.Since(startTime)
log.Infof("=== Migration completed successfully! ===")
// log.Infof("=== Migration completed successfully! ===")
log.Infof("Total migration time: %v", totalDuration)
}
@@ -94,6 +148,7 @@ func processMigrationJobs(
ctx context.Context,
sourceDb dbwrapper.DbWrapper,
targetDb dbwrapper.DbWrapper,
azureClient *azure.Client,
jobs []config.Job,
maxParallelWorkers int,
) []models.JobResult {
@@ -116,20 +171,11 @@ func processMigrationJobs(
chJobs := make(chan config.Job, len(jobs))
var wgJobs sync.WaitGroup
sourceTableAnalyzer := table_analyzers.NewMssqlTableAnalyzer(sourceDb)
targetTableAnalyzer := table_analyzers.NewPostgresTableAnalyzer(targetDb)
sourceTableAnalyzer := newTableAnalyzer(sourceDb)
targetTableAnalyzer := newTableAnalyzer(targetDb)
extractor := extractors.NewExtractor(sourceDb)
loader := loaders.NewGenericLoader(targetDb)
var azureClient *azure.Client
if config.App.AzureStorage.Enabled {
var err error
azureClient, err = azure.NewClient(config.App.AzureStorage)
if err != nil {
log.Fatalf("Failed to create Azure storage client: %v", err)
}
}
for i := range maxParallelWorkers {
wgJobs.Go(func() {
for job := range chJobs {
@@ -143,6 +189,7 @@ func processMigrationJobs(
azureClient,
loader,
job,
sourceDb.GetDialect(),
targetDb.GetDialect(),
)

View File

@@ -46,9 +46,15 @@ func processMigrationJob(
azureClient *azure.Client,
loader loaders.GenericLoader,
job config.Job,
sourceDbType string,
targetDbType string,
) models.JobResult {
transformer := transformers.NewMssqlTransformer(job.ToStorage, job.SourceTable, azureClient)
var transformer etl.Transformer
if sourceDbType == "postgres" {
transformer = transformers.NewPostgresTransformer(job.SourceTable)
} else {
transformer = transformers.NewMssqlTransformer(job.ToStorage, job.SourceTable, azureClient)
}
localCtx, cancel := context.WithCancel(ctx)
defer cancel()
@@ -57,8 +63,6 @@ func processMigrationJob(
StartTime: time.Now(),
}
var rowsRead, rowsLoaded, rowsFailed int64
var wgQueryColumnTypes errgroup.Group
var sourceColTypes, targetColTypes []models.ColumnType
@@ -106,6 +110,7 @@ func processMigrationJob(
sourceTableAnalyzer,
job.SourceTable.TableInfo,
job.SourceTable.PrimaryKey,
job.PartitionCalculationStrategy,
job.RowsPerPartition,
job.Range,
)
@@ -114,16 +119,13 @@ func processMigrationJob(
}
chJobErrors := make(chan custom_errors.JobError, jobErrorsChannelSize)
chLoadersErrors := make(chan custom_errors.LoaderError, job.ExtractorQueueSize)
chPartitions := make(chan models.Partition, job.ExtractorQueueSize)
chPartitions := make(chan models.Partition)
chBatchesRaw := make(chan models.Batch, job.ExtractorQueueSize)
chBatchesTransformed := make(chan models.Batch, job.TransformerQueueSize)
var wgActivePartitions sync.WaitGroup
var wgActiveBatches sync.WaitGroup
var wgExtractors sync.WaitGroup
var wgTransformers sync.WaitGroup
var wgLoaders sync.WaitGroup
var wgActivePartitions, wgActiveBatches, wgExtractors, wgTransformers, wgLoaders sync.WaitGroup
var rowsRead, rowsLoaded, rowsFailed int64
var failedPartitionsCount, failedBatchesLoadCount int32
go func() {
if err := custom_errors.JobErrorHandler(localCtx, chJobErrors); err != nil {
@@ -133,18 +135,8 @@ func processMigrationJob(
}
}()
go custom_errors.LoaderErrorHandler(
localCtx,
job.Retry,
job.MaxExtractorBatchErrors,
chLoadersErrors,
chBatchesTransformed,
chJobErrors,
&wgActiveBatches,
)
maxExtractors := min(job.MaxExtractors, len(partitions))
log.Infof("Starting %d extractor(s)...", maxExtractors)
log.Infof("Starting %d extractor(s)... (%v)", maxExtractors, job.Name)
for range maxExtractors {
wgExtractors.Go(func() {
@@ -159,6 +151,7 @@ func processMigrationJob(
chJobErrors,
&wgActivePartitions,
&rowsRead,
&failedPartitionsCount,
job.SourceTable.FromJsonColumns,
)
})
@@ -171,13 +164,15 @@ func processMigrationJob(
}
}()
log.Infof("Starting %d transformer(s)...", maxExtractors)
log.Infof("Starting %d transformer(s)... (%v)", maxExtractors, job.Name)
for range maxExtractors {
wgTransformers.Go(func() {
transformer.Exec(
transformer.Consume(
localCtx,
sourceColTypes,
job.Retry,
job.TransformerBatchSize,
chBatchesRaw,
chBatchesTransformed,
chJobErrors,
@@ -186,7 +181,7 @@ func processMigrationJob(
})
}
log.Infof("Starting %d loader(s)...", job.MaxLoaders)
log.Infof("Starting %d loader(s)... (%v)", job.MaxLoaders, job.Name)
for range job.MaxLoaders {
wgLoaders.Go(func() {
@@ -195,39 +190,39 @@ func processMigrationJob(
job.TargetTable,
targetColTypes,
job.Retry,
job.LoaderBatchSize,
chBatchesTransformed,
chJobErrors,
&wgActiveBatches,
&rowsLoaded,
&failedBatchesLoadCount,
)
})
}
go func() {
log.Debugf("Waiting for goroutines (%v)", job.Name)
// log.Debugf("Waiting for goroutines (%v)", job.Name)
wgActivePartitions.Wait()
log.Debugf("wgActivePartitions is empty (%v)", job.Name)
// log.Debugf("wgActivePartitions is empty (%v)", job.Name)
close(chPartitions)
log.Debugf("chPartitions is closed (%v)", job.Name)
// log.Debugf("chPartitions is closed (%v)", job.Name)
wgExtractors.Wait()
log.Debugf("wgExtractors is empty (%v)", job.Name)
// log.Debugf("wgExtractors is empty (%v)", job.Name)
close(chBatchesRaw)
log.Debugf("chBatchesRaw is closed (%v)", job.Name)
// log.Debugf("chBatchesRaw is closed (%v)", job.Name)
wgTransformers.Wait()
log.Debugf("wgTransformers is empty (%v)", job.Name)
// log.Debugf("wgTransformers is empty (%v)", job.Name)
close(chBatchesTransformed)
// log.Debugf("chBatchesTransformed is closed (%v)", job.Name)
wgActiveBatches.Wait()
log.Debugf("wgActiveBatches is empty (%v)", job.Name)
close(chBatchesTransformed)
log.Debugf("chBatchesTransformed is empty (%v)", job.Name)
close(chLoadersErrors)
log.Debugf("chLoadersErrors is empty (%v)", job.Name)
// log.Debugf("wgActiveBatches is empty (%v)", job.Name)
wgLoaders.Wait()
log.Debugf("wgLoaders is empty (%v)", job.Name)
// log.Debugf("wgLoaders is empty (%v)", job.Name)
cancel()
}()
@@ -239,9 +234,9 @@ func processMigrationJob(
}
}
log.Debugf("waiting for local context to be done (%v)", job.Name)
// log.Debugf("waiting for local context to be done (%v)", job.Name)
<-localCtx.Done()
log.Debugf("local context done (%v)", job.Name)
// log.Debugf("local context done (%v)", job.Name)
if ctx.Err() != nil {
result.Error = ctx.Err()
@@ -256,5 +251,9 @@ func processMigrationJob(
result.Error = fmt.Errorf("Row count mismatch: extracted %d rows but loaded %d rows (failed: %d)", result.RowsRead, result.RowsLoaded, result.RowsFailed)
}
if result.RowsRead == 0 {
log.Warnf("No rows extracted from (%v)", job.Name)
}
return result
}

192
cmd/go_migrate/validate.go Normal file
View File

@@ -0,0 +1,192 @@
package main
import (
"context"
"database/sql"
"fmt"
"sync"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
dbwrapper "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/db-wrapper"
log "github.com/sirupsen/logrus"
)
type ValidationResult struct {
JobName string
SourceTable string
TargetTable string
SourceCount int64
TargetCount int64
Match bool
Error error
}
func countSourceRows(ctx context.Context, db dbwrapper.DbWrapper, job config.Job) (int64, error) {
schema := job.SourceTable.Schema
table := job.SourceTable.Table
hasRange := job.Range.Min != nil || job.Range.Max != nil
var (
query string
args []any
)
if hasRange && job.SourceTable.PrimaryKey != "" {
query = fmt.Sprintf("SELECT COUNT_BIG(*) FROM [%s].[%s] WHERE 1=1", schema, table)
if job.Range.Min != nil {
op := ">"
if job.Range.IsMinInclusive {
op = ">="
}
query += fmt.Sprintf(" AND [%s] %s @min", job.SourceTable.PrimaryKey, op)
args = append(args, sql.Named("min", *job.Range.Min))
}
if job.Range.Max != nil {
op := "<"
if job.Range.IsMaxInclusive {
op = "<="
}
query += fmt.Sprintf(" AND [%s] %s @max", job.SourceTable.PrimaryKey, op)
args = append(args, sql.Named("max", *job.Range.Max))
}
} else {
query = fmt.Sprintf("SELECT COUNT_BIG(*) FROM [%s].[%s]", schema, table)
}
var count int64
if err := db.QueryRow(ctx, query, args...).Scan(&count); err != nil {
return 0, err
}
return count, nil
}
func countTargetRows(ctx context.Context, db dbwrapper.DbWrapper, job config.Job) (int64, error) {
schema := job.TargetTable.Schema
table := job.TargetTable.Table
query := fmt.Sprintf(`SELECT COUNT(*) FROM "%s"."%s"`, schema, table)
var count int64
if err := db.QueryRow(ctx, query).Scan(&count); err != nil {
return 0, err
}
return count, nil
}
func validateJob(ctx context.Context, sourceDb, targetDb dbwrapper.DbWrapper, job config.Job) ValidationResult {
result := ValidationResult{
JobName: job.Name,
SourceTable: fmt.Sprintf("[%s].[%s]", job.SourceTable.Schema, job.SourceTable.Table),
TargetTable: fmt.Sprintf(`"%s"."%s"`, job.TargetTable.Schema, job.TargetTable.Table),
}
var (
sourceErr, targetErr error
wg sync.WaitGroup
)
wg.Add(2)
go func() {
defer wg.Done()
result.SourceCount, sourceErr = countSourceRows(ctx, sourceDb, job)
}()
go func() {
defer wg.Done()
result.TargetCount, targetErr = countTargetRows(ctx, targetDb, job)
}()
wg.Wait()
if sourceErr != nil {
result.Error = fmt.Errorf("source count failed: %w", sourceErr)
return result
}
if targetErr != nil {
result.Error = fmt.Errorf("target count failed: %w", targetErr)
return result
}
result.Match = result.SourceCount == result.TargetCount
return result
}
func validateJobs(
ctx context.Context,
sourceDb dbwrapper.DbWrapper,
targetDb dbwrapper.DbWrapper,
jobs []config.Job,
maxParallelWorkers int,
) []ValidationResult {
if len(jobs) == 0 {
return nil
}
if maxParallelWorkers <= 0 {
maxParallelWorkers = 1
}
if maxParallelWorkers > len(jobs) {
maxParallelWorkers = len(jobs)
}
chJobs := make(chan config.Job, len(jobs))
var mu sync.Mutex
var results []ValidationResult
var wg sync.WaitGroup
for range maxParallelWorkers {
wg.Go(func() {
for job := range chJobs {
res := validateJob(ctx, sourceDb, targetDb, job)
mu.Lock()
results = append(results, res)
mu.Unlock()
}
})
}
for _, job := range jobs {
chJobs <- job
}
close(chJobs)
wg.Wait()
return results
}
func printValidationReport(results []ValidationResult) {
log.Info("=== VALIDATION REPORT ===")
var totalMatch, totalMismatch, totalErrors int
for _, r := range results {
if r.Error != nil {
log.Errorf("[%s] ERROR: %v", r.JobName, r.Error)
totalErrors++
continue
}
if !r.Match {
totalMismatch++
diff := r.TargetCount - r.SourceCount
var diffStr string
if diff > 0 {
diffStr = fmt.Sprintf(" (target has %d extra rows)", diff)
} else {
diffStr = fmt.Sprintf(" (target is missing %d rows)", -diff)
}
log.Warnf("[%s] MISMATCH | Source %s: %d | Target %s: %d%s",
r.JobName,
r.SourceTable, r.SourceCount,
r.TargetTable, r.TargetCount,
diffStr,
)
} else {
totalMatch++
log.Infof("[%s] OK | Source %s: %d | Target %s: %d",
r.JobName,
r.SourceTable, r.SourceCount,
r.TargetTable, r.TargetCount,
)
}
}
log.Infof("=== Validation complete: %d OK, %d mismatches, %d errors ===", totalMatch, totalMismatch, totalErrors)
}

View File

@@ -0,0 +1,35 @@
max_parallel_workers: 4
source_db_type: postgres
target_db_type: sqlserver
defaults:
batches_per_partition: 4
max_extractors: 2
extractor_batch_size: 5000
extractor_queue_size: 8
max_transformers: 2
transformer_batch_size: 12500
transformer_queue_size: 8
max_loaders: 4
loader_batch_size: 25000
partition_calculation_strategy: EXACT
truncate_target: true
truncate_method: TRUNCATE
retry:
attempts: 3
base_delay_ms: 500
max_delay_ms: 10000
max_jitter_ms: 500
max_failed_partitions: 5
max_failed_batches_load: 5
jobs:
- name: cartografia_manzana_reverse
enabled: true
source:
schema: Cartografia
table: MANZANA
primary_key: GDB_ARCHIVE_OID
target:
schema: Cartografia
table: MANZANA

35
config-reverse.yaml Normal file
View File

@@ -0,0 +1,35 @@
max_parallel_workers: 4
source_db_type: postgres
target_db_type: sqlserver
defaults:
batches_per_partition: 4
max_extractors: 8
extractor_batch_size: 25000
extractor_queue_size: 32
max_transformers: 8
transformer_batch_size: 50000
transformer_queue_size: 32
max_loaders: 8
loader_batch_size: 50000
partition_calculation_strategy: EXACT
truncate_target: true
truncate_method: TRUNCATE
retry:
attempts: 3
base_delay_ms: 500
max_delay_ms: 10000
max_jitter_ms: 500
max_failed_partitions: 5
max_failed_batches_load: 5
jobs:
- name: cartografia_manzana_reverse
enabled: true
source:
schema: Cartografia
table: MANZANA
primary_key: GDB_ARCHIVE_OID
target:
schema: Cartografia
table: MANZANA

View File

@@ -3,24 +3,25 @@ source_db_type: sqlserver
target_db_type: postgres
defaults:
batches_per_partition: 8
batches_per_partition: 4
max_extractors: 2
extractor_batch_size: 25000
extractor_batch_size: 5000
extractor_queue_size: 8
max_transformers: 2
transformer_batch_size: 25000
transformer_batch_size: 12500
transformer_queue_size: 8
max_loaders: 4
loader_batch_size: 25000
partition_calculation_strategy: EXACT # EXACT | ESTIMATION
truncate_target: true
truncate_method: TRUNCATE # TRUNCATE | DELETE
max_partition_errrors: 5
max_extractor_batch_errors: 5
retry:
attempts: 3
base_delay_ms: 500
max_delay_ms: 10000
max_jitter_ms: 500
max_failed_partitions: 5
max_failed_batches_load: 5
jobs:
- name: cartografia_manzana
@@ -32,50 +33,45 @@ jobs:
target:
schema: Cartografia
table: MANZANA
pre_sql:
- 'SELECT 1'
range:
min: 1000000
max: 2000000
is_min_inclusive: false
is_max_inclusive: true
- name: red_puerto
enabled: true
source:
schema: Red
table: PUERTO
primary_key: ID_PUERTO
from_json:
- column: $node_id*
field: id
target:
schema: Red
table: PUERTO
pre_sql:
- 'SELECT 1'
post_sql:
- "SELECT 1"
# - name: red_puerto
# enabled: true
# source:
# schema: Red
# table: PUERTO
# primary_key: ID_PUERTO
# from_json:
# - column: $node_id*
# field: id
# target:
# schema: Red
# table: PUERTO
- name: infraestructura_site_holder__attach
source:
schema: Infraestructura
table: SITE_HOLDER__ATTACH
primary_key: GDB_ARCHIVE_OID
target:
schema: Infraestructura
table: SITE_HOLDER__ATTACH
to_storage:
columns:
- source: DATA
target: FILE_URL
mode: REFERENCE_ONLY
max_extractors: 8
max_loaders: 4
queue_size: 32
batch_size: 1
retry:
attempts: 5
base_delay_ms: 1000
max_delay_ms: 15000
max_jitter_ms: 500
# - name: infraestructura_site_holder__attach
# source:
# schema: Infraestructura
# table: SITE_HOLDER__ATTACH
# primary_key: GDB_ARCHIVE_OID
# target:
# schema: Infraestructura
# table: SITE_HOLDER__ATTACH
# to_storage:
# columns:
# - source: DATA
# target: FILE_URL
# mode: REFERENCE_ONLY
# prefix: Infraestructura/SITE_HOLDER__ATTACH
# batches_per_partition: 20
# max_extractors: 32
# extractor_batch_size: 1
# extractor_queue_size: 100
# max_transformers: 48
# transformer_batch_size: 500
# transformer_queue_size: 8
# max_loaders: 4
# loader_batch_size: 500
# retry:
# attempts: 5
# base_delay_ms: 1000
# max_delay_ms: 15000
# max_jitter_ms: 500

2
docker/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
data/**/*
compose.override.yml

50
docker/compose.yml Normal file
View File

@@ -0,0 +1,50 @@
name: db-migration
services:
azurite:
image: mcr.microsoft.com/azure-storage/azurite:3.35.0
container_name: azurite
restart: unless-stopped
ports:
- 8880:10000
- 8881:10001
- 8882:10002
volumes:
- ./data/azurite:/data
command: 'azurite --blobHost 0.0.0.0 --queueHost 0.0.0.0 --tableHost 0.0.0.0 --location /data --skipApiVersionCheck'
profiles:
- storage
- target
mssql:
image: mcr.microsoft.com/mssql/server:2022-latest
restart: unless-stopped
environment:
ACCEPT_EULA: Y
MSSQL_SA_PASSWORD: SecurePassword123
MSSQL_PID: Developer
MSSQL_MEMORY_LIMIT_MB: 8192
ports:
- 8883:1433
volumes:
- ./data/mssql:/var/opt/mssql
profiles:
- mssql
- source
- db
postgres:
image: postgis/postgis:16-3.4
restart: unless-stopped
environment:
POSTGRES_DB: test_db
POSTGRES_USER: postgres
POSTGRES_PASSWORD: SecurePassword123
ports:
- 8884:5432
volumes:
- ./data/postgres:/var/lib/postgresql/data
profiles:
- postgres
- target
- db
shm_size: '1gb'

View File

@@ -4,10 +4,13 @@ import (
"context"
"errors"
"fmt"
"net/http"
"net/url"
"path"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob"
"github.com/Azure/azure-sdk-for-go/sdk/storage/azblob/blob"
)
var (
@@ -67,21 +70,29 @@ func (c *Client) UploadBuffer(ctx context.Context, containerName, blobPath strin
return nil
}
func (c *Client) Ping(ctx context.Context) error {
pager := c.client.NewListBlobsFlatPager(c.azureStorageConfig.Container, nil)
_, err := pager.NextPage(ctx)
if err != nil {
return fmt.Errorf("storage access check failed: %w", err)
}
return nil
}
func (c *Client) UploadAndGetURL(ctx context.Context, blobPath string, buffer []byte) (string, error) {
if blobPath == "" || buffer == nil {
return "", ErrInvalidInput
}
fullPath := blobPath
if c.azureStorageConfig.Prefix != "" {
fullPath, _ = url.JoinPath(c.azureStorageConfig.Prefix, blobPath)
fullPath := path.Join(c.azureStorageConfig.Prefix, blobPath)
contentType := http.DetectContentType(buffer)
opts := &azblob.UploadBufferOptions{
HTTPHeaders: &blob.HTTPHeaders{BlobContentType: &contentType},
}
if _, err := c.client.UploadBuffer(ctx, c.azureStorageConfig.Container, fullPath, buffer, opts); err != nil {
return "", fmt.Errorf("uploading blob %s: %w", fullPath, err)
}
if err := c.UploadBuffer(ctx, c.azureStorageConfig.Container, fullPath, buffer); err != nil {
return "", err
}
blobEndpoint, _ := url.JoinPath(c.azureStorageConfig.ServiceURL, c.azureStorageConfig.AccountName)
blobURL, _ := url.JoinPath(blobEndpoint, c.azureStorageConfig.Container, fullPath)
return blobURL, nil
return fullPath, nil
}

View File

@@ -1,6 +1,10 @@
package config
import (
"fmt"
"maps"
"net/url"
"github.com/ilyakaznacheev/cleanenv"
log "github.com/sirupsen/logrus"
)
@@ -16,12 +20,97 @@ type AzureStorageConfig struct {
}
type appConfig struct {
SourceDbUrl string `env:"SOURCE_DB_URL" env-required:"true"`
TargetDbUrl string `env:"TARGET_DB_URL" env-required:"true"`
SourceDbUrl string `env:"SOURCE_DB_URL"`
SourceDbHost string `env:"SOURCE_DB_HOST"`
SourceDbPort string `env:"SOURCE_DB_PORT"`
SourceDbName string `env:"SOURCE_DB_NAME"`
SourceDbUser string `env:"SOURCE_DB_USER"`
SourceDbPwd string `env:"SOURCE_DB_PWD"`
SourceDbOptions string `env:"SOURCE_DB_OPTIONS"`
TargetDbUrl string `env:"TARGET_DB_URL"`
TargetDbHost string `env:"TARGET_DB_HOST"`
TargetDbPort string `env:"TARGET_DB_PORT"`
TargetDbName string `env:"TARGET_DB_NAME"`
TargetDbUser string `env:"TARGET_DB_USER"`
TargetDbPwd string `env:"TARGET_DB_PWD"`
TargetDbOptions string `env:"TARGET_DB_OPTIONS"`
LogLevel string `env:"LOG_LEVEL" env-default:"INFO"`
AzureStorage AzureStorageConfig
}
func (c *appConfig) ResolveSourceDbUrl(dbType string) (string, error) {
if c.SourceDbUrl != "" {
return c.SourceDbUrl, nil
}
u, err := buildDbUrl(dbType, c.SourceDbHost, c.SourceDbPort, c.SourceDbName, c.SourceDbUser, c.SourceDbPwd, c.SourceDbOptions)
if err != nil {
return "", fmt.Errorf("source DB: %w", err)
}
return u, nil
}
func (c *appConfig) ResolveTargetDbUrl(dbType string) (string, error) {
if c.TargetDbUrl != "" {
return c.TargetDbUrl, nil
}
u, err := buildDbUrl(dbType, c.TargetDbHost, c.TargetDbPort, c.TargetDbName, c.TargetDbUser, c.TargetDbPwd, c.TargetDbOptions)
if err != nil {
return "", fmt.Errorf("target DB: %w", err)
}
return u, nil
}
func buildDbUrl(dbType, host, port, name, user, pwd, options string) (string, error) {
if host == "" {
return "", fmt.Errorf("DB_HOST is required when DB_URL is not set")
}
if name == "" {
return "", fmt.Errorf("DB_NAME is required when DB_URL is not set")
}
if user == "" {
return "", fmt.Errorf("DB_USER is required when DB_URL is not set")
}
switch dbType {
case "sqlserver":
if port == "" {
port = "1433"
}
q := url.Values{}
if options != "" {
extra, err := url.ParseQuery(options)
if err != nil {
return "", fmt.Errorf("invalid DB_OPTIONS: %w", err)
}
maps.Copy(q, extra)
}
q.Set("database", name)
u := &url.URL{
Scheme: "sqlserver",
Host: host + ":" + port,
User: url.UserPassword(user, pwd),
RawQuery: q.Encode(),
}
return u.String(), nil
case "postgres":
if port == "" {
port = "5432"
}
u := &url.URL{
Scheme: "postgres",
Host: host + ":" + port,
User: url.UserPassword(user, pwd),
Path: "/" + name,
RawQuery: options,
}
return u.String(), nil
default:
return "", fmt.Errorf("unknown db type %q — cannot build URL from individual components", dbType)
}
}
func getAppConfig() appConfig {
var cfg appConfig

View File

@@ -12,12 +12,15 @@ type RetryConfig struct {
BaseDelayMs int `yaml:"base_delay_ms"`
MaxDelayMs int `yaml:"max_delay_ms"`
MaxJitterMs int `yaml:"max_jitter_ms"`
MaxFailedPartitions int `yaml:"max_failed_partitions"`
MaxFailedBatchesLoad int `yaml:"max_failed_batches_load"`
}
type ToStorageColumnConfig struct {
Source string `yaml:"source"`
Target string `yaml:"target"`
Mode string `yaml:"mode"`
Prefix string `yaml:"prefix"`
}
type ToStorageConfig struct {
@@ -34,10 +37,9 @@ type JobConfig struct {
TransformerQueueSize int `yaml:"transformer_queue_size"`
MaxLoaders int `yaml:"max_loaders"`
LoaderBatchSize int `yaml:"loader_batch_size"`
PartitionCalculationStrategy string `yaml:"partition_calculation_strategy"`
TruncateTarget bool `yaml:"truncate_target"`
TruncateMethod string `yaml:"truncate_method"`
MaxPartitionErrrors int `yaml:"max_partition_errrors"`
MaxExtractorBatchErrors int `yaml:"max_extractor_batch_errors"`
Retry RetryConfig `yaml:"retry"`
RowsPerPartition int64
ToStorage ToStorageConfig `yaml:"to_storage"`
@@ -66,8 +68,8 @@ type TargetTableInfo struct {
}
type RangeConfig struct {
Min int64 `yaml:"min"`
Max int64 `yaml:"max"`
Min *int64 `yaml:"min"`
Max *int64 `yaml:"max"`
IsMinInclusive bool `yaml:"is_min_inclusive"`
IsMaxInclusive bool `yaml:"is_max_inclusive"`
}

View File

@@ -1,7 +1,6 @@
package custom_errors
import (
"context"
"math/rand"
"time"
)
@@ -40,22 +39,3 @@ func ComputeBackoffDelay(retryCounter int, baseDelayMs int, maxDelayMs int, maxJ
return delay
}
func requeueWithBackoff(ctx context.Context, delay time.Duration, enqueue func()) {
if delay <= 0 {
enqueue()
return
}
go func() {
timer := time.NewTimer(delay)
defer timer.Stop()
select {
case <-ctx.Done():
return
case <-timer.C:
enqueue()
}
}()
}

View File

@@ -1,107 +0,0 @@
package custom_errors
import (
"context"
"fmt"
"sync"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
)
type LoaderError struct {
Batch models.Batch
Msg string
}
func (e *LoaderError) Error() string {
return e.Msg
}
func LoaderErrorHandler(
ctx context.Context,
retryConfig config.RetryConfig,
maxChunkErrors int,
chErrorsIn <-chan LoaderError,
chBatchesOut chan<- models.Batch,
chJobErrorsOut chan<- JobError,
wgActiveBatches *sync.WaitGroup,
) {
definitiveErrors := 0
for {
if ctx.Err() != nil {
return
}
select {
case <-ctx.Done():
return
case err, ok := <-chErrorsIn:
if !ok {
return
}
if err.Batch.RetryCounter >= retryConfig.Attempts {
wgActiveBatches.Done()
definitiveErrors++
jobError := JobError{
ShouldCancelJob: false,
Msg: fmt.Sprintf("Batch %v reached max retries (%d)", err.Batch.Id, retryConfig.Attempts),
Prev: &err,
}
select {
case chJobErrorsOut <- jobError:
case <-ctx.Done():
return
}
if maxChunkErrors > 0 && definitiveErrors >= maxChunkErrors {
fatalError := JobError{
ShouldCancelJob: true,
Msg: fmt.Sprintf("Chunk error limit reached (%d)", maxChunkErrors),
Prev: &err,
}
select {
case chJobErrorsOut <- fatalError:
case <-ctx.Done():
return
}
}
continue
} else {
jobError := JobError{
ShouldCancelJob: false,
Msg: fmt.Sprintf("Temporal error in batch %v (retries: %d)", err.Batch.Id, err.Batch.RetryCounter),
Prev: &err,
}
select {
case chJobErrorsOut <- jobError:
case <-ctx.Done():
return
}
}
err.Batch.RetryCounter++
delay := ComputeBackoffDelay(
err.Batch.RetryCounter,
retryConfig.BaseDelayMs,
retryConfig.MaxDelayMs,
retryConfig.MaxJitterMs,
)
requeueWithBackoff(ctx, delay, func() {
select {
case chBatchesOut <- err.Batch:
case <-ctx.Done():
return
}
})
}
}
}

View File

@@ -14,3 +14,12 @@ type ExtractorError struct {
func (e *ExtractorError) Error() string {
return e.Msg
}
type LoaderError struct {
Batch models.Batch
Msg string
}
func (e *LoaderError) Error() string {
return e.Msg
}

View File

@@ -10,7 +10,6 @@ import (
dbdialects "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/db-wrapper/db_dialects"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
mssql "github.com/microsoft/go-mssqldb"
"github.com/sirupsen/logrus"
)
func init() {
@@ -188,8 +187,6 @@ func buildExtractQueryMssql(q ExtractionQuery) (string, error) {
hasRegularColumns := len(q.Columns) > 0
hasJsonColumns := len(q.FromJsonColumns) > 0
// logrus.Debugf("Extraction query: %+v", q)
resolvedJson := make(map[string][]config.FromJsonItem, len(q.FromJsonColumns))
if hasJsonColumns {
for _, jsonConfig := range q.FromJsonColumns {
@@ -296,7 +293,7 @@ func (mw *mssqlDbWrapper) QueryFromObject(ctx context.Context, q ExtractionQuery
return nil, err
}
logrus.Debugf("Query: %s", queryString)
// logrus.Debugf("Query: %s", queryString)
var queryArgs []any

View File

@@ -25,6 +25,7 @@ func (ex *GenericExtractor) Consume(
chErrorsOut chan<- custom_errors.JobError,
wgActivePartitions *sync.WaitGroup,
rowsRead *int64,
failedPartitionsCount *int32,
fromJsonColumns []config.FromJsonItem,
) {
indexPrimaryKey := slices.IndexFunc(columns, func(col models.ColumnType) bool {
@@ -72,11 +73,12 @@ func (ex *GenericExtractor) Consume(
if rowsReadResult > 0 {
current := atomic.LoadInt64(rowsRead)
logrus.Debugf("Rows read: +%v [current=%v] (%s.%s)", rowsReadResult, current, tableInfo.Schema, tableInfo.Table)
logrus.Debugf("Rows read (partition extracted): +%v [current=%v] (%s.%s)", rowsReadResult, current, tableInfo.Schema, tableInfo.Table)
atomic.AddInt64(rowsRead, int64(rowsReadResult))
}
if err != nil {
atomic.AddInt32(failedPartitionsCount, 1)
if jobError, ok := errors.AsType[*custom_errors.JobError](err); ok {
select {
case <-ctx.Done():
@@ -90,6 +92,16 @@ func (ex *GenericExtractor) Consume(
case chErrorsOut <- custom_errors.JobError{ShouldCancelJob: false, Msg: err.Error(), Prev: err}:
}
}
currentFPCount := atomic.LoadInt32(failedPartitionsCount)
if currentFPCount > int32(retryConfig.MaxFailedPartitions) {
select {
case <-ctx.Done():
return
case chErrorsOut <- custom_errors.JobError{ShouldCancelJob: true, Msg: "Max failed partitions reached"}:
return
}
}
}
}
}

View File

@@ -27,7 +27,6 @@ func sendBatch(ctx context.Context, chBatchesOut chan<- models.Batch, batch mode
func flush(
ctx context.Context,
partition *models.Partition,
batchSize int,
batchRows []models.UnknownRowValues,
chBatchesOut chan<- models.Batch,
@@ -36,7 +35,7 @@ func flush(
return nil
}
batch := models.Batch{Id: uuid.New(), PartitionId: partition.Id, Rows: batchRows}
batch := models.Batch{Id: uuid.New(), Rows: batchRows}
batchRows = make([]models.UnknownRowValues, 0, batchSize)
return sendBatch(ctx, chBatchesOut, batch)
}

View File

@@ -50,7 +50,7 @@ func (ex *GenericExtractor) ProcessPartitionWithRetries(
if currentParitition.RetryCounter >= retryConfig.Attempts {
return totalRowsRead, &custom_errors.JobError{
Msg: fmt.Sprintf("Partition %v reached max retries", exError.Partition.Id),
Msg: fmt.Sprintf("Partition %v reached max retries (%d)", currentParitition.Id, currentParitition.RetryCounter),
Prev: err,
}
}

View File

@@ -76,6 +76,7 @@ func (ex *GenericExtractor) ProcessPartition(
batchRows := make([]models.UnknownRowValues, 0, batchSize)
var rowsRead int64 = 0
var lastRow models.UnknownRowValues
for rows.Next() {
rowValues := make([]any, len(columns))
@@ -90,7 +91,7 @@ func (ex *GenericExtractor) ProcessPartition(
return rowsRead, err
}
if err := flush(ctx, &partition, batchSize, batchRows, chBatchesOut); err != nil {
if err := flush(ctx, batchSize, batchRows, chBatchesOut); err != nil {
return rowsRead, err
}
@@ -98,11 +99,12 @@ func (ex *GenericExtractor) ProcessPartition(
return rowsRead, errorFromLastPartitionRow(lastRow, indexPrimaryKey, partition, err)
}
rowsRead++
lastRow = rowValues
batchRows = append(batchRows, rowValues)
if len(batchRows) >= batchSize {
// logrus.Debugf("Batch size reached, flushing batch with %v rows (rowsRead=%v)", len(batchRows), rowsRead)
if err := flush(ctx, &partition, batchSize, batchRows, chBatchesOut); err != nil {
if err := flush(ctx, batchSize, batchRows, chBatchesOut); err != nil {
// logrus.Warnf("Error flushing rows: %v", err)
return rowsRead, err
}
@@ -110,9 +112,16 @@ func (ex *GenericExtractor) ProcessPartition(
}
}
if err := flush(ctx, &partition, batchSize, batchRows, chBatchesOut); err != nil {
if err := flush(ctx, batchSize, batchRows, chBatchesOut); err != nil {
return rowsRead, err
}
return rowsRead, rows.Err()
if err := rows.Err(); err != nil {
if lastRow != nil {
return rowsRead, errorFromLastPartitionRow(lastRow, indexPrimaryKey, partition, err)
}
return rowsRead, err
}
return rowsRead, nil
}

View File

@@ -9,54 +9,144 @@ import (
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
"github.com/sirupsen/logrus"
)
type loaderAccumulator struct {
batchSize int
rows []models.UnknownRowValues
parents []models.BatchRef
pendingDone int
}
func (a *loaderAccumulator) add(batch models.Batch) {
a.rows = append(a.rows, batch.Rows...)
a.parents = append(a.parents, models.BatchRef{Id: batch.Id})
a.pendingDone++
}
func (a *loaderAccumulator) ready() bool {
return len(a.rows) >= a.batchSize
}
func (a *loaderAccumulator) drainPending(wg *sync.WaitGroup) {
for range a.pendingDone {
wg.Done()
}
}
func sendLoadError(
ctx context.Context,
err error,
retryConfig config.RetryConfig,
failedBatchesCount *int32,
chErrorsOut chan<- custom_errors.JobError,
) bool {
atomic.AddInt32(failedBatchesCount, 1)
var jobErr custom_errors.JobError
if je, ok := errors.AsType[*custom_errors.JobError](err); ok {
jobErr = *je
} else {
jobErr = custom_errors.JobError{ShouldCancelJob: false, Msg: err.Error(), Prev: err}
}
select {
case <-ctx.Done():
return false
case chErrorsOut <- jobErr:
}
if atomic.LoadInt32(failedBatchesCount) > int32(retryConfig.MaxFailedBatchesLoad) {
select {
case <-ctx.Done():
case chErrorsOut <- custom_errors.JobError{ShouldCancelJob: true, Msg: "Max failed batches (load) reached"}:
}
return false
}
return true
}
func (gl *GenericLoader) Consume(
ctx context.Context,
tableInfo config.TargetTableInfo,
columns []models.ColumnType,
retryConfig config.RetryConfig,
batchSize int,
chBatchesIn <-chan models.Batch,
chErrorsOut chan<- custom_errors.JobError,
wgActiveBatches *sync.WaitGroup,
rowsLoaded *int64,
failedBatchesCount *int32,
) {
colNames := mapSlice(columns, func(col models.ColumnType) string {
return col.Name()
})
for {
if ctx.Err() != nil {
return
}
acc := &loaderAccumulator{batchSize: batchSize}
defer acc.drainPending(wgActiveBatches)
select {
case <-ctx.Done():
return
case batch, ok := <-chBatchesIn:
if !ok {
return
flush := func() bool {
if len(acc.rows) == 0 {
return true
}
processedRows, err := gl.ProcessBatchWithRetries(ctx, tableInfo, colNames, retryConfig, batch)
count := len(acc.parents)
superBatch := models.Batch{
Id: uuid.New(),
ParentBatches: acc.parents,
Rows: acc.rows,
}
processedRows, err := gl.ProcessBatchWithRetries(ctx, tableInfo, colNames, retryConfig, superBatch)
for range count {
wgActiveBatches.Done()
}
acc.pendingDone -= count
acc.rows = nil
acc.parents = nil
if err != nil {
if jobError, ok := errors.AsType[*custom_errors.JobError](err); ok {
select {
case <-ctx.Done():
return
case chErrorsOut <- *jobError:
return sendLoadError(ctx, err, retryConfig, failedBatchesCount, chErrorsOut)
}
} else {
select {
case <-ctx.Done():
return
case chErrorsOut <- custom_errors.JobError{ShouldCancelJob: false, Msg: err.Error(), Prev: err}:
}
}
} else {
current := atomic.LoadInt64(rowsLoaded)
logrus.Debugf("Rows loaded (batch loaded): +%v [current=%v] (%s.%s)", processedRows, current, tableInfo.Schema, tableInfo.Table)
atomic.AddInt64(rowsLoaded, int64(processedRows))
return true
}
for {
select {
case <-ctx.Done():
return
case batch, ok := <-chBatchesIn:
if !ok {
flush()
return
}
if batchSize <= 0 {
processedRows, err := gl.ProcessBatchWithRetries(ctx, tableInfo, colNames, retryConfig, batch)
wgActiveBatches.Done()
if err != nil {
if !sendLoadError(ctx, err, retryConfig, failedBatchesCount, chErrorsOut) {
return
}
continue
}
current := atomic.LoadInt64(rowsLoaded)
logrus.Debugf("Rows loaded: +%v [current=%v] (%s.%s)", processedRows, current, tableInfo.Schema, tableInfo.Table)
atomic.AddInt64(rowsLoaded, int64(processedRows))
continue
}
acc.add(batch)
if acc.ready() {
if !flush() {
return
}
}
}
}

View File

@@ -0,0 +1,603 @@
package loaders
import (
"context"
"errors"
"sync"
"sync/atomic"
"testing"
"time"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
dbwrapper "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/db-wrapper"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
)
const testTimeout = 2 * time.Second
type mockResult struct {
err error
}
type mockDbWrapper struct {
mu sync.Mutex
callCount int
results []mockResult
}
func newMockDb(results ...mockResult) *mockDbWrapper {
return &mockDbWrapper{results: results}
}
func (m *mockDbWrapper) SaveMassive(_ context.Context, _ string, _ string, _ []string, rows [][]any) (int64, error) {
m.mu.Lock()
defer m.mu.Unlock()
idx := m.callCount
m.callCount++
if idx < len(m.results) && m.results[idx].err != nil {
return 0, m.results[idx].err
}
return int64(len(rows)), nil
}
func (m *mockDbWrapper) Close() error { return nil }
func (m *mockDbWrapper) Connect(_ context.Context, _ string) error { return nil }
func (m *mockDbWrapper) Exec(_ context.Context, _ string, _ ...any) (dbwrapper.ExecResult, error) {
return dbwrapper.ExecResult{}, nil
}
func (m *mockDbWrapper) GetDialect() string { return "" }
func (m *mockDbWrapper) Query(_ context.Context, _ string, _ ...any) (dbwrapper.RowsResult, error) {
return nil, nil
}
func (m *mockDbWrapper) QueryRow(_ context.Context, _ string, _ ...any) dbwrapper.RowResult {
return nil
}
func (m *mockDbWrapper) QueryFromObject(_ context.Context, _ dbwrapper.ExtractionQuery) (dbwrapper.RowsResult, error) {
return nil, nil
}
func makeBatch(numRows int) models.Batch {
rows := make([]models.UnknownRowValues, numRows)
for i := range rows {
rows[i] = models.UnknownRowValues{i}
}
return models.Batch{Id: uuid.New(), Rows: rows}
}
func newLoader(db *mockDbWrapper) GenericLoader {
return GenericLoader{db: db}
}
func rc(maxFailed int) config.RetryConfig {
return config.RetryConfig{Attempts: 1, MaxFailedBatchesLoad: maxFailed}
}
func sendBatch(chIn chan<- models.Batch, batch models.Batch, wg *sync.WaitGroup) {
wg.Add(1)
chIn <- batch
}
func runConsume(
ctx context.Context,
gl GenericLoader,
retryConfig config.RetryConfig,
batchSize int,
chIn <-chan models.Batch,
chErr chan<- custom_errors.JobError,
wg *sync.WaitGroup,
rowsLoaded *int64,
failedCount *int32,
) <-chan struct{} {
done := make(chan struct{})
go func() {
gl.Consume(ctx, config.TargetTableInfo{}, nil, retryConfig, batchSize,
chIn, chErr, wg, rowsLoaded, failedCount)
close(done)
}()
return done
}
func waitWg(wg *sync.WaitGroup) <-chan struct{} {
done := make(chan struct{})
go func() { wg.Wait(); close(done) }()
return done
}
func dbError() error { return errors.New("connection reset by peer") }
func TestLoaderAccumulator_Add(t *testing.T) {
acc := &loaderAccumulator{batchSize: 5}
b1 := makeBatch(2)
b2 := makeBatch(3)
acc.add(b1)
acc.add(b2)
if len(acc.rows) != 5 {
t.Errorf("expected 5 rows, got %d", len(acc.rows))
}
if len(acc.parents) != 2 {
t.Fatalf("expected 2 parents, got %d", len(acc.parents))
}
if acc.parents[0].Id != b1.Id || acc.parents[1].Id != b2.Id {
t.Error("parent IDs do not match source batch IDs in order")
}
if acc.pendingDone != 2 {
t.Errorf("expected pendingDone=2, got %d", acc.pendingDone)
}
}
func TestLoaderAccumulator_Ready(t *testing.T) {
acc := &loaderAccumulator{batchSize: 3}
acc.add(makeBatch(2))
if acc.ready() {
t.Error("should not be ready with 2 rows and batchSize=3")
}
acc.add(makeBatch(1))
if !acc.ready() {
t.Error("should be ready with 3 rows and batchSize=3")
}
}
func TestLoaderAccumulator_DrainPending_ReleasesWg(t *testing.T) {
acc := &loaderAccumulator{batchSize: 5, pendingDone: 3}
var wg sync.WaitGroup
wg.Add(3)
acc.drainPending(&wg)
select {
case <-waitWg(&wg):
case <-time.After(testTimeout):
t.Fatal("wg.Wait() timed out: drainPending did not call Done() enough times")
}
}
func TestLoaderAccumulator_DrainPending_ZeroPending(t *testing.T) {
acc := &loaderAccumulator{batchSize: 5, pendingDone: 0}
var wg sync.WaitGroup
acc.drainPending(&wg)
select {
case <-waitWg(&wg):
case <-time.After(testTimeout):
t.Fatal("wg.Wait() timed out")
}
}
func TestSendLoadError_PlainError_WrappedAsNonFatal(t *testing.T) {
ch := make(chan custom_errors.JobError, 2)
var failedCount int32
result := sendLoadError(context.Background(), errors.New("db error"), rc(10), &failedCount, ch)
if !result {
t.Error("expected true (below threshold)")
}
if atomic.LoadInt32(&failedCount) != 1 {
t.Errorf("expected failedCount=1, got %d", failedCount)
}
select {
case e := <-ch:
if e.ShouldCancelJob {
t.Error("plain error should be wrapped as ShouldCancelJob=false")
}
default:
t.Error("expected an error in the channel")
}
}
func TestSendLoadError_JobError_PassesThrough(t *testing.T) {
ch := make(chan custom_errors.JobError, 2)
var failedCount int32
original := &custom_errors.JobError{ShouldCancelJob: false, Msg: "custom msg"}
sendLoadError(context.Background(), original, rc(10), &failedCount, ch)
select {
case e := <-ch:
if e.Msg != "custom msg" || e.ShouldCancelJob {
t.Errorf("JobError should pass through unchanged, got %+v", e)
}
default:
t.Error("expected an error in the channel")
}
}
func TestSendLoadError_FatalJobError_BelowThreshold_ReturnsTrue(t *testing.T) {
ch := make(chan custom_errors.JobError, 2)
var failedCount int32
fatal := &custom_errors.JobError{ShouldCancelJob: true, Msg: "unique constraint"}
result := sendLoadError(context.Background(), fatal, rc(10), &failedCount, ch)
if !result {
t.Error("below-threshold fatal error should return true (external cancel expected from JobErrorHandler)")
}
select {
case e := <-ch:
if !e.ShouldCancelJob {
t.Error("fatal JobError should be forwarded with ShouldCancelJob=true")
}
default:
t.Error("expected the fatal error in the channel")
}
}
func TestSendLoadError_ThresholdExceeded_ReturnsFalse(t *testing.T) {
ch := make(chan custom_errors.JobError, 2)
var failedCount int32
result := sendLoadError(context.Background(), errors.New("db error"), rc(0), &failedCount, ch)
if result {
t.Error("expected false when threshold exceeded")
}
if len(ch) != 2 {
t.Fatalf("expected 2 errors (batch error + fatal threshold error), got %d", len(ch))
}
<-ch // batch error
threshold := <-ch
if !threshold.ShouldCancelJob {
t.Error("second error should be the fatal threshold error (ShouldCancelJob=true)")
}
}
func TestSendLoadError_AtThresholdBoundary(t *testing.T) {
ch := make(chan custom_errors.JobError, 6)
var failedCount int32
if !sendLoadError(context.Background(), errors.New("err"), rc(2), &failedCount, ch) {
t.Error("first failure: expected true (below threshold)")
}
if !sendLoadError(context.Background(), errors.New("err"), rc(2), &failedCount, ch) {
t.Error("second failure: expected true (at threshold, not exceeded)")
}
if sendLoadError(context.Background(), errors.New("err"), rc(2), &failedCount, ch) {
t.Error("third failure: expected false (threshold exceeded)")
}
}
func TestSendLoadError_ContextCancelled_ReturnsFalse(t *testing.T) {
ch := make(chan custom_errors.JobError)
var failedCount int32
ctx, cancel := context.WithCancel(context.Background())
cancel()
result := sendLoadError(ctx, errors.New("db error"), rc(10), &failedCount, ch)
if result {
t.Error("expected false when context is cancelled")
}
if len(ch) != 0 {
t.Error("no error should be sent when context is cancelled")
}
}
func TestConsume_Passthrough_RowsLoaded(t *testing.T) {
db := newMockDb()
gl := newLoader(db)
chIn := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
sendBatch(chIn, makeBatch(5), &wg)
close(chIn)
<-runConsume(context.Background(), gl, rc(0), 0, chIn, chErr, &wg, &rowsLoaded, &failedCount)
wg.Wait()
if rowsLoaded != 5 {
t.Errorf("expected rowsLoaded=5, got %d", rowsLoaded)
}
if db.callCount != 1 {
t.Errorf("expected 1 SaveMassive call, got %d", db.callCount)
}
}
func TestConsume_Passthrough_MultipleBatches_RowsAccumulate(t *testing.T) {
db := newMockDb()
gl := newLoader(db)
chIn := make(chan models.Batch, 3)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
sendBatch(chIn, makeBatch(3), &wg)
sendBatch(chIn, makeBatch(2), &wg)
sendBatch(chIn, makeBatch(4), &wg)
close(chIn)
<-runConsume(context.Background(), gl, rc(10), 0, chIn, chErr, &wg, &rowsLoaded, &failedCount)
wg.Wait()
if rowsLoaded != 9 {
t.Errorf("expected rowsLoaded=9, got %d", rowsLoaded)
}
}
func TestConsume_Passthrough_WgDoneBeforeErrorHandling(t *testing.T) {
db := newMockDb(mockResult{err: dbError()})
gl := newLoader(db)
chIn := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 2)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
sendBatch(chIn, makeBatch(2), &wg)
close(chIn)
<-runConsume(context.Background(), gl, rc(10), 0, chIn, chErr, &wg, &rowsLoaded, &failedCount)
select {
case <-waitWg(&wg):
case <-time.After(testTimeout):
t.Fatal("wg.Wait() timed out: Done() was not called even though processing failed")
}
}
func TestConsume_Passthrough_NonFatalError_Continues(t *testing.T) {
db := newMockDb(mockResult{err: dbError()})
gl := newLoader(db)
chIn := make(chan models.Batch, 2)
chErr := make(chan custom_errors.JobError, 3)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
sendBatch(chIn, makeBatch(2), &wg)
sendBatch(chIn, makeBatch(3), &wg)
close(chIn)
<-runConsume(context.Background(), gl, rc(10), 0, chIn, chErr, &wg, &rowsLoaded, &failedCount)
wg.Wait()
if rowsLoaded != 3 {
t.Errorf("expected rowsLoaded=3 (only second batch succeeded), got %d", rowsLoaded)
}
if atomic.LoadInt32(&failedCount) != 1 {
t.Errorf("expected failedCount=1, got %d", failedCount)
}
if len(chErr) == 0 {
t.Error("expected at least one error in chErr for the failed batch")
}
}
func TestConsume_Passthrough_ThresholdExceeded_Exits(t *testing.T) {
db := newMockDb(mockResult{err: dbError()})
gl := newLoader(db)
chIn := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 3)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
sendBatch(chIn, makeBatch(1), &wg)
done := runConsume(context.Background(), gl, rc(0), 0, chIn, chErr, &wg, &rowsLoaded, &failedCount)
select {
case <-done:
case <-time.After(testTimeout):
t.Fatal("Consume did not exit after threshold exceeded")
}
select {
case <-waitWg(&wg):
case <-time.After(testTimeout):
t.Fatal("wg.Wait() timed out after threshold exit")
}
}
func TestConsume_Accumulation_FlushOnThreshold(t *testing.T) {
db := newMockDb()
gl := newLoader(db)
chIn := make(chan models.Batch, 3)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
sendBatch(chIn, makeBatch(1), &wg)
sendBatch(chIn, makeBatch(1), &wg)
sendBatch(chIn, makeBatch(1), &wg)
close(chIn)
<-runConsume(context.Background(), gl, rc(0), 3, chIn, chErr, &wg, &rowsLoaded, &failedCount)
wg.Wait()
if rowsLoaded != 3 {
t.Errorf("expected rowsLoaded=3, got %d", rowsLoaded)
}
if db.callCount != 1 {
t.Errorf("expected 1 SaveMassive call, got %d", db.callCount)
}
}
func TestConsume_Accumulation_FlushOnClose(t *testing.T) {
db := newMockDb()
gl := newLoader(db)
chIn := make(chan models.Batch, 2)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
sendBatch(chIn, makeBatch(2), &wg)
sendBatch(chIn, makeBatch(3), &wg)
close(chIn)
<-runConsume(context.Background(), gl, rc(0), 10, chIn, chErr, &wg, &rowsLoaded, &failedCount)
wg.Wait()
if rowsLoaded != 5 {
t.Errorf("expected rowsLoaded=5, got %d", rowsLoaded)
}
if db.callCount != 1 {
t.Errorf("expected exactly 1 SaveMassive call (single flush on close), got %d", db.callCount)
}
}
func TestConsume_Accumulation_RowsLoadedCorrect(t *testing.T) {
db := newMockDb()
gl := newLoader(db)
chIn := make(chan models.Batch, 5)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
for range 5 {
sendBatch(chIn, makeBatch(2), &wg)
}
close(chIn)
<-runConsume(context.Background(), gl, rc(0), 4, chIn, chErr, &wg, &rowsLoaded, &failedCount)
wg.Wait()
if rowsLoaded != 10 {
t.Errorf("expected rowsLoaded=10, got %d", rowsLoaded)
}
if db.callCount != 3 {
t.Errorf("expected 3 SaveMassive calls (2 threshold flushes + 1 on close), got %d", db.callCount)
}
}
func TestConsume_Accumulation_WgBalanced_OnContextCancel(t *testing.T) {
db := newMockDb()
gl := newLoader(db)
chIn := make(chan models.Batch)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
ctx, cancel := context.WithCancel(context.Background())
done := runConsume(ctx, gl, rc(0), 10, chIn, chErr, &wg, &rowsLoaded, &failedCount)
sendBatch(chIn, makeBatch(1), &wg)
sendBatch(chIn, makeBatch(1), &wg)
cancel()
select {
case <-done:
case <-time.After(testTimeout):
t.Fatal("Consume did not exit after context cancellation")
}
select {
case <-waitWg(&wg):
case <-time.After(testTimeout):
t.Fatal("wg.Wait() timed out: drainPending did not release accumulated batches on cancel")
}
}
func TestConsume_Accumulation_ErrorInFlush_WgStillBalanced(t *testing.T) {
db := newMockDb(mockResult{err: dbError()})
gl := newLoader(db)
chIn := make(chan models.Batch, 2)
chErr := make(chan custom_errors.JobError, 3)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
sendBatch(chIn, makeBatch(1), &wg)
sendBatch(chIn, makeBatch(1), &wg)
close(chIn)
<-runConsume(context.Background(), gl, rc(10), 2, chIn, chErr, &wg, &rowsLoaded, &failedCount)
select {
case <-waitWg(&wg):
case <-time.After(testTimeout):
t.Fatal("wg.Wait() timed out: wg.Done() not called after flush error")
}
}
func TestConsume_Accumulation_MultipleFlushes_NonFatalErrors(t *testing.T) {
db := newMockDb(mockResult{err: dbError()}, mockResult{err: dbError()})
gl := newLoader(db)
chIn := make(chan models.Batch, 4)
chErr := make(chan custom_errors.JobError, 6)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
for range 4 {
sendBatch(chIn, makeBatch(1), &wg)
}
close(chIn)
<-runConsume(context.Background(), gl, rc(10), 2, chIn, chErr, &wg, &rowsLoaded, &failedCount)
select {
case <-waitWg(&wg):
case <-time.After(testTimeout):
t.Fatal("wg.Wait() timed out")
}
if atomic.LoadInt32(&failedCount) != 2 {
t.Errorf("expected failedCount=2, got %d", failedCount)
}
if rowsLoaded != 0 {
t.Errorf("expected rowsLoaded=0 (all batches failed), got %d", rowsLoaded)
}
}
func TestConsume_EmptyInput_NoProcessing(t *testing.T) {
db := newMockDb()
gl := newLoader(db)
chIn := make(chan models.Batch)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
close(chIn)
done := runConsume(context.Background(), gl, rc(0), 5, chIn, chErr, &wg, &rowsLoaded, &failedCount)
select {
case <-done:
case <-time.After(testTimeout):
t.Fatal("Consume did not exit after empty input channel was closed")
}
if db.callCount != 0 {
t.Errorf("expected no SaveMassive calls, got %d", db.callCount)
}
if rowsLoaded != 0 {
t.Errorf("expected rowsLoaded=0, got %d", rowsLoaded)
}
wg.Wait()
}
func TestConsume_ContextCancellation_Exits(t *testing.T) {
db := newMockDb()
gl := newLoader(db)
chIn := make(chan models.Batch)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
var rowsLoaded int64
var failedCount int32
ctx, cancel := context.WithCancel(context.Background())
done := runConsume(ctx, gl, rc(0), 0, chIn, chErr, &wg, &rowsLoaded, &failedCount)
cancel()
select {
case <-done:
case <-time.After(testTimeout):
t.Fatal("Consume did not exit after context cancellation")
}
wg.Wait()
}

View File

@@ -29,8 +29,7 @@ func (gl *GenericLoader) ProcessBatchWithRetries(
if batch.RetryCounter >= retryConfig.Attempts {
return rowsLoaded, &custom_errors.JobError{
ShouldCancelJob: false,
Msg: fmt.Sprintf("Temporal error in batch %v (retries: %d)", btError.Batch.Id, btError.Batch.RetryCounter),
Msg: fmt.Sprintf("Batch %v reached max retries (%d)", batch.Id, batch.RetryCounter),
Prev: btError,
}
}

View File

@@ -2,6 +2,7 @@ package table_analyzers
import (
"context"
"math"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl"
@@ -15,23 +16,10 @@ func PartitionRangeGenerator(
tableAnalyzer etl.TableAnalyzer,
tableInfo config.TableInfo,
partitionColumn string,
partitionCalculationStrategy string,
rowsPerPartition int64,
jobRange config.RangeConfig,
) ([]models.Partition, error) {
if jobRange.Min > 0 {
return []models.Partition{{
Id: uuid.New(),
HasRange: true,
RetryCounter: 0,
Range: models.PartitionRange{
Min: jobRange.Min,
Max: jobRange.Max,
IsMinInclusive: jobRange.IsMinInclusive,
IsMaxInclusive: jobRange.IsMaxInclusive,
},
}}, nil
}
rowsCount, err := tableAnalyzer.EstimateTotalRows(ctx, tableInfo)
logrus.Infof("Estimated rows in source: %v (%s.%s)", rowsCount, tableInfo.Schema, tableInfo.Table)
if err != nil {
@@ -39,21 +27,107 @@ func PartitionRangeGenerator(
}
if rowsCount <= rowsPerPartition {
return []models.Partition{{
Id: uuid.New(),
HasRange: false,
RetryCounter: 0,
}}, nil
hasRange := jobRange.Min != nil || jobRange.Max != nil
partition := models.Partition{Id: uuid.New(), HasRange: hasRange, RetryCounter: 0}
if hasRange {
var min, max int64
if jobRange.Min != nil {
min = *jobRange.Min
}
if jobRange.Max != nil {
max = *jobRange.Max
}
partition.Range = models.PartitionRange{
Min: min,
Max: max,
IsMinInclusive: jobRange.IsMinInclusive,
IsMaxInclusive: jobRange.IsMaxInclusive,
}
}
return []models.Partition{partition}, nil
}
partitionsCount := rowsCount / rowsPerPartition
partitions, err := tableAnalyzer.CalculatePartitionRanges(ctx, tableInfo, partitionColumn, partitionsCount)
if partitionCalculationStrategy == "ESTIMATION" {
return calculatePartitionsEstimation(ctx, tableAnalyzer, tableInfo, partitionColumn, partitionsCount, jobRange)
}
partitions, err := tableAnalyzer.CalculatePartitionRanges(ctx, tableInfo, partitionColumn, partitionsCount, jobRange)
if err != nil {
return nil, err
}
// logrus.Debugf("Partitions: %+v (%s.%s)", partitions, tableInfo.Schema, tableInfo.Table)
logrus.Debugf("Partitions count: %v (%s.%s)", len(partitions), tableInfo.Schema, tableInfo.Table)
return partitions, nil
}
func calculatePartitionsEstimation(
ctx context.Context,
tableAnalyzer etl.TableAnalyzer,
tableInfo config.TableInfo,
partitionColumn string,
partitionsCount int64,
rangeConstraint config.RangeConfig,
) ([]models.Partition, error) {
var minValue, maxValue int64
if rangeConstraint.Min != nil && rangeConstraint.Max != nil {
minValue = *rangeConstraint.Min
maxValue = *rangeConstraint.Max
logrus.Infof("Column range for %s.%s.%s: [%d, %d] (user-defined)", tableInfo.Schema, tableInfo.Table, partitionColumn, minValue, maxValue)
} else if rangeConstraint.Min != nil || rangeConstraint.Max != nil {
result, err := tableAnalyzer.QueryMaxMinFromColumn(ctx, tableInfo, partitionColumn)
if err != nil {
return nil, err
}
if rangeConstraint.Min != nil {
minValue = *rangeConstraint.Min
maxValue = result.Max
logrus.Infof("Column range for %s.%s.%s: [%d, %d] (min user-defined)", tableInfo.Schema, tableInfo.Table, partitionColumn, minValue, maxValue)
} else {
minValue = result.Min
maxValue = *rangeConstraint.Max
logrus.Infof("Column range for %s.%s.%s: [%d, %d] (max user-defined)", tableInfo.Schema, tableInfo.Table, partitionColumn, minValue, maxValue)
}
} else {
result, err := tableAnalyzer.QueryMaxMinFromColumn(ctx, tableInfo, partitionColumn)
if err != nil {
return nil, err
}
logrus.Infof("Column range for %s.%s.%s: [%d, %d]", tableInfo.Schema, tableInfo.Table, partitionColumn, result.Min, result.Max)
minValue = result.Min
maxValue = result.Max
}
rangeSize := maxValue - minValue
stepSize := int64(math.Ceil(float64(rangeSize) / float64(partitionsCount)))
partitions := make([]models.Partition, 0, partitionsCount)
for i := range partitionsCount {
partitionMin := minValue + (i * stepSize)
partitionMax := minValue + ((i + 1) * stepSize)
if i == partitionsCount-1 {
partitionMax = maxValue
}
isMinInclusive := i == 0
partition := models.Partition{
Id: uuid.New(),
HasRange: true,
RetryCounter: 0,
Range: models.PartitionRange{
Min: partitionMin,
Max: partitionMax,
IsMinInclusive: isMinInclusive,
IsMaxInclusive: true,
},
}
partitions = append(partitions, partition)
}
return partitions, nil
}

View File

@@ -0,0 +1,332 @@
package table_analyzers
import (
"context"
"testing"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
)
type MockTableAnalyzer struct {
minValue int64
maxValue int64
totalRows int64
capturedRangeConstraint config.RangeConfig
}
func (m *MockTableAnalyzer) QueryColumnTypes(_ context.Context, _ config.TableInfo) ([]models.ColumnType, error) {
return nil, nil
}
func (m *MockTableAnalyzer) EstimateTotalRows(_ context.Context, _ config.TableInfo) (int64, error) {
return m.totalRows, nil
}
func (m *MockTableAnalyzer) QueryMaxMinFromColumn(_ context.Context, _ config.TableInfo, _ string) (etl.MaxMinColumnResult, error) {
return etl.MaxMinColumnResult{Min: m.minValue, Max: m.maxValue}, nil
}
func (m *MockTableAnalyzer) CalculatePartitionRanges(_ context.Context, _ config.TableInfo, _ string, _ int64, rangeConstraint config.RangeConfig) ([]models.Partition, error) {
m.capturedRangeConstraint = rangeConstraint
return []models.Partition{}, nil
}
//go:fix inline
func ptr64(v int64) *int64 { return new(v) }
var testTableInfo = config.TableInfo{Schema: "dbo", Table: "test"}
func TestCalculatePartitionsEstimation_NoOverlap(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{minValue: 0, maxValue: 100}
partitions, err := calculatePartitionsEstimation(ctx, mock, testTableInfo, "id", 4, config.RangeConfig{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(partitions) != 4 {
t.Errorf("expected 4 partitions, got %d", len(partitions))
}
for i := 0; i < len(partitions)-1; i++ {
current := partitions[i].Range
next := partitions[i+1].Range
if current.Max == next.Min && current.IsMaxInclusive && next.IsMinInclusive {
t.Errorf("partition %d and %d overlap at value %d (both inclusive)", i, i+1, current.Max)
}
}
}
func TestCalculatePartitionsEstimation_CoverageComplete(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{minValue: 1000, maxValue: 2000}
partitions, err := calculatePartitionsEstimation(ctx, mock, testTableInfo, "id", 5, config.RangeConfig{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if partitions[0].Range.Min != 1000 || !partitions[0].Range.IsMinInclusive {
t.Errorf("first partition should start at 1000 (inclusive), got %d (inclusive=%v)",
partitions[0].Range.Min, partitions[0].Range.IsMinInclusive)
}
if partitions[len(partitions)-1].Range.Max != 2000 {
t.Errorf("last partition should end at 2000, got %d", partitions[len(partitions)-1].Range.Max)
}
}
func TestCalculatePartitionsEstimation_FirstPartitionInclusive(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{minValue: 50, maxValue: 70}
partitions, err := calculatePartitionsEstimation(ctx, mock, testTableInfo, "id", 3, config.RangeConfig{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if !partitions[0].Range.IsMinInclusive {
t.Errorf("first partition should have IsMinInclusive=true")
}
if partitions[0].Range.Min != 50 {
t.Errorf("first partition should start at 50, got %d", partitions[0].Range.Min)
}
for i := 1; i < len(partitions); i++ {
if partitions[i].Range.IsMinInclusive {
t.Errorf("partition %d should have IsMinInclusive=false to avoid overlap", i)
}
}
}
func TestPartitionRangeGenerator_Exact_NoRange_PassesEmptyConstraint(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 1000}
_, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "EXACT", 100, config.RangeConfig{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if mock.capturedRangeConstraint.Min != nil || mock.capturedRangeConstraint.Max != nil {
t.Errorf("expected empty range constraint, got min=%v max=%v",
mock.capturedRangeConstraint.Min, mock.capturedRangeConstraint.Max)
}
}
func TestPartitionRangeGenerator_Exact_BothBounds_PassesBothToAnalyzer(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 1000}
jobRange := config.RangeConfig{Min: ptr64(200), Max: ptr64(800), IsMinInclusive: true, IsMaxInclusive: true}
_, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "EXACT", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
rc := mock.capturedRangeConstraint
if rc.Min == nil || *rc.Min != 200 {
t.Errorf("expected Min=200, got %v", rc.Min)
}
if rc.Max == nil || *rc.Max != 800 {
t.Errorf("expected Max=800, got %v", rc.Max)
}
if !rc.IsMinInclusive || !rc.IsMaxInclusive {
t.Errorf("expected both bounds inclusive, got minInc=%v maxInc=%v", rc.IsMinInclusive, rc.IsMaxInclusive)
}
}
func TestPartitionRangeGenerator_Exact_MinOnly_PassesMinNilMax(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 1000}
jobRange := config.RangeConfig{Min: ptr64(500)}
_, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "EXACT", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
rc := mock.capturedRangeConstraint
if rc.Min == nil || *rc.Min != 500 {
t.Errorf("expected Min=500, got %v", rc.Min)
}
if rc.Max != nil {
t.Errorf("expected Max=nil (no upper bound), got %v", rc.Max)
}
}
func TestPartitionRangeGenerator_Exact_MaxOnly_PassesMaxNilMin(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 1000}
jobRange := config.RangeConfig{Max: ptr64(300)}
_, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "EXACT", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
rc := mock.capturedRangeConstraint
if rc.Min != nil {
t.Errorf("expected Min=nil (no lower bound), got %v", rc.Min)
}
if rc.Max == nil || *rc.Max != 300 {
t.Errorf("expected Max=300, got %v", rc.Max)
}
}
func TestPartitionRangeGenerator_Estimation_BothBounds_UsesUserRange(t *testing.T) {
ctx := context.Background()
// DB min/max differ intentionally — user bounds should take precedence.
mock := &MockTableAnalyzer{totalRows: 1000, minValue: 0, maxValue: 999}
jobRange := config.RangeConfig{Min: ptr64(200), Max: ptr64(700), IsMinInclusive: true}
partitions, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "ESTIMATION", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(partitions) == 0 {
t.Fatal("expected at least one partition")
}
if partitions[0].Range.Min != 200 {
t.Errorf("first partition should start at user min=200, got %d", partitions[0].Range.Min)
}
if partitions[len(partitions)-1].Range.Max != 700 {
t.Errorf("last partition should end at user max=700, got %d", partitions[len(partitions)-1].Range.Max)
}
}
func TestPartitionRangeGenerator_Estimation_MinOnly_QueriesDBForMax(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 1000, minValue: 0, maxValue: 999}
jobRange := config.RangeConfig{Min: ptr64(500), IsMinInclusive: true}
partitions, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "ESTIMATION", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(partitions) == 0 {
t.Fatal("expected at least one partition")
}
if partitions[0].Range.Min != 500 {
t.Errorf("first partition should start at user min=500, got %d", partitions[0].Range.Min)
}
if partitions[len(partitions)-1].Range.Max != 999 {
t.Errorf("last partition should end at DB max=999, got %d", partitions[len(partitions)-1].Range.Max)
}
}
func TestPartitionRangeGenerator_Estimation_MaxOnly_QueriesDBForMin(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 1000, minValue: 100, maxValue: 999}
jobRange := config.RangeConfig{Max: ptr64(600), IsMaxInclusive: true}
partitions, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "ESTIMATION", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(partitions) == 0 {
t.Fatal("expected at least one partition")
}
if partitions[0].Range.Min != 100 {
t.Errorf("first partition should start at DB min=100, got %d", partitions[0].Range.Min)
}
if partitions[len(partitions)-1].Range.Max != 600 {
t.Errorf("last partition should end at user max=600, got %d", partitions[len(partitions)-1].Range.Max)
}
}
func TestPartitionRangeGenerator_SinglePartition_NoRange(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 50}
partitions, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "EXACT", 100, config.RangeConfig{})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(partitions) != 1 {
t.Fatalf("expected 1 partition, got %d", len(partitions))
}
if partitions[0].HasRange {
t.Error("single partition with no range should have HasRange=false")
}
}
func TestPartitionRangeGenerator_SinglePartition_BothBounds(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 50}
jobRange := config.RangeConfig{Min: ptr64(100), Max: ptr64(200), IsMinInclusive: true, IsMaxInclusive: true}
partitions, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "EXACT", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(partitions) != 1 {
t.Fatalf("expected 1 partition, got %d", len(partitions))
}
p := partitions[0]
if !p.HasRange {
t.Error("expected HasRange=true")
}
if p.Range.Min != 100 || p.Range.Max != 200 {
t.Errorf("expected [100, 200], got [%d, %d]", p.Range.Min, p.Range.Max)
}
if !p.Range.IsMinInclusive || !p.Range.IsMaxInclusive {
t.Errorf("expected both inclusive, got minInc=%v maxInc=%v", p.Range.IsMinInclusive, p.Range.IsMaxInclusive)
}
}
func TestPartitionRangeGenerator_SinglePartition_MinOnly(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 50}
jobRange := config.RangeConfig{Min: ptr64(100), IsMinInclusive: true}
partitions, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "EXACT", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
p := partitions[0]
if !p.HasRange {
t.Error("expected HasRange=true")
}
if p.Range.Min != 100 {
t.Errorf("expected Min=100, got %d", p.Range.Min)
}
if p.Range.Max != 0 {
t.Errorf("expected Max=0 (no upper bound), got %d", p.Range.Max)
}
}
func TestPartitionRangeGenerator_SinglePartition_MaxOnly(t *testing.T) {
ctx := context.Background()
mock := &MockTableAnalyzer{totalRows: 50}
jobRange := config.RangeConfig{Max: ptr64(200), IsMaxInclusive: true}
partitions, err := PartitionRangeGenerator(ctx, mock, testTableInfo, "id", "EXACT", 100, jobRange)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
p := partitions[0]
if !p.HasRange {
t.Error("expected HasRange=true")
}
if p.Range.Min != 0 {
t.Errorf("expected Min=0 (no lower bound), got %d", p.Range.Min)
}
if p.Range.Max != 200 {
t.Errorf("expected Max=200, got %d", p.Range.Max)
}
}

View File

@@ -196,17 +196,65 @@ GROUP BY t.name`
return rowsCount, nil
}
func (ta *MssqlTableAnalyzer) QueryMaxMinFromColumn(
ctx context.Context,
tableInfo config.TableInfo,
columnName string,
) (etl.MaxMinColumnResult, error) {
query := fmt.Sprintf(`
SELECT
MIN([%s]) AS min_value,
MAX([%s]) AS max_value
FROM [%s].[%s]`, columnName, columnName, tableInfo.Schema, tableInfo.Table)
ctxTimeout, cancel := context.WithTimeout(ctx, 1*time.Minute)
defer cancel()
result := etl.MaxMinColumnResult{}
err := ta.db.QueryRow(ctxTimeout, query).Scan(&result.Min, &result.Max)
if err != nil {
return etl.MaxMinColumnResult{}, err
}
return result, nil
}
func (ta *MssqlTableAnalyzer) CalculatePartitionRanges(
ctx context.Context,
tableInfo config.TableInfo,
partitionColumn string,
maxPartitions int64,
rangeConstraint config.RangeConfig,
) ([]models.Partition, error) {
whereClause := ""
args := []any{sql.Named("maxPartitions", maxPartitions)}
if rangeConstraint.Min != nil || rangeConstraint.Max != nil {
var conditions []string
if rangeConstraint.Min != nil {
minOp := ">"
if rangeConstraint.IsMinInclusive {
minOp = ">="
}
conditions = append(conditions, fmt.Sprintf("[%s] %s @rangeMin", partitionColumn, minOp))
args = append(args, sql.Named("rangeMin", *rangeConstraint.Min))
}
if rangeConstraint.Max != nil {
maxOp := "<"
if rangeConstraint.IsMaxInclusive {
maxOp = "<="
}
conditions = append(conditions, fmt.Sprintf("[%s] %s @rangeMax", partitionColumn, maxOp))
args = append(args, sql.Named("rangeMax", *rangeConstraint.Max))
}
whereClause = "WHERE " + strings.Join(conditions, " AND ")
}
query := fmt.Sprintf(`
SELECT
MIN([%s]) AS lower_limit,
MAX([%s]) AS upper_limit
FROM (SELECT [%s], NTILE(@maxPartitions) OVER (ORDER BY [%s]) AS batch_id FROM [%s].[%s]) AS T
FROM (SELECT [%s], NTILE(@maxPartitions) OVER (ORDER BY [%s]) AS batch_id FROM [%s].[%s] %s) AS T
GROUP BY batch_id
ORDER BY batch_id`,
partitionColumn,
@@ -214,12 +262,13 @@ ORDER BY batch_id`,
partitionColumn,
partitionColumn,
tableInfo.Schema,
tableInfo.Table)
tableInfo.Table,
whereClause)
ctxTimeout, cancel := context.WithTimeout(ctx, 1*time.Minute)
defer cancel()
rows, err := ta.db.Query(ctxTimeout, query, sql.Named("maxPartitions", maxPartitions))
rows, err := ta.db.Query(ctxTimeout, query, args...)
if err != nil {
return nil, err
}

View File

@@ -2,6 +2,7 @@ package table_analyzers
import (
"context"
"fmt"
"strings"
"time"
@@ -9,6 +10,7 @@ import (
dbwrapper "git.ksdemosapps.com/kylesoda/go-migrate/internal/app/db-wrapper"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
)
type PostgresTableAnalyzer struct {
@@ -161,7 +163,50 @@ func (ta *PostgresTableAnalyzer) EstimateTotalRows(
ctx context.Context,
tableInfo config.TableInfo,
) (int64, error) {
return 0, nil
query := `
SELECT reltuples::bigint
FROM pg_class
JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace
WHERE pg_namespace.nspname = $1 AND pg_class.relname = $2`
ctxTimeout, cancel := context.WithTimeout(ctx, 1*time.Minute)
defer cancel()
var estimate int64
err := ta.db.QueryRow(ctxTimeout, query, tableInfo.Schema, tableInfo.Table).Scan(&estimate)
if err != nil {
return 0, err
}
if estimate < 0 {
countQuery := fmt.Sprintf(`SELECT COUNT(*) FROM "%s"."%s"`, tableInfo.Schema, tableInfo.Table)
err = ta.db.QueryRow(ctxTimeout, countQuery).Scan(&estimate)
if err != nil {
return 0, err
}
}
return estimate, nil
}
func (ta *PostgresTableAnalyzer) QueryMaxMinFromColumn(
ctx context.Context,
tableInfo config.TableInfo,
columnName string,
) (etl.MaxMinColumnResult, error) {
query := fmt.Sprintf(`SELECT MIN("%s"), MAX("%s") FROM "%s"."%s"`,
columnName, columnName, tableInfo.Schema, tableInfo.Table)
ctxTimeout, cancel := context.WithTimeout(ctx, 1*time.Minute)
defer cancel()
result := etl.MaxMinColumnResult{}
err := ta.db.QueryRow(ctxTimeout, query).Scan(&result.Min, &result.Max)
if err != nil {
return etl.MaxMinColumnResult{}, err
}
return result, nil
}
func (ta *PostgresTableAnalyzer) CalculatePartitionRanges(
@@ -169,6 +214,80 @@ func (ta *PostgresTableAnalyzer) CalculatePartitionRanges(
tableInfo config.TableInfo,
partitionColumn string,
maxPartitions int64,
rangeConstraint config.RangeConfig,
) ([]models.Partition, error) {
return []models.Partition{}, nil
whereClause := ""
args := []any{maxPartitions}
if rangeConstraint.Min != nil || rangeConstraint.Max != nil {
var conditions []string
if rangeConstraint.Min != nil {
minOp := ">"
if rangeConstraint.IsMinInclusive {
minOp = ">="
}
args = append(args, *rangeConstraint.Min)
conditions = append(conditions, fmt.Sprintf(`"%s" %s $%d`, partitionColumn, minOp, len(args)))
}
if rangeConstraint.Max != nil {
maxOp := "<"
if rangeConstraint.IsMaxInclusive {
maxOp = "<="
}
args = append(args, *rangeConstraint.Max)
conditions = append(conditions, fmt.Sprintf(`"%s" %s $%d`, partitionColumn, maxOp, len(args)))
}
whereClause = "WHERE " + strings.Join(conditions, " AND ")
}
query := fmt.Sprintf(`
SELECT MIN("%s") AS lower_limit, MAX("%s") AS upper_limit
FROM (
SELECT "%s", NTILE($1) OVER (ORDER BY "%s") AS batch_id
FROM "%s"."%s" %s
) AS t
GROUP BY batch_id
ORDER BY batch_id`,
partitionColumn,
partitionColumn,
partitionColumn,
partitionColumn,
tableInfo.Schema,
tableInfo.Table,
whereClause)
ctxTimeout, cancel := context.WithTimeout(ctx, 1*time.Minute)
defer cancel()
rows, err := ta.db.Query(ctxTimeout, query, args...)
if err != nil {
return nil, err
}
defer rows.Close()
partitions := make([]models.Partition, 0, maxPartitions)
for rows.Next() {
partition := models.Partition{
Id: uuid.New(),
HasRange: true,
RetryCounter: 0,
Range: models.PartitionRange{
IsMinInclusive: true,
IsMaxInclusive: true,
},
}
if err := rows.Scan(&partition.Range.Min, &partition.Range.Max); err != nil {
return nil, err
}
partitions = append(partitions, partition)
}
if err := rows.Err(); err != nil {
return nil, err
}
return partitions, nil
}

View File

@@ -0,0 +1,119 @@
package transformers
import (
"context"
"errors"
"sync"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
)
type batchAccumulator struct {
batchSize int
rows []models.UnknownRowValues
parents []models.BatchRef
}
func (a *batchAccumulator) add(batch models.Batch) {
a.rows = append(a.rows, batch.Rows...)
a.parents = append(a.parents, models.BatchRef{Id: batch.Id})
}
func (a *batchAccumulator) ready() bool {
return len(a.rows) >= a.batchSize
}
func (a *batchAccumulator) flush(ctx context.Context, chOut chan<- models.Batch, wg *sync.WaitGroup) bool {
if len(a.rows) == 0 {
return true
}
out := models.Batch{
Id: uuid.New(),
ParentBatches: a.parents,
Rows: a.rows,
}
wg.Add(1)
select {
case chOut <- out:
case <-ctx.Done():
wg.Done()
return false
}
a.rows = nil
a.parents = nil
return true
}
func sendTransformError(ctx context.Context, err error, ch chan<- custom_errors.JobError) {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return
}
var jobErr custom_errors.JobError
if je, ok := errors.AsType[*custom_errors.JobError](err); ok {
jobErr = *je
} else {
jobErr = custom_errors.JobError{ShouldCancelJob: true, Msg: "Transformation failed", Prev: err}
}
select {
case ch <- jobErr:
case <-ctx.Done():
}
}
func (mssqlTr *MssqlTransformer) Consume(
ctx context.Context,
columns []models.ColumnType,
retryConfig config.RetryConfig,
batchSize int,
chBatchesIn <-chan models.Batch,
chBatchesOut chan<- models.Batch,
chJobErrorsOut chan<- custom_errors.JobError,
wgActiveBatches *sync.WaitGroup,
) {
transformationPlan := computeTransformationPlan(columns)
storagePlan := computeStorageTransformationPlan(ctx, mssqlTr.azureClient, mssqlTr.toStorage, columns, mssqlTr.sourceTable)
transformationPlan = append(transformationPlan, storagePlan...)
acc := &batchAccumulator{batchSize: batchSize}
for {
select {
case <-ctx.Done():
return
case batch, ok := <-chBatchesIn:
if !ok {
acc.flush(ctx, chBatchesOut, wgActiveBatches)
return
}
if len(transformationPlan) > 0 {
if err := ProcessBatchWithRetries(ctx, &batch, transformationPlan, retryConfig); err != nil {
sendTransformError(ctx, err, chJobErrorsOut)
return
}
}
if batchSize <= 0 {
wgActiveBatches.Add(1)
select {
case chBatchesOut <- batch:
case <-ctx.Done():
wgActiveBatches.Done()
return
}
continue
}
acc.add(batch)
if acc.ready() {
if !acc.flush(ctx, chBatchesOut, wgActiveBatches) {
return
}
}
}
}
}

View File

@@ -0,0 +1,545 @@
package transformers
import (
"context"
"errors"
"sync"
"testing"
"time"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
)
const testTimeout = 2 * time.Second
func makeBatch(numRows int) models.Batch {
rows := make([]models.UnknownRowValues, numRows)
for i := range rows {
rows[i] = models.UnknownRowValues{i}
}
return models.Batch{Id: uuid.New(), Rows: rows}
}
func noRetry() config.RetryConfig {
return config.RetryConfig{Attempts: 1}
}
func newTransformer() *MssqlTransformer {
return &MssqlTransformer{}
}
func uuidColumn() models.ColumnType {
return models.NewColumnType("col_uuid", false, false, "uniqueidentifier", "uniqueidentifier", "string", false, 0, 0, 0)
}
func runConsume(
ctx context.Context,
tr *MssqlTransformer,
columns []models.ColumnType,
batchSize int,
chIn <-chan models.Batch,
chOut chan<- models.Batch,
chErr chan<- custom_errors.JobError,
wg *sync.WaitGroup,
) <-chan struct{} {
done := make(chan struct{})
go func() {
tr.Consume(ctx, columns, noRetry(), batchSize, chIn, chOut, chErr, wg)
close(done)
}()
return done
}
func drainOut(chOut <-chan models.Batch, wg *sync.WaitGroup) []models.Batch {
var batches []models.Batch
for {
select {
case b := <-chOut:
batches = append(batches, b)
wg.Done()
default:
return batches
}
}
}
func TestBatchAccumulator_Add(t *testing.T) {
acc := &batchAccumulator{batchSize: 5}
b1 := makeBatch(2)
b2 := makeBatch(3)
acc.add(b1)
acc.add(b2)
if len(acc.rows) != 5 {
t.Errorf("expected 5 rows, got %d", len(acc.rows))
}
if len(acc.parents) != 2 {
t.Fatalf("expected 2 parents, got %d", len(acc.parents))
}
if acc.parents[0].Id != b1.Id || acc.parents[1].Id != b2.Id {
t.Error("parent IDs do not match source batch IDs")
}
}
func TestBatchAccumulator_Ready(t *testing.T) {
acc := &batchAccumulator{batchSize: 3}
acc.add(makeBatch(2))
if acc.ready() {
t.Error("should not be ready with 2 rows and batchSize=3")
}
acc.add(makeBatch(1))
if !acc.ready() {
t.Error("should be ready with 3 rows and batchSize=3")
}
}
func TestBatchAccumulator_Flush_Empty(t *testing.T) {
acc := &batchAccumulator{batchSize: 5}
chOut := make(chan models.Batch, 1)
var wg sync.WaitGroup
if !acc.flush(context.Background(), chOut, &wg) {
t.Error("flush on empty accumulator should return true")
}
if len(chOut) != 0 {
t.Error("flush on empty accumulator should send nothing")
}
}
func TestBatchAccumulator_Flush_Success(t *testing.T) {
acc := &batchAccumulator{batchSize: 2}
b := makeBatch(2)
acc.add(b)
chOut := make(chan models.Batch, 1)
var wg sync.WaitGroup
if !acc.flush(context.Background(), chOut, &wg) {
t.Fatal("flush should return true on success")
}
select {
case out := <-chOut:
wg.Done()
if len(out.Rows) != 2 {
t.Errorf("expected 2 rows in flushed batch, got %d", len(out.Rows))
}
if len(out.ParentBatches) != 1 || out.ParentBatches[0].Id != b.Id {
t.Error("flushed batch should reference the source batch as parent")
}
default:
t.Error("expected a batch in chOut after flush")
}
if len(acc.rows) != 0 || len(acc.parents) != 0 {
t.Error("accumulator state should be reset after flush")
}
wg.Wait()
}
func TestBatchAccumulator_Flush_ContextCancelled(t *testing.T) {
acc := &batchAccumulator{batchSize: 2}
acc.add(makeBatch(2))
chOut := make(chan models.Batch)
var wg sync.WaitGroup
ctx, cancel := context.WithCancel(context.Background())
cancel()
if acc.flush(ctx, chOut, &wg) {
t.Error("flush should return false when context is cancelled")
}
wg.Wait()
}
func TestSendTransformError_PlainError(t *testing.T) {
ch := make(chan custom_errors.JobError, 1)
sendTransformError(context.Background(), errors.New("something broke"), ch)
select {
case e := <-ch:
if !e.ShouldCancelJob {
t.Error("plain error should produce ShouldCancelJob=true")
}
default:
t.Error("expected a job error in the channel")
}
}
func TestSendTransformError_JobError_Passthrough(t *testing.T) {
ch := make(chan custom_errors.JobError, 1)
original := &custom_errors.JobError{ShouldCancelJob: false, Msg: "custom msg"}
sendTransformError(context.Background(), original, ch)
select {
case e := <-ch:
if e.ShouldCancelJob != false || e.Msg != "custom msg" {
t.Errorf("JobError should pass through unchanged, got %+v", e)
}
default:
t.Error("expected a job error in the channel")
}
}
func TestSendTransformError_ContextCancelled_Silent(t *testing.T) {
ch := make(chan custom_errors.JobError, 1)
ctx, cancel := context.WithCancel(context.Background())
cancel()
sendTransformError(ctx, context.Canceled, ch)
if len(ch) != 0 {
t.Error("context.Canceled should be silently dropped")
}
}
func TestSendTransformError_DeadlineExceeded_Silent(t *testing.T) {
ch := make(chan custom_errors.JobError, 1)
ctx, cancel := context.WithCancel(context.Background())
cancel()
sendTransformError(ctx, context.DeadlineExceeded, ch)
if len(ch) != 0 {
t.Error("context.DeadlineExceeded should be silently dropped")
}
}
func TestConsume_Passthrough_PreservesOriginalBatch(t *testing.T) {
tr := newTransformer()
chIn := make(chan models.Batch, 1)
chOut := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
batch := makeBatch(3)
chIn <- batch
close(chIn)
done := runConsume(context.Background(), tr, nil, 0, chIn, chOut, chErr, &wg)
select {
case got := <-chOut:
wg.Done()
if got.Id != batch.Id {
t.Error("passthrough should preserve the original batch ID")
}
if len(got.Rows) != 3 {
t.Errorf("expected 3 rows, got %d", len(got.Rows))
}
case <-time.After(testTimeout):
t.Fatal("timeout waiting for output batch")
}
<-done
wg.Wait()
}
func TestConsume_Passthrough_WaitGroupBalanced(t *testing.T) {
tr := newTransformer()
chIn := make(chan models.Batch, 3)
chOut := make(chan models.Batch, 3)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
for range 3 {
chIn <- makeBatch(1)
}
close(chIn)
done := runConsume(context.Background(), tr, nil, 0, chIn, chOut, chErr, &wg)
<-done
batches := drainOut(chOut, &wg)
if len(batches) != 3 {
t.Errorf("expected 3 output batches, got %d", len(batches))
}
wg.Wait()
}
func TestConsume_Accumulation_FlushOnThreshold(t *testing.T) {
tr := newTransformer()
chIn := make(chan models.Batch, 3)
chOut := make(chan models.Batch, 2)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
for range 3 {
chIn <- makeBatch(1)
}
close(chIn)
done := runConsume(context.Background(), tr, nil, 3, chIn, chOut, chErr, &wg)
<-done
batches := drainOut(chOut, &wg)
if len(batches) != 1 {
t.Fatalf("expected 1 accumulated batch, got %d", len(batches))
}
if len(batches[0].Rows) != 3 {
t.Errorf("expected 3 rows in accumulated batch, got %d", len(batches[0].Rows))
}
wg.Wait()
}
func TestConsume_Accumulation_FlushOnClose(t *testing.T) {
tr := newTransformer()
chIn := make(chan models.Batch, 2)
chOut := make(chan models.Batch, 2)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
chIn <- makeBatch(1)
chIn <- makeBatch(1)
close(chIn)
done := runConsume(context.Background(), tr, nil, 10, chIn, chOut, chErr, &wg)
<-done
batches := drainOut(chOut, &wg)
if len(batches) != 1 {
t.Fatalf("expected 1 batch flushed on close, got %d", len(batches))
}
if len(batches[0].Rows) != 2 {
t.Errorf("expected 2 rows, got %d", len(batches[0].Rows))
}
wg.Wait()
}
func TestConsume_Accumulation_TracksAllParentBatches(t *testing.T) {
tr := newTransformer()
chIn := make(chan models.Batch, 2)
chOut := make(chan models.Batch, 2)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
b1 := makeBatch(1)
b2 := makeBatch(1)
chIn <- b1
chIn <- b2
close(chIn)
done := runConsume(context.Background(), tr, nil, 10, chIn, chOut, chErr, &wg)
<-done
batches := drainOut(chOut, &wg)
if len(batches) != 1 {
t.Fatalf("expected 1 output batch, got %d", len(batches))
}
parents := batches[0].ParentBatches
if len(parents) != 2 {
t.Fatalf("expected 2 parent refs, got %d", len(parents))
}
if parents[0].Id != b1.Id || parents[1].Id != b2.Id {
t.Error("parent IDs should match source batch IDs in order")
}
wg.Wait()
}
func TestConsume_Accumulation_MultipleFlushes(t *testing.T) {
tr := newTransformer()
chIn := make(chan models.Batch, 5)
chOut := make(chan models.Batch, 5)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
for range 5 {
chIn <- makeBatch(1)
}
close(chIn)
done := runConsume(context.Background(), tr, nil, 2, chIn, chOut, chErr, &wg)
<-done
batches := drainOut(chOut, &wg)
if len(batches) != 3 {
t.Fatalf("expected 3 output batches (2+2+1 rows), got %d", len(batches))
}
totalRows := 0
for _, b := range batches {
totalRows += len(b.Rows)
}
if totalRows != 5 {
t.Errorf("expected 5 total rows across all batches, got %d", totalRows)
}
wg.Wait()
}
func TestConsume_EmptyInput_NoOutput(t *testing.T) {
tr := newTransformer()
chIn := make(chan models.Batch)
chOut := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
close(chIn)
done := runConsume(context.Background(), tr, nil, 5, chIn, chOut, chErr, &wg)
select {
case <-done:
case <-time.After(testTimeout):
t.Fatal("timeout: Consume did not exit after empty input channel was closed")
}
if len(chOut) != 0 {
t.Error("expected no output for empty input")
}
wg.Wait()
}
func TestConsume_TransformError_SendsJobError(t *testing.T) {
tr := newTransformer()
col := uuidColumn()
chIn := make(chan models.Batch, 1)
chOut := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
batch := models.Batch{
Id: uuid.New(),
Rows: []models.UnknownRowValues{{[]byte{1, 2, 3}}},
}
chIn <- batch
done := runConsume(context.Background(), tr, []models.ColumnType{col}, 0, chIn, chOut, chErr, &wg)
select {
case err := <-chErr:
if !err.ShouldCancelJob {
t.Error("transform error should set ShouldCancelJob=true")
}
case <-time.After(testTimeout):
t.Fatal("timeout: expected a job error from transform failure")
}
<-done
wg.Wait()
}
func TestConsume_TransformError_NoOutputForwarded(t *testing.T) {
tr := newTransformer()
col := uuidColumn()
chIn := make(chan models.Batch, 1)
chOut := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
batch := models.Batch{
Id: uuid.New(),
Rows: []models.UnknownRowValues{{[]byte{1, 2, 3}}},
}
chIn <- batch
done := runConsume(context.Background(), tr, []models.ColumnType{col}, 0, chIn, chOut, chErr, &wg)
<-done
if len(chOut) != 0 {
t.Error("no batch should be forwarded when transformation fails")
}
wg.Wait()
}
func TestConsume_ContextCancellation_Exits(t *testing.T) {
tr := newTransformer()
chIn := make(chan models.Batch)
chOut := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
ctx, cancel := context.WithCancel(context.Background())
done := runConsume(ctx, tr, nil, 0, chIn, chOut, chErr, &wg)
cancel()
select {
case <-done:
case <-time.After(testTimeout):
t.Fatal("timeout: Consume did not exit after context cancellation")
}
wg.Wait()
}
func TestConsume_Transform_DatetimeConvertedToUTC(t *testing.T) {
tr := newTransformer()
col := models.NewColumnType("col_dt", false, false, "datetime", "datetime", "timestamp", false, 0, 0, 0)
chIn := make(chan models.Batch, 1)
chOut := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
nonUTC := time.Date(2024, 1, 15, 12, 0, 0, 0, time.FixedZone("EST", -5*3600))
batch := models.Batch{
Id: uuid.New(),
Rows: []models.UnknownRowValues{{nonUTC}},
}
chIn <- batch
close(chIn)
done := runConsume(context.Background(), tr, []models.ColumnType{col}, 0, chIn, chOut, chErr, &wg)
<-done
select {
case got := <-chOut:
wg.Done()
result, ok := got.Rows[0][0].(time.Time)
if !ok {
t.Fatal("expected time.Time in output row")
}
if result.Location() != time.UTC {
t.Errorf("expected UTC location after transform, got %v", result.Location())
}
default:
t.Error("expected an output batch")
}
wg.Wait()
}
func TestConsume_Transform_NilValueSkipped(t *testing.T) {
tr := newTransformer()
col := uuidColumn()
chIn := make(chan models.Batch, 1)
chOut := make(chan models.Batch, 1)
chErr := make(chan custom_errors.JobError, 1)
var wg sync.WaitGroup
batch := models.Batch{
Id: uuid.New(),
Rows: []models.UnknownRowValues{{nil}},
}
chIn <- batch
close(chIn)
done := runConsume(context.Background(), tr, []models.ColumnType{col}, 0, chIn, chOut, chErr, &wg)
<-done
select {
case got := <-chOut:
wg.Done()
if got.Rows[0][0] != nil {
t.Error("nil value should pass through unchanged")
}
default:
t.Error("expected an output batch even when value is nil")
}
if len(chErr) != 0 {
t.Error("nil value should not produce an error")
}
wg.Wait()
}

View File

@@ -1,20 +1,9 @@
package transformers
import (
"context"
"errors"
"fmt"
"strings"
"sync"
"time"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/azure"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
log "github.com/sirupsen/logrus"
)
type MssqlTransformer struct {
@@ -30,196 +19,3 @@ func NewMssqlTransformer(toStorage config.ToStorageConfig, sourceTable config.So
azureClient: azureClient,
}
}
func computeTransformationPlan(columns []models.ColumnType) []etl.ColumnTransformPlan {
var plan []etl.ColumnTransformPlan
for i, col := range columns {
switch col.SystemType() {
case "uniqueidentifier":
plan = append(plan, etl.ColumnTransformPlan{
Index: i,
Fn: func(v any) (any, error) {
if b, ok := v.([]byte); ok && b != nil {
return mssqlUuidToBigEndian(b)
}
return v, nil
},
})
case "geometry", "geography":
plan = append(plan, etl.ColumnTransformPlan{
Index: i,
Fn: func(v any) (any, error) {
if b, ok := v.([]byte); ok && b != nil {
return wkbToEwkbWithSrid(b, 4326)
}
return v, nil
},
})
case "datetime", "datetime2":
plan = append(plan, etl.ColumnTransformPlan{
Index: i,
Fn: func(v any) (any, error) {
if t, ok := v.(time.Time); ok {
return ensureUTC(t), nil
}
return v, nil
},
})
}
}
return plan
}
func computeStorageTransformationPlan(
ctx context.Context,
azureClient *azure.Client,
toStorage config.ToStorageConfig,
sourceColumns []models.ColumnType,
sourceTable config.SourceTableInfo,
) []etl.ColumnTransformPlan {
if azureClient == nil || len(toStorage.Columns) == 0 {
return nil
}
colIndex := make(map[string]int, len(sourceColumns))
for i, col := range sourceColumns {
colIndex[strings.ToUpper(col.Name())] = i
}
var plan []etl.ColumnTransformPlan
for _, storageCol := range toStorage.Columns {
if storageCol.Mode != "REFERENCE_ONLY" {
log.Warnf("to_storage: unsupported mode %q for column %s — skipping", storageCol.Mode, storageCol.Source)
continue
}
idx, ok := colIndex[strings.ToUpper(storageCol.Source)]
if !ok {
log.Warnf("to_storage: source column %q not found in source schema — skipping", storageCol.Source)
continue
}
sourceColName := storageCol.Source
schema := sourceTable.Schema
table := sourceTable.Table
plan = append(plan, etl.ColumnTransformPlan{
Index: idx,
Fn: func(v any) (any, error) {
if v == nil {
return nil, nil
}
b, ok := v.([]byte)
if !ok {
log.Warnf("to_storage: expected []byte for %s.%s.%s, got %T — passing through",
schema, table, sourceColName, v)
return v, nil
}
start := time.Now()
blobPath := fmt.Sprintf("%s/%s/%s", schema, table, uuid.New().String())
blobURL, err := azureClient.UploadAndGetURL(ctx, blobPath, b)
if err != nil {
return nil, fmt.Errorf("uploading %s.%s.%s: %w", schema, table, sourceColName, err)
}
log.Debugf(`Succesfully uploaded "%s", (%vms)`, blobURL, time.Since(start).Milliseconds())
return blobURL, nil
},
})
}
return plan
}
const processBatchCtxCheck = 4096
func (mssqlTr *MssqlTransformer) ProcessBatch(
ctx context.Context,
batch *models.Batch,
transformationPlan []etl.ColumnTransformPlan,
) error {
for i, rowValues := range batch.Rows {
if i%processBatchCtxCheck == 0 {
if err := ctx.Err(); err != nil {
return err
}
}
for _, task := range transformationPlan {
val := rowValues[task.Index]
if val == nil {
continue
}
transformed, err := task.Fn(val)
if err != nil {
return err
}
rowValues[task.Index] = transformed
}
}
return nil
}
func (mssqlTr *MssqlTransformer) Exec(
ctx context.Context,
columns []models.ColumnType,
chBatchesIn <-chan models.Batch,
chBatchesOut chan<- models.Batch,
chJobErrorsOut chan<- custom_errors.JobError,
wgActiveBatches *sync.WaitGroup,
) {
transformationPlan := computeTransformationPlan(columns)
storagePlan := computeStorageTransformationPlan(ctx, mssqlTr.azureClient, mssqlTr.toStorage, columns, mssqlTr.sourceTable)
transformationPlan = append(transformationPlan, storagePlan...)
for {
if ctx.Err() != nil {
return
}
select {
case <-ctx.Done():
return
case batch, ok := <-chBatchesIn:
if !ok {
return
}
if len(transformationPlan) == 0 {
select {
case chBatchesOut <- batch:
wgActiveBatches.Add(1)
continue
case <-ctx.Done():
return
}
}
err := mssqlTr.ProcessBatch(ctx, &batch, transformationPlan)
if err != nil {
if errors.Is(err, ctx.Err()) {
return
}
select {
case chJobErrorsOut <- custom_errors.JobError{ShouldCancelJob: true, Msg: "Transformation failed", Prev: err}:
case <-ctx.Done():
}
return
}
select {
case chBatchesOut <- batch:
case <-ctx.Done():
return
}
wgActiveBatches.Add(1)
}
}
}

View File

@@ -0,0 +1,181 @@
package transformers
import (
"context"
"fmt"
"path"
"strings"
"time"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/azure"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
"github.com/google/uuid"
"github.com/sirupsen/logrus"
)
func computeTransformationPlan(columns []models.ColumnType) []etl.ColumnTransformPlan {
var plan []etl.ColumnTransformPlan
for i, col := range columns {
switch col.SystemType() {
case "uniqueidentifier":
plan = append(plan, etl.ColumnTransformPlan{
Index: i,
Fn: func(v any) (any, error) {
if b, ok := v.([]byte); ok && b != nil {
return mssqlUuidToBigEndian(b)
}
return v, nil
},
})
case "geometry", "geography":
plan = append(plan, etl.ColumnTransformPlan{
Index: i,
Fn: func(v any) (any, error) {
if b, ok := v.([]byte); ok && b != nil {
return wkbToEwkbWithSrid(b, 4326)
}
return v, nil
},
})
case "datetime", "datetime2":
plan = append(plan, etl.ColumnTransformPlan{
Index: i,
Fn: func(v any) (any, error) {
if t, ok := v.(time.Time); ok {
return ensureUTC(t), nil
}
return v, nil
},
})
}
}
return plan
}
func computePostgresTransformationPlan(columns []models.ColumnType) []etl.ColumnTransformPlan {
var plan []etl.ColumnTransformPlan
for i, col := range columns {
switch col.SystemType() {
case "int2", "int4", "int8", "integer", "smallint", "bigint":
plan = append(plan, etl.ColumnTransformPlan{
Index: i,
Fn: func(v any) (any, error) {
if v64, ok := ToInt64(v); ok {
return v64, nil
}
return v, nil
},
})
case "uuid":
plan = append(plan, etl.ColumnTransformPlan{
Index: i,
Fn: func(v any) (any, error) {
switch b := v.(type) {
case []byte:
if b != nil {
return bigEndianToMssqlUuid(b)
}
case [16]byte:
return bigEndianToMssqlUuid(b[:])
}
return v, nil
},
})
case "geometry":
plan = append(plan, etl.ColumnTransformPlan{
Index: i,
Fn: func(v any) (any, error) {
if b, ok := v.([]byte); ok && b != nil {
return ewkbToMssqlGeo(b, false)
}
return v, nil
},
})
case "geography":
plan = append(plan, etl.ColumnTransformPlan{
Index: i,
Fn: func(v any) (any, error) {
if b, ok := v.([]byte); ok && b != nil {
return ewkbToMssqlGeo(b, true)
}
return v, nil
},
})
}
}
return plan
}
func computeStorageTransformationPlan(
ctx context.Context,
azureClient *azure.Client,
toStorage config.ToStorageConfig,
sourceColumns []models.ColumnType,
sourceTable config.SourceTableInfo,
) []etl.ColumnTransformPlan {
if azureClient == nil || len(toStorage.Columns) == 0 {
return nil
}
colIndex := make(map[string]int, len(sourceColumns))
for i, col := range sourceColumns {
colIndex[strings.ToUpper(col.Name())] = i
}
var plan []etl.ColumnTransformPlan
for _, storageCol := range toStorage.Columns {
if storageCol.Mode != "REFERENCE_ONLY" {
logrus.Warnf("to_storage: unsupported mode %q for column %s — skipping", storageCol.Mode, storageCol.Source)
continue
}
idx, ok := colIndex[strings.ToUpper(storageCol.Source)]
if !ok {
logrus.Warnf("to_storage: source column %q not found in source schema — skipping", storageCol.Source)
continue
}
sourceColName := storageCol.Source
schema := sourceTable.Schema
table := sourceTable.Table
plan = append(plan, etl.ColumnTransformPlan{
Index: idx,
Fn: func(v any) (any, error) {
if v == nil {
return nil, nil
}
b, ok := v.([]byte)
if !ok {
logrus.Warnf("to_storage: expected []byte for %s.%s.%s, got %T — passing through", schema, table, sourceColName, v)
return v, nil
}
// start := time.Now()
blobPath := path.Join(storageCol.Prefix, uuid.New().String())
blobURL, err := azureClient.UploadAndGetURL(ctx, blobPath, b)
if err != nil {
return nil, &custom_errors.JobError{
Msg: fmt.Sprintf("Error uploading %s.%s.%s", schema, table, sourceColName),
Prev: err,
}
}
// logrus.Debugf(`Succesfully uploaded "%s", (%vms)`, blobURL, time.Since(start).Milliseconds())
return blobURL, nil
},
})
}
return plan
}

View File

@@ -0,0 +1,72 @@
package transformers
import (
"context"
"sync"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
)
type PostgresTransformer struct {
sourceTable config.SourceTableInfo
}
func NewPostgresTransformer(sourceTable config.SourceTableInfo) etl.Transformer {
return &PostgresTransformer{sourceTable: sourceTable}
}
func (pgTr *PostgresTransformer) Consume(
ctx context.Context,
columns []models.ColumnType,
retryConfig config.RetryConfig,
batchSize int,
chBatchesIn <-chan models.Batch,
chBatchesOut chan<- models.Batch,
chJobErrorsOut chan<- custom_errors.JobError,
wgActiveBatches *sync.WaitGroup,
) {
transformationPlan := computePostgresTransformationPlan(columns)
acc := &batchAccumulator{batchSize: batchSize}
for {
select {
case <-ctx.Done():
return
case batch, ok := <-chBatchesIn:
if !ok {
acc.flush(ctx, chBatchesOut, wgActiveBatches)
return
}
if len(transformationPlan) > 0 {
if err := ProcessBatchWithRetries(ctx, &batch, transformationPlan, retryConfig); err != nil {
sendTransformError(ctx, err, chJobErrorsOut)
return
}
}
if batchSize <= 0 {
wgActiveBatches.Add(1)
select {
case chBatchesOut <- batch:
case <-ctx.Done():
wgActiveBatches.Done()
return
}
continue
}
acc.add(batch)
if acc.ready() {
if !acc.flush(ctx, chBatchesOut, wgActiveBatches) {
return
}
}
}
}
}

View File

@@ -0,0 +1,73 @@
package transformers
import (
"context"
"errors"
"time"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/custom_errors"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/etl"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
)
const processBatchCtxCheck = 4096
func ProcessBatchWithRetries(
ctx context.Context,
batch *models.Batch,
transformationPlan []etl.ColumnTransformPlan,
retryConfig config.RetryConfig,
) error {
for i, rowValues := range batch.Rows {
if i%processBatchCtxCheck == 0 {
if err := ctx.Err(); err != nil {
return err
}
}
for _, task := range transformationPlan {
val := rowValues[task.Index]
if val == nil {
continue
}
var lastErr error
success := false
for attempt := 0; attempt < retryConfig.Attempts; attempt++ {
transformed, err := task.Fn(val)
if err == nil {
rowValues[task.Index] = transformed
success = true
break
}
lastErr = err
if jobError, ok := errors.AsType[*custom_errors.JobError](err); ok {
if jobError.ShouldCancelJob {
return jobError
}
}
if attempt == retryConfig.Attempts-1 {
break
}
delay := custom_errors.ComputeBackoffDelay(
attempt,
retryConfig.BaseDelayMs,
retryConfig.MaxDelayMs,
retryConfig.MaxJitterMs,
)
time.Sleep(delay)
}
if !success {
return lastErr
}
}
}
return nil
}

View File

@@ -1 +0,0 @@
package transformers

View File

@@ -4,6 +4,8 @@ import (
"encoding/binary"
"errors"
"time"
mssqlclrgeo "github.com/gaspardle/go-mssqlclrgeo"
)
func mssqlUuidToBigEndian(mssqlUuid []byte) ([]byte, error) {
@@ -62,6 +64,29 @@ func ensureUTC(t time.Time) time.Time {
return time.Date(t.Year(), t.Month(), t.Day(), t.Hour(), t.Minute(), t.Second(), t.Nanosecond(), time.UTC)
}
func bigEndianToMssqlUuid(pgUuid []byte) ([]byte, error) {
if len(pgUuid) != 16 {
return nil, errors.New("Invalid uuid")
}
mssqlUuid := make([]byte, 16)
mssqlUuid[0], mssqlUuid[1], mssqlUuid[2], mssqlUuid[3] = pgUuid[3], pgUuid[2], pgUuid[1], pgUuid[0]
mssqlUuid[4], mssqlUuid[5] = pgUuid[5], pgUuid[4]
mssqlUuid[6], mssqlUuid[7] = pgUuid[7], pgUuid[6]
copy(mssqlUuid[8:], pgUuid[8:])
return mssqlUuid, nil
}
func ewkbToMssqlGeo(ewkb []byte, isGeography bool) ([]byte, error) {
if len(ewkb) < 5 {
return nil, errors.New("Invalid ewkb")
}
// mssqlclrgeo reads the SRID flag and bytes directly from EWKB,
// so no pre-processing needed — pass through as-is.
return mssqlclrgeo.WkbToUdtGeo(ewkb, isGeography)
}
func ToInt64(v any) (int64, bool) {
switch t := v.(type) {
case int:

View File

@@ -9,18 +9,6 @@ import (
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/models"
)
type Extractor interface {
ProcessPartition(
ctx context.Context,
tableInfo config.SourceTableInfo,
columns []models.ColumnType,
batchSize int,
partition models.Partition,
indexPrimaryKey int,
chBatchesOut chan<- models.Batch,
) (int, error)
}
type TransformerFunc func(any) (any, error)
type ColumnTransformPlan struct {
@@ -29,29 +17,21 @@ type ColumnTransformPlan struct {
}
type Transformer interface {
ProcessBatch(
ctx context.Context,
batch *models.Batch,
transformationPlan []ColumnTransformPlan,
) error
Exec(
Consume(
ctx context.Context,
columns []models.ColumnType,
retryConfig config.RetryConfig,
batchSize int,
chBatchesIn <-chan models.Batch,
chBactchesOut chan<- models.Batch,
chBatchesOut chan<- models.Batch,
chJobErrorsOut chan<- custom_errors.JobError,
wgActiveBatches *sync.WaitGroup,
)
}
type Loader interface {
ProcessBatch(
ctx context.Context,
tableInfo config.TargetTableInfo,
colNames []string,
batch models.Batch,
) (int64, error)
type MaxMinColumnResult struct {
Max int64
Min int64
}
type TableAnalyzer interface {
@@ -65,10 +45,17 @@ type TableAnalyzer interface {
tableInfo config.TableInfo,
) (int64, error)
QueryMaxMinFromColumn(
ctx context.Context,
tableInfo config.TableInfo,
columnName string,
) (MaxMinColumnResult, error)
CalculatePartitionRanges(
ctx context.Context,
tableInfo config.TableInfo,
partitionColumn string,
maxPartitions int64,
rangeConstraint config.RangeConfig,
) ([]models.Partition, error)
}

View File

@@ -8,9 +8,14 @@ import (
type UnknownRowValues = []any
type Batch struct {
type BatchRef struct {
Id uuid.UUID
PartitionId uuid.UUID
}
type Batch struct {
Id uuid.UUID
ParentBatches []BatchRef
Rows []UnknownRowValues
RetryCounter int
}

View File

@@ -0,0 +1,137 @@
# Plan: Bidirectional Transformation Support
## Goal
Make the transformation pipeline direction-aware. Currently hardcoded to MSSQL → PG; add support for PG → MSSQL by applying inverse transformations when `SourceDbType == "postgres"`.
Excluded: `to_storage` Azure blob upload (not reversible).
---
## Hardcoded wiring to fix
| File | Line | Change |
|---|---|---|
| `cmd/go_migrate/process.go` | 51 | Branch on `SourceDbType`: `"sqlserver"``NewMssqlTransformer`, `"postgres"``NewPostgresTransformer` |
| `cmd/go_migrate/main.go` | 166167 | Branch on source/target type for both `TableAnalyzer` selections |
---
## Transformations
### Forward (MSSQL → PG) — unchanged
| Column type | Function | File |
|---|---|---|
| `uniqueidentifier` | `mssqlUuidToBigEndian` | `utils.go:9` |
| `geometry`/`geography` | `wkbToEwkbWithSrid` | `utils.go:25` |
| `datetime`/`datetime2` | `ensureUTC` | `utils.go:57` |
### Inverse (PG → MSSQL) — new
| PG system type | Action |
|---|---|
| `uuid` | `bigEndianToMssqlUuid`: re-swap bytes [0-3], [4-5], [6-7] |
| `geometry` | `ewkbToMssqlGeo(v, false)`: strip SRID → WKB → `WkbToUdtGeo` |
| `geography` | `ewkbToMssqlGeo(v, true)`: strip SRID → WKB → `WkbToUdtGeo` |
| `timestamp`/`timestamptz` | no-op |
**Geometry note**: MSSQL rejects plain WKB via bulk protocol. Must use `mssqlclrgeo.WkbToUdtGeo(wkb, isGeography)` (already in go.mod). PG extractor already emits EWKB via `ST_AsEWKB()`.
---
## New utility functions (`transformers/utils.go`)
### `bigEndianToMssqlUuid(v []byte) []byte`
```
out[0..3] = v[3,2,1,0]
out[4..5] = v[5,4]
out[6..7] = v[7,6]
out[8..15] = v[8..15]
```
### `ewkbToMssqlGeo(ewkb []byte, isGeography bool) ([]byte, error)`
1. Read byte-order flag from `ewkb[0]`
2. Read geometry type word bytes [1..4]
3. If SRID flag (`0x20000000`) is set: strip bytes [5..8], clear flag in type word
4. Call `mssqlclrgeo.WkbToUdtGeo(wkb, isGeography)`
---
## New files
### `transformers/postgres.go`
```go
func NewPostgresTransformer(...) *Transformer {
// same signature as NewMssqlTransformer
// calls computePostgresTransformationPlan instead
// does NOT call computeStorageTransformationPlan
}
```
### `computePostgresTransformationPlan` in `transformers/plan.go`
Iterates `sourceColTypes` (from PG analyzer), applies inverse closures by system type.
---
## PostgreSQL table analyzer stubs to implement (`table_analyzers/postgres.go`)
Required for PG-as-source partitioned extraction:
### `EstimateTotalRows`
```sql
SELECT reltuples::bigint FROM pg_class
JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace
WHERE pg_namespace.nspname = $schema AND pg_class.relname = $table
```
Fallback to `COUNT(*)` if `reltuples < 0`.
### `QueryMaxMinFromColumn`
```sql
SELECT MIN("col"), MAX("col") FROM "schema"."table"
```
### `CalculatePartitionRanges`
Use min/max from above + `rowsPerPartition` to compute boundaries. Mirror the logic from `MssqlTableAnalyzer.CalculatePartitionRanges`.
---
## Test cases
### TC-1: `bigEndianToMssqlUuid` — round-trip
- Input: run `mssqlUuidToBigEndian` on a known 16-byte MSSQL UUID → produces PG UUID
- Assert: `bigEndianToMssqlUuid(pgUUID)` == original MSSQL UUID bytes
- Also assert nil input → nil output (no panic)
### TC-2: `bigEndianToMssqlUuid` — known vector
- Input: `[0x6b,0xa7,0xb8,0x10, 0x9d,0xad, 0x11,0xd1, 0x80,0xb4,0x00,0xc0,0x4f,0xd4,0x30,0xc8]` (RFC 4122 nil UUID variant)
- Assert: bytes [0-3] are reversed, [4-5] reversed, [6-7] reversed, [8-15] identical
### TC-3: `ewkbToMssqlGeo` — geometry round-trip
- Input: generate a polygon via `go-geom` + `wkb.Marshal` → plain WKB
- Forward: run `wkbToEwkbWithSrid` → EWKB
- Inverse: run `ewkbToMssqlGeo(ewkb, false)` → CLR/UDT bytes
- Assert: no error, output is non-empty `[]byte`
### TC-4: `ewkbToMssqlGeo` — nil input
- Input: nil
- Assert: returns nil, nil (no panic)
### TC-5: `ewkbToMssqlGeo` — EWKB without SRID flag
- Input: plain WKB (no SRID flag set)
- Assert: function still calls `WkbToUdtGeo` and returns without error
### TC-6: Transformer factory selection
- Given `SourceDbType == "postgres"``NewPostgresTransformer` is selected
- Given `SourceDbType == "sqlserver"``NewMssqlTransformer` is selected
---
## Files changed (summary)
1. `cmd/go_migrate/process.go` — transformer factory branch
2. `cmd/go_migrate/main.go` — analyzer selection branch
3. `internal/app/etl/transformers/utils.go` — 2 new functions
4. `internal/app/etl/transformers/plan.go``computePostgresTransformationPlan`
5. `internal/app/etl/transformers/postgres.go` *(new)*
6. `internal/app/etl/table_analyzers/postgres.go` — 3 stub implementations

View File

@@ -1,6 +1,8 @@
package main
import (
"flag"
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
log "github.com/sirupsen/logrus"
)
@@ -8,7 +10,18 @@ import (
func main() {
log.SetLevel(log.DebugLevel)
migrationConfig, err := config.ReadMigrationConfig()
configPath := flag.String("config", "", "path to migration config file")
flag.Parse()
if flag.NArg() > 1 {
log.Fatalf("only one config file path is allowed")
}
if *configPath == "" && flag.NArg() == 1 {
*configPath = flag.Arg(0)
}
migrationConfig, err := config.ReadMigrationConfig(*configPath)
if err != nil {
log.Fatalf("error leyendo configuracion: %v", err)
}

View File

@@ -12,9 +12,9 @@ import (
)
const (
totalRows int = 1_000_000
chunkSize int = 50_000
queueSize int = 4
totalRows int = 2_000_000
chunkSize int = 5000
queueSize int = 8
)
func main() {
@@ -40,6 +40,14 @@ func main() {
seedManzanas(ctx, db)
})
// wgSeed.Go(func() {
// seedPuertos(ctx, db)
// })
// wgSeed.Go(func() {
// seedSiteHolderAttach(ctx, db)
// })
wgSeed.Wait()
}

View File

@@ -0,0 +1,227 @@
package main
import (
"bytes"
"context"
"database/sql"
"fmt"
"math/rand"
"sync"
"time"
"github.com/google/uuid"
log "github.com/sirupsen/logrus"
)
var siteHolderAttachJob = MigrationJob{
Schema: "Infraestructura",
Table: "SITE_HOLDER__ATTACH",
}
func seedSiteHolderAttach(ctx context.Context, db *sql.DB) error {
maxOid, err := getMaxGDBArchiveOidForAttach(ctx, db)
if err != nil {
log.Fatal("Error getting max GDB_ARCHIVE_OID: ", err)
}
log.Infof("Starting SITE_HOLDER__ATTACH data generation from GDB_ARCHIVE_OID: %d", maxOid+1)
rowsChan := make(chan []UnknownRowValues, queueSize)
var wgRowGenerator sync.WaitGroup
wgRowGenerator.Go(func() {
generateSiteHolderAttachRows(ctx, maxOid, totalRows, chunkSize, rowsChan)
})
columns := []string{
"GDB_ARCHIVE_OID",
"REL_GLOBALID",
"CONTENT_TYPE",
"ATT_NAME",
"DATA_SIZE",
"DATA",
"GLOBALID",
"GDB_FROM_DATE",
"GDB_TO_DATE",
"ATTACHMENTID",
}
if err := loadRowsMssql(ctx, siteHolderAttachJob, columns, db, rowsChan); err != nil {
return fmt.Errorf("Error loading rows (SITE_HOLDER__ATTACH): %w", err)
}
log.Info("Data generation and loading completed successfully (SITE_HOLDER__ATTACH)")
wgRowGenerator.Wait()
return nil
}
func getMaxGDBArchiveOidForAttach(ctx context.Context, db *sql.DB) (int, error) {
var maxOid sql.NullInt64
query := fmt.Sprintf(`
SELECT ISNULL(MAX(GDB_ARCHIVE_OID), 0)
FROM [%s].[%s]
`, siteHolderAttachJob.Schema, siteHolderAttachJob.Table)
err := db.QueryRowContext(ctx, query).Scan(&maxOid)
if err != nil && err != sql.ErrNoRows {
return 0, err
}
if !maxOid.Valid {
return 0, nil
}
return int(maxOid.Int64), nil
}
func generateSiteHolderAttachRows(
ctx context.Context,
startOid int,
totalRows int,
chunkSize int,
out chan<- []UnknownRowValues,
) {
defer close(out)
rowsGenerated := 0
currentChunk := make([]UnknownRowValues, 0, chunkSize)
for i := range totalRows {
gdbArchiveOid := startOid + i + 1
row := generateSiteHolderAttachRow(gdbArchiveOid)
currentChunk = append(currentChunk, row)
rowsGenerated++
if len(currentChunk) == chunkSize {
select {
case out <- currentChunk:
log.Debugf("Sent SITE_HOLDER__ATTACH chunk with %d rows", len(currentChunk))
case <-ctx.Done():
log.Info("Context cancelled, stopping SITE_HOLDER__ATTACH row generation")
return
}
currentChunk = make([]UnknownRowValues, 0, chunkSize)
}
if rowsGenerated%100_000 == 0 {
logSiteHolderAttachSampleRow(rowsGenerated, row)
}
}
if len(currentChunk) > 0 {
select {
case out <- currentChunk:
log.Debugf("Sent final SITE_HOLDER__ATTACH chunk with %d rows", len(currentChunk))
case <-ctx.Done():
log.Info("Context cancelled, stopping SITE_HOLDER__ATTACH row generation")
}
}
log.Infof("Finished generating %d SITE_HOLDER__ATTACH rows", rowsGenerated)
}
func generateSiteHolderAttachRow(gdbArchiveOid int) UnknownRowValues {
dateLowerLimit, _ := time.Parse(time.RFC3339, "2020-12-31T23:59:59Z")
dateUpperLimit, _ := time.Parse(time.RFC3339, "2025-12-31T23:59:59Z")
relGlobalID, _ := uuid.New().MarshalBinary()
contentType := generateRandomContentType()
attName := generateRandomAttachmentName()
binaryData := generateRandomBinaryContent()
dataSize := len(binaryData)
globalID, _ := uuid.New().MarshalBinary()
gdbFromDate := generateRandomTimestamp(dateLowerLimit, dateUpperLimit)
gdbToDate, _ := time.Parse(time.RFC3339, "9999-12-31T23:59:59Z")
attachmentID := rand.Intn(10000) + 1
return UnknownRowValues{
gdbArchiveOid,
relGlobalID,
contentType,
attName,
dataSize,
binaryData,
globalID,
gdbFromDate,
gdbToDate,
attachmentID,
}
}
func generateRandomContentType() string {
contentTypes := []string{
"text/plain",
"application/pdf",
"image/jpeg",
"image/png",
"application/msword",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"text/csv",
"application/json",
}
return contentTypes[rand.Intn(len(contentTypes))]
}
func generateRandomAttachmentName() string {
extensions := []string{".txt", ".pdf", ".jpg", ".png", ".doc", ".docx", ".csv", ".json"}
baseName := generateRandomString(20)
extension := extensions[rand.Intn(len(extensions))]
return baseName + extension
}
func generateRandomBinaryContent() []byte {
sizeOptions := []int{100, 500, 1000, 5000, 10000, 50000, 100000}
size := sizeOptions[rand.Intn(len(sizeOptions))]
var buf bytes.Buffer
lineCount := rand.Intn(size/50) + 1
for range lineCount {
line := generateRandomString(rand.Intn(80) + 20)
buf.WriteString(line)
buf.WriteString("\n")
}
for buf.Len() < size {
randomText := generateRandomString(rand.Intn(100) + 50)
buf.WriteString(randomText)
buf.WriteString("\n")
}
result := buf.Bytes()
if len(result) > size {
result = result[:size]
}
return result
}
func logSiteHolderAttachSampleRow(id int, rowValues UnknownRowValues) {
dataBytes := rowValues[5].([]byte)
log.Infof(`
Sample SITE_HOLDER__ATTACH row #%d:
GDB_ARCHIVE_OID: %v
REL_GLOBALID: [binary UUID]
CONTENT_TYPE: %v
ATT_NAME: %v
DATA_SIZE: %v
DATA: [%d bytes of binary content]
GLOBALID: [binary UUID]
GDB_FROM_DATE: %v
GDB_TO_DATE: %v
ATTACHMENTID: %v
`,
id,
rowValues[0],
rowValues[2],
rowValues[3],
rowValues[4],
len(dataBytes),
rowValues[7],
rowValues[8],
rowValues[9],
)
}