From 86258718d89fa2900da44b211fd2825c3079916a Mon Sep 17 00:00:00 2001 From: Kylesoda <249518290+kylesoda@users.noreply.github.com> Date: Fri, 29 May 2026 14:38:25 -0500 Subject: [PATCH] refactor: update benchmark results and configuration for improved performance in go-migrate --- benchmark-results.md | 75 +++++++++++++++++++++++++++++++++++ config-reverse-original.yaml | 35 ++++++++++++++++ config-reverse.yaml | 20 +++++----- scripts/mssql-copy-in/main.go | 7 ++-- 4 files changed, 123 insertions(+), 14 deletions(-) create mode 100644 benchmark-results.md create mode 100644 config-reverse-original.yaml diff --git a/benchmark-results.md b/benchmark-results.md new file mode 100644 index 0000000..7c282e1 --- /dev/null +++ b/benchmark-results.md @@ -0,0 +1,75 @@ +# Benchmark go-migrate — 2,000,000 filas + +**Tabla**: `Cartografia.MANZANA` +**Fecha**: 2026-05-29 +**Entorno**: Docker local (MSSQL 2022 Developer / PostgreSQL 16 + PostGIS) + +--- + +## Resultado final — 5 pasadas cada dirección + +| Métrica | MSSQL → PostgreSQL | PostgreSQL → MSSQL | +|---|---|---| +| **Promedio** | **8.37s** | **16.77s** | +| **Mediana** | 8.16s | 16.33s | +| **Mínimo** | 7.75s | 16.03s | +| **Máximo** | 9.17s | 18.46s | +| **Desv. estándar** | 0.56s | 1.01s | +| **Throughput promedio** | **~238,892 filas/seg** | **~119,261 filas/seg** | +| **Factor** | 1x | **~2x más lento** | + +--- + +## Evolución del tuning PG → MSSQL + +| Etapa | Config | Tiempo | Throughput | Δ | +|---|---|---|---|---| +| Corrida 1 — original | conservadora | 236.8s | ~8,446 /seg | baseline | +| Corrida 2 — igualada | mismos parámetros | 21.94s | ~91,148 /seg | +10.8x | +| Tuning A | 4ext/8load 50k | 17.37s | ~115,200 /seg | +1.27x | +| Tuning C | 16 loaders | 17.26s | ~115,900 /seg | +1.28x | +| **Tuning D — óptimo** | **8ext/8load 50k** | **~16.77s** | **~119,261 /seg** | **+1.37x** | +| Tablock + 8 loaders | lock exclusivo serial | ~44s | ~45,000 /seg | ❌ regresión | +| Tablock + 1 loader | minimal logging | ~47s | ~42,000 /seg | ❌ regresión | + +--- + +## Configuración óptima — `config-reverse.yaml` + +```yaml +max_parallel_workers: 4 +defaults: + batches_per_partition: 4 + max_extractors: 8 # ← mayor lever de mejora + extractor_batch_size: 25000 + extractor_queue_size: 32 + max_transformers: 8 + transformer_batch_size: 50000 + transformer_queue_size: 32 + max_loaders: 8 + loader_batch_size: 50000 # sweet spot — 75k y 100k peores +``` + +--- + +## Análisis de la brecha final (~2x) + +La diferencia residual entre ambas direcciones es estructural y está en el protocolo de escritura: + +| Protocolo | Mecanismo | Overhead | +|---|---|---| +| `pgx.CopyFrom` (→ PG) | PostgreSQL COPY protocol — streaming binario sin SQL | mínimo | +| `mssql.CopyIn` (→ MSSQL) | BCP protocol — row-by-row dentro de un bulk statement | mayor por fila | + +`mssql.CopyIn` itera fila a fila via `stmt.ExecContext(row...)` antes del flush final, lo que introduce overhead por fila independientemente del batch size. `pgx.CopyFrom` hace streaming puro. + +--- + +## Hallazgos sobre Tablock + +`Tablock: true` en `mssql.BulkOptions` resultó contraproducente en ambos escenarios: + +- **Con 8 loaders paralelos**: cada loader compite por un lock exclusivo de tabla → serialización completa (~44s) +- **Con 1 loader + batch enorme**: sin contención de locks, pero overhead de log + gestión de la lock exclusiva superó el beneficio de minimal logging (~47s) + +**Conclusión**: para este patrón de carga (múltiples loaders concurrentes), `Tablock: false` (default) es siempre mejor. diff --git a/config-reverse-original.yaml b/config-reverse-original.yaml new file mode 100644 index 0000000..bcb73a2 --- /dev/null +++ b/config-reverse-original.yaml @@ -0,0 +1,35 @@ +max_parallel_workers: 4 +source_db_type: postgres +target_db_type: sqlserver + +defaults: + batches_per_partition: 4 + max_extractors: 2 + extractor_batch_size: 5000 + extractor_queue_size: 8 + max_transformers: 2 + transformer_batch_size: 12500 + transformer_queue_size: 8 + max_loaders: 4 + loader_batch_size: 25000 + partition_calculation_strategy: EXACT + truncate_target: true + truncate_method: TRUNCATE + retry: + attempts: 3 + base_delay_ms: 500 + max_delay_ms: 10000 + max_jitter_ms: 500 + max_failed_partitions: 5 + max_failed_batches_load: 5 + +jobs: + - name: cartografia_manzana_reverse + enabled: true + source: + schema: Cartografia + table: MANZANA + primary_key: GDB_ARCHIVE_OID + target: + schema: Cartografia + table: MANZANA diff --git a/config-reverse.yaml b/config-reverse.yaml index c1e88e9..82d5fb5 100644 --- a/config-reverse.yaml +++ b/config-reverse.yaml @@ -1,17 +1,17 @@ -max_parallel_workers: 2 +max_parallel_workers: 4 source_db_type: postgres target_db_type: sqlserver defaults: - batches_per_partition: 2 - max_extractors: 2 - extractor_batch_size: 500 - extractor_queue_size: 4 - max_transformers: 2 - transformer_batch_size: 2500 - transformer_queue_size: 4 - max_loaders: 2 - loader_batch_size: 5000 + batches_per_partition: 4 + max_extractors: 8 + extractor_batch_size: 25000 + extractor_queue_size: 32 + max_transformers: 8 + transformer_batch_size: 50000 + transformer_queue_size: 32 + max_loaders: 8 + loader_batch_size: 50000 partition_calculation_strategy: EXACT truncate_target: true truncate_method: TRUNCATE diff --git a/scripts/mssql-copy-in/main.go b/scripts/mssql-copy-in/main.go index c25535c..ec88755 100644 --- a/scripts/mssql-copy-in/main.go +++ b/scripts/mssql-copy-in/main.go @@ -12,10 +12,9 @@ import ( ) const ( - // totalRows int = 1_000_000 - totalRows int = 1000 - chunkSize int = 200 - queueSize int = 4 + totalRows int = 2_000_000 + chunkSize int = 5000 + queueSize int = 8 ) func main() {