feat: update chunk size for MSSQL data loading and add utility functions for database operations
This commit is contained in:
@@ -1,101 +1,312 @@
|
||||
//go:build go1.10
|
||||
// +build go1.10
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"log"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/gaspardle/go-mssqlclrgeo"
|
||||
"github.com/google/uuid"
|
||||
mssql "github.com/microsoft/go-mssqldb"
|
||||
log "github.com/sirupsen/logrus"
|
||||
"github.com/twpayne/go-geom"
|
||||
"github.com/twpayne/go-geom/encoding/wkb"
|
||||
)
|
||||
|
||||
const (
|
||||
createTestTable = `CREATE TABLE test_table(
|
||||
id int IDENTITY(1,1) NOT NULL,
|
||||
test_nvarchar nvarchar(50) NULL,
|
||||
test_varchar varchar(50) NULL,
|
||||
test_float float NULL,
|
||||
test_datetime2_3 datetime2(3) NULL,
|
||||
test_bitn bit NULL,
|
||||
test_bigint bigint NOT NULL,
|
||||
test_geom geometry NULL,
|
||||
CONSTRAINT PK_table_test_id PRIMARY KEY CLUSTERED
|
||||
(
|
||||
id ASC
|
||||
) ON [PRIMARY]);`
|
||||
dropTestTable = "IF OBJECT_ID('test_table', 'U') IS NOT NULL DROP TABLE test_table;"
|
||||
totalRows int = 5_000_000
|
||||
chunkSize int = 50_000
|
||||
schema string = "Cartografia"
|
||||
table string = "MANZANA"
|
||||
queueSize int = 4
|
||||
)
|
||||
|
||||
// This example shows how to perform bulk imports
|
||||
func main() {
|
||||
db, err := sql.Open("sqlserver", "")
|
||||
if err != nil {
|
||||
log.Fatal("Open connection failed:", err.Error())
|
||||
log.SetFormatter(&log.TextFormatter{
|
||||
FullTimestamp: true,
|
||||
TimestampFormat: time.StampMilli,
|
||||
DisableSorting: false,
|
||||
PadLevelText: true,
|
||||
})
|
||||
log.SetLevel(log.DebugLevel)
|
||||
|
||||
db, connError := connectToSqlServer()
|
||||
if connError != nil {
|
||||
log.Fatal("Connection error: ", connError)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
txn, err := db.Begin()
|
||||
ctx := context.Background()
|
||||
|
||||
maxOid, err := getMaxGDBArchiveOid(ctx, db)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
log.Fatal("Error getting max GDB_ARCHIVE_OID: ", err)
|
||||
}
|
||||
|
||||
// Create table
|
||||
_, err = db.Exec(createTestTable)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer db.Exec(dropTestTable)
|
||||
log.Infof("Starting data generation from GDB_ARCHIVE_OID: %d", maxOid+1)
|
||||
|
||||
// mssqldb.CopyIn creates string to be consumed by Prepare
|
||||
stmt, err := txn.Prepare(mssql.CopyIn("test_table", mssql.BulkOptions{}, "test_varchar", "test_nvarchar", "test_float", "test_bigint"))
|
||||
if err != nil {
|
||||
log.Fatal(err.Error())
|
||||
rowsChan := make(chan []UnknownRowValues, queueSize)
|
||||
|
||||
var wgRowGenerator sync.WaitGroup
|
||||
|
||||
wgRowGenerator.Go(func() {
|
||||
generateManzanaRows(ctx, maxOid, totalRows, chunkSize, rowsChan)
|
||||
})
|
||||
|
||||
columns := []string{
|
||||
"GDB_ARCHIVE_OID",
|
||||
"ID_MANZANA",
|
||||
"ID_DISTRITO",
|
||||
"NOMBRE",
|
||||
"CODIGO",
|
||||
"CANTIDAD_TOTAL",
|
||||
"OCUPACION_RESIDENCIAL",
|
||||
"OCUPACION_NEGOCIO",
|
||||
"OCUPACION_DEPARTAMENTO",
|
||||
"INDICADOR",
|
||||
"FECHA_ALTA",
|
||||
"FECHA_ACT",
|
||||
"Shape",
|
||||
"GDB_GEOMATTR_DATA",
|
||||
"GlobalID",
|
||||
"GDB_FROM_DATE",
|
||||
"GDB_TO_DATE",
|
||||
"OBJECTID",
|
||||
}
|
||||
|
||||
for i := 0; i < 10; i++ {
|
||||
_, err = stmt.Exec(generateString(0, 30), generateStringUnicode(0, 30), i, i)
|
||||
job := MigrationJob{
|
||||
Schema: schema,
|
||||
Table: table,
|
||||
}
|
||||
|
||||
if err := loadRowsMssql(ctx, job, columns, db, rowsChan); err != nil {
|
||||
log.Fatal("Error loading rows: ", err)
|
||||
}
|
||||
|
||||
log.Info("Data generation and loading completed successfully")
|
||||
wgRowGenerator.Wait()
|
||||
}
|
||||
|
||||
func loadRowsMssql(ctx context.Context, job MigrationJob, colNames []string, db *sql.DB, in <-chan []UnknownRowValues) error {
|
||||
chunkCount := 0
|
||||
totalRowsLoaded := 0
|
||||
|
||||
for rows := range in {
|
||||
chunkStartTime := time.Now()
|
||||
|
||||
tx, err := db.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
log.Fatal(err.Error())
|
||||
return fmt.Errorf("error starting transaction: %w", err)
|
||||
}
|
||||
|
||||
fullTableName := fmt.Sprintf("[%s].[%s]", job.Schema, job.Table)
|
||||
|
||||
stmt, err := tx.PrepareContext(ctx, mssql.CopyIn(fullTableName, mssql.BulkOptions{}, colNames...))
|
||||
if err != nil {
|
||||
tx.Rollback()
|
||||
return fmt.Errorf("error preparing bulk copy statement: %w", err)
|
||||
}
|
||||
|
||||
copyStartTime := time.Now()
|
||||
|
||||
for _, row := range rows {
|
||||
_, err = stmt.ExecContext(ctx, row...)
|
||||
if err != nil {
|
||||
stmt.Close()
|
||||
tx.Rollback()
|
||||
return fmt.Errorf("error executing row insert: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
result, err := stmt.ExecContext(ctx)
|
||||
if err != nil {
|
||||
stmt.Close()
|
||||
tx.Rollback()
|
||||
return fmt.Errorf("error flushing bulk data: %w", err)
|
||||
}
|
||||
|
||||
err = stmt.Close()
|
||||
if err != nil {
|
||||
tx.Rollback()
|
||||
return fmt.Errorf("error closing statement: %w", err)
|
||||
}
|
||||
|
||||
if err := tx.Commit(); err != nil {
|
||||
return fmt.Errorf("error committing transaction: %w", err)
|
||||
}
|
||||
|
||||
rowsAffected, _ := result.RowsAffected()
|
||||
chunkCount++
|
||||
totalRowsLoaded += int(rowsAffected)
|
||||
|
||||
copyDuration := time.Since(copyStartTime)
|
||||
chunkDuration := time.Since(chunkStartTime)
|
||||
rowsPerSec := float64(len(rows)) / chunkDuration.Seconds()
|
||||
|
||||
log.Infof("Loaded chunk #%d (MSSQL): %d rows in %v (copy: %v, %.0f rows/sec) - Total: %d rows", chunkCount, len(rows), chunkDuration, copyDuration, rowsPerSec, totalRowsLoaded)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func generateRandomPolygonWKB() []byte {
|
||||
minX := rand.Float64()*180 - 90
|
||||
minY := rand.Float64()*180 - 90
|
||||
|
||||
size := 0.01
|
||||
|
||||
coords := []geom.Coord{
|
||||
{minX, minY},
|
||||
{minX + size, minY},
|
||||
{minX + size, minY + size},
|
||||
{minX, minY + size},
|
||||
{minX, minY},
|
||||
}
|
||||
|
||||
polygon := geom.NewPolygon(geom.XY).MustSetCoords([][]geom.Coord{coords})
|
||||
|
||||
polygonWkb, _ := wkb.Marshal(polygon, wkb.NDR)
|
||||
|
||||
return polygonWkb
|
||||
}
|
||||
|
||||
func getMaxGDBArchiveOid(ctx context.Context, db *sql.DB) (int, error) {
|
||||
var maxOid sql.NullInt64
|
||||
|
||||
query := fmt.Sprintf(`
|
||||
SELECT ISNULL(MAX(GDB_ARCHIVE_OID), 0)
|
||||
FROM [%s].[%s]
|
||||
`, schema, table)
|
||||
|
||||
err := db.QueryRowContext(ctx, query).Scan(&maxOid)
|
||||
if err != nil && err != sql.ErrNoRows {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if !maxOid.Valid {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
return int(maxOid.Int64), nil
|
||||
}
|
||||
|
||||
func generateManzanaRows(
|
||||
ctx context.Context,
|
||||
startOid int,
|
||||
totalRows int,
|
||||
chunkSize int,
|
||||
out chan<- []UnknownRowValues,
|
||||
) {
|
||||
defer close(out)
|
||||
|
||||
rowsGenerated := 0
|
||||
currentChunk := make([]UnknownRowValues, 0, chunkSize)
|
||||
|
||||
for i := range totalRows {
|
||||
gdbArchiveOid := startOid + i + 1
|
||||
row := generateManzanaRow(gdbArchiveOid)
|
||||
currentChunk = append(currentChunk, row)
|
||||
rowsGenerated++
|
||||
|
||||
if len(currentChunk) == chunkSize {
|
||||
select {
|
||||
case out <- currentChunk:
|
||||
log.Debugf("Sent chunk with %d rows", len(currentChunk))
|
||||
case <-ctx.Done():
|
||||
log.Info("Context cancelled, stopping row generation")
|
||||
return
|
||||
}
|
||||
currentChunk = make([]UnknownRowValues, 0, chunkSize)
|
||||
}
|
||||
|
||||
if rowsGenerated%100_000 == 0 {
|
||||
logManzanaSampleRow(rowsGenerated, row)
|
||||
}
|
||||
}
|
||||
|
||||
result, err := stmt.Exec()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
if len(currentChunk) > 0 {
|
||||
select {
|
||||
case out <- currentChunk:
|
||||
log.Debugf("Sent final chunk with %d rows", len(currentChunk))
|
||||
case <-ctx.Done():
|
||||
log.Info("Context cancelled, stopping row generation")
|
||||
}
|
||||
}
|
||||
|
||||
err = stmt.Close()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
err = txn.Commit()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
rowCount, _ := result.RowsAffected()
|
||||
log.Printf("%d row copied\n", rowCount)
|
||||
log.Printf("bye\n")
|
||||
log.Infof("Finished generating %d rows", rowsGenerated)
|
||||
}
|
||||
|
||||
func generateString(x int, n int) string {
|
||||
letters := "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
b := make([]byte, n)
|
||||
func generateManzanaRow(gdbArchiveOid int) UnknownRowValues {
|
||||
dateLowerLimit, _ := time.Parse(time.RFC3339, "2020-12-31T23:59:59Z")
|
||||
dateUpperLimit, _ := time.Parse(time.RFC3339, "2025-12-31T23:59:59Z")
|
||||
|
||||
rowID := gdbArchiveOid
|
||||
distrito := fmt.Sprintf("D%d", rand.Intn(100))
|
||||
nombre := generateRandomString(15)
|
||||
codigo := generateRandomString(15)
|
||||
cantidadTotal := rand.Intn(1000)
|
||||
ocupacionResidencial := rand.Intn(1000)
|
||||
ocupacionNegocio := rand.Intn(1000)
|
||||
ocupacionDepartamento := rand.Intn(1000)
|
||||
indicador := rand.Intn(10000)
|
||||
fechaAlta := generateRandomTimestamp(dateLowerLimit, dateUpperLimit)
|
||||
fechaAct := generateRandomTimestamp(dateLowerLimit, dateUpperLimit)
|
||||
shapeWKB := generateRandomPolygonWKB()
|
||||
geoData := []byte{}
|
||||
id := uuid.New()
|
||||
globalID := id[:]
|
||||
gdbFromDate := fechaAct
|
||||
gdbToDate, _ := time.Parse(time.RFC3339, "9999-12-31T23:59:59Z")
|
||||
objectID := gdbArchiveOid
|
||||
|
||||
shapeMssql, err := mssqlclrgeo.WkbToUdtGeo(shapeWKB, false)
|
||||
if err != nil {
|
||||
log.Errorf("Error convirtiendo WKB a formato MSSQL: %v", err)
|
||||
shapeMssql = []byte{}
|
||||
}
|
||||
|
||||
return UnknownRowValues{
|
||||
gdbArchiveOid,
|
||||
rowID,
|
||||
distrito,
|
||||
nombre,
|
||||
codigo,
|
||||
cantidadTotal,
|
||||
ocupacionResidencial,
|
||||
ocupacionNegocio,
|
||||
ocupacionDepartamento,
|
||||
indicador,
|
||||
fechaAlta,
|
||||
fechaAct,
|
||||
shapeMssql,
|
||||
geoData,
|
||||
globalID,
|
||||
gdbFromDate,
|
||||
gdbToDate,
|
||||
objectID,
|
||||
}
|
||||
}
|
||||
|
||||
func generateRandomTimestamp(min, max time.Time) time.Time {
|
||||
minUnix := min.Unix()
|
||||
maxUnix := max.Unix()
|
||||
|
||||
delta := maxUnix - minUnix
|
||||
secAleatorios := rand.Int63n(delta)
|
||||
|
||||
return time.Unix(minUnix+secAleatorios, 0)
|
||||
}
|
||||
|
||||
func generateRandomString(maxLength int) string {
|
||||
const charset = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
|
||||
length := min(rand.Intn(maxLength)+1, maxLength)
|
||||
|
||||
b := make([]byte, length)
|
||||
for i := range b {
|
||||
b[i] = letters[(x+i)%len(letters)]
|
||||
b[i] = charset[rand.Intn(len(charset))]
|
||||
}
|
||||
return string(b)
|
||||
}
|
||||
func generateStringUnicode(x int, n int) string {
|
||||
letters := []byte("ab©💾é?ghïjklmnopqЯ☀tuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
||||
b := &strings.Builder{}
|
||||
for i := 0; i < n; i++ {
|
||||
r, sz := utf8.DecodeRune(letters[x%len(letters):])
|
||||
x += sz
|
||||
b.WriteRune(r)
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
52
scripts/mssql-copy-in/types.go
Normal file
52
scripts/mssql-copy-in/types.go
Normal file
@@ -0,0 +1,52 @@
|
||||
package main
|
||||
|
||||
type ColumnType struct {
|
||||
name string
|
||||
|
||||
hasMaxLength bool
|
||||
hasPrecisionScale bool
|
||||
|
||||
userType string
|
||||
systemType string
|
||||
unifiedType string
|
||||
nullable bool
|
||||
maxLength int64
|
||||
precision int64
|
||||
scale int64
|
||||
}
|
||||
|
||||
func (c *ColumnType) Name() string {
|
||||
return c.name
|
||||
}
|
||||
|
||||
func (c *ColumnType) UserType() string {
|
||||
return c.userType
|
||||
}
|
||||
|
||||
func (c *ColumnType) SystemType() string {
|
||||
return c.systemType
|
||||
}
|
||||
|
||||
func (c *ColumnType) Length() (length int64, ok bool) {
|
||||
return c.maxLength, c.hasMaxLength
|
||||
}
|
||||
|
||||
func (c *ColumnType) DecimalSize() (precision, scale int64, ok bool) {
|
||||
return c.precision, c.scale, c.hasPrecisionScale
|
||||
}
|
||||
|
||||
func (c *ColumnType) Nullable() bool {
|
||||
return c.nullable
|
||||
}
|
||||
|
||||
func (c *ColumnType) Type() string {
|
||||
return c.unifiedType
|
||||
}
|
||||
|
||||
type MigrationJob struct {
|
||||
Schema string
|
||||
Table string
|
||||
PrimaryKey string
|
||||
}
|
||||
|
||||
type UnknownRowValues = []any
|
||||
81
scripts/mssql-copy-in/utils.go
Normal file
81
scripts/mssql-copy-in/utils.go
Normal file
@@ -0,0 +1,81 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"git.ksdemosapps.com/kylesoda/go-migrate/internal/app/config"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func connectToSqlServer() (*sql.DB, error) {
|
||||
db, err := sql.Open("sqlserver", config.App.SourceDbUrl)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Unable to connect to sqlserver: %w", err)
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second)
|
||||
defer cancel()
|
||||
|
||||
if err := db.PingContext(ctx); err != nil {
|
||||
return nil, fmt.Errorf("Unable to ping sqlserver: %w", err)
|
||||
}
|
||||
|
||||
return db, nil
|
||||
}
|
||||
|
||||
func Map[T any, V any](input []T, mapper func(T) V) []V {
|
||||
result := make([]V, len(input))
|
||||
|
||||
for i, v := range input {
|
||||
result[i] = mapper(v)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
func logManzanaSampleRow(id int, rowValues UnknownRowValues) {
|
||||
log.Infof(`
|
||||
Sample row #%d:
|
||||
GDB_ARCHIVE_OID (%T): %v
|
||||
ID_MANZANA (%T): %v
|
||||
ID_DISTRITO (%T): %v
|
||||
NOMBRE (%T): %v
|
||||
CODIGO (%T): %v
|
||||
CANTIDAD_TOTAL (%T): %v
|
||||
OCUPACION_RESIDENCIAL (%T): %v
|
||||
OCUPACION_NEGOCIO (%T): %v
|
||||
OCUPACION_DEPARTAMENTO (%T): %v
|
||||
INDICADOR (%T): %v
|
||||
FECHA_ALTA (%T): %v
|
||||
FECHA_ACT (%T): %v
|
||||
Shape (%T): %v
|
||||
GDB_GEOMATTR_DATA (%T): %v
|
||||
GlobalID (%T): %v
|
||||
GDB_FROM_DATE (%T): %v
|
||||
GDB_TO_DATE (%T): %v
|
||||
OBJECTID (%T): %v
|
||||
`,
|
||||
id,
|
||||
rowValues[0], rowValues[0],
|
||||
rowValues[1], rowValues[1],
|
||||
rowValues[2], rowValues[2],
|
||||
rowValues[3], rowValues[3],
|
||||
rowValues[4], rowValues[4],
|
||||
rowValues[5], rowValues[5],
|
||||
rowValues[6], rowValues[6],
|
||||
rowValues[7], rowValues[7],
|
||||
rowValues[8], rowValues[8],
|
||||
rowValues[9], rowValues[9],
|
||||
rowValues[10], rowValues[10],
|
||||
rowValues[11], rowValues[11],
|
||||
rowValues[12], rowValues[12],
|
||||
rowValues[13], rowValues[13],
|
||||
rowValues[14], rowValues[14],
|
||||
rowValues[15], rowValues[15],
|
||||
rowValues[16], rowValues[16],
|
||||
rowValues[17], rowValues[17],
|
||||
)
|
||||
}
|
||||
Reference in New Issue
Block a user