feat: implement data generation and loading for MANZANA with improved structure and logging

This commit is contained in:
2026-04-08 09:57:11 -05:00
parent 1e2a37e59f
commit 4434054b21
7 changed files with 287 additions and 259 deletions

View File

@@ -4,16 +4,11 @@ import (
"context"
"database/sql"
"fmt"
"math/rand"
"sync"
"time"
"github.com/gaspardle/go-mssqlclrgeo"
"github.com/google/uuid"
mssql "github.com/microsoft/go-mssqldb"
log "github.com/sirupsen/logrus"
"github.com/twpayne/go-geom"
"github.com/twpayne/go-geom/encoding/wkb"
)
const (
@@ -41,53 +36,13 @@ func main() {
ctx := context.Background()
maxOid, err := getMaxGDBArchiveOid(ctx, db)
if err != nil {
log.Fatal("Error getting max GDB_ARCHIVE_OID: ", err)
}
var wgSeed sync.WaitGroup
log.Infof("Starting data generation from GDB_ARCHIVE_OID: %d", maxOid+1)
rowsChan := make(chan []UnknownRowValues, queueSize)
var wgRowGenerator sync.WaitGroup
wgRowGenerator.Go(func() {
generateManzanaRows(ctx, maxOid, totalRows, chunkSize, rowsChan)
wgSeed.Go(func() {
seedManzanas(ctx, db)
})
columns := []string{
"GDB_ARCHIVE_OID",
"ID_MANZANA",
"ID_DISTRITO",
"NOMBRE",
"CODIGO",
"CANTIDAD_TOTAL",
"OCUPACION_RESIDENCIAL",
"OCUPACION_NEGOCIO",
"OCUPACION_DEPARTAMENTO",
"INDICADOR",
"FECHA_ALTA",
"FECHA_ACT",
"Shape",
"GDB_GEOMATTR_DATA",
"GlobalID",
"GDB_FROM_DATE",
"GDB_TO_DATE",
"OBJECTID",
}
job := MigrationJob{
Schema: schema,
Table: table,
}
if err := loadRowsMssql(ctx, job, columns, db, rowsChan); err != nil {
log.Fatal("Error loading rows: ", err)
}
log.Info("Data generation and loading completed successfully")
wgRowGenerator.Wait()
wgSeed.Wait()
}
func loadRowsMssql(ctx context.Context, job MigrationJob, colNames []string, db *sql.DB, in <-chan []UnknownRowValues) error {
@@ -151,162 +106,3 @@ func loadRowsMssql(ctx context.Context, job MigrationJob, colNames []string, db
return nil
}
func generateRandomPolygonWKB() []byte {
minX := rand.Float64()*180 - 90
minY := rand.Float64()*180 - 90
size := 0.01
coords := []geom.Coord{
{minX, minY},
{minX + size, minY},
{minX + size, minY + size},
{minX, minY + size},
{minX, minY},
}
polygon := geom.NewPolygon(geom.XY).MustSetCoords([][]geom.Coord{coords})
polygonWkb, _ := wkb.Marshal(polygon, wkb.NDR)
return polygonWkb
}
func getMaxGDBArchiveOid(ctx context.Context, db *sql.DB) (int, error) {
var maxOid sql.NullInt64
query := fmt.Sprintf(`
SELECT ISNULL(MAX(GDB_ARCHIVE_OID), 0)
FROM [%s].[%s]
`, schema, table)
err := db.QueryRowContext(ctx, query).Scan(&maxOid)
if err != nil && err != sql.ErrNoRows {
return 0, err
}
if !maxOid.Valid {
return 0, nil
}
return int(maxOid.Int64), nil
}
func generateManzanaRows(
ctx context.Context,
startOid int,
totalRows int,
chunkSize int,
out chan<- []UnknownRowValues,
) {
defer close(out)
rowsGenerated := 0
currentChunk := make([]UnknownRowValues, 0, chunkSize)
for i := range totalRows {
gdbArchiveOid := startOid + i + 1
row := generateManzanaRow(gdbArchiveOid)
currentChunk = append(currentChunk, row)
rowsGenerated++
if len(currentChunk) == chunkSize {
select {
case out <- currentChunk:
log.Debugf("Sent chunk with %d rows", len(currentChunk))
case <-ctx.Done():
log.Info("Context cancelled, stopping row generation")
return
}
currentChunk = make([]UnknownRowValues, 0, chunkSize)
}
if rowsGenerated%100_000 == 0 {
logManzanaSampleRow(rowsGenerated, row)
}
}
if len(currentChunk) > 0 {
select {
case out <- currentChunk:
log.Debugf("Sent final chunk with %d rows", len(currentChunk))
case <-ctx.Done():
log.Info("Context cancelled, stopping row generation")
}
}
log.Infof("Finished generating %d rows", rowsGenerated)
}
func generateManzanaRow(gdbArchiveOid int) UnknownRowValues {
dateLowerLimit, _ := time.Parse(time.RFC3339, "2020-12-31T23:59:59Z")
dateUpperLimit, _ := time.Parse(time.RFC3339, "2025-12-31T23:59:59Z")
rowID := gdbArchiveOid
distrito := fmt.Sprintf("D%d", rand.Intn(100))
nombre := generateRandomString(15)
codigo := generateRandomString(15)
cantidadTotal := rand.Intn(1000)
ocupacionResidencial := rand.Intn(1000)
ocupacionNegocio := rand.Intn(1000)
ocupacionDepartamento := rand.Intn(1000)
indicador := rand.Intn(10000)
fechaAlta := generateRandomTimestamp(dateLowerLimit, dateUpperLimit)
fechaAct := generateRandomTimestamp(dateLowerLimit, dateUpperLimit)
shapeWKB := generateRandomPolygonWKB()
geoData := []byte{}
id := uuid.New()
globalID := id[:]
gdbFromDate := fechaAct
gdbToDate, _ := time.Parse(time.RFC3339, "9999-12-31T23:59:59Z")
objectID := gdbArchiveOid
shapeMssql, err := mssqlclrgeo.WkbToUdtGeo(shapeWKB, false)
if err != nil {
log.Errorf("Error convirtiendo WKB a formato MSSQL: %v", err)
shapeMssql = []byte{}
}
return UnknownRowValues{
gdbArchiveOid,
rowID,
distrito,
nombre,
codigo,
cantidadTotal,
ocupacionResidencial,
ocupacionNegocio,
ocupacionDepartamento,
indicador,
fechaAlta,
fechaAct,
shapeMssql,
geoData,
globalID,
gdbFromDate,
gdbToDate,
objectID,
}
}
func generateRandomTimestamp(min, max time.Time) time.Time {
minUnix := min.Unix()
maxUnix := max.Unix()
delta := maxUnix - minUnix
secAleatorios := rand.Int63n(delta)
return time.Unix(minUnix+secAleatorios, 0)
}
func generateRandomString(maxLength int) string {
const charset = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
length := min(rand.Intn(maxLength)+1, maxLength)
b := make([]byte, length)
for i := range b {
b[i] = charset[rand.Intn(len(charset))]
}
return string(b)
}