refactor: simplify batch processing by removing partition dependency and introducing batch accumulator

2026-05-11 00:38:42 -05:00
parent 16217f6ee2
commit ab9a2d8694
5 changed files with 81 additions and 76 deletions
--- a/internal/app/etl/transformers/consume.go
+++ b/internal/app/etl/transformers/consume.go
@@ -11,6 +11,58 @@ import (
 	"github.com/google/uuid"
 )

+type batchAccumulator struct {
+	batchSize int
+	rows      []models.UnknownRowValues
+	parents   []models.BatchRef
+}
+
+func (a *batchAccumulator) add(batch models.Batch) {
+	a.rows = append(a.rows, batch.Rows...)
+	a.parents = append(a.parents, models.BatchRef{Id: batch.Id})
+}
+
+func (a *batchAccumulator) ready() bool {
+	return len(a.rows) >= a.batchSize
+}
+
+func (a *batchAccumulator) flush(ctx context.Context, chOut chan<- models.Batch, wg *sync.WaitGroup) bool {
+	if len(a.rows) == 0 {
+		return true
+	}
+	out := models.Batch{
+		Id:            uuid.New(),
+		ParentBatches: a.parents,
+		Rows:          a.rows,
+	}
+	wg.Add(1)
+	select {
+	case chOut <- out:
+	case <-ctx.Done():
+		wg.Done()
+		return false
+	}
+	a.rows = nil
+	a.parents = nil
+	return true
+}
+
+func sendTransformError(ctx context.Context, err error, ch chan<- custom_errors.JobError) {
+	if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
+		return
+	}
+	var jobErr custom_errors.JobError
+	if je, ok := errors.AsType[*custom_errors.JobError](err); ok {
+		jobErr = *je
+	} else {
+		jobErr = custom_errors.JobError{ShouldCancelJob: true, Msg: "Transformation failed", Prev: err}
+	}
+	select {
+	case ch <- jobErr:
+	case <-ctx.Done():
+	}
+}
+
 func (mssqlTr *MssqlTransformer) Consume(
 	ctx context.Context,
 	columns []models.ColumnType,
@@ -25,90 +77,40 @@ func (mssqlTr *MssqlTransformer) Consume(
 	storagePlan := computeStorageTransformationPlan(ctx, mssqlTr.azureClient, mssqlTr.toStorage, columns, mssqlTr.sourceTable)
 	transformationPlan = append(transformationPlan, storagePlan...)

-	var accRows []models.UnknownRowValues
-	var parentBatchesId []uuid.UUID
-	var firstPartitionId uuid.UUID
-
-	flush := func() bool {
-		if len(accRows) == 0 {
-			return true
-		}
-		out := models.Batch{
-			Id:              uuid.New(),
-			PartitionId:     firstPartitionId,
-			ParentBatchesId: parentBatchesId,
-			Rows:            accRows,
-		}
-		select {
-		case chBatchesOut <- out:
-			wgActiveBatches.Add(1)
-		case <-ctx.Done():
-			return false
-		}
-		accRows = nil
-		parentBatchesId = nil
-		firstPartitionId = uuid.Nil
-		return true
-	}
+	acc := &batchAccumulator{batchSize: batchSize}

 	for {
-		if ctx.Err() != nil {
-			return
-		}
-
 		select {
 		case <-ctx.Done():
 			return

 		case batch, ok := <-chBatchesIn:
 			if !ok {
-				flush()
+				acc.flush(ctx, chBatchesOut, wgActiveBatches)
 				return
 			}

 			if len(transformationPlan) > 0 {
-				err := ProcessBatchWithRetries(ctx, &batch, transformationPlan, retryConfig)
-				if err != nil {
-					if errors.Is(err, ctx.Err()) {
-						return
-					}
-
-					if jobError, ok := errors.AsType[*custom_errors.JobError](err); ok {
-						select {
-						case chJobErrorsOut <- *jobError:
-						case <-ctx.Done():
-							return
-						}
-					} else {
-						select {
-						case chJobErrorsOut <- custom_errors.JobError{ShouldCancelJob: true, Msg: "Transformation failed", Prev: err}:
-						case <-ctx.Done():
-							return
-						}
-					}
-
+				if err := ProcessBatchWithRetries(ctx, &batch, transformationPlan, retryConfig); err != nil {
+					sendTransformError(ctx, err, chJobErrorsOut)
 					return
 				}
 			}

 			if batchSize <= 0 {
+				wgActiveBatches.Add(1)
 				select {
 				case chBatchesOut <- batch:
-					wgActiveBatches.Add(1)
 				case <-ctx.Done():
+					wgActiveBatches.Done()
 					return
 				}
 				continue
 			}

-			if len(parentBatchesId) == 0 {
-				firstPartitionId = batch.PartitionId
-			}
-			accRows = append(accRows, batch.Rows...)
-			parentBatchesId = append(parentBatchesId, batch.Id)
-
-			if len(accRows) >= batchSize {
-				if !flush() {
+			acc.add(batch)
+			if acc.ready() {
+				if !acc.flush(ctx, chBatchesOut, wgActiveBatches) {
 					return
 				}
 			}