refactor: simplify batch processing by removing partition dependency and introducing batch accumulator
This commit is contained in:
@@ -11,6 +11,58 @@ import (
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
type batchAccumulator struct {
|
||||
batchSize int
|
||||
rows []models.UnknownRowValues
|
||||
parents []models.BatchRef
|
||||
}
|
||||
|
||||
func (a *batchAccumulator) add(batch models.Batch) {
|
||||
a.rows = append(a.rows, batch.Rows...)
|
||||
a.parents = append(a.parents, models.BatchRef{Id: batch.Id})
|
||||
}
|
||||
|
||||
func (a *batchAccumulator) ready() bool {
|
||||
return len(a.rows) >= a.batchSize
|
||||
}
|
||||
|
||||
func (a *batchAccumulator) flush(ctx context.Context, chOut chan<- models.Batch, wg *sync.WaitGroup) bool {
|
||||
if len(a.rows) == 0 {
|
||||
return true
|
||||
}
|
||||
out := models.Batch{
|
||||
Id: uuid.New(),
|
||||
ParentBatches: a.parents,
|
||||
Rows: a.rows,
|
||||
}
|
||||
wg.Add(1)
|
||||
select {
|
||||
case chOut <- out:
|
||||
case <-ctx.Done():
|
||||
wg.Done()
|
||||
return false
|
||||
}
|
||||
a.rows = nil
|
||||
a.parents = nil
|
||||
return true
|
||||
}
|
||||
|
||||
func sendTransformError(ctx context.Context, err error, ch chan<- custom_errors.JobError) {
|
||||
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
|
||||
return
|
||||
}
|
||||
var jobErr custom_errors.JobError
|
||||
if je, ok := errors.AsType[*custom_errors.JobError](err); ok {
|
||||
jobErr = *je
|
||||
} else {
|
||||
jobErr = custom_errors.JobError{ShouldCancelJob: true, Msg: "Transformation failed", Prev: err}
|
||||
}
|
||||
select {
|
||||
case ch <- jobErr:
|
||||
case <-ctx.Done():
|
||||
}
|
||||
}
|
||||
|
||||
func (mssqlTr *MssqlTransformer) Consume(
|
||||
ctx context.Context,
|
||||
columns []models.ColumnType,
|
||||
@@ -25,90 +77,40 @@ func (mssqlTr *MssqlTransformer) Consume(
|
||||
storagePlan := computeStorageTransformationPlan(ctx, mssqlTr.azureClient, mssqlTr.toStorage, columns, mssqlTr.sourceTable)
|
||||
transformationPlan = append(transformationPlan, storagePlan...)
|
||||
|
||||
var accRows []models.UnknownRowValues
|
||||
var parentBatchesId []uuid.UUID
|
||||
var firstPartitionId uuid.UUID
|
||||
|
||||
flush := func() bool {
|
||||
if len(accRows) == 0 {
|
||||
return true
|
||||
}
|
||||
out := models.Batch{
|
||||
Id: uuid.New(),
|
||||
PartitionId: firstPartitionId,
|
||||
ParentBatchesId: parentBatchesId,
|
||||
Rows: accRows,
|
||||
}
|
||||
select {
|
||||
case chBatchesOut <- out:
|
||||
wgActiveBatches.Add(1)
|
||||
case <-ctx.Done():
|
||||
return false
|
||||
}
|
||||
accRows = nil
|
||||
parentBatchesId = nil
|
||||
firstPartitionId = uuid.Nil
|
||||
return true
|
||||
}
|
||||
acc := &batchAccumulator{batchSize: batchSize}
|
||||
|
||||
for {
|
||||
if ctx.Err() != nil {
|
||||
return
|
||||
}
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
|
||||
case batch, ok := <-chBatchesIn:
|
||||
if !ok {
|
||||
flush()
|
||||
acc.flush(ctx, chBatchesOut, wgActiveBatches)
|
||||
return
|
||||
}
|
||||
|
||||
if len(transformationPlan) > 0 {
|
||||
err := ProcessBatchWithRetries(ctx, &batch, transformationPlan, retryConfig)
|
||||
if err != nil {
|
||||
if errors.Is(err, ctx.Err()) {
|
||||
return
|
||||
}
|
||||
|
||||
if jobError, ok := errors.AsType[*custom_errors.JobError](err); ok {
|
||||
select {
|
||||
case chJobErrorsOut <- *jobError:
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
} else {
|
||||
select {
|
||||
case chJobErrorsOut <- custom_errors.JobError{ShouldCancelJob: true, Msg: "Transformation failed", Prev: err}:
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if err := ProcessBatchWithRetries(ctx, &batch, transformationPlan, retryConfig); err != nil {
|
||||
sendTransformError(ctx, err, chJobErrorsOut)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if batchSize <= 0 {
|
||||
wgActiveBatches.Add(1)
|
||||
select {
|
||||
case chBatchesOut <- batch:
|
||||
wgActiveBatches.Add(1)
|
||||
case <-ctx.Done():
|
||||
wgActiveBatches.Done()
|
||||
return
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if len(parentBatchesId) == 0 {
|
||||
firstPartitionId = batch.PartitionId
|
||||
}
|
||||
accRows = append(accRows, batch.Rows...)
|
||||
parentBatchesId = append(parentBatchesId, batch.Id)
|
||||
|
||||
if len(accRows) >= batchSize {
|
||||
if !flush() {
|
||||
acc.add(batch)
|
||||
if acc.ready() {
|
||||
if !acc.flush(ctx, chBatchesOut, wgActiveBatches) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user