spring-projects/spring-batch

AggregateMongoItemReader to perform mongoDb aggregations

GaetanoMar96 opened this issue · 0 comments

I saw that there is an issue related to this feature (#3666) but opened in 2020 and there is no activity since then, so i decided to open a new one.

Expected Behavior

A MongoItemReader which enables to perform aggregations using the aggregation framework. The idea is to use this reader whenever there is the need to aggregate data from mongoDB collections in reader phase.
I have attached the reader and also an example of usage. You will find also the collections example (txt format to convert in js).

import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import lombok.Setter;
import org.springframework.batch.item.data.MongoItemReader;
import org.springframework.data.mongodb.core.MongoOperations;
import org.springframework.data.mongodb.core.aggregation.Aggregation;
import org.springframework.data.mongodb.core.aggregation.AggregationOperation;
import org.springframework.data.mongodb.core.aggregation.AggregationResults;

@Setter
public class AggregationMongoItemReader<T> extends MongoItemReader<T> {

    private MongoOperations mongoTemplate;
    private Aggregation aggregation;
    private Class<T> classType;
    private String collection;
    private int pageSize = 5; //Must be same size of chunk to avoid reprocessing?
    private volatile AtomicInteger currentPage = new AtomicInteger(0);

    @Override
    protected Iterator<T> doPageRead() {

        int skip = currentPage.getAndIncrement() * pageSize;

        List<AggregationOperation> stages = new ArrayList<>(aggregation.getPipeline().getOperations());
        stages.add(Aggregation.skip((long) skip));
        stages.add(Aggregation.limit(pageSize));
        Aggregation limitedAggregation = Aggregation.newAggregation(stages);

        AggregationResults<T> results = mongoTemplate.aggregate(limitedAggregation, collection, classType);
        List<T> mappedResults = results.getMappedResults();

        if (mappedResults.isEmpty()) {
            return Collections.emptyIterator();
        }

        return mappedResults.iterator();
    }

    @Override
    protected void doOpen() throws Exception {
        super.doOpen();
    }

    @Override
    protected void doClose() throws Exception {
        super.doClose();
    }
}

Example of usage
collections_mongo.txt

import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.batch.item.ItemReader;
import org.springframework.batch.item.data.MongoItemReader;
import org.springframework.batch.item.support.SynchronizedItemStreamReader;
import org.springframework.data.mongodb.core.MongoTemplate;
import org.springframework.data.mongodb.core.aggregation.Aggregation;
import org.springframework.data.mongodb.core.aggregation.ArrayOperators;
import org.springframework.data.mongodb.core.aggregation.ComparisonOperators;
import org.springframework.data.mongodb.core.aggregation.ConditionalOperators;
import org.springframework.data.mongodb.core.query.Criteria;
import org.springframework.stereotype.Component;

@Component
@Slf4j
@RequiredArgsConstructor
public class CertificationsReader {

    private final MongoTemplate mongoTemplate;

    public ItemReader<Certification> certificationReader() {
        SynchronizedItemStreamReader<Certification> synchronizedItemStreamReader = new SynchronizedItemStreamReader<>();
        MongoItemReader<Certification> itemReader = getCertification(certificationType, ledger);
        synchronizedItemStreamReader.setDelegate(itemReader);
        return synchronizedItemStreamReader;
    }

    private MongoItemReader<Certification> getCertification(String certificationType, String ledger) {
        AggregationMongoItemReader<Certification> reader = new AggregationMongoItemReader<>();

        Aggregation aggregation = Aggregation.newAggregation(
            // $lookup stage
            Aggregation.lookup("certifications", "idCertification", "_id", "certification"),
            // $addFields stage
            Aggregation.addFields()
                .addField("certification")
                .withValueOf(ArrayOperators.ArrayElemAt.arrayOf("$certification").elementAt(0))
                .build(),
            // $match stage
            Aggregation.match(
                Criteria.where("certification.ledger").is(ledger)
                    .and("certification.certificationType").is(certificationType)
                    .and("certification.flagStatus").is("TD")
                    .and("certification.flagLocked").is("0")
            ),
            // $group stage
            Aggregation.group("$idCertification")
                .sum(
                    ConditionalOperators.Cond.when(ComparisonOperators.Eq.valueOf("$certificationResult").equalToValue("OK"))
                        .then(1)
                        .otherwise(0)
                ).as("ok")
                .sum(
                    ConditionalOperators.Cond.when(ComparisonOperators.Eq.valueOf("$certificationResult").equalToValue("KO"))
                        .then(1)
                        .otherwise(0)
                ).as("ko")
                .count().as("total")
                .addToSet("accountId").as("accounts"),
            // $project stage
            Aggregation.project("_id",
                                "ok",
                                "ko",
                                "total",
                                "accounts")
        );

        reader.setMongoTemplate(mongoTemplate);
        reader.setCollection("simulations");
        reader.setAggregation(aggregation);
        reader.setClassType(Certification.class);
        return reader;
    }
}

Current Behavior

Right now i saw that the mongoItemReader does not make available the aggregations (maybe this is for a reason i don't know).

Context

I needed this feature because of the fact that i needed to merge data from 2 different collections, and doing so in reader phase made all the following steps (processing and writing) easier because the data are already aggregated and ready to be processed. I have some concerns related to the concurrency and chunk processing.