`BpOsdDecoder` in v2 is slower

Question

`BpOsdDecoder` in v2 is slower

Closed this issue 6 months ago · 2 comments

To test BpOsdDecoder in ldpc_v2 branch, I sampled and decoded some surface code circuits with sinter.

For the baseline, I used stimbposd.sinter_decoders()(which internally used ldpc.bposd_decoder in v1) to decode the samplings. For v2, awared of #32 , I added the SinterCompiledBpOsdDecoder as below to use with sinter:

import stim
import numpy as np
import pathlib
from ldpc.bposd_decoder import BpOsdDecoder
import sinter
from beliefmatching import detector_error_model_to_check_matrices


MAX_BP_ITERS = 30
BP_METHOD = "ps"
OSD_ORDER = 60


class SinterCompiledBpOsdDecoder(sinter.CompiledDecoder):
    def __init__(self, bposd: BpOsdDecoder, num_dets: int, observables_matrix: np.ndarray):
        self.bposd = bposd
        self.num_dets = num_dets
        self.observables_matrix = observables_matrix

    def decode_shots_bit_packed(self, *, bit_packed_detection_event_data: np.ndarray) -> np.ndarray:
        shots = np.unpackbits(bit_packed_detection_event_data, axis=1, count=self.num_dets, bitorder="little")
        corrs = np.apply_along_axis(self.bposd.decode, axis=1, arr=shots)
        predictions = (corrs @ self.observables_matrix.T) % 2
        return np.packbits(predictions, axis=1, bitorder="little")


class SinterBpOsdDecoder(sinter.Decoder):
    def __init__(
        self,
        max_iter=0,
        bp_method="ms",
        ms_scaling_factor=0.625,
        schedule="parallel",
        omp_thread_count=1,
        serial_schedule_order=None,
        osd_method="osd0",
        osd_order=0,
    ):
        self.max_iter = max_iter
        self.bp_method = bp_method
        self.ms_scaling_factor = ms_scaling_factor
        self.schedule = schedule
        self.omp_thread_count = omp_thread_count
        self.serial_schedule_order = serial_schedule_order
        self.osd_method = osd_method
        self.osd_order = osd_order

    def compile_decoder_for_dem(
        self,
        *,
        dem: stim.DetectorErrorModel,
    ) -> sinter.CompiledDecoder:
        check_matrices = detector_error_model_to_check_matrices(dem)
        bposd = BpOsdDecoder(
            check_matrices.check_matrix,
            error_channel=list(check_matrices.priors),
            max_iter=self.max_iter,
            bp_method=self.bp_method,
            ms_scaling_factor=self.ms_scaling_factor,
            schedule=self.schedule,
            omp_thread_count=self.omp_thread_count,
            serial_schedule_order=self.serial_schedule_order,
            osd_method=self.osd_method,
            osd_order=self.osd_order,
        )
        return SinterCompiledBpOsdDecoder(
            bposd, num_dets=dem.num_detectors, observables_matrix=check_matrices.observables_matrix
        )

    def decode_via_files(
        self,
        *,
        num_shots: int,
        num_dets: int,
        num_obs: int,
        dem_path: pathlib.Path,
        dets_b8_in_path: pathlib.Path,
        obs_predictions_b8_out_path: pathlib.Path,
        tmp_dir: pathlib.Path,
    ) -> None:
        self.dem = stim.DetectorErrorModel.from_file(dem_path)
        self.matrices = detector_error_model_to_check_matrices(self.dem)
        self.bposd = BpOsdDecoder(
            self.matrices.check_matrix,
            error_channel=list(self.matrices.priors),
            max_iter=self.max_iter,
            bp_method=self.bp_method,
            ms_scaling_factor=self.ms_scaling_factor,
            schedule=self.schedule,
            omp_thread_count=self.omp_thread_count,
            serial_schedule_order=self.serial_schedule_order,
            osd_method=self.osd_method,
            osd_order=self.osd_order,
        )

        shots = stim.read_shot_data_file(path=dets_b8_in_path, format="b8", num_detectors=num_dets)
        predictions = np.zeros((num_shots, num_obs), dtype=bool)
        for i in range(num_shots):
            predictions[i, :] = self.decode(shots[i, :])

        stim.write_shot_data_file(
            data=predictions,
            path=obs_predictions_b8_out_path,
            format="b8",
            num_observables=num_obs,
        )

    def decode(self, syndrome: np.ndarray) -> np.ndarray:
        corr = self.bposd.decode(syndrome)
        return (self.matrices.observables_matrix @ corr) % 2


def bposd_sinter_decoder():
    return {
        "bposdv2": SinterBpOsdDecoder(
            max_iter=MAX_BP_ITERS,
            bp_method=BP_METHOD,
            osd_method="OSD_CS",
            osd_order=OSD_ORDER,
        )
    }

The MAX_BP_ITERS/BP_METHOD/OSD_ORDER were selected to be consistent with the default ones in stimbposd.

Then, I ran the sampling with the following script:

import stim
import sinter
import stimbposd

import bposdtest # where I defined the sinter decoder for bposd v2

def generate_example_tasks():
    for p in [0.004, 0.006, 0.008, 0.01, 0.012]:
        for d in [3, 5, 7]:
            circuit = stim.Circuit.generated(
                code_task="surface_code:rotated_memory_x",
                rounds=d,
                distance=d,
                after_clifford_depolarization=p,
                after_reset_flip_probability=p,
                before_measure_flip_probability=p,
                before_round_data_depolarization=p,
            )
            yield sinter.Task(
                circuit=circuit,
                json_metadata={"p": p, "d": d},
            )

if __name__ == "__main__":
    sinter.collect(
        num_workers=120, # I ran the simulation on a server
        max_shots=500_000,
        max_errors=500,
        tasks=generate_example_tasks(),
        # decoders=["bposd"],
        # custom_decoders=stimbposd.sinter_decoders(),
        decoders=["bposd2"],
        custom_decoders=bposdtest.bposd_sinter_decoder(),
        print_progress=True,
        save_resume_filepath="bposd.csv",
    )

I compared the decoding accuracy and time per shot with v1/v2 implementations, and here is the result:

We can clearly see for the surface code with distance > 3, the decoding time of v2 implementation is slower than v1. I'm not sure whether all the conditions/arguments were equally set in this benchmark for v1 and v2 and I might missed something important.

The environment I used for the test is Ubuntu22.04LTS and Python3.11.4.

Answer 1 · 2024-03-23T13:44:57.000Z

Hi. Could you try setting the bp_method='ps_log' for the simulations in v1? I think the speed of the two versions should be more comparable once this change has been made.

The bp_method='ps' method uses a version of BP where the messages are computed using probability ratios, rather than log probability ratios. This is quicker, as you can avoid computing the archtanh(x) function need for the LLR version. I haven't implemented the optimised version of product sum for V2, but it is on my todo list.

Cheers,
Joschka

Answer 2 · 2024-03-25T03:31:01.000Z

Thanks for your reply. After I set bp_method='ps_log' for v1, the speed of the two version is close.