dblalock/bolt

basic matrix multiply example for C++ api?

troyliu0105 opened this issue · 9 comments

I couldn't find any c++ examples (including basic matrix multiply) here. Any related code snippet would be very helpful... 😢

template<int M, bool NeedEncodeX>
void _run_bolt_matmul(const RowMatrix& X, const RowMatrix& Q,
ColMatrix centroids, RowMatrix<uint8_t> codes,
RowVector offsets, float scaleby,
ColMatrix<uint8_t> lut_out, RowMatrix<uint16_t> out)
{
auto ncols = X.cols();
auto nqueries = Q.rows();
auto nblocks = X.rows() / 32;
REQUIRE(ncols == Q.cols());
REQUIRE((nblocks * 32) == X.rows());

if (NeedEncodeX) {
    bolt_encode<M>(X.data(), X.rows(), (int)X.cols(), centroids.data(), codes.data());
}

for (int i = 0; i < nqueries; i++) {
    const float* q = Q.row(i).data();
    bolt_lut<M>(q, (int)ncols, centroids.data(), offsets.data(),
                scaleby, lut_out.data());
    bolt_scan<M, true>(codes.data(), lut_out.data(), out.row(i).data(), nblocks);
}

}
///#########################################################################
//##########################################################################
template
void _profile_bolt_matmul(int nrows, int ncols, int nqueries) {
static constexpr int ncodebooks = 2 * M;

// create random data
RowMatrix<float> X(nrows, ncols);
X.setRandom();

// create random queries
RowMatrix<float> Q(nqueries, ncols);
Q.setRandom();

// create random centroids
ColMatrix<float> centroids(ncentroids, ncols);
centroids.setRandom();

// create random codes in [0, 15]
ColMatrix<uint8_t> codes(nrows, ncodebooks);
codes.setRandom();
codes = codes.array() / 16;

// create random / arbitrary offsets and scale factor for luts
RowVector<float> offsets(ncols);
offsets.setRandom();
float scaleby = 3; // arbitrary number

// storage for luts, product
ColMatrix<uint8_t> luts(ncentroids, ncodebooks);
RowMatrix<uint16_t> out(nqueries, nrows);

// time it
std::string msg = string_with_format("bolt<%d> encode=%d matmul %d",
                                     M, false, nqueries);
REPEATED_PROFILE_DIST_COMPUTATION(kNreps, msg, kNtrials,
    out.data(), nrows * nqueries,
    (_run_bolt_matmul<M, false>(X, Q, centroids, codes, offsets, scaleby, luts, out)) );

std::string msg2 = string_with_format("bolt<%d> encode=%d matmul %d",
                                     M, true, nqueries);
REPEATED_PROFILE_DIST_COMPUTATION(kNreps, msg2, kNtrials,
    out.data(), nrows * nqueries,
    (_run_bolt_matmul<M, true>(X, Q, centroids, codes, offsets, scaleby, luts, out)) );

}
//################################################################

template
struct mithral_amm_task {
using traits = mithral_input_type_traits;
using scale_t = typename traits::encoding_scales_type;
using offset_t = typename traits::encoding_offsets_type;
using output_t = typename traits::output_type;
static constexpr int scan_block_nrows = 32;
static constexpr int ncentroids = 16;
static constexpr int nsplits_per_codebook = 4;
static constexpr int max_splitvals = 1 << 4;

mithral_amm_task(int N, int D, int M, int ncodebooks,
                 float lut_work_const):
    N_padded(N % scan_block_nrows == 0 ? N :
        N + (scan_block_nrows - (N % scan_block_nrows))),
    centroids(ncentroids * ncodebooks, D),
    nsplits(ncodebooks * nsplits_per_codebook),
    splitdims(nsplits),
    splitvals(max_splitvals, nsplits),
    encode_scales(nsplits),
    encode_offsets(nsplits),
    nnz_per_centroid(lut_work_const > 0 ?
        lut_work_const * D / ncodebooks : D),
    idxs(ncodebooks, nnz_per_centroid),
    amm(N_padded, D, M, ncodebooks, centroids.data(),
        splitdims.data(), splitvals.data(),
        encode_scales.data(), encode_offsets.data(),
        idxs.data(), nnz_per_centroid),
    X(N_padded, D),
    Q(D, M)
{
    centroids.setRandom();
    splitdims.setRandom();
    for (int i = 0; i < splitdims.size(); i++) {
        splitdims(i) = splitdims(i) % D;
    }
    splitvals.setRandom();
    encode_scales.setRandom();
    encode_offsets.setRandom();

    // randomly initialize idxs, ensuring all are unique and < D
    idxs.setRandom();
    int all_idxs[D];
    for (int i = 0; i < D; i++) {
        all_idxs[i] = i;
    }
    std::random_device rd;
    std::mt19937 g(rd());  // why can't shuffle just create its own...
    for (int c = 0; c < ncodebooks; c++) {  // random sequential idxs
        std::shuffle(all_idxs, all_idxs + D, g);
        std::sort(all_idxs, all_idxs + nnz_per_centroid);
        for (int j = 0; j < nnz_per_centroid; j++) {
            idxs(c, j) = all_idxs[j];
        }
    }

    X.setRandom();
    Q.setRandom();
}

void encode() { amm.encode(X.data()); }
void lut() { amm.lut(Q.data()); }
void scan() { amm.scan(); }

void run_matmul(bool create_lut=true) {
    encode();
    if (create_lut) {
        lut();
    }
    scan();
}

const ColMatrix<output_t>& output() const { return amm.out_mat; }

// stuff we pass into the amm object (would be learned during training)
int N_padded;
ColMatrix<float> centroids;
int nsplits;
RowVector<uint32_t> splitdims;
ColMatrix<int8_t> splitvals;
RowVector<scale_t> encode_scales;
RowVector<offset_t> encode_offsets;
int nnz_per_centroid;
RowMatrix<int> idxs;

// amm object
mithral_amm<InputT> amm;

// random data
ColMatrix<InputT> X;
ColMatrix<float> Q;

};
//#################################################################
void run_matmul(bool create_lut=true) {
encode();
if (create_lut) {
lut();
}
scan();
}

@dumpinfo Thx for your reply. I'm going to test it. 😆

Hello, how to use this test case, I have compiled the source file:bazel run :main,What should I do afterwards, can I tell you more about it?

To run any of the MADDNESS code, use cpp/test/main.cpp. I used catch tests for everything because it gives you a nice CLI and makes debugging easy.

You can run it with no arguments, but you probably want to pass some tags for which tests to run--the full suite takes a while. You probably want something like [amm] ~[old] to just run the MADDNESS tests and exclude legacy / debugging tests.

Hi @dumpinfo, @troyliu0105, it seems like you have worked with mithral in its C++ implementation. Could you find files where the parameter optimization takes place? Running run_matmul()in mithral_amm_task() obviously does not work, since the parameters necessary for the encoding and lut-creation are set randomly. Or does one have to provide the learning-phase?

@fjrdev: there is no learning available in the C++ API, only the python implementation. So you'd have to generate the split thresholds, split values, and prototypes in python and pass them to C++. Which this repo doesn't support doing yet (and I likely won't add it in the foreseeable future because I wrote this code 2.5y ago and am super busy with work + family these days).

implemented in "python/clusterize.py"

image

@dumpinfo have you managed to port the python learning code to c++ or at least were able to extract the learnt parameters into the c++ codebase?