/gasp

Global Address SPace toolbox

Primary LanguageCMIT LicenseMIT

gasp

Build Status license

Global Address SPace toolbox.

Description

The Dtree distributed dynamic scheduler, and a fast minimal implementation global arrays implementation with an interface based on GA.

  • MPI-2 asynchronous communication and MPI-3 one-sided RMA
  • C11/Linux; tested on Cray machines and Intel clusters

Usage

Dtree:

See the paper linked above for details on Dtree parameters. See test/dtreetest.c for an example.

#include <omp.h>
#include "gasp.h"

gasp_t *g;
int grank, ngranks;

int main(int argc, char **argv)
{
    /* initialize gasp */
    gasp_init(argc, argv, &g);
    grank = gasp_rank();
    ngranks = gasp_nranks();

    /* required scheduler parameters */
    int64_t num_work_items = 50000;
    int min_distrib = omp_get_num_threads() * 1.5;
    double first = 0.4;

    /* parameters that usually don't need changing */
    int fan_out = 1024;
    int can_parent = 1;
    int parents_work = 1;
    double grank_mul = 1.0;
    double rest = 0.4;

    /* create the scheduler */
    dtree_t *scheduler;
    int is_parent;
    dtree_create(g, fan_out, num_work_items, can_parent, parents_work,
            grank_mul, omp_get_max_threads(), omp_get_thread_num,
            first, rest, min_distrib, &scheduler, &is_parent);

    /* get initial work allocation */
    int64_t volatile num_items;
    int64_t cur_item, last_item;

    num_items = dtree_initwork(scheduler, &cur_item, &last_item);
    int run_dtree = dtree_run(scheduler);

    int64_t volatile lock __attribute((aligned(8)));
    lock_init(lock);
    uint64_t wi_done = num_items;

    /* work loop */
    #pragma omp parallel
    {
        int tnum = omp_get_thread_num();

        // run the Dtree if needed
        if (is_parent  &&  run_dtree  &&  tnum == omp_get_num_threads() - 1) {
            while (dtree_run(scheduler))
                cpu_pause();
        }

        else {
            int64_t item;

            while (num_items) {
                lock_acquire(lock);
                if (last_item == 0) {
                    lock_release(lock);
                    break;
                }
                if (cur_item == last_item) {
                    num_items = dtree_getwork(scheduler, &cur_item, &last_item);
                    wi_done += num_items;
                    lock_release(lock);
                    continue;
                }
                item = cur_item++;
                lock_release(lock);

                /* process `item` here */
            }
        }
    }

    /* clean up */
    dtree_destroy(scheduler);

    gasp_shutdown(g);
    return 0;
}
Global arrays:

See Global Arrays documentation for PGAS model and concepts. See test/garraytest.c for example.

#include "gasp.h"

gasp_t *g;
int grank, ngranks;

typedef struct aelem {
    int64_t a, b;
} aelem_t;

int main(int argc, char **argv)
{
    /* initialize gasp */
    gasp_init(argc, argv, &g);
    grank = gasp_rank();
    ngranks = gasp_nranks();

    /* create a global array; currently chunks cannot be specified */
    garray_t *ga;
    int64_t nelems = ngranks * 100;
    garray_create(g, nelems, sizeof(aelem_t), NULL, &ga);

    /* get the local part of the global array; lo-hi inclusive */
    int64_t lo, hi;
    garray_distribution(ga, grank, &lo, &hi);

    int64_t nlocal_elems = hi - lo + 1;

    aelem_t *aptr;
    garray_access(ga, lo, hi, (void **)&aptr);
    for (int64_t i = 0;  i < nlocal_elems;  ++i) {
        aptr[i].a = (i + 1) * grank;
        aptr[i].b = grank;
    }
    garray_flush(ga);

    /* wait for all ranks to complete */
    gasp_sync();

    /* put into the next rank's first element */
    aelem_t tae;
    tae.a = 100 + grank;
    tae.b = grank;
    int64_t ti = (hi + 1) % nelems;
    garray_put(ga, ti, ti, &tae);

    /* wait for all ranks to complete */
    gasp_sync();

    /* destroy the global array */
    garray_destroy(ga);

    gasp_shutdown(g);
    return 0;
}