/memset

Some example implementations of memset

Primary LanguageC

/* This file contains a variety of implementations of memset. If you don't know
 * what memset is `man memset` may enlighten you. Its definition is
 *
 *  void* memset(void* b, int c, size_t len);
 *
 * I wrote this code partly as a gentle introduction to memset and partly as a
 * reminder to myself of how memset works. This code is in the public domain and
 * you are free to use it for any purpose (commercial or non-commercial) with or
 * without attribution to me. I don't guarantee the correctness of any of the
 * code herein and if you spot a mistake please let me know and I'll correct it.
 *
 * I wrote this code during my free time, but at a period during which I was
 * employed by National ICT Australia and it was heavily related to my work. If
 * push comes to shove it may be considered their intellectual property, but as
 * it is not functional freestanding code and (better) implementations of memset
 * are widely available, I hope they won't mind :)
 *
 * Matthew Fernandez, 2011
 */

#include <string.h>
#include <stdio.h>
#include <stdint.h>
#include <assert.h>

/* Just for convenience let's setup a type for bytes. */
typedef unsigned char byte;

/* Let's start off with a fairly naive implementation of memset. This sets
 * memory byte-by-byte. While not being particularly efficient and being
 * slightly braindead, it does have the advantage of being readily
 * understandable and reasonably straightforward to implement without making
 * mistakes.
 */
void* bytewise_memset(void* s, int c, size_t sz) {
    byte* p = (byte*)s;

    /* c should only be a byte's worth of information anyway, but let's mask out
     * everything else just in case.
     */
    byte x = c & 0xff;

    while (sz--)
        *p++ = x;
    return s;
}

/* A smarter way of doing memset is word-by-word. Your architecture will have a
 * native word size (32 bits on x86) and reads/writes at this granularity will
 * be more efficient than those at finer granularities. Let's take a look at
 * memset for a 32-bit architecture.
 */
void* wordwise_32_memset(void* s, int c, size_t sz) {
    uint32_t* p = (uint32_t*)s;

    /* In this case the masking is actually important. */
    uint32_t x = c & 0xff;

    /* Construct a word's worth of the value we're supposed to be setting. */
    x |= x << 8;
    x |= x << 16;

    /* This technique (without a prologue and epilogue) will only cope with
     * sizes that are word-aligned. For example, you cannot use this function to
     * set a region 7 bytes long. Let's do some checks to make sure the size
     * passed is actually something we can cope with. It is worth noting that in
     * practice you would actually want to check the alignment of the start
     * pointer as well. Doing a word-wise memset on an unaligned pointer gains
     * you nothing and may even hurt performance.
     */
    assert(!(sz & 3)); /* Check sz is 4-byte-aligned. */
    sz >>= 2; /* Divide by number of bytes in a word. */

    while (sz--)
        *p++ = x;
    return s;
}

/* Now let's do an architecture-independent word-by-word memset. You would never
 * want to implement this in practice, as you are relying on the compiler to
 * optimise the statically determinable loops that could be manually written
 * much more efficiently if you know the target architecture, but this is a
 * useful thought exercise. Note that GCC defines the handy constant __WORDSIZE
 * that tells us the size (in bits) of words on this architecture.
 */
void* wordwise_memset(void* s, int c, size_t sz) {
    uintptr_t* p = (uintptr_t*)s;
    uintptr_t x = c & 0xff;
    int i;
    int bytes_per_word;

    /* Construct a word's worth of the byte value we need to set. */
    for (i = 3; (1<<i) < __WORDSIZE; ++i)
        x |= x << (1<<i);
    /* At this point i is the power of 2 which is equal to the word size. */

    /* 1<<i would be the number of bits per word, therefore (1<<i)/8 == 1<<(i-3)
     * is the number of bytes per word.
     */
    bytes_per_word = 1<<(i-3);

    /* Check sz is bytes_per_word-byte-aligned. */
    assert(!(sz & (bytes_per_word-1)));
    sz >>= i-3;

    while (sz--)
        *p++ = x;
    return s;
}

/* Let's return to the 32-bit word-wise version briefly to implement a version
 * that handles unaligned (with respect to word size) pointers and size. To
 * understand what's going on here it's best to look at an example:
 *
 *  Calling wordwise_32_unaligned_memset(2, 0, 7)...
 * Initial:             +-+-+-+-+-+-+-+
 *                      |2|3|4|5|6|7|8|  pp = 2, p = ?
 *                      +-+-+-+-+-+-+-+  sz = 7, tail = ?
 *                      |?|?|?|?|?|?|?|
 *                      +-+-+-+-+-+-+-+
 *
 * After the prologue:  +-+-+-+-+-+-+-+ Now the pointer (pp/p) is aligned.
 *                      |2|3|4|5|6|7|8|  pp = 4, p = 4
 *                      +-+-+-+-+-+-+-+  sz = 5, tail = 1
 *                      |0|0|?|?|?|?|?| (then we adjust sz to 5>>2 == 1)
 *                      +-+-+-+-+-+-+-+
 *
 * After the main loop: +-+-+-+-+-+-+-+ We can't set any more word length
 *                      |2|3|4|5|6|7|8| regions, but there's still one byte
 *                      +-+-+-+-+-+-+-+ remaining.
 *                      |0|0|0|0|0|0|?|  pp = 8, p = 8
 *                      +-+-+-+-+-+-+-+  sz = 0, tail = 1
 *
 * After the epilogue:  +-+-+-+-+-+-+-+ Now we're done.
 *                      |2|3|4|5|6|7|8|  pp = 9, p = 8
 *                      +-+-+-+-+-+-+-+  sz = 0, tail = 0
 *                      |0|0|0|0|0|0|0|
 *                      +-+-+-+-+-+-+-+
 */
void* wordwise_32_unaligned_memset(void* s, int c, size_t sz) {
    uint32_t* p;
    uint32_t x = c & 0xff;
    byte xx = c & 0xff;
    byte* pp = (byte*)s;
    size_t tail;

    /* Let's introduce a prologue to bump the starting location forward to the
     * next alignment boundary.
     */
    while (((unsigned int)pp & 3) && sz--)
        *pp++ = xx;
    p = (uint32_t*)pp;

    /* Let's figure out the number of bytes that will be trailing when the
     * word-wise loop taps out.
     */
    tail = sz & 3;

    /* The middle of this function is identical to the wordwise_32_memset minus
     * the alignment checks.
     */
    x |= x << 8;
    x |= x << 16;

    sz >>= 2;

    while (sz--)
        *p++ = x;

    /* Now we introduce an epilogue to account for the trailing bytes. */
    pp = (byte*)p;
    while (tail--)
        *pp++ = xx;

    return s;
}

/* Let's take the final logical step and implement an architecture-independent
 * memset that can cope with unaligned pointers and sizes. Note, the same
 * caveat as for wordwise_memset applies; you wouldn't write code like this in
 * real life.
 */
void* wordwise_unaligned_memset(void* s, int c, size_t sz) {
    uintptr_t* p;
    uintptr_t x = c & 0xff;
    byte* pp = (byte*)s;
    byte xx = c & 0xff;
    size_t tail;
    int i;
    int bytes_per_word;

    for (i = 3; (1<<i) < __WORDSIZE; ++i)
        x |= x << (1<<i);
    bytes_per_word = 1<<(i-3);

    /* Prologue. */
    while (((unsigned int)pp & (bytes_per_word-1)) && sz--)
        *pp++ = xx;
    tail = sz & (bytes_per_word-1);
    p = (uintptr_t*)pp;

    /* Main loop. */
    sz >>= i-3;
    while (sz--)
        *p++ = x;

    /* Epilogue. */
    pp = (byte*)p;
    while (tail--)
        *pp++ = xx;

    return s;
}

/* Finally, just for fun, let's look at memset using Duff's Device
 * (http://www.lysator.liu.se/c/duffs-device.html). The original Duff's Device
 * was not intended to target memset, but was aimed at solving the problem of
 * copying data from an array into a memory mapped hardware register. As a
 * result, it's not particularly suited to memset. Of course there's a big
 * "but" with this. As with all the algorithms in this file, the relative
 * performance is highly architecture- and compiler-dependent. If you want to
 * know the fastest algorithm for a given scenario you really have no choice
 * but to look at the generated assembly code.
 */
void* duffs_device_memset(void* s, int c, size_t sz) {
    byte* p = (byte*)s;
    byte x = c & 0xff;
    unsigned int leftover = sz & 0x7;

    /* Catch the pathological case of 0. */
    if (!sz)
        return s;

    /* To understand what's going on here, take a look at the original
     * bytewise_memset and consider unrolling the loop. For this situation
     * we'll unroll the loop 8 times (assuming a 32-bit architecture). Choosing
     * the level to which to unroll the loop can be a fine art...
     */
    sz = (sz + 7) >> 3;
    switch (leftover) {
        case 0: do { *p++ = x;
        case 7:      *p++ = x;
        case 6:      *p++ = x;
        case 5:      *p++ = x;
        case 4:      *p++ = x;
        case 3:      *p++ = x;
        case 2:      *p++ = x;
        case 1:      *p++ = x;
                } while (--sz > 0);
    }
    return s;
}

/* Lines below here are instrumentation for testing your implementation. */

#define CHECK(f, unaligned) \
    do { \
        int fail_byte = check_memset((f), (unaligned)); \
        if (fail_byte) \
            printf("%s %s check failed on byte %d.\n", \
                   (unaligned) ? "Unaligned" : "Aligned", #f, fail_byte - 1); \
    } while(0)

#define BUFFER_LEN 4096

/* This function does some very basic checking of your memset function. If you
 * really want to comprehensively test your function you should check the
 * regions either side of the memory you are setting to make sure your function
 * is actually obeying the limits passed to it.
 *
 * Note, the argument unaligned determines whether we pass an unaligned pointer
 * and size or not. Only certain of the above implementations will pass this
 * test.
 */
int check_memset(void* f(), int unaligned) {
    char buffer[BUFFER_LEN];
    int i;
    char set;

    for (set = 0; (unsigned int)set <= 0xff; ++set) {
        f(buffer + unaligned, set, BUFFER_LEN - 2*unaligned);
        for (i = unaligned; i < BUFFER_LEN - 2*unaligned; ++i)
            if (buffer[i] != set)
                return i + 1;
    }

    return 0;
}

/* When executed, this program will just validate the implementations in this
 * file. Note that the unaligned tests are only run on the functions that can
 * cope with unaligned values.
 */
int main(int argc, char** argv) {

    /* Use GCC's built-in memset to validate our checking function. */
    CHECK(memset, 0);
    CHECK(memset, 1);

    /* Check our implementations. */
    CHECK(bytewise_memset, 0);
    CHECK(bytewise_memset, 1);
    CHECK(wordwise_32_memset, 0);
    CHECK(wordwise_memset, 0);
    CHECK(wordwise_32_unaligned_memset, 0);
    CHECK(wordwise_32_unaligned_memset, 1);
    CHECK(wordwise_unaligned_memset, 0);
    CHECK(wordwise_unaligned_memset, 1);
    CHECK(duffs_device_memset, 0);
    CHECK(duffs_device_memset, 1);

    return 0;
}