Workiva/go-datastructures

ctrie data race

newhook opened this issue · 9 comments

I just got a report of data race in the ctrie impl on committed.

func (c *Ctrie) rdcssRoot(old *iNode, expected *mainNode, nv *iNode) bool {
    desc := &iNode{
        rdcss: &rdcssDescriptor{
            old:      old,
            expected: expected,
            nv:       nv,
        },
    }
    if c.casRoot(old, desc) {
        c.rdcssComplete(false)
        return desc.rdcss.committed
    }
    return false
}
func (c *Ctrie) rdcssComplete(abort bool) *iNode {
...
            if oldeMain == exp {
            // Commit the RDCSS.
            if c.casRoot(r, nv) {
                desc.committed = true
                return nv
Read by goroutine 9:
  customerio/chgbuf/ctrie.(*Ctrie).rdcssRoot()
      /Users/matthew/src/services/src/customerio/chgbuf/ctrie/ctrie.go:881 +0x1e7
  customerio/chgbuf/ctrie.(*Ctrie).Snapshot()
      /Users/matthew/src/services/src/customerio/chgbuf/ctrie/ctrie.go:318 +0xb7
...
Previous write by goroutine 6:
  customerio/chgbuf/ctrie.(*Ctrie).rdcssComplete()
      /Users/matthew/src/services/src/customerio/chgbuf/ctrie/ctrie.go:912 +0x1a1
  customerio/chgbuf/ctrie.(*Ctrie).rdcssReadRoot()
      /Users/matthew/src/services/src/customerio/chgbuf/ctrie/ctrie.go:863 +0x78
  customerio/chgbuf/ctrie.(*Ctrie).readRoot()
      /Users/matthew/src/services/src/customerio/chgbuf/ctrie/ctrie.go:855 +0x33
  customerio/chgbuf/ctrie.(*Ctrie).ReadOnlySnapshot()
...

Thanks, will need to take a closer look, but that probably needs to be an
atomic.

On Thu, Dec 24, 2015, 11:59 AM Matthew Newhook notifications@github.com
wrote:

I just got a report of data race in the ctrie impl on committed.

func (c *Ctrie) rdcssRoot(old *iNode, expected *mainNode, nv *iNode) bool {
desc := &iNode{
rdcss: &rdcssDescriptor{
old: old,
expected: expected,
nv: nv,
},
}
if c.casRoot(old, desc) {
c.rdcssComplete(false)
return desc.rdcss.committed
}
return false
}

func (c *Ctrie) rdcssComplete(abort bool) *iNode {
...
if oldeMain == exp {
// Commit the RDCSS.
if c.casRoot(r, nv) {
desc.committed = true
return nv

Read by goroutine 9:
customerio/chgbuf/ctrie.(_Ctrie).rdcssRoot()
/Users/matthew/src/services/src/customerio/chgbuf/ctrie/ctrie.go:881 +0x1e7
customerio/chgbuf/ctrie.(_Ctrie).Snapshot()
/Users/matthew/src/services/src/customerio/chgbuf/ctrie/ctrie.go:318 +0xb7
...
Previous write by goroutine 6:
customerio/chgbuf/ctrie.(_Ctrie).rdcssComplete()
/Users/matthew/src/services/src/customerio/chgbuf/ctrie/ctrie.go:912 +0x1a1
customerio/chgbuf/ctrie.(_Ctrie).rdcssReadRoot()
/Users/matthew/src/services/src/customerio/chgbuf/ctrie/ctrie.go:863 +0x78
customerio/chgbuf/ctrie.(_Ctrie).readRoot()
/Users/matthew/src/services/src/customerio/chgbuf/ctrie/ctrie.go:855 +0x33
customerio/chgbuf/ctrie.(_Ctrie).ReadOnlySnapshot()
...


Reply to this email directly or view it on GitHub
#122.

-               return desc.rdcss.committed
+        return atomic.LoadInt32(&desc.rdcss.committed) == 1
        }
        return false
 }
@@ -909,7 +909,7 @@ func (c *Ctrie) rdcssComplete(abort bool) *iNode {
                if oldeMain == exp {
                        // Commit the RDCSS.
                        if c.casRoot(r, nv) {
-                               desc.committed = true
+                atomic.StoreInt32(&desc.committed, 1)
                                return nv

I made that fix locally, but I'm still getting very occasional crashes in a highly concurrent test case.

panic: runtime error: invalid memory address or nil pointer dereference [recovered]
    panic: runtime error: invalid memory address or nil pointer dereference
[signal 0xb code=0x1 addr=0x0 pc=0x19146b]

goroutine 5 [running]:
testing.tRunner.func1(0xc8200a8000)
    /private/var/folders/q8/bf_4b1ts2zj0l7b0p1dv36lr0000gp/T/workdir/go/src/testing/testing.go:450 +0x1f5
customerio/chgbuf/ctrie.(*Ctrie).ilookup(0xc82039cd00, 0xc8203b3ee0, 0xc8203994d0, 0x0, 0x0, 0x3f9990, 0x0, 0x0, 0x8)
    /Users/matthew/src/services/src/customerio/chgbuf/ctrie/ctrie.go:540 +0x7b
customerio/chgbuf/ctrie.(*Ctrie).lookup(0xc82039cd00, 0xc8203994d0, 0x0, 0x0, 0x340ca71c)
    /Users/matthew/src/services/src/customerio/chgbuf/ctrie/ctrie.go:430 +0x9d
customerio/chgbuf/ctrie.(*Ctrie).Lookup(0xc82039cd00, 0xc8203b6090, 0x1, 0x8, 0x0, 0x0, 0x1)
    /Users/matthew/src/services/src/customerio/chgbuf/ctrie/ctrie.go:303 +0x111
customerio/chgbuf.(*data).child(0xc82039cd40, 0xc8203b6090, 0x1, 0x8, 0x256a00, 0x2fdde8)
    /Users/matthew/src/services/src/customerio/chgbuf/data.go:17 +0x68
customerio/chgbuf.(*bucketData).child(0xc82039cd20, 0xc8203b6090, 0x1, 0x8, 0x2fe980, 0x39a00)

If you have a test which can reproduce semi reliably that would be great.

On Thu, Dec 24, 2015, 3:15 PM Matthew Newhook notifications@github.com
wrote:

  •           return desc.rdcss.committed
    
  •    return atomic.LoadInt32(&desc.rdcss.committed) == 1
    }
    return false
    
    }
    @@ -909,7 +909,7 @@ func (c *Ctrie) rdcssComplete(abort bool) *iNode {
    if oldeMain == exp {
    // Commit the RDCSS.
    if c.casRoot(r, nv) {
  •                           desc.committed = true
    
  •            atomic.StoreInt32(&desc.committed, 1)
                            return nv
    

I made that fix locally, but I'm still getting very occasional crashes in
a highly concurrent test case.

customerio/chgbuf/ctrie.(_Ctrie).ilookup(0xc82039cd00, 0xc8203b3ee0, 0xc8203994d0, 0x0, 0x0, 0x3f9990, 0x0, 0x0, 0x8)
/Users/matthew/src/services/src/customerio/chgbuf/ctrie/ctrie.go:540 +0x7b
customerio/chgbuf/ctrie.(_Ctrie).lookup(0xc82039cd00, 0xc8203994d0, 0x0, 0x0, 0x340ca71c)
/Users/matthew/src/services/src/customerio/chgbuf/ctrie/ctrie.go:430 +0x9d
customerio/chgbuf/ctrie.(_Ctrie).Lookup(0xc82039cd00, 0xc8203b6090, 0x1, 0x8, 0x0, 0x0, 0x1)
/Users/matthew/src/services/src/customerio/chgbuf/ctrie/ctrie.go:303 +0x111
customerio/chgbuf.(_data).child(0xc82039cd40, 0xc8203b6090, 0x1, 0x8, 0x256a00, 0x2fdde8)
/Users/matthew/src/services/src/customerio/chgbuf/data.go:17 +0x68
customerio/chgbuf.(*bucketData).child(0xc82039cd20, 0xc8203b6090, 0x1, 0x8, 0x2fe980, 0x39a00)


Reply to this email directly or view it on GitHub
#122 (comment)
.

The test is part of something much larger. I'll try and reproduce in something small.

I haven't hit the nil pointer issue, but there definitely appears to be an issue with snapshotting (don't know if it's related to your issue or not). I ran the following program which gives the correct output unless the two goroutines are commented out and the wg is adjusted accordingly.

package main

import (
    "fmt"
    "strconv"
    "sync"

    "github.com/Workiva/go-datastructures/trie/ctrie"
)

func main() {
    trie := ctrie.New(nil)
    var wg sync.WaitGroup
    wg.Add(2)
    go func() {
        for i := 0; i < 1000000; i++ {
            trie.Insert([]byte(strconv.Itoa(i)), i)
        }
        wg.Done()
    }()
    go func() {
        for i := 0; i < 1000000; i++ {
            trie.Lookup([]byte(strconv.Itoa(i)))
        }
        wg.Done()
    }()
    //go func() {
    //  for i := 0; i < 1000000; i++ {
    //      trie.Snapshot()
    //  }
    //  wg.Done()
    //}()
    //go func() {
    //  for i := 0; i < 1000000; i++ {
    //      trie.ReadOnlySnapshot()
    //  }
    //  wg.Done()
    //}()
    wg.Wait()

    i := 1
    for _ = range trie.Iterator(nil) {
        fmt.Println(i)
        i++
    }
    fmt.Println("size", trie.Size())
}

I have a branch with a few fixes, but snapshotting is still broken: https://github.com/tylertreat/go-datastructures/tree/fixes

Simpler test:

package main

import (
    "fmt"
    "strconv"
    "sync"

    "github.com/Workiva/go-datastructures/trie/ctrie"
)

func main() {
    trie := ctrie.New(nil)
    var wg sync.WaitGroup
    wg.Add(2)
    go func() {
        for i := 0; i < 7; i++ {
            trie.Insert([]byte(strconv.Itoa(i)), i)
        }
        wg.Done()
    }()
    go func() {
        for i := 0; i < 7; i++ {
            trie.Snapshot()
        }
        wg.Done()
    }()
    wg.Wait()
    fmt.Println("size", trie.Size())
}

Expected size is 7, but if you run it enough times you should see values less than 7 occasionally.

Issue appears to be with GCAS. I made an incorrect assumption from Java (where new Object() != new Object()). In Go, this is not quite the case: https://play.golang.org/p/BBde4cX1GC. As a result, GCAS always succeeds, despite the root generation changing during a snapshot.

I believe I have it fixed with this: https://github.com/Workiva/go-datastructures/compare/master...tylertreat:fixes?expand=1. I haven't been able to reproduce after applying those fixes.

@newhook can you confirm this branch fixes your problem?

Awesome! I'll check on Tuesday :)