census-ecosystem/opencensus-go-exporter-aws

Trace ID not carried forward from parent

Opened this issue · 1 comments

I'm experiencing an issue when publishing traces that run longer than the publish interval. The first batch of spans is properly sent to X-Ray, but subsequent spans are dropped.

This seems to be happening because a new AmazonTraceID is generated on every publish bundle, regardless of whether the span has a parent or not. Specifically, convertToAmazonTraceID injects a fresh timestamp each time, resulting in a new unique trace ID, even when we want to be carrying forward the existing trace ID.

Included is a small program demonstrating the issue. I added some logging to opencensus-go-exporter-aws (not shown here) so we can see what's getting sent to AWS. The results show two different trace IDs.

{"id":"d4da803036c02aea","name":"/baz","start_time":1583097340.570889,"trace_id":"1-5e5c25fd-2693fa8d418af12b90e0377d","end_time":1583097340.570891,"parent_id":"779447f3308cda9b","type":"subsegment"}

{"id":"779447f3308cda9b","name":"/bar","start_time":1583097340.570889,"trace_id":"1-5e5c25fe-2693fa8d418af12b90e0377d","end_time":1583097342.571168,"parent_id":"1a4e0eb62b588a4d","type":"subsegment"}

{"id":"1a4e0eb62b588a4d","name":"/foo","start_time":1583097340.570887,"trace_id":"1-5e5c25fe-2693fa8d418af12b90e0377d","end_time":1583097342.571286,"service":{"version":"latest"}}

Screen Shot 2020-03-01 at 1 50 57 PM

package main

import (
	"context"
	"fmt"
	"log"
	"os"
	"time"

	"contrib.go.opencensus.io/exporter/aws"
	"go.opencensus.io/trace"
)

func main() {
	if os.Getenv("AWS_ACCESS_KEY_ID") == "" {
		log.Fatalln("AWS_ACCESS_KEY_ID must be set")
	}
	if os.Getenv("AWS_SECRET_ACCESS_KEY") == "" {
		log.Fatalln("AWS_SECRET_ACCESS_KEY must be set")
	}
	if os.Getenv("AWS_DEFAULT_REGION") == "" {
		log.Fatalln("AWS_DEFAULT_REGION must be set")
	}

	ctx := context.Background()

	{
		exporter, err := aws.NewExporter(
			aws.WithVersion("latest"),

			// one-second publish interval
			//aws.WithInterval(time.Second*1),

			aws.WithOnExport(func(in aws.OnExport) {
				fmt.Println("publishing trace,", in.TraceID)
			}),
		)
		if err != nil {
			log.Fatal(err)
		}
		trace.RegisterExporter(exporter)

		// For demoing purposes, always sample.
		trace.ApplyConfig(trace.Config{
			DefaultSampler: trace.AlwaysSample(),
		})
		defer exporter.Close()
	}

	ctx, span := trace.StartSpan(ctx, "/foo")
	bar(ctx)
	span.End()
}

func bar(ctx context.Context) {
	ctx, span := trace.StartSpan(ctx, "/bar")

	baz(ctx)

	// 2-second long task
	time.Sleep(time.Second * 2)

	defer span.End()
}

func baz(ctx context.Context) {
	ctx, span := trace.StartSpan(ctx, "/baz")
	defer span.End()
}

On further digging, it seems that convertToAmazonTraceID is expecting timestamp data in the first 4 bytes of the trace ID, which is not the case. From census-instrumentation/opencensus-go#643, it appears that support for this was anticipated but possibly not implemented in this project?

The workaround is to provide an X-Ray-specific ID Generator like so:

type xrayIDGenerator struct {
	sync.Mutex

	nextSpanID uint64
	spanIDInc  uint64

	traceIDAdd  [2]uint64
	traceIDRand *rand.Rand
}

func NewIDGenerator() *xrayIDGenerator {
	gen := &xrayIDGenerator{}
	// initialize traceID and spanID generators.

	var rngSeed int64
	for _, p := range []interface{}{
		&rngSeed, &gen.traceIDAdd, &gen.nextSpanID, &gen.spanIDInc,
	} {
		binary.Read(crand.Reader, binary.LittleEndian, p)
	}
	gen.traceIDRand = rand.New(rand.NewSource(rngSeed))
	gen.spanIDInc |= 1

	return gen

}

// NewSpanID returns a non-zero span ID from a randomly-chosen sequence.
func (gen *xrayIDGenerator) NewSpanID() [8]byte {
	var id uint64
	for id == 0 {
		id = atomic.AddUint64(&gen.nextSpanID, gen.spanIDInc)
	}
	var sid [8]byte
	binary.LittleEndian.PutUint64(sid[:], id)
	return sid
}

// NewTraceID returns a non-zero trace ID from a randomly-chosen sequence.
// mu should be held while this function is called.
func (gen *xrayIDGenerator) NewTraceID() [16]byte {
	var tid [16]byte
	// Construct the trace ID from two outputs of traceIDRand, with a constant
	// added to each half for additional entropy.
	gen.Lock()
	now := time.Now().Unix()

	//binary.LittleEndian.PutUint64(tid[0:4], uint64(now))
	binary.BigEndian.PutUint32(tid[0:4], uint32(now))
	binary.BigEndian.PutUint32(tid[4:8], gen.traceIDRand.Uint32())
	binary.BigEndian.PutUint64(tid[8:16], gen.traceIDRand.Uint64()+gen.traceIDAdd[1])
	gen.Unlock()
	return tid
}

And then wherever your trace is configured:

trace.ApplyConfig(trace.Config{
	DefaultSampler: trace.AlwaysSample(),
	IDGenerator:    xray.NewIDGenerator(),
})

If I have the time, I'll clean this up into a pull request.