bits-and-blooms/bitset

Overreading with v1.3.1

klauspost opened this issue · 10 comments

It seems you are still over-reading with the latest release. Here is a reproducer test:

func TestWriteTo(t *testing.T) {
	const length = 958506
	addBuf := []byte(`1234`)
	bs := New(length)
	var buf bytes.Buffer
	_, err := bs.WriteTo(&buf)
	if err != nil {
		t.Fatal(err)
	}
	buf.Write(addBuf)
	
	// Generate test input for regression tests:
	if false {
		gzout := bytes.NewBuffer(nil)
		gz, err := gzip.NewWriterLevel(gzout, 9)
		if err != nil {
			t.Fatal(err)
		}
		gz.Write(buf.Bytes())
		gz.Close()
		t.Log("Encoded:", base64.StdEncoding.EncodeToString(gzout.Bytes()))
	}
	bs = New(length)
	_, err = bs.ReadFrom(&buf)
	if err != nil {
		t.Fatal(err)
	}
	more, err := io.ReadAll(&buf)
	if err != nil {
		t.Fatal(err)
	}
	if !bytes.Equal(more, addBuf) {
		t.Fatalf("extra mismatch. got %v, want %v", more, addBuf)
	}
}

Works with v1.2.0, but outputs:

=== RUN   TestWriteTo
    bitset_test.go:1684: extra mismatch. got [], want [49 50 51 52]
--- FAIL: TestWriteTo (0.00s)

You can extend it for more sizes.

You should add regression tests. Here is a template:


func TestReadFrom(t *testing.T) {
	addBuf := []byte(`1234`)
	tests := []struct {
		length uint
		input  string // base64+gzipped
	}{{
		length: 958506,
		input:  "H4sIAAAAAAAC/+zAORUAIAwFsK8AIWwcgpHKq4lOScp4MwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATdY+9wcAAP//b+hYaRTUAQA=",
	}}

	for _, test := range tests {
		var buf bytes.Buffer
		b, err := base64.StdEncoding.DecodeString(test.input)
		if err != nil {
			t.Fatal(err)
		}
		gz, err := gzip.NewReader(bytes.NewBuffer(b))
		if err != nil {
			t.Fatal(err)
		}
		io.Copy(&buf, gz)
		gz.Close()

		bs := New(test.length)
		_, err = bs.ReadFrom(&buf)
		if err != nil {
			t.Fatal(err)
		}
		more, err := io.ReadAll(&buf)
		if err != nil {
			t.Fatal(err)
		}
		if !bytes.Equal(more, addBuf) {
			t.Fatalf("extra mismatch. got %v, want %v", more, addBuf)
		}
	}
}

You can also add some fixed content to the bf itself.

@klauspost Would you kindly produce a pull request ?

@lemire With the tests?

You have identified a bug in the ReadFrom function, I invite you to produce a patch and to submit it as a pull request.

Here is the function, can you point at the bug please?

// ReadFrom reads a BitSet from a stream written using WriteTo
func (b *BitSet) ReadFrom(stream io.Reader) (int64, error) {
	var length uint64

	// Read length first
	err := binary.Read(stream, binaryOrder, &length)
	if err != nil {
		return 0, err
	}
	newset := New(uint(length))

	if uint64(newset.length) != length {
		return 0, errors.New("unmarshalling error: type mismatch")
	}

	// Read remaining bytes as set
	// current implementation bufio.Reader is more memory efficient than
	// binary.Read for large set
	reader := bufio.NewReader(stream)
	var item = make([]byte, binary.Size(uint64(0))) // one uint64
	nWords := uint64(wordsNeeded(uint(length)))
	for i := uint64(0); i < nWords; i++ {
		if _, err := reader.Read(item); err != nil {
			return 0, err
		}
		newset.set[i] = binaryOrder.Uint64(item)
	}

	*b = *newset
	return int64(b.BinaryStorageSize()), nil
}

Bug is using bufio.NewReader, which reads ahead into buffer. I will submit a PR.

Thanks.

Global Endianness switch. oh, wow.

Anyway PR sent in #109