bodgit/sevenzip

low performance for aes7z

Closed this issue · 2 comments

I have a encrypted 7z file, file size was about 60MB, 100+ files in it, when I extract it, cost over 20 seconds on my computer, I trace the code found key will calculate on every file opened in sevenz.File.Open , its very slow, I try to optimize aes7z.calculateKey for get key cached, (I guess all files in a 7z has same key, but I can't comfirm it), it seems works ok, the aes7z/key.go I optimized is here:

package aes7z

import (
	"bytes"
	"crypto/sha256"
	"encoding/binary"
	"sync"

	"golang.org/x/text/encoding/unicode"
	"golang.org/x/text/transform"
)

type keyCacheItem struct {
	password string
	cycles   int
	salt     []byte
	key      []byte
}

func (c *keyCacheItem) hittest(password string, cycles int, salt []byte) bool {
	return c.password == password && c.cycles == cycles && bytes.Equal(salt, c.salt)
}

var keyCache []*keyCacheItem = []*keyCacheItem{}
var keyCacheLock sync.RWMutex

func findKeyCached(password string, cycles int, salt []byte) []byte {
	keyCacheLock.RLock()
	defer keyCacheLock.RUnlock()
	for _, kci := range keyCache {
		if kci.hittest(password, cycles, salt) {
			return kci.key
		}
	}

	return nil
}

func recordKeyCached(password string, cycles int, salt []byte, key []byte) {
	keyCacheLock.Lock()
	defer keyCacheLock.Unlock()
	keyCache = append(keyCache, &keyCacheItem{password: password, cycles: cycles, salt: salt, key: key})
}

func calculateKey(password string, cycles int, salt []byte) []byte {
	k := findKeyCached(password, cycles, salt)
	if len(k) > 0 {
		// key found in cache
		return k
	}
	b := bytes.NewBuffer(salt)

	// Convert password to UTF-16LE
	utf16le := unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM)
	t := transform.NewWriter(b, utf16le.NewEncoder())
	_, _ = t.Write([]byte(password))

	key := make([]byte, sha256.Size)
	if cycles == 0x3f {
		copy(key, b.Bytes())
	} else {
		h := sha256.New()
		for i := uint64(0); i < 1<<cycles; i++ {
			// These will never error
			_, _ = h.Write(b.Bytes())
			_ = binary.Write(h, binary.LittleEndian, i)
		}
		copy(key, h.Sum(nil))
	}

	recordKeyCached(password, cycles, salt, key)
	return key
}

my test code is here

package main

import (
	"io"
	"os"
	"time"

	sevenz "github.com/bodgit/sevenzip"
)

func open7zArchive(archive, password string) (*sevenz.ReadCloser, error) {
	if password != "" {
		return sevenz.OpenReaderWithPassword(archive, password)
	}
	return sevenz.OpenReader(archive)
}

func extract7zItem(file *sevenz.File, target string) error {
	if file.FileInfo().IsDir() {
		return os.MkdirAll(target, file.Mode())
	}

	f, err := os.OpenFile(target, os.O_CREATE|os.O_WRONLY, file.Mode())
	if err != nil {
		return err
	}
	defer f.Close()

	// open on each file cost too many times
	fReader, err := file.Open()
	if err != nil {
		return err
	}
	defer fReader.Close()

	_, err = io.Copy(f, fReader)

	return err
}

func extract7zWithCallback(archive string, password string, handler func(*sevenz.File) error) error {
	reader, err := open7zArchive(archive, password)
	if err != nil {
		return err
	}
	defer reader.Close()

	for _, f := range reader.File {
		if err := handler(f); err != nil {
			return err
		}
	}

	return nil
}

func extract7zArchive(archive, password string, path string) error {
	if path[len(path)-1] != '/' && path[len(path)-1] != '\\' {
		path += "/"
	}
	return extract7zWithCallback(archive, password, func(f *sevenz.File) error {
		return extract7zItem(f, path+f.Name)
	})
}

func main() {
	t := time.Now().UnixMilli()
	println(extract7zArchive("F:\\log4job\\日常工作\\2024\\03-29\\client_v2403.csp", "test", "F:\\log4job\\日常工作\\2024\\03-29\\client_v2403"))
	println(time.Now().UnixMilli()-t, " ms cost")
}

extract my 7z file cost 20 seconds on no cache and 5 seconds on key cached

Normally files are all stored in a single stream, so the key should only need to be calculated once. However if you enable encryption and then don't use compression, it will add the files individually. I managed to reproduce the problem creating an archive with the following:

7z a -mhc=on -mhe=on -ppassword -m0=copy archive.7z ...

You can confirm this with 7z l:

Path = testdata/aes7z.7z
Type = 7z
Physical Size = 36552
Headers Size = 424
Method = Copy 7zAES
Solid = -
Blocks = 10

The Solid = - and Blocks = 10 confirms this, versus another archive:

Path = aes7z.7z
Type = 7z
Physical Size = 6167
Headers Size = 311
Method = LZMA2:48k 7zAES
Solid = +
Blocks = 1

Both archives contain 10 files.

I have released v1.5.1 which improves the performance of AES key calculation.

Thanks for your detailed report!