bodgit/sevenzip

Low performance of .7z files with password

hiroyukki opened this issue · 2 comments

I extract a file with password, my code likes:

func extract7zArchive(archive string, path string, password string) error {
	reader, err := sevenz.OpenReaderWithPassword(archive, password)
	if err != nil {
		return err
	}
	defer reader.Close()

	for _, f := range reader.File {
		target := PathAppend(path, f.Name)
		// seems no flag to check if f is directory, but it will end with / if is directory
		if IsStrEndWith(f.Name, "/", true) {
			if err := os.MkdirAll(target, f.Mode()); err != nil {
				return err
			}
		} else {
			_f, err := os.OpenFile(target, os.O_CREATE, f.Mode())
			if err != nil {
				return err
			}
			defer _f.Close()

			pre := time.Now().UnixMilli()
			fReader, err := f.Open()
			LogInfo("%d ms cost open %s", time.Now().UnixMilli()-pre, f.Name)
			if err != nil {
				return err
			}
			defer fReader.Close()

			if _, err = io.Copy(_f, fReader); err != nil {
				return err
			}
		}
	}

	return nil
}

it very slow on f.Open, I follow the code and found performance problem at internal/aes7z/key.go

func calculateKey(password string, cycles int, salt []byte) []byte {
	b := bytes.NewBuffer(salt)

	// Convert password to UTF-16LE
	utf16le := unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM)
	t := transform.NewWriter(b, utf16le.NewEncoder())
	_, _ = t.Write([]byte(password))

	key := make([]byte, sha256.Size)
	if cycles == 0x3f {
		copy(key, b.Bytes())
	} else {
		pre := time.Now().UnixMilli()
		h := sha256.New()
		count := 0
		for i := uint64(0); i < 1<<cycles; i++ {
			count++
			// These will never error
			_, _ = h.Write(b.Bytes())
			_ = binary.Write(h, binary.LittleEndian, i)
		}
		println(time.Now().UnixMilli()-pre, "ms cost, loop count", count)
		copy(key, h.Sum(nil))
	}

	return key
}

for loop run 500k counts+, cost about 250ms on my pc(cpu i7 9700)
output:
250 ms cost, loop count 524288 11-30 11:37:46,info,archivehlpr.go.155:251 ms cost open apps.json

Try something like this instead:

func extractFile(file *sevenzip.File, target string) error {
	_f, err := os.OpenFile(target, os.O_CREATE, file.Mode())
	if err != nil {
		return err
	}
	defer _f.Close()

	fReader, err := file.Open()
	if err != nil {
		return err
	}
	defer fReader.Close()

	_, err = io.Copy(_f, fReader)

        return err
}

func extract7zArchive(archive string, path string, password string) error {
	reader, err := sevenz.OpenReaderWithPassword(archive, password)
	if err != nil {
		return err
	}
	defer reader.Close()

	for _, f := range reader.File {
		target := PathAppend(path, f.Name)
		// seems no flag to check if f is directory, but it will end with / if is directory
		if IsStrEndWith(f.Name, "/", true) {
			if err := os.MkdirAll(target, f.Mode()); err != nil {
				return err
			}
		} else {
                	pre := time.Now().UnixMilli()
                	if err = extractFile(f, target); err != nil {
                                return err
                        }
                	LogInfo("%d ms cost open %s", time.Now().UnixMilli()-pre, f.Name)
		}
	}

	return nil
}

I've not compile-tested it but you should be able to get the idea.

The point is to close the io.ReadCloser as soon as you finished extracting a file before extracting the next one. The way your code is currently structured means that all of the io.ReadClosers are deferred until extract7zArchive() completes before closing all of them.

Internally, to read file n in an archive, you have to start at the beginning of the stream and read (and discard) the data for files 0 to n-1, so for increasing values of n it will get slower and slower as you have to read and discard more and more.

There's an optimisation in my library that will reuse the stream reader but that only works if you call Close() on the file reader before opening the next one.

You're also doing the same with your filehandles returned from os.OpenFile() so you can potentially run out of filehandles available to your process when dealing with very large archives.

Thanks a lot, I close reader before open next file item, it works now