dict.go

package gozstd

/*
#cgo CFLAGS: -O3

#define ZSTD_STATIC_LINKING_ONLY
#include "zstd.h"

#define ZDICT_STATIC_LINKING_ONLY
#include "zdict.h"

#include <stdint.h>  // for uintptr_t
#include <string.h>  // for memset

// The following *_wrapper functions allow avoiding memory allocations
// durting calls from Go.
// See https://github.com/golang/go/issues/24450 .

static ZSTD_CDict* ZSTD_createCDict_wrapper(uintptr_t dictBuffer, size_t dictSize, int compressionLevel) {
	return ZSTD_createCDict((const void *)dictBuffer, dictSize, compressionLevel);
}

static ZSTD_DDict* ZSTD_createDDict_wrapper(uintptr_t dictBuffer, size_t dictSize) {
	return ZSTD_createDDict((const void *)dictBuffer, dictSize);
}

static size_t ZDICT_optimizeTrainFromBuffer_fastCover_wrapper(void* dictBuffer, size_t dictBufferCapacity,
	const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
	unsigned kvalue, int compressionLevel, unsigned nbThreads)
{
    ZDICT_fastCover_params_t params;
    memset(&params, 0, sizeof(params));
    params.k = kvalue;
    params.d = 8;
    params.f = 20;
    params.steps = 4;
    params.splitPoint = 0.75;
    params.accel = 1;
    params.nbThreads = nbThreads;
    params.zParams.compressionLevel = compressionLevel;
    params.zParams.notificationLevel = 0;
    return ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, dictBufferCapacity,
        samplesBuffer, samplesSizes, nbSamples,
        &params);
}

*/
import "C"

import (
	"fmt"
	"runtime"
	"sync"
	"unsafe"
)

const minDictLen = C.ZDICT_DICTSIZE_MIN

// BuildDict returns dictionary built from the given samples.
//
// The resulting dictionary size will be close to desiredDictLen.
//
// The returned dictionary may be passed to NewCDict* and NewDDict.
func BuildDict(samples [][]byte, desiredDictLen int) []byte {
	if desiredDictLen < minDictLen {
		desiredDictLen = minDictLen
	}
	dict := make([]byte, desiredDictLen)

	// Calculate the total samples size.
	samplesBufLen := 0
	for _, sample := range samples {
		if len(sample) == 0 {
			// Skip empty samples.
			continue
		}
		samplesBufLen += len(sample)
	}

	// Construct flat samplesBuf and samplesSizes.
	samplesBuf := make([]byte, 0, samplesBufLen)
	samplesSizes := make([]C.size_t, 0, len(samples))
	for _, sample := range samples {
		samplesBuf = append(samplesBuf, sample...)
		samplesSizes = append(samplesSizes, C.size_t(len(sample)))
	}

	// Add fake samples if the original samples are too small.
	minSamplesBufLen := int(C.ZDICT_CONTENTSIZE_MIN)
	if minSamplesBufLen < minDictLen {
		minSamplesBufLen = minDictLen
	}
	for samplesBufLen < minSamplesBufLen {
		fakeSample := []byte(fmt.Sprintf("this is a fake sample %d", samplesBufLen))
		samplesBuf = append(samplesBuf, fakeSample...)
		samplesSizes = append(samplesSizes, C.size_t(len(fakeSample)))
		samplesBufLen += len(fakeSample)
	}

	// Run ZDICT_trainFromBuffer under lock, since it looks like it
	// is unsafe for concurrent usage (it just randomly crashes).
	// TODO: remove this restriction.

	buildDictLock.Lock()
	result := C.ZDICT_trainFromBuffer(
		unsafe.Pointer(&dict[0]),
		C.size_t(len(dict)),
		unsafe.Pointer(&samplesBuf[0]),
		&samplesSizes[0],
		C.unsigned(len(samplesSizes)))
	buildDictLock.Unlock()
	if C.ZDICT_isError(result) != 0 {
		// Return empty dictionary, since the original samples are too small.
		return nil
	}

	dictLen := int(result)
	return dict[:dictLen]
}

// BuildDictFast returns dictionary built from the given samples.
//
// The resulting dictionary size will be close to desiredDictLen.
//
// The returned dictionary may be passed to NewCDict* and NewDDict.
func BuildDictFast(samplesBuf []byte, desiredDictLen int, kValue uint, compressionLevel int, threads uint) []byte {
	if desiredDictLen < minDictLen {
		desiredDictLen = minDictLen
	}
	dict := make([]byte, desiredDictLen)

	// Calculate the total samples size.
	samplesBufLen := len(samplesBuf)
	samplesCount := samplesBufLen / 4096
	if samplesBufLen%4096 != 0 {
		samplesCount++
	}

	// Construct samplesSizes.
	samplesSizes := make([]C.size_t, 0, samplesCount)
	for i := 0; i < samplesBufLen/4096; i++ {
		samplesSizes = append(samplesSizes, C.size_t(4096))
	}
	if samplesBufLen%4096 != 0 {
		samplesSizes = append(samplesSizes, C.size_t(samplesBufLen%4096))
	}

	// Run ZDICT_trainFromBuffer under lock, since it looks like it
	// is unsafe for concurrent usage (it just randomly crashes).
	// TODO: remove this restriction.

	buildDictLock.Lock()
	result := C.ZDICT_optimizeTrainFromBuffer_fastCover_wrapper(
		unsafe.Pointer(&dict[0]),
		C.size_t(len(dict)),
		unsafe.Pointer(&samplesBuf[0]),
		&samplesSizes[0],
		C.unsigned(len(samplesSizes)),
		C.unsigned(kValue),
		C.int(compressionLevel),
		C.unsigned(threads),
	)
	buildDictLock.Unlock()
	if C.ZDICT_isError(result) != 0 {
		// Return empty dictionary, since the original samples are too small.
		return nil
	}

	dictLen := int(result)
	return dict[:dictLen]
}

var buildDictLock sync.Mutex

// CDict is a dictionary used for compression.
//
// A single CDict may be re-used in concurrently running goroutines.
type CDict struct {
	p                *C.ZSTD_CDict
	compressionLevel int
}

// NewCDict creates new CDict from the given dict.
//
// Call Release when the returned dict is no longer used.
func NewCDict(dict []byte) (*CDict, error) {
	return NewCDictLevel(dict, DefaultCompressionLevel)
}

// NewCDictLevel creates new CDict from the given dict
// using the given compressionLevel.
//
// Call Release when the returned dict is no longer used.
func NewCDictLevel(dict []byte, compressionLevel int) (*CDict, error) {
	if len(dict) == 0 {
		return nil, fmt.Errorf("dict cannot be empty")
	}

	cd := &CDict{
		p: C.ZSTD_createCDict_wrapper(
			C.uintptr_t(uintptr(unsafe.Pointer(&dict[0]))),
			C.size_t(len(dict)),
			C.int(compressionLevel)),
		compressionLevel: compressionLevel,
	}
	// Prevent from GC'ing of dict during CGO call above.
	runtime.KeepAlive(dict)
	runtime.SetFinalizer(cd, freeCDict)
	return cd, nil
}

// Release releases resources occupied by cd.
//
// cd cannot be used after the release.
func (cd *CDict) Release() {
	if cd.p == nil {
		return
	}
	result := C.ZSTD_freeCDict(cd.p)
	ensureNoError("ZSTD_freeCDict", result)
	cd.p = nil
}

func freeCDict(v interface{}) {
	v.(*CDict).Release()
}

// DDict is a dictionary used for decompression.
//
// A single DDict may be re-used in concurrently running goroutines.
type DDict struct {
	p *C.ZSTD_DDict
}

// NewDDict creates new DDict from the given dict.
//
// Call Release when the returned dict is no longer needed.
func NewDDict(dict []byte) (*DDict, error) {
	if len(dict) == 0 {
		return nil, fmt.Errorf("dict cannot be empty")
	}

	dd := &DDict{
		p: C.ZSTD_createDDict_wrapper(
			C.uintptr_t(uintptr(unsafe.Pointer(&dict[0]))),
			C.size_t(len(dict))),
	}
	// Prevent from GC'ing of dict during CGO call above.
	runtime.KeepAlive(dict)
	runtime.SetFinalizer(dd, freeDDict)
	return dd, nil
}

// Release releases resources occupied by dd.
//
// dd cannot be used after the release.
func (dd *DDict) Release() {
	if dd.p == nil {
		return
	}

	result := C.ZSTD_freeDDict(dd.p)
	ensureNoError("ZSTD_freeDDict", result)
	dd.p = nil
}

func freeDDict(v interface{}) {
	v.(*DDict).Release()
}