From 408235a9dcd2983684c87615a1bc2a8954f6eb47 Mon Sep 17 00:00:00 2001
From: Rithesh Kumar <ritheshkumar.95@gmail.com>
Date: Wed, 5 Jul 2023 11:31:35 -0400
Subject: [PATCH] Add final training configs as well as release 16 KHz model
 (#19)

* adding final configs for all models

* changs for 16khz

* add latest version for 16khz model

* update package version

---------

Co-authored-by: Ishaan Kumar <ishaan@descript.com>
---
 README.md             |   3 +-
 conf/final/16khz.yml  | 123 ++++++++++++++++++++++++++++++++++++++++++
 conf/final/24khz.yml  | 123 ++++++++++++++++++++++++++++++++++++++++++
 conf/final/44khz.yml  | 123 ++++++++++++++++++++++++++++++++++++++++++
 dac/__init__.py       |   2 +-
 dac/utils/__init__.py |  10 +++-
 dac/utils/decode.py   |   2 +-
 dac/utils/encode.py   |   2 +-
 setup.py              |   2 +-
 tests/test_cli.py     |   2 +-
 10 files changed, 384 insertions(+), 8 deletions(-)
 create mode 100644 conf/final/16khz.yml
 create mode 100644 conf/final/24khz.yml
 create mode 100644 conf/final/44khz.yml

diff --git a/README.md b/README.md
index 24db863..fd8ba9c 100644
--- a/README.md
+++ b/README.md
@@ -32,12 +32,13 @@ pip install git+https://github.com/descriptinc/descript-audio-codec
 
 ### Weights
 Weights are released as part of this repo under MIT license.
-We release weights for models that can natively support 24kHz and 44.1kHz sampling rates.
+We release weights for models that can natively support 16 kHz, 24kHz, and 44.1kHz sampling rates.
 Weights are automatically downloaded when you first run `encode` or `decode` command. You can cache them using one of the following commands
 ```bash
 python3 -m dac download # downloads the default 44kHz variant
 python3 -m dac download --model_type 44khz # downloads the 44kHz variant
 python3 -m dac download --model_type 24khz # downloads the 24kHz variant
+python3 -m dac download --model_type 16khz # downloads the 16kHz variant
 ```
 We provide a Dockerfile that installs all required dependencies for encoding and decoding. The build process caches the default model weights inside the image. This allows the image to be used without an internet connection. [Please refer to instructions below.](#docker-image)
 
diff --git a/conf/final/16khz.yml b/conf/final/16khz.yml
new file mode 100644
index 0000000..a86e107
--- /dev/null
+++ b/conf/final/16khz.yml
@@ -0,0 +1,123 @@
+# Model setup
+DAC.sample_rate: 16000
+DAC.encoder_dim: 64
+DAC.encoder_rates: [2, 4, 5, 8]
+DAC.decoder_dim: 1536
+DAC.decoder_rates: [8, 5, 4, 2]
+
+# Quantization
+DAC.n_codebooks: 12
+DAC.codebook_size: 1024
+DAC.codebook_dim: 8
+DAC.quantizer_dropout: 0.5
+
+# Discriminator
+Discriminator.sample_rate: 16000
+Discriminator.rates: []
+Discriminator.periods: [2, 3, 5, 7, 11]
+Discriminator.fft_sizes: [2048, 1024, 512]
+Discriminator.bands:
+  - [0.0, 0.1]
+  - [0.1, 0.25]
+  - [0.25, 0.5]
+  - [0.5, 0.75]
+  - [0.75, 1.0]
+
+# Optimization
+AdamW.betas: [0.8, 0.99]
+AdamW.lr: 0.0001
+ExponentialLR.gamma: 0.999996
+
+amp: false
+val_batch_size: 100
+device: cuda
+num_iters: 400000
+save_iters: [10000, 50000, 100000, 200000]
+valid_freq: 1000
+sample_freq: 10000
+num_workers: 32
+val_idx: [0, 1, 2, 3, 4, 5, 6, 7]
+seed: 0
+lambdas:
+  mel/loss: 15.0
+  adv/feat_loss: 2.0
+  adv/gen_loss: 1.0
+  vq/commitment_loss: 0.25
+  vq/codebook_loss: 1.0
+
+VolumeNorm.db: [const, -16]
+
+# Transforms
+build_transform.preprocess:
+  - Identity
+build_transform.augment_prob: 0.0
+build_transform.augment:
+  - Identity
+build_transform.postprocess:
+  - VolumeNorm
+  - RescaleAudio
+  - ShiftPhase
+
+# Loss setup
+MultiScaleSTFTLoss.window_lengths: [2048, 512]
+MelSpectrogramLoss.n_mels: [5, 10, 20, 40, 80, 160, 320]
+MelSpectrogramLoss.window_lengths: [32, 64, 128, 256, 512, 1024, 2048]
+MelSpectrogramLoss.mel_fmin: [0, 0, 0, 0, 0, 0, 0]
+MelSpectrogramLoss.mel_fmax: [null, null, null, null, null, null, null]
+MelSpectrogramLoss.pow: 1.0
+MelSpectrogramLoss.clamp_eps: 1.0e-5
+MelSpectrogramLoss.mag_weight: 0.0
+
+# Data
+batch_size: 72
+train/AudioDataset.duration: 0.38
+train/AudioDataset.n_examples: 10000000
+
+val/AudioDataset.duration: 5.0
+val/build_transform.augment_prob: 1.0
+val/AudioDataset.n_examples: 250
+
+test/AudioDataset.duration: 10.0
+test/build_transform.augment_prob: 1.0
+test/AudioDataset.n_examples: 1000
+
+AudioLoader.shuffle: true
+AudioDataset.without_replacement: true
+
+train/build_dataset.folders:
+  speech_fb:
+    - /data/daps/train
+  speech_hq:
+    - /data/vctk
+    - /data/vocalset
+    - /data/read_speech
+    - /data/french_speech
+  speech_uq:
+    - /data/emotional_speech/
+    - /data/common_voice/
+    - /data/german_speech/
+    - /data/russian_speech/
+    - /data/spanish_speech/
+  music_hq:
+    - /data/musdb/train
+  music_uq:
+    - /data/jamendo
+  general:
+    - /data/audioset/data/unbalanced_train_segments/
+    - /data/audioset/data/balanced_train_segments/
+
+val/build_dataset.folders:
+  speech_hq:
+    - /data/daps/val
+  music_hq:
+    - /data/musdb/test
+  general:
+    - /data/audioset/data/eval_segments/
+
+test/build_dataset.folders:
+  speech_hq:
+    - /data/daps/test
+  music_hq:
+    - /data/musdb/test
+  general:
+    - /data/audioset/data/eval_segments/
diff --git a/conf/final/24khz.yml b/conf/final/24khz.yml
new file mode 100644
index 0000000..b20298a
--- /dev/null
+++ b/conf/final/24khz.yml
@@ -0,0 +1,123 @@
+# Model setup
+DAC.sample_rate: 24000
+DAC.encoder_dim: 64
+DAC.encoder_rates: [2, 4, 5, 8]
+DAC.decoder_dim: 1536
+DAC.decoder_rates: [8, 5, 4, 2]
+
+# Quantization
+DAC.n_codebooks: 32
+DAC.codebook_size: 1024
+DAC.codebook_dim: 8
+DAC.quantizer_dropout: 0.5
+
+# Discriminator
+Discriminator.sample_rate: 24000
+Discriminator.rates: []
+Discriminator.periods: [2, 3, 5, 7, 11]
+Discriminator.fft_sizes: [2048, 1024, 512]
+Discriminator.bands:
+  - [0.0, 0.1]
+  - [0.1, 0.25]
+  - [0.25, 0.5]
+  - [0.5, 0.75]
+  - [0.75, 1.0]
+
+# Optimization
+AdamW.betas: [0.8, 0.99]
+AdamW.lr: 0.0001
+ExponentialLR.gamma: 0.999996
+
+amp: false
+val_batch_size: 100
+device: cuda
+num_iters: 400000
+save_iters: [10000, 50000, 100000, 200000]
+valid_freq: 1000
+sample_freq: 10000
+num_workers: 32
+val_idx: [0, 1, 2, 3, 4, 5, 6, 7]
+seed: 0
+lambdas:
+  mel/loss: 15.0
+  adv/feat_loss: 2.0
+  adv/gen_loss: 1.0
+  vq/commitment_loss: 0.25
+  vq/codebook_loss: 1.0
+
+VolumeNorm.db: [const, -16]
+
+# Transforms
+build_transform.preprocess:
+  - Identity
+build_transform.augment_prob: 0.0
+build_transform.augment:
+  - Identity
+build_transform.postprocess:
+  - VolumeNorm
+  - RescaleAudio
+  - ShiftPhase
+
+# Loss setup
+MultiScaleSTFTLoss.window_lengths: [2048, 512]
+MelSpectrogramLoss.n_mels: [5, 10, 20, 40, 80, 160, 320]
+MelSpectrogramLoss.window_lengths: [32, 64, 128, 256, 512, 1024, 2048]
+MelSpectrogramLoss.mel_fmin: [0, 0, 0, 0, 0, 0, 0]
+MelSpectrogramLoss.mel_fmax: [null, null, null, null, null, null, null]
+MelSpectrogramLoss.pow: 1.0
+MelSpectrogramLoss.clamp_eps: 1.0e-5
+MelSpectrogramLoss.mag_weight: 0.0
+
+# Data
+batch_size: 72
+train/AudioDataset.duration: 0.38
+train/AudioDataset.n_examples: 10000000
+
+val/AudioDataset.duration: 5.0
+val/build_transform.augment_prob: 1.0
+val/AudioDataset.n_examples: 250
+
+test/AudioDataset.duration: 10.0
+test/build_transform.augment_prob: 1.0
+test/AudioDataset.n_examples: 1000
+
+AudioLoader.shuffle: true
+AudioDataset.without_replacement: true
+
+train/build_dataset.folders:
+  speech_fb:
+    - /data/daps/train
+  speech_hq:
+    - /data/vctk
+    - /data/vocalset
+    - /data/read_speech
+    - /data/french_speech
+  speech_uq:
+    - /data/emotional_speech/
+    - /data/common_voice/
+    - /data/german_speech/
+    - /data/russian_speech/
+    - /data/spanish_speech/
+  music_hq:
+    - /data/musdb/train
+  music_uq:
+    - /data/jamendo
+  general:
+    - /data/audioset/data/unbalanced_train_segments/
+    - /data/audioset/data/balanced_train_segments/
+
+val/build_dataset.folders:
+  speech_hq:
+    - /data/daps/val
+  music_hq:
+    - /data/musdb/test
+  general:
+    - /data/audioset/data/eval_segments/
+
+test/build_dataset.folders:
+  speech_hq:
+    - /data/daps/test
+  music_hq:
+    - /data/musdb/test
+  general:
+    - /data/audioset/data/eval_segments/
diff --git a/conf/final/44khz.yml b/conf/final/44khz.yml
new file mode 100644
index 0000000..f3de25e
--- /dev/null
+++ b/conf/final/44khz.yml
@@ -0,0 +1,123 @@
+# Model setup
+DAC.sample_rate: 44100
+DAC.encoder_dim: 64
+DAC.encoder_rates: [2, 4, 8, 8]
+DAC.decoder_dim: 1536
+DAC.decoder_rates: [8, 8, 4, 2]
+
+# Quantization
+DAC.n_codebooks: 9
+DAC.codebook_size: 1024
+DAC.codebook_dim: 8
+DAC.quantizer_dropout: 0.5
+
+# Discriminator
+Discriminator.sample_rate: 44100
+Discriminator.rates: []
+Discriminator.periods: [2, 3, 5, 7, 11]
+Discriminator.fft_sizes: [2048, 1024, 512]
+Discriminator.bands:
+  - [0.0, 0.1]
+  - [0.1, 0.25]
+  - [0.25, 0.5]
+  - [0.5, 0.75]
+  - [0.75, 1.0]
+
+# Optimization
+AdamW.betas: [0.8, 0.99]
+AdamW.lr: 0.0001
+ExponentialLR.gamma: 0.999996
+
+amp: false
+val_batch_size: 100
+device: cuda
+num_iters: 400000
+save_iters: [10000, 50000, 100000, 200000]
+valid_freq: 1000
+sample_freq: 10000
+num_workers: 32
+val_idx: [0, 1, 2, 3, 4, 5, 6, 7]
+seed: 0
+lambdas:
+  mel/loss: 15.0
+  adv/feat_loss: 2.0
+  adv/gen_loss: 1.0
+  vq/commitment_loss: 0.25
+  vq/codebook_loss: 1.0
+
+VolumeNorm.db: [const, -16]
+
+# Transforms
+build_transform.preprocess:
+  - Identity
+build_transform.augment_prob: 0.0
+build_transform.augment:
+  - Identity
+build_transform.postprocess:
+  - VolumeNorm
+  - RescaleAudio
+  - ShiftPhase
+
+# Loss setup
+MultiScaleSTFTLoss.window_lengths: [2048, 512]
+MelSpectrogramLoss.n_mels: [5, 10, 20, 40, 80, 160, 320]
+MelSpectrogramLoss.window_lengths: [32, 64, 128, 256, 512, 1024, 2048]
+MelSpectrogramLoss.mel_fmin: [0, 0, 0, 0, 0, 0, 0]
+MelSpectrogramLoss.mel_fmax: [null, null, null, null, null, null, null]
+MelSpectrogramLoss.pow: 1.0
+MelSpectrogramLoss.clamp_eps: 1.0e-5
+MelSpectrogramLoss.mag_weight: 0.0
+
+# Data
+batch_size: 72
+train/AudioDataset.duration: 0.38
+train/AudioDataset.n_examples: 10000000
+
+val/AudioDataset.duration: 5.0
+val/build_transform.augment_prob: 1.0
+val/AudioDataset.n_examples: 250
+
+test/AudioDataset.duration: 10.0
+test/build_transform.augment_prob: 1.0
+test/AudioDataset.n_examples: 1000
+
+AudioLoader.shuffle: true
+AudioDataset.without_replacement: true
+
+train/build_dataset.folders:
+  speech_fb:
+    - /data/daps/train
+  speech_hq:
+    - /data/vctk
+    - /data/vocalset
+    - /data/read_speech
+    - /data/french_speech
+  speech_uq:
+    - /data/emotional_speech/
+    - /data/common_voice/
+    - /data/german_speech/
+    - /data/russian_speech/
+    - /data/spanish_speech/
+  music_hq:
+    - /data/musdb/train
+  music_uq:
+    - /data/jamendo
+  general:
+    - /data/audioset/data/unbalanced_train_segments/
+    - /data/audioset/data/balanced_train_segments/
+
+val/build_dataset.folders:
+  speech_hq:
+    - /data/daps/val
+  music_hq:
+    - /data/musdb/test
+  general:
+    - /data/audioset/data/eval_segments/
+
+test/build_dataset.folders:
+  speech_hq:
+    - /data/daps/test
+  music_hq:
+    - /data/musdb/test
+  general:
+    - /data/audioset/data/eval_segments/
diff --git a/dac/__init__.py b/dac/__init__.py
index 231ebbc..7f988ef 100644
--- a/dac/__init__.py
+++ b/dac/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.0.4"
+__version__ = "0.0.5"
 
 # preserved here for legacy reasons
 __model_version__ = "latest"
diff --git a/dac/utils/__init__.py b/dac/utils/__init__.py
index 3693b82..c4def3c 100644
--- a/dac/utils/__init__.py
+++ b/dac/utils/__init__.py
@@ -11,6 +11,7 @@
 __MODEL_LATEST_TAGS__ = {
     "44khz": "0.0.1",
     "24khz": "0.0.4",
+    "16khz": "0.0.5",
 }
 
 __MODEL_URLS__ = {
@@ -22,6 +23,10 @@
         "24khz",
         "0.0.4",
     ): "https://github.com/descriptinc/descript-audio-codec/releases/download/0.0.4/weights_24khz.pth",
+    (
+        "16khz",
+        "0.0.5",
+    ): "https://github.com/descriptinc/descript-audio-codec/releases/download/0.0.5/weights_16khz.pth",
 }
 
 
@@ -35,7 +40,7 @@ def ensure_default_model(tag: str = "latest", model_type: str = "44khz"):
     tag : str
         The tag of the model to download. Defaults to "latest".
     model_type : str
-        The type of model to download. Must be one of "44khz" or "24khz". Defaults to "44khz".
+        The type of model to download. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz".
 
     Returns
     -------
@@ -48,7 +53,8 @@ def ensure_default_model(tag: str = "latest", model_type: str = "44khz"):
     assert model_type in [
         "44khz",
         "24khz",
-    ], "model_type must be one of '44khz' or '24khz'"
+        "16khz",
+    ], "model_type must be one of '44khz', '24khz', or '16khz'"
 
     if tag == "latest":
         tag = __MODEL_LATEST_TAGS__[model_type]
diff --git a/dac/utils/decode.py b/dac/utils/decode.py
index 69bdccd..c8f32ef 100644
--- a/dac/utils/decode.py
+++ b/dac/utils/decode.py
@@ -122,7 +122,7 @@ def decode(
     device : str, optional
         Device to use, by default "cuda". If "cpu", the model will be loaded on the CPU.
     model_type : str, optional
-        The type of model to download. Must be one of "44khz" or "24khz". Defaults to "44khz". Ignored if `weights_path` is specified.
+        The type of model to download. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz". Ignored if `weights_path` is specified.
     """
     generator = load_model(
         tag=model_tag,
diff --git a/dac/utils/encode.py b/dac/utils/encode.py
index f45912e..0f1f620 100644
--- a/dac/utils/encode.py
+++ b/dac/utils/encode.py
@@ -146,7 +146,7 @@ def encode(
     device : str, optional
         Device to use, by default "cuda"
     model_type : str, optional
-        The type of model to download. Must be one of "44khz" or "24khz". Defaults to "44khz". Ignored if `weights_path` is specified.
+        The type of model to download. Must be one of "44khz", "24khz", or "16khz". Defaults to "44khz". Ignored if `weights_path` is specified.
     """
     generator = load_model(
         tag=model_tag,
diff --git a/setup.py b/setup.py
index 490d0ec..68cac4d 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
 
 setup(
     name="descript-audio-codec",
-    version="0.0.4",
+    version="0.0.5",
     classifiers=[
         "Intended Audience :: Developers",
         "Natural Language :: English",
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 60c5215..a39cc26 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -30,7 +30,7 @@ def teardown_module(module):
     subprocess.check_output(["rm", "-rf", f"{repo_root}/tests/assets"])
 
 
-@pytest.mark.parametrize("model_type", ["44khz", "24khz"])
+@pytest.mark.parametrize("model_type", ["44khz", "24khz", "16khz"])
 def test_reconstruction(model_type):
     # Test encoding
     input_dir = Path(__file__).parent / "assets" / "input"