atomicoo
diff --git a/‎.gitignore
Lines changed: 8 additions & 3 deletions b/‎.gitignore
Lines changed: 8 additions & 3 deletions
diff --git a/‎README.md
Lines changed: 13 additions & 14 deletions b/‎README.md
Lines changed: 13 additions & 14 deletions
diff --git a/‎config/default.yaml
Lines changed: 2 additions & 2 deletions b/‎config/default.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎config/jpspeech.yaml
Lines changed: 74 additions & 0 deletions b/‎config/jpspeech.yaml
Lines changed: 74 additions & 0 deletions
diff --git a/‎models/layers.py
Lines changed: 23 additions & 0 deletions b/‎models/layers.py
Lines changed: 23 additions & 0 deletions
diff --git a/‎pretrained/ljspeech-melgan-epoch3200.pth
16.3 MB b/‎pretrained/ljspeech-melgan-epoch3200.pth
16.3 MB
diff --git a/‎pretrained/ljspeech-parallel-epoch0100.pth
16.6 MB b/‎pretrained/ljspeech-parallel-epoch0100.pth
16.6 MB
diff --git a/‎samples/synthesize.txt
Lines changed: 5 additions & 0 deletions b/‎samples/synthesize.txt
Lines changed: 5 additions & 0 deletions
diff --git a/‎samples/text/synthesize-mb.txt
Lines changed: 0 additions & 3 deletions b/‎samples/text/synthesize-mb.txt
Lines changed: 0 additions & 3 deletions
diff --git a/‎samples/text/synthesize.txt
Lines changed: 0 additions & 3 deletions b/‎samples/text/synthesize.txt
Lines changed: 0 additions & 3 deletions
diff --git a/‎samples/xxxx-xx-xx_001.wav
810 KB b/‎samples/xxxx-xx-xx_001.wav
810 KB
diff --git a/‎samples/xxxx-xx-xx_002.wav
810 KB b/‎samples/xxxx-xx-xx_002.wav
810 KB
diff --git a/‎samples/xxxx-xx-xx_003.wav
810 KB b/‎samples/xxxx-xx-xx_003.wav
810 KB
diff --git a/‎samples/xxxx-xx-xx_004.wav
810 KB b/‎samples/xxxx-xx-xx_004.wav
810 KB
diff --git a/‎samples/xxxx-xx-xx_005.wav
810 KB b/‎samples/xxxx-xx-xx_005.wav
810 KB
diff --git a/‎synthesize.py
Lines changed: 8 additions & 0 deletions b/‎synthesize.py
Lines changed: 8 additions & 0 deletions
@@ -1,9 +1,14 @@
 # Projects
-datasets/*Speech*/
-voienc/
+.vscode/
+datasets/*Speech*
 logdir/
+outputs/
+voienc/
 wandb/
-.vscode/
+
+# Git
+.git/
+.gitignore
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
 
@@ -6,27 +6,26 @@
 
 ```
 .
-|--- config    # 配置文件
+|--- config/      # 配置文件
      |--- default.yaml
      |--- ...
-|--- datasets  # 数据处理
-|--- helpers   # 一些辅助类
+|--- datasets/    # 数据处理
+|--- helpers/     # 一些辅助类
      |--- trainer.py
      |--- synthesizer.py
      |--- ...
-|--- logdir    # 训练过程保存目录
-|--- losses    # 一些损失函数
-|--- melgan    # 声码器
+|--- logdir/      # 训练过程保存目录
+|--- losses/      # 一些损失函数
+|--- melgan/      # 声码器
      |--- generator.py
      |--- ...
-|--- models    # 合成模型
+|--- models/      # 合成模型
      |--- layers.py
      |--- duration.py
      |--- parallel.py
-|--- samples   # 合成样例
-     |--- audio
-     |--- text
-|--- utils     # 一些通用方法
+|--- pretrained/  # 预训练模型（LJSpeech 数据集）
+|--- samples/     # 合成样例
+|--- utils/       # 一些通用方法
 |--- LICENSE
 |--- prepare-dataset.py  # 准备脚本
 |--- extract-duration.py
@@ -39,11 +38,11 @@
 
 ## 合成样例
 
-部分合成样例见[这里](https://github.com/atomicoo/ParallelTTS/tree/main/samples/audio)。
+部分合成样例见[这里](https://github.com/atomicoo/ParallelTTS/tree/main/samples/)。
 
 ## 预训练
 
-部分预训练模型见[这里](https://github.com/atomicoo/ParallelTTS/tree/main/logdir)。
+部分预训练模型见[这里](https://github.com/atomicoo/ParallelTTS/tree/main/pretrained/)。
 
 ## 快速开始
 
@@ -97,5 +96,5 @@ $ python extract-duration.py
 $ python train-parallel.py
 ```
 
-通过 `--ground_truth` 可以指定是否使用 Ground-Truth 声谱图进行训练。
+通过 `--ground_truth` 可以指定是否使用 Ground-Truth 声谱图进行模型训练。
 
@@ -70,5 +70,5 @@ trainer:
   disable_progress_bar: false
   logdir: './logdir'
 synthesizer:
-  inputs_file_path: './samples/text/synthesize.txt'
-  outputs_dir: './samples/audio'
+  inputs_file_path: './outputs/text/synthesize.txt'
+  outputs_dir: './outputs/audio'
@@ -0,0 +1,74 @@
+data:
+  datasets_path: './datasets'
+  dataset: 'jpspeech'
+  dataset_dir: 'JPSpeech-1.1'
+text:
+  graphemes: &gs !!python/object/apply:eval ['list("abcdefghijklmnopqrstuvwxyz1234567890")']
+  phonemes: &ps !!python/object/apply:eval ['["X"] * 70']
+  specials: &sp !!python/object/apply:eval ['["<pad>", "<unk>"]']
+  punctuations: &pt !!python/object/apply:eval ['[".", ",", "?", "!", " ", "-"]']
+  units_list: &ul !!python/object/apply:eval ['us+sp+pt', {'us': *gs, 'sp': *sp, 'pt': *pt}]
+  use_phonemes: &up false
+audio:
+  n_mel_channels: &nm 80
+  filter_length: 1024
+  hop_length: 256  # WARNING: this can't be changed.
+  win_length: 1024
+  sampling_rate: &sr 48000
+  segment_length: *sr
+  pad_short: 2000
+  mel_fmin: 0.0
+  mel_fmax: 8000.0
+  # Precomputed statistics for log-mel-spectrs for speech dataset
+  spec_mean: -5.522  # for JPSpeech dataset
+  spec_std: 2.063  # for JPSpeech dataset
+  spec_min: -11.5129  # for JPSpeech dataset
+  spec_max: 2.0584  # for JPSpeech dataset
+  # Others
+  normalize: false
+  reduction_rate: 4
+parallel:
+  ground_truth: false
+  out_channels: *nm  # equal to ${audio.n_mel_channels}
+  alphabet_size: !!python/object/apply:eval ['len(ul)', {'ul': *ul}]
+  channels: 128
+  enc_kernel_size: 4
+  dec_kernel_size: 4
+  enc_dilations: !!python/object/apply:eval ['4 * [1,2,4]   + [1]']  # receptive field is max 15
+  dec_dilations: !!python/object/apply:eval ['4 * [1,2,4,8] + [1]']  # receptive field is max 32
+  normalize: FreqNorm  # 'freq', 'layer', 'batch'
+  activation: torch.nn.ReLU  # 'relu', 'linear', 'sigmoid'
+  final_activation: torch.nn.Identity
+  pos_mode: 'duration'  # 'standard', 'duration'
+  interpolate: false  # true
+  separate_duration_grad: true
+  speaker_dim: 128  # speaker embedding dim
+duration:
+  out_channels: *nm  # equal to ${audio.n_mel_channels}
+  alphabet_size: !!python/object/apply:eval ['len(ul)', {'ul': *ul}]
+  channels: 40
+  hidden_channels: 80
+  kernel_size: 3
+  text_enc_dilations: !!python/object/apply:eval ['2 * [1,3,9,27] + [1,1]']
+  spec_enc_dilations: !!python/object/apply:eval ['2 * [1,3,9,27] + [1,1]']
+  spec_dec_dilations: !!python/object/apply:eval ['2 * [1,3,9,27] + [1,1]']
+  normalize: torch.nn.Linear  # 'layer'  # 'freq', 'layer', 'batch'
+  activation: torch.nn.ReLU  # 'relu', 'linear', 'sigmoid'
+  final_activation: torch.nn.Sigmoid
+  att_noise: 0.1
+  att_hidden_channels: 80
+  pos_mode: 'duration'  # 'standard', 'duration'
+  # Spectrogram augmentation options
+  enable_augment: true
+  noise: 0.01  # 1. add normal noise to input spectrs
+  feed_repeat: 2  # 2. Feed spectrograms through the model `feed_repeat` times, use degraded output on input for training
+  feed_ratio: 0.5  # how many items in batch are degraded
+  replace_ratio: 0.1  # 3. Replace random spectrogram frames with random noise
+melgan:
+  checkpoint: 'melgan-epoch3200.pth'
+trainer:
+  disable_progress_bar: false
+  logdir: './logdir'
+synthesizer:
+  inputs_file_path: './samples/text/synthesize-jp.txt'
+  outputs_dir: './samples/audio'
@@ -67,6 +67,29 @@ def forward(self, x):
         return x + self.blocks(x)
 
 
+class GatedConvBlock(nn.Module):
+    """Implements conv->PReLU->norm n_1-times->GLU"""
+
+    def __init__(self, channels, kernel_size, dilation,  n=2, norm=FreqNorm, activation=nn.ReLU, causal=True):
+        super(ResidualBlock, self).__init__()
+
+        self.blocks = [
+            nn.Sequential(
+                Conv1d(channels, channels, kernel_size, dilation=dilation),
+                ZeroTemporalPad(kernel_size, dilation, causal),
+                activation(),
+                norm(channels),  # Normalize after activation. if we used ReLU, half of our neurons would be dead!
+            )
+            for i in range(n-1)
+        ]
+        self.blocks.extend([nn.GLU(dim=1)])
+
+        self.blocks = nn.Sequential(*self.blocks)
+
+    def forward(self, x):
+        return x + self.blocks(x)
+
+
 class ScaledDotAttention(nn.Module):
 
     def __init__(self, in_channels, hidden_channels, out_channels, noise=0, normalize=False, dropout=False):
 
@@ -0,0 +1,5 @@
+The complexity of modern TTS designs thus leads to substantial engineering efforts when building a new system.
+To install precompiled package of eSpeak NG on Linux, use standard package manager of your distribution.
+A text-to-speech synthesis system typically consists of multiple stages, such as a text analysis frontend, an acoustic model and an audio synthesis module.
+These components are based on extensive domain expertise and are laborious to design. 
+Building these components often requires extensive domain expertise and may contain brittle design choices.
@@ -58,10 +58,13 @@
         device=device
     )
 
+    print('Synthesizing...')
+    since = time.time()
     text_file = args.input_texts or hparams.synthesizer.inputs_file_path
     with open(text_file, 'r', encoding='utf-8') as fr:
         texts = fr.read().strip().split('\n')
     melspecs = synthesizer.inference(texts)
+    print(f"Inference {len(texts)} spectrograms, total elapsed {time.time()-since:.3f}s. Done.")
 
     vocoder = Generator(hparams.audio.n_mel_channels).to(device)
     vocoder.eval(inference=True)
@@ -70,11 +73,16 @@
     vocoder.load_state_dict(torch.load(vocoder_checkpoint, map_location=device))
 
     waves = vocoder(melspecs).squeeze(1)
+    print(f"Generate {len(texts)} audios, total elapsed {time.time()-since:.3f}s. Done.")
 
+    print('Saving audio...')
     outputs_dir = args.outputs_dir or hparams.synthesizer.outputs_dir
     os.makedirs(outputs_dir, exist_ok=True)
     for i, wav in enumerate(waves, start=1):
         wav = wav.cpu().detach().numpy()
         filename = osp.join(outputs_dir, f"{time.strftime('%Y-%m-%d')}_{i:03d}.wav")
         write(filename, hparams.audio.sampling_rate, wav)
+    print(f"Audios saved to {outputs_dir}. Done.")
+
+    print(f'Done. ({time.time()-since:.3f}s)')