Skip to content

Commit 20c8cfb

Browse files
committed
Upload pretrained model and synthesized samples
1 parent e763437 commit 20c8cfb

16 files changed

+133
-25
lines changed

.gitignore

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
# Projects
2-
datasets/*Speech*/
3-
voienc/
2+
.vscode/
3+
datasets/*Speech*
44
logdir/
5+
outputs/
6+
voienc/
57
wandb/
6-
.vscode/
8+
9+
# Git
10+
.git/
11+
.gitignore
712

813
# Byte-compiled / optimized / DLL files
914
__pycache__/

README.md

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,27 +6,26 @@
66

77
```
88
.
9-
|--- config # 配置文件
9+
|--- config/ # 配置文件
1010
|--- default.yaml
1111
|--- ...
12-
|--- datasets # 数据处理
13-
|--- helpers # 一些辅助类
12+
|--- datasets/ # 数据处理
13+
|--- helpers/ # 一些辅助类
1414
|--- trainer.py
1515
|--- synthesizer.py
1616
|--- ...
17-
|--- logdir # 训练过程保存目录
18-
|--- losses # 一些损失函数
19-
|--- melgan # 声码器
17+
|--- logdir/ # 训练过程保存目录
18+
|--- losses/ # 一些损失函数
19+
|--- melgan/ # 声码器
2020
|--- generator.py
2121
|--- ...
22-
|--- models # 合成模型
22+
|--- models/ # 合成模型
2323
|--- layers.py
2424
|--- duration.py
2525
|--- parallel.py
26-
|--- samples # 合成样例
27-
|--- audio
28-
|--- text
29-
|--- utils # 一些通用方法
26+
|--- pretrained/ # 预训练模型(LJSpeech 数据集)
27+
|--- samples/ # 合成样例
28+
|--- utils/ # 一些通用方法
3029
|--- LICENSE
3130
|--- prepare-dataset.py # 准备脚本
3231
|--- extract-duration.py
@@ -39,11 +38,11 @@
3938

4039
## 合成样例
4140

42-
部分合成样例见[这里](https://github.com/atomicoo/ParallelTTS/tree/main/samples/audio)
41+
部分合成样例见[这里](https://github.com/atomicoo/ParallelTTS/tree/main/samples/)
4342

4443
## 预训练
4544

46-
部分预训练模型见[这里](https://github.com/atomicoo/ParallelTTS/tree/main/logdir)
45+
部分预训练模型见[这里](https://github.com/atomicoo/ParallelTTS/tree/main/pretrained/)
4746

4847
## 快速开始
4948

@@ -97,5 +96,5 @@ $ python extract-duration.py
9796
$ python train-parallel.py
9897
```
9998

100-
通过 `--ground_truth` 可以指定是否使用 Ground-Truth 声谱图进行训练
99+
通过 `--ground_truth` 可以指定是否使用 Ground-Truth 声谱图进行模型训练
101100

config/default.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,5 +70,5 @@ trainer:
7070
disable_progress_bar: false
7171
logdir: './logdir'
7272
synthesizer:
73-
inputs_file_path: './samples/text/synthesize.txt'
74-
outputs_dir: './samples/audio'
73+
inputs_file_path: './outputs/text/synthesize.txt'
74+
outputs_dir: './outputs/audio'

config/jpspeech.yaml

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
data:
2+
datasets_path: './datasets'
3+
dataset: 'jpspeech'
4+
dataset_dir: 'JPSpeech-1.1'
5+
text:
6+
graphemes: &gs !!python/object/apply:eval ['list("abcdefghijklmnopqrstuvwxyz1234567890")']
7+
phonemes: &ps !!python/object/apply:eval ['["X"] * 70']
8+
specials: &sp !!python/object/apply:eval ['["<pad>", "<unk>"]']
9+
punctuations: &pt !!python/object/apply:eval ['[".", ",", "?", "!", " ", "-"]']
10+
units_list: &ul !!python/object/apply:eval ['us+sp+pt', {'us': *gs, 'sp': *sp, 'pt': *pt}]
11+
use_phonemes: &up false
12+
audio:
13+
n_mel_channels: &nm 80
14+
filter_length: 1024
15+
hop_length: 256 # WARNING: this can't be changed.
16+
win_length: 1024
17+
sampling_rate: &sr 48000
18+
segment_length: *sr
19+
pad_short: 2000
20+
mel_fmin: 0.0
21+
mel_fmax: 8000.0
22+
# Precomputed statistics for log-mel-spectrs for speech dataset
23+
spec_mean: -5.522 # for JPSpeech dataset
24+
spec_std: 2.063 # for JPSpeech dataset
25+
spec_min: -11.5129 # for JPSpeech dataset
26+
spec_max: 2.0584 # for JPSpeech dataset
27+
# Others
28+
normalize: false
29+
reduction_rate: 4
30+
parallel:
31+
ground_truth: false
32+
out_channels: *nm # equal to ${audio.n_mel_channels}
33+
alphabet_size: !!python/object/apply:eval ['len(ul)', {'ul': *ul}]
34+
channels: 128
35+
enc_kernel_size: 4
36+
dec_kernel_size: 4
37+
enc_dilations: !!python/object/apply:eval ['4 * [1,2,4] + [1]'] # receptive field is max 15
38+
dec_dilations: !!python/object/apply:eval ['4 * [1,2,4,8] + [1]'] # receptive field is max 32
39+
normalize: FreqNorm # 'freq', 'layer', 'batch'
40+
activation: torch.nn.ReLU # 'relu', 'linear', 'sigmoid'
41+
final_activation: torch.nn.Identity
42+
pos_mode: 'duration' # 'standard', 'duration'
43+
interpolate: false # true
44+
separate_duration_grad: true
45+
speaker_dim: 128 # speaker embedding dim
46+
duration:
47+
out_channels: *nm # equal to ${audio.n_mel_channels}
48+
alphabet_size: !!python/object/apply:eval ['len(ul)', {'ul': *ul}]
49+
channels: 40
50+
hidden_channels: 80
51+
kernel_size: 3
52+
text_enc_dilations: !!python/object/apply:eval ['2 * [1,3,9,27] + [1,1]']
53+
spec_enc_dilations: !!python/object/apply:eval ['2 * [1,3,9,27] + [1,1]']
54+
spec_dec_dilations: !!python/object/apply:eval ['2 * [1,3,9,27] + [1,1]']
55+
normalize: torch.nn.Linear # 'layer' # 'freq', 'layer', 'batch'
56+
activation: torch.nn.ReLU # 'relu', 'linear', 'sigmoid'
57+
final_activation: torch.nn.Sigmoid
58+
att_noise: 0.1
59+
att_hidden_channels: 80
60+
pos_mode: 'duration' # 'standard', 'duration'
61+
# Spectrogram augmentation options
62+
enable_augment: true
63+
noise: 0.01 # 1. add normal noise to input spectrs
64+
feed_repeat: 2 # 2. Feed spectrograms through the model `feed_repeat` times, use degraded output on input for training
65+
feed_ratio: 0.5 # how many items in batch are degraded
66+
replace_ratio: 0.1 # 3. Replace random spectrogram frames with random noise
67+
melgan:
68+
checkpoint: 'melgan-epoch3200.pth'
69+
trainer:
70+
disable_progress_bar: false
71+
logdir: './logdir'
72+
synthesizer:
73+
inputs_file_path: './samples/text/synthesize-jp.txt'
74+
outputs_dir: './samples/audio'

models/layers.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,29 @@ def forward(self, x):
6767
return x + self.blocks(x)
6868

6969

70+
class GatedConvBlock(nn.Module):
71+
"""Implements conv->PReLU->norm n_1-times->GLU"""
72+
73+
def __init__(self, channels, kernel_size, dilation, n=2, norm=FreqNorm, activation=nn.ReLU, causal=True):
74+
super(ResidualBlock, self).__init__()
75+
76+
self.blocks = [
77+
nn.Sequential(
78+
Conv1d(channels, channels, kernel_size, dilation=dilation),
79+
ZeroTemporalPad(kernel_size, dilation, causal),
80+
activation(),
81+
norm(channels), # Normalize after activation. if we used ReLU, half of our neurons would be dead!
82+
)
83+
for i in range(n-1)
84+
]
85+
self.blocks.extend([nn.GLU(dim=1)])
86+
87+
self.blocks = nn.Sequential(*self.blocks)
88+
89+
def forward(self, x):
90+
return x + self.blocks(x)
91+
92+
7093
class ScaledDotAttention(nn.Module):
7194

7295
def __init__(self, in_channels, hidden_channels, out_channels, noise=0, normalize=False, dropout=False):
16.3 MB
Binary file not shown.
16.6 MB
Binary file not shown.

samples/synthesize.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
The complexity of modern TTS designs thus leads to substantial engineering efforts when building a new system.
2+
To install precompiled package of eSpeak NG on Linux, use standard package manager of your distribution.
3+
A text-to-speech synthesis system typically consists of multiple stages, such as a text analysis frontend, an acoustic model and an audio synthesis module.
4+
These components are based on extensive domain expertise and are laborious to design.
5+
Building these components often requires extensive domain expertise and may contain brittle design choices.

samples/text/synthesize-mb.txt

Lines changed: 0 additions & 3 deletions
This file was deleted.

samples/text/synthesize.txt

Lines changed: 0 additions & 3 deletions
This file was deleted.

samples/xxxx-xx-xx_001.wav

810 KB
Binary file not shown.

samples/xxxx-xx-xx_002.wav

810 KB
Binary file not shown.

samples/xxxx-xx-xx_003.wav

810 KB
Binary file not shown.

samples/xxxx-xx-xx_004.wav

810 KB
Binary file not shown.

samples/xxxx-xx-xx_005.wav

810 KB
Binary file not shown.

synthesize.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,13 @@
5858
device=device
5959
)
6060

61+
print('Synthesizing...')
62+
since = time.time()
6163
text_file = args.input_texts or hparams.synthesizer.inputs_file_path
6264
with open(text_file, 'r', encoding='utf-8') as fr:
6365
texts = fr.read().strip().split('\n')
6466
melspecs = synthesizer.inference(texts)
67+
print(f"Inference {len(texts)} spectrograms, total elapsed {time.time()-since:.3f}s. Done.")
6568

6669
vocoder = Generator(hparams.audio.n_mel_channels).to(device)
6770
vocoder.eval(inference=True)
@@ -70,11 +73,16 @@
7073
vocoder.load_state_dict(torch.load(vocoder_checkpoint, map_location=device))
7174

7275
waves = vocoder(melspecs).squeeze(1)
76+
print(f"Generate {len(texts)} audios, total elapsed {time.time()-since:.3f}s. Done.")
7377

78+
print('Saving audio...')
7479
outputs_dir = args.outputs_dir or hparams.synthesizer.outputs_dir
7580
os.makedirs(outputs_dir, exist_ok=True)
7681
for i, wav in enumerate(waves, start=1):
7782
wav = wav.cpu().detach().numpy()
7883
filename = osp.join(outputs_dir, f"{time.strftime('%Y-%m-%d')}_{i:03d}.wav")
7984
write(filename, hparams.audio.sampling_rate, wav)
85+
print(f"Audios saved to {outputs_dir}. Done.")
86+
87+
print(f'Done. ({time.time()-since:.3f}s)')
8088

0 commit comments

Comments
 (0)