Skip to content
This repository was archived by the owner on Nov 11, 2023. It is now read-only.

Commit 7dbb0ba

Browse files
committed
Diif Updata
1 parent 8cc7645 commit 7dbb0ba

File tree

9 files changed

+128
-83
lines changed

9 files changed

+128
-83
lines changed

diffusion/data_loaders.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,10 +145,10 @@ def __init__(
145145
if n_spk is not None and n_spk > 1:
146146
spk_name = name_ext.split("/")[-2]
147147
spk_id = spk[spk_name] if spk_name in spk else 0
148-
if spk_id < 1 or spk_id > n_spk:
149-
raise ValueError(' [x] Muiti-speaker traing error : spk_id must be a positive integer from 1 to n_spk ')
148+
if spk_id < 0 or spk_id >= n_spk:
149+
raise ValueError(' [x] Muiti-speaker traing error : spk_id must be a positive integer from 0 to n_spk-1 ')
150150
else:
151-
spk_id = 1
151+
spk_id = 0
152152
spk_id = torch.LongTensor(np.array([spk_id])).to(device)
153153

154154
if load_all_data:

diffusion/solver.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ def test(args, model, vocoder, loader_test, saver):
2222
# run
2323
with torch.no_grad():
2424
for bidx, data in enumerate(loader_test):
25-
fn = data['name'][0]
25+
fn = data['name'][0].split("/")[-1]
26+
speaker = data['name'][0].split("/")[-2]
2627
print('--------')
2728
print('{}/{} - {}'.format(bidx, num_batches, fn))
2829

@@ -65,16 +66,15 @@ def test(args, model, vocoder, loader_test, saver):
6566
test_loss += loss.item()
6667

6768
# log mel
68-
saver.log_spec(data['name'][0], data['mel'], mel)
69+
saver.log_spec(f"{speaker}_{fn}.wav", data['mel'], mel)
6970

70-
# log audio
71-
path_audio = os.path.join(args.data.valid_path, 'audio', data['name_ext'][0])
71+
# log audi
72+
path_audio = data['name_ext'][0]
7273
audio, sr = librosa.load(path_audio, sr=args.data.sampling_rate)
7374
if len(audio.shape) > 1:
7475
audio = librosa.to_mono(audio)
7576
audio = torch.from_numpy(audio).unsqueeze(0).to(signal)
76-
saver.log_audio({fn+'/gt.wav': audio, fn+'/pred.wav': signal})
77-
77+
saver.log_audio({f"{speaker}_{fn}_gt.wav": audio,f"{speaker}_{fn}_pred.wav": signal})
7878
# report
7979
test_loss /= args.train.batch_size
8080
test_loss /= num_batches
@@ -107,6 +107,7 @@ def train(args, initial_global_step, model, optimizer, scheduler, vocoder, loade
107107
dtype = torch.bfloat16
108108
else:
109109
raise ValueError(' [x] Unknown amp_dtype: ' + args.train.amp_dtype)
110+
saver.log_info("epoch|batch_idx/num_batches|output_dir|batch/s|lr|time|step")
110111
for epoch in range(args.train.epochs):
111112
for batch_idx, data in enumerate(loader_train):
112113
saver.global_step_increment()

diffusion/unit2mel.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,12 @@ def __getattr__(*args):
1818

1919
def load_model_vocoder(
2020
model_path,
21-
device='cpu'):
22-
config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
21+
device='cpu',
22+
config_path = None
23+
):
24+
if config_path is None: config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
25+
else: config_file = config_path
26+
2327
with open(config_file, "r") as config:
2428
args = yaml.safe_load(config)
2529
args = DotDict(args)
@@ -85,9 +89,9 @@ def forward(self, units, f0, volume, spk_id = None, spk_mix_dict = None, aug_shi
8589
if spk_mix_dict is not None:
8690
for k, v in spk_mix_dict.items():
8791
spk_id_torch = torch.LongTensor(np.array([[k]])).to(units.device)
88-
x = x + v * self.spk_embed(spk_id_torch - 1)
92+
x = x + v * self.spk_embed(spk_id_torch)
8993
else:
90-
x = x + self.spk_embed(spk_id - 1)
94+
x = x + self.spk_embed(spk_id)
9195
if self.aug_shift_embed is not None and aug_shift is not None:
9296
x = x + self.aug_shift_embed(aug_shift / 5)
9397
x = self.decoder(x, gt_spec=gt_spec, infer=infer, infer_speedup=infer_speedup, method=method, k_step=k_step, use_tqdm=use_tqdm)

inference/infer_tool.py

Lines changed: 73 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
import utils
2020
from models import SynthesizerTrn
2121

22+
from diffusion.unit2mel import load_model_vocoder
23+
import yaml
24+
2225
logging.getLogger('matplotlib').setLevel(logging.WARNING)
2326

2427

@@ -114,7 +117,11 @@ class Svc(object):
114117
def __init__(self, net_g_path, config_path,
115118
device=None,
116119
cluster_model_path="logs/44k/kmeans_10000.pt",
117-
nsf_hifigan_enhance = False
120+
nsf_hifigan_enhance = False,
121+
diffusion_model_path="logs/44k/diffusion/model_0.pt",
122+
diffusion_config_path="configs/diffusion.yaml",
123+
shallow_diffusion = False,
124+
only_diffusion = False,
118125
):
119126
self.net_g_path = net_g_path
120127
if device is None:
@@ -127,19 +134,32 @@ def __init__(self, net_g_path, config_path,
127134
self.hop_size = self.hps_ms.data.hop_length
128135
self.spk2id = self.hps_ms.spk
129136
self.nsf_hifigan_enhance = nsf_hifigan_enhance
137+
self.only_diffusion = only_diffusion
138+
self.shallow_diffusion = shallow_diffusion
130139
try:
131140
self.speech_encoder = self.hps_ms.model.speech_encoder
132141
except Exception as e:
133142
self.speech_encoder = 'vec768l12'
134-
# load hubert
135-
self.hubert_model = utils.get_speech_encoder(self.speech_encoder,device=self.dev)
136-
self.load_model()
143+
144+
if self.shallow_diffusion or self.only_diffusion:
145+
self.diffusion_model,self.vocoder,self.diffusion_args = load_model_vocoder(diffusion_model_path,self.dev,config_path=diffusion_config_path)
146+
# load hubert and model
147+
if not self.only_diffusion:
148+
self.load_model()
149+
self.hubert_model = utils.get_speech_encoder(self.speech_encoder,device=self.dev)
150+
self.volume_extractor = utils.Volume_Extractor(self.hps_ms.data.hop_length)
151+
assert self.diffusion_args.data.encoder == self.hps_ms.model.speech_encoder
152+
else:
153+
self.hubert_model = utils.get_speech_encoder(self.diffusion_args.data.encoder,device=self.dev)
154+
self.volume_extractor = utils.Volume_Extractor(self.diffusion_args.data.block_size)
155+
137156
if os.path.exists(cluster_model_path):
138157
self.cluster_model = cluster.get_cluster_model(cluster_model_path)
158+
if self.shallow_diffusion : self.nsf_hifigan_enhance = False
139159
if self.nsf_hifigan_enhance:
140160
from modules.enhancer import Enhancer
141161
self.enhancer = Enhancer('nsf-hifigan', 'pretrain/nsf_hifigan/model',device=self.dev)
142-
162+
143163
def load_model(self):
144164
# get model configuration
145165
self.net_g_ms = SynthesizerTrn(
@@ -157,7 +177,7 @@ def load_model(self):
157177
def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker, f0_filter ,f0_predictor,cr_threshold=0.05):
158178

159179
wav, sr = librosa.load(in_path, sr=self.target_sample)
160-
180+
161181
f0_predictor_object = utils.get_f0_predictor(f0_predictor,hop_length=self.hop_size,sampling_rate=self.target_sample,device=self.dev,threshold=cr_threshold)
162182

163183
f0, uv = f0_predictor_object.compute_f0_uv(wav)
@@ -190,7 +210,8 @@ def infer(self, speaker, tran, raw_path,
190210
f0_filter=False,
191211
f0_predictor='pm',
192212
enhancer_adaptive_key = 0,
193-
cr_threshold = 0.05
213+
cr_threshold = 0.05,
214+
k_step = 100
194215
):
195216

196217
speaker_id = self.spk2id.__dict__.get(speaker)
@@ -203,7 +224,44 @@ def infer(self, speaker, tran, raw_path,
203224
c = c.half()
204225
with torch.no_grad():
205226
start = time.time()
206-
audio = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)[0,0].data.float()
227+
if not self.only_diffusion:
228+
audio,f0 = self.net_g_ms.infer(c, f0=f0, g=sid, uv=uv, predict_f0=auto_predict_f0, noice_scale=noice_scale)
229+
audio = audio[0,0].data.float()
230+
if self.shallow_diffusion:
231+
audio_mel = self.vocoder.extract(audio[None,:])
232+
vol = self.volume_extractor.extract(audio[None,:])[None,:,None].to(self.dev)
233+
f0 = f0[:,:,None]
234+
c = c.transpose(-1,-2)
235+
audio_mel = self.diffusion_model(
236+
c,
237+
f0,
238+
vol,
239+
spk_id = sid,
240+
spk_mix_dict = None,
241+
gt_spec=audio_mel,
242+
infer=True,
243+
infer_speedup=self.diffusion_args.infer.speedup,
244+
method=self.diffusion_args.infer.methold,
245+
k_step=k_step)
246+
audio = self.vocoder.infer(audio_mel, f0).squeeze()
247+
else:
248+
wav, sr = librosa.load(raw_path, sr=self.target_sample)
249+
wav = torch.FloatTensor(wav).to(self.dev)
250+
vol = self.volume_extractor.extract(wav[None,:])[None,:,None]
251+
c = c.transpose(-1,-2)
252+
f0 = f0[:,:,None]
253+
audio_mel = self.diffusion_model(
254+
c,
255+
f0,
256+
vol,
257+
spk_id = sid,
258+
spk_mix_dict = None,
259+
gt_spec=None,
260+
infer=True,
261+
infer_speedup=self.diffusion_args.infer.speedup,
262+
method=self.diffusion_args.infer.methold,
263+
k_step=k_step)
264+
audio = self.vocoder.infer(audio_mel, f0).squeeze()
207265
if self.nsf_hifigan_enhance:
208266
audio, _ = self.enhancer.enhance(
209267
audio[None,:],
@@ -243,9 +301,10 @@ def slice_inference(self,
243301
lgr_num =0.75,
244302
f0_predictor='pm',
245303
enhancer_adaptive_key = 0,
246-
cr_threshold = 0.05
304+
cr_threshold = 0.05,
305+
k_step = 100
247306
):
248-
wav_path = raw_audio_path
307+
wav_path = Path(raw_audio_path).with_suffix('.wav')
249308
chunks = slicer.cut(wav_path, db_thresh=slice_db)
250309
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
251310
per_size = int(clip_seconds*audio_sr)
@@ -284,7 +343,8 @@ def slice_inference(self,
284343
noice_scale=noice_scale,
285344
f0_predictor = f0_predictor,
286345
enhancer_adaptive_key = enhancer_adaptive_key,
287-
cr_threshold = cr_threshold
346+
cr_threshold = cr_threshold,
347+
k_step = k_step
288348
)
289349
_audio = out_audio.cpu().numpy()
290350
pad_len = int(self.target_sample * pad_seconds)
@@ -327,7 +387,7 @@ def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path,
327387
auto_predict_f0=auto_predict_f0,
328388
noice_scale=noice_scale,
329389
f0_filter=f0_filter)
330-
390+
331391
audio = audio.cpu().numpy()
332392
self.last_chunk = audio[-self.pre_len:]
333393
self.last_o = audio
@@ -348,3 +408,4 @@ def process(self, svc_model, speaker_id, f_pitch_change, input_wav_path,
348408
self.last_chunk = audio[-self.pre_len:]
349409
self.last_o = audio
350410
return ret[self.chunk_len:2 * self.chunk_len]
411+

inference_main.py

Lines changed: 33 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,21 @@ def main():
2929
parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["君の知らない物語-src.wav"], help='wav文件名列表,放在raw文件夹下')
3030
parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='音高调整,支持正负(半音)')
3131
parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['nen'], help='合成目标说话人名称')
32-
32+
3333
# 可选项部分
3434
parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False, help='语音转换自动预测音高,转换歌声时不要打开这个会严重跑调')
3535
parser.add_argument('-cm', '--cluster_model_path', type=str, default="logs/44k/kmeans_10000.pt", help='聚类模型路径,如果没有训练聚类则随便填')
3636
parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='聚类方案占比,范围0-1,若没有训练聚类模型则默认0即可')
3737
parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='两段音频切片的交叉淡入长度,如果强制切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,单位为秒')
3838
parser.add_argument('-f0p', '--f0_predictor', type=str, default="pm", help='选择F0预测器,可选择crepe,pm,dio,harvest,默认为pm(注意:crepe为原F0使用均值滤波器)')
3939
parser.add_argument('-eh', '--enhance', action='store_true', default=False, help='是否使用NSF_HIFIGAN增强器,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭')
40+
parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False, help='是否使用浅层扩散,使用后可解决一部分电音问题,默认关闭,该选项打开时,NSF_HIFIGAN增强器将会被禁止')
41+
42+
# 浅扩散设置
43+
parser.add_argument('-dm', '--diffusion_model_path', type=str, default="logs/44k/diffusion/model_0.pt", help='扩散模型路径')
44+
parser.add_argument('-dc', '--diffusion_config_path', type=str, default="logs/44k/diffusion/config.yaml", help='扩散模型配置文件路径')
45+
parser.add_argument('-ks', '--k_step', type=int, default=100, help='扩散步数,越大越接近扩散模型的结果,默认100')
46+
parser.add_argument('-od', '--only_diffusion', action='store_true', default=False, help='纯扩散模式,该模式不会加载sovits模型,以扩散模型推理')
4047

4148
# 不用动的部分
4249
parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='默认-40,嘈杂的音频可以-30,干声保留呼吸可以-50')
@@ -67,70 +74,40 @@ def main():
6774
enhance = args.enhance
6875
enhancer_adaptive_key = args.enhancer_adaptive_key
6976
cr_threshold = args.f0_filter_threshold
77+
diffusion_model_path = args.diffusion_model_path
78+
diffusion_config_path = args.diffusion_config_path
79+
k_step = args.k_step
80+
only_diffusion = args.only_diffusion
81+
shallow_diffusion = args.shallow_diffusion
7082

71-
svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path,enhance)
83+
svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path,enhance,diffusion_model_path,diffusion_config_path,shallow_diffusion,only_diffusion)
7284
infer_tool.mkdir(["raw", "results"])
73-
85+
7486
infer_tool.fill_a_to_b(trans, clean_names)
7587
for clean_name, tran in zip(clean_names, trans):
7688
raw_audio_path = f"raw/{clean_name}"
7789
if "." not in raw_audio_path:
7890
raw_audio_path += ".wav"
7991
infer_tool.format_wav(raw_audio_path)
80-
wav_path = Path(raw_audio_path).with_suffix('.wav')
81-
chunks = slicer.cut(wav_path, db_thresh=slice_db)
82-
audio_data, audio_sr = slicer.chunks2audio(wav_path, chunks)
83-
per_size = int(clip*audio_sr)
84-
lg_size = int(lg*audio_sr)
85-
lg_size_r = int(lg_size*lgr)
86-
lg_size_c_l = (lg_size-lg_size_r)//2
87-
lg_size_c_r = lg_size-lg_size_r-lg_size_c_l
88-
lg_2 = np.linspace(0,1,lg_size_r) if lg_size!=0 else 0
89-
9092
for spk in spk_list:
91-
audio = []
92-
for (slice_tag, data) in audio_data:
93-
print(f'#=====segment start, {round(len(data) / audio_sr, 3)}s======')
94-
95-
length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample))
96-
if slice_tag:
97-
print('jump empty segment')
98-
_audio = np.zeros(length)
99-
audio.extend(list(infer_tool.pad_array(_audio, length)))
100-
continue
101-
if per_size != 0:
102-
datas = infer_tool.split_list_by_n(data, per_size,lg_size)
103-
else:
104-
datas = [data]
105-
for k,dat in enumerate(datas):
106-
per_length = int(np.ceil(len(dat) / audio_sr * svc_model.target_sample)) if clip!=0 else length
107-
if clip!=0: print(f'###=====segment clip start, {round(len(dat) / audio_sr, 3)}s======')
108-
# padd
109-
pad_len = int(audio_sr * pad_seconds)
110-
dat = np.concatenate([np.zeros([pad_len]), dat, np.zeros([pad_len])])
111-
raw_path = io.BytesIO()
112-
soundfile.write(raw_path, dat, audio_sr, format="wav")
113-
raw_path.seek(0)
114-
out_audio, out_sr = svc_model.infer(spk, tran, raw_path,
115-
cluster_infer_ratio=cluster_infer_ratio,
116-
auto_predict_f0=auto_predict_f0,
117-
noice_scale=noice_scale,
118-
f0_predictor = f0p,
119-
enhancer_adaptive_key = enhancer_adaptive_key,
120-
cr_threshold = cr_threshold
121-
)
122-
_audio = out_audio.cpu().numpy()
123-
pad_len = int(svc_model.target_sample * pad_seconds)
124-
_audio = _audio[pad_len:-pad_len]
125-
_audio = infer_tool.pad_array(_audio, per_length)
126-
if lg_size!=0 and k!=0:
127-
lg1 = audio[-(lg_size_r+lg_size_c_r):-lg_size_c_r] if lgr != 1 else audio[-lg_size:]
128-
lg2 = _audio[lg_size_c_l:lg_size_c_l+lg_size_r] if lgr != 1 else _audio[0:lg_size]
129-
lg_pre = lg1*(1-lg_2)+lg2*lg_2
130-
audio = audio[0:-(lg_size_r+lg_size_c_r)] if lgr != 1 else audio[0:-lg_size]
131-
audio.extend(lg_pre)
132-
_audio = _audio[lg_size_c_l+lg_size_r:] if lgr != 1 else _audio[lg_size:]
133-
audio.extend(list(_audio))
93+
kwarg = {
94+
"raw_audio_path" : raw_audio_path,
95+
"spk" : spk,
96+
"tran" : tran,
97+
"slice_db" : slice_db,
98+
"cluster_infer_ratio" : cluster_infer_ratio,
99+
"auto_predict_f0" : auto_predict_f0,
100+
"noice_scale" : noice_scale,
101+
"pad_seconds" : pad_seconds,
102+
"clip_seconds" : clip,
103+
"lg_num": lg,
104+
"lgr_num" : lgr,
105+
"f0_predictor" : f0p,
106+
"enhancer_adaptive_key" : enhancer_adaptive_key,
107+
"cr_threshold" : cr_threshold,
108+
"k_step":k_step
109+
}
110+
audio = svc_model.slice_inference(**kwarg)
134111
key = "auto" if auto_predict_f0 else f"{tran}key"
135112
cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
136113
res_path = f'./results/{clean_name}_{key}_{spk}{cluster_name}.{wav_format}'

models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -417,4 +417,4 @@ def infer(self, c, f0, uv, g=None, noice_scale=0.35, predict_f0=False):
417417
z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale)
418418
z = self.flow(z_p, c_mask, g=g, reverse=True)
419419
o = self.dec(z * c_mask, g=g, f0=f0)
420-
return o
420+
return o,f0

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ librosa==0.9.1
1919
tensorboard
2020
tensorboardX
2121
edge_tts
22+
pyyaml

requirements_win.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ onnxsim
2222
onnxoptimizer
2323
tensorboardX
2424
edge_tts
25+
pyyaml

0 commit comments

Comments
 (0)