@@ -174,10 +174,8 @@ def load_model(self):
174
174
175
175
176
176
177
- def get_unit_f0 (self , in_path , tran , cluster_infer_ratio , speaker , f0_filter ,f0_predictor ,cr_threshold = 0.05 ):
177
+ def get_unit_f0 (self , wav , tran , cluster_infer_ratio , speaker , f0_filter ,f0_predictor ,cr_threshold = 0.05 ):
178
178
179
- wav , sr = librosa .load (in_path , sr = self .target_sample )
180
-
181
179
f0_predictor_object = utils .get_f0_predictor (f0_predictor ,hop_length = self .hop_size ,sampling_rate = self .target_sample ,device = self .dev ,threshold = cr_threshold )
182
180
183
181
f0 , uv = f0_predictor_object .compute_f0_uv (wav )
@@ -219,7 +217,8 @@ def infer(self, speaker, tran, raw_path,
219
217
if len (self .spk2id .__dict__ ) >= speaker :
220
218
speaker_id = speaker
221
219
sid = torch .LongTensor ([int (speaker_id )]).to (self .dev ).unsqueeze (0 )
222
- c , f0 , uv = self .get_unit_f0 (raw_path , tran , cluster_infer_ratio , speaker , f0_filter ,f0_predictor ,cr_threshold = cr_threshold )
220
+ wav , sr = librosa .load (raw_path , sr = self .target_sample )
221
+ c , f0 , uv = self .get_unit_f0 (wav , tran , cluster_infer_ratio , speaker , f0_filter ,f0_predictor ,cr_threshold = cr_threshold )
223
222
if "half" in self .net_g_path and torch .cuda .is_available ():
224
223
c = c .half ()
225
224
with torch .no_grad ():
@@ -245,9 +244,8 @@ def infer(self, speaker, tran, raw_path,
245
244
k_step = k_step )
246
245
audio = self .vocoder .infer (audio_mel , f0 ).squeeze ()
247
246
else :
248
- wav , sr = librosa .load (raw_path , sr = self .target_sample )
249
247
wav = torch .FloatTensor (wav ).to (self .dev )
250
- vol = self .volume_extractor .extract (wav [None ,:])[None ,:,None ]
248
+ vol = self .volume_extractor .extract (wav [None ,:])[None ,:,None ]. to ( self . dev )
251
249
c = c .transpose (- 1 ,- 2 )
252
250
f0 = f0 [:,:,None ]
253
251
audio_mel = self .diffusion_model (
0 commit comments