Skip to content

Commit 3cc6220

Browse files
committed
use of silero model instead of silero VadIterator
1 parent 70c5355 commit 3cc6220

File tree

5 files changed

+70
-59
lines changed

5 files changed

+70
-59
lines changed

mic_test_whisper_simple.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,23 +39,23 @@ def stream_process(self, vad_result):
3939
if chunk is not None:
4040
sf = soundfile.SoundFile(io.BytesIO(chunk), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
4141
audio, _ = librosa.load(sf,sr=SAMPLING_RATE)
42-
# self.audio_buffer.append(chunk)
4342
out = []
4443
out.append(audio)
4544
a = np.concatenate(out)
4645
self.audio_buffer = np.append(self.audio_buffer, a)
4746

4847
if is_final and len(self.audio_buffer) > 0:
4948
res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
50-
# use custom ts_words
5149
tsw = self.ts_words(res)
50+
5251
self.init_prompt = self.init_prompt + tsw
5352
self.init_prompt = self.init_prompt [-100:]
5453
self.audio_buffer.resize(0)
5554
iter_in_phrase =0
55+
5656
yield True, tsw
57-
# show progress evry 10 chunks
58-
elif iter_in_phrase % 20 == 0 and len(self.audio_buffer) > 0:
57+
# show progress evry 50 chunks
58+
elif iter_in_phrase % 50 == 0 and len(self.audio_buffer) > 0:
5959
res = self.asr.transcribe(self.audio_buffer, init_prompt=self.init_prompt)
6060
# use custom ts_words
6161
tsw = self.ts_words(res)

mic_test_whisper_streaming.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
src_lan = "en" # source language
1414
tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
1515
use_vad_result = True
16-
min_sample_length = 1 * SAMPLING_RATE
16+
min_sample_length = 1.5 * SAMPLING_RATE
1717

1818

1919

microphone_stream.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def __init__(
2929
self._pyaudio = pyaudio.PyAudio()
3030
self.sample_rate = sample_rate
3131

32-
self._chunk_size = int(self.sample_rate * 0.1)
32+
self._chunk_size = int(self.sample_rate * 40 / 1000)
3333
self._stream = self._pyaudio.open(
3434
format=pyaudio.paInt16,
3535
channels=1,

voice_activity_controller.py

Lines changed: 56 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,27 @@
33
# import sounddevice as sd
44
import torch
55
import numpy as np
6+
import datetime
67

78

9+
def int2float(sound):
10+
abs_max = np.abs(sound).max()
11+
sound = sound.astype('float32')
12+
if abs_max > 0:
13+
sound *= 1/32768
14+
sound = sound.squeeze() # depends on the use case
15+
return sound
16+
817
class VoiceActivityController:
918
def __init__(
1019
self,
1120
sampling_rate = 16000,
12-
second_ofSilence = 0.5,
13-
second_ofSpeech = 0.25,
21+
min_silence_to_final_ms = 500,
22+
min_speech_to_final_ms = 100,
23+
min_silence_duration_ms = 100,
1424
use_vad_result = True,
1525
activity_detected_callback=None,
26+
threshold =0.3
1627
):
1728
self.activity_detected_callback=activity_detected_callback
1829
self.model, self.utils = torch.hub.load(
@@ -26,84 +37,77 @@ def __init__(
2637
collect_chunks) = self.utils
2738

2839
self.sampling_rate = sampling_rate
29-
self.silence_limit = second_ofSilence * self.sampling_rate
30-
self.speech_limit = second_ofSpeech *self.sampling_rate
40+
self.final_silence_limit = min_silence_to_final_ms * self.sampling_rate / 1000
41+
self.final_speech_limit = min_speech_to_final_ms *self.sampling_rate / 1000
42+
self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
3143

3244
self.use_vad_result = use_vad_result
33-
self.vad_iterator = VADIterator(
34-
model =self.model,
35-
threshold = 0.3, # 0.5
36-
sampling_rate= self.sampling_rate,
37-
min_silence_duration_ms = 500, #100
38-
speech_pad_ms = 400 #30
39-
)
4045
self.last_marked_chunk = None
41-
42-
43-
def int2float(self, sound):
44-
abs_max = np.abs(sound).max()
45-
sound = sound.astype('float32')
46-
if abs_max > 0:
47-
sound *= 1/32768
48-
sound = sound.squeeze() # depends on the use case
49-
return sound
46+
self.threshold = threshold
47+
self.reset_states()
48+
49+
def reset_states(self):
50+
self.model.reset_states()
51+
self.temp_end = 0
52+
self.current_sample = 0
5053

5154
def apply_vad(self, audio):
52-
audio_float32 = self.int2float(audio)
53-
chunk = self.vad_iterator(audio_float32, return_seconds=False)
54-
55-
if chunk is not None:
56-
if "start" in chunk:
57-
start = chunk["start"]
58-
self.last_marked_chunk = chunk
59-
return audio[start:] if self.use_vad_result else audio, (len(audio) - start), 0
60-
61-
if "end" in chunk:
62-
#todo: pending get the padding from the next chunk
63-
end = chunk["end"] if chunk["end"] < len(audio) else len(audio)
64-
self.last_marked_chunk = chunk
65-
return audio[:end] if self.use_vad_result else audio, end ,len(audio) - end
55+
x = int2float(audio)
56+
if not torch.is_tensor(x):
57+
try:
58+
x = torch.Tensor(x)
59+
except:
60+
raise TypeError("Audio cannot be casted to tensor. Cast it manually")
61+
62+
speech_prob = self.model(x, self.sampling_rate).item()
63+
64+
window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
65+
self.current_sample += window_size_samples
66+
6667

67-
if self.last_marked_chunk is not None:
68-
if "start" in self.last_marked_chunk:
69-
return audio, len(audio) ,0
68+
if (speech_prob >= self.threshold):
69+
self.temp_end = 0
70+
return audio, window_size_samples, 0
71+
72+
else :
73+
if not self.temp_end:
74+
self.temp_end = self.current_sample
75+
76+
if self.current_sample - self.temp_end < self.min_silence_samples:
77+
return audio, 0, window_size_samples
78+
else:
79+
return np.array([], dtype=np.float16) , 0, window_size_samples
7080

71-
if "end" in self.last_marked_chunk:
72-
return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 ,len(audio)
7381

74-
return np.array([], dtype=np.float16) if self.use_vad_result else audio, 0 , 0
7582

7683

7784

7885
def detect_user_speech(self, audio_stream, audio_in_int16 = False):
79-
silence_len= 0
86+
last_silence_len= 0
8087
speech_len = 0
8188

8289
for data in audio_stream: # replace with your condition of choice
83-
# if isinstance(data, EndOfTransmission):
84-
# raise EndOfTransmission("End of transmission detected")
8590

8691

8792
audio_block = np.frombuffer(data, dtype=np.int16) if not audio_in_int16 else data
8893
wav = audio_block
8994

90-
9195
is_final = False
92-
voice_audio, speech_in_wav, last_silent_duration_in_wav = self.apply_vad(wav)
93-
# print(f'----r> speech_in_wav: {speech_in_wav} last_silent_duration_in_wav: {last_silent_duration_in_wav}')
96+
voice_audio, speech_in_wav, last_silent_in_wav = self.apply_vad(wav)
97+
9498

9599
if speech_in_wav > 0 :
96-
silence_len= 0
100+
last_silence_len= 0
97101
speech_len += speech_in_wav
98102
if self.activity_detected_callback is not None:
99103
self.activity_detected_callback()
100104

101-
silence_len = silence_len + last_silent_duration_in_wav
102-
if silence_len>= self.silence_limit and speech_len >= self.speech_limit:
105+
last_silence_len += last_silent_in_wav
106+
if last_silence_len>= self.final_silence_limit and speech_len >= self.final_speech_limit:
107+
103108
is_final = True
104-
silence_len= 0
105-
speech_len = 0
106-
109+
last_silence_len= 0
110+
speech_len = 0
107111

108112
yield voice_audio.tobytes(), is_final
109113

whisper_online.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import librosa
55
from functools import lru_cache
66
import time
7-
7+
import datetime
88

99

1010
@lru_cache
@@ -118,14 +118,21 @@ def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
118118
return model
119119

120120
def transcribe(self, audio, init_prompt=""):
121+
122+
# tiempo_inicio = datetime.datetime.now()
121123
# tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
122124
segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True, **self.transcribe_kargs)
125+
126+
# print(f'({datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")})----------r> whisper transcribe take { (datetime.datetime.now() -tiempo_inicio) } ms.')
127+
123128
return list(segments)
124129

125130
def ts_words(self, segments):
126131
o = []
127132
for segment in segments:
128133
for word in segment.words:
134+
if segment.no_speech_prob > 0.9:
135+
continue
129136
# not stripping the spaces -- should not be merged with them!
130137
w = word.word
131138
t = (word.start, word.end, w)

0 commit comments

Comments
 (0)