Skip to content

Commit ffc67b8

Browse files
feat(stt): readd interimResults and lowLatency wss params
1 parent f55b18d commit ffc67b8

File tree

4 files changed

+93
-11
lines changed

4 files changed

+93
-11
lines changed

examples/microphone-speech-to-text.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ def recognize_using_weboscket(*args):
7272
mycallback = MyRecognizeCallback()
7373
speech_to_text.recognize_using_websocket(audio=audio_source,
7474
content_type='audio/l16; rate=44100',
75-
recognize_callback=mycallback)
75+
recognize_callback=mycallback,
76+
interim_results=True)
7677

7778
###############################################
7879
#### Prepare the for recording using Pyaudio ##

ibm_watson/speech_to_text_v1_adapter.py

+21-1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def recognize_using_websocket(self,
3333
customization_weight=None,
3434
base_model_version=None,
3535
inactivity_timeout=None,
36+
interim_results=None,
3637
keywords=None,
3738
keywords_threshold=None,
3839
max_alternatives=None,
@@ -54,6 +55,7 @@ def recognize_using_websocket(self,
5455
split_transcript_at_phrase_end=None,
5556
speech_detector_sensitivity=None,
5657
background_audio_suppression=None,
58+
low_latency=None,
5759
character_insertion_bias=None,
5860
**kwargs):
5961
"""
@@ -269,6 +271,22 @@ def recognize_using_websocket(self,
269271
* 1.0 suppresses all audio (no audio is transcribed).
270272
The values increase on a monotonic curve. See [Background audio
271273
suppression](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-suppression).
274+
:param bool low_latency: (optional) If `true` for next-generation
275+
`Multimedia` and `Telephony` models that support low latency, directs the
276+
service to produce results even more quickly than it usually does.
277+
Next-generation models produce transcription results faster than
278+
previous-generation models. The `low_latency` parameter causes the models
279+
to produce results even more quickly, though the results might be less
280+
accurate when the parameter is used.
281+
**Note:** The parameter is beta functionality. It is not available for
282+
previous-generation `Broadband` and `Narrowband` models. It is available
283+
only for some next-generation models.
284+
* For a list of next-generation models that support low latency, see
285+
[Supported language
286+
models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng#models-ng-supported)
287+
for next-generation models.
288+
* For more information about the `low_latency` parameter, see [Low
289+
latency](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-interim#low-latency).
272290
:param float character_insertion_bias: (optional) For next-generation
273291
`Multimedia` and `Telephony` models, an indication of whether the service
274292
is biased to recognize shorter or longer strings of characters when
@@ -337,6 +355,7 @@ def recognize_using_websocket(self,
337355
'customization_weight': customization_weight,
338356
'content_type': content_type,
339357
'inactivity_timeout': inactivity_timeout,
358+
'interim_results': interim_results,
340359
'keywords': keywords,
341360
'keywords_threshold': keywords_threshold,
342361
'max_alternatives': max_alternatives,
@@ -356,7 +375,8 @@ def recognize_using_websocket(self,
356375
'split_transcript_at_phrase_end': split_transcript_at_phrase_end,
357376
'speech_detector_sensitivity': speech_detector_sensitivity,
358377
'background_audio_suppression': background_audio_suppression,
359-
'character_insertion_bias': character_insertion_bias
378+
'character_insertion_bias': character_insertion_bias,
379+
'low_latency': low_latency,
360380
}
361381
options = {k: v for k, v in options.items() if v is not None}
362382
request['options'] = options

ibm_watson/websocket/recognize_listener.py

+10-9
Original file line numberDiff line numberDiff line change
@@ -196,15 +196,16 @@ def on_data(self, ws, message, message_type, fin):
196196
# set of transcriptions and send them to the appropriate callbacks.
197197
results = json_object.get('results')
198198
if results:
199-
b_final = (results[0].get('final') is True)
200-
alternatives = results[0].get('alternatives')
201-
if alternatives:
202-
hypothesis = alternatives[0].get('transcript')
203-
transcripts = self.extract_transcripts(alternatives)
204-
if b_final:
205-
self.callback.on_transcription(transcripts)
206-
if hypothesis:
207-
self.callback.on_hypothesis(hypothesis)
199+
if (self.options.get('interim_results') is True):
200+
b_final = (results[0].get('final') is True)
201+
alternatives = results[0].get('alternatives')
202+
if alternatives:
203+
hypothesis = alternatives[0].get('transcript')
204+
transcripts = self.extract_transcripts(alternatives)
205+
if b_final:
206+
self.callback.on_transcription(transcripts)
207+
if hypothesis:
208+
self.callback.on_hypothesis(hypothesis)
208209
else:
209210
final_transcript = []
210211
for result in results:

test/integration/test_speech_to_text_v1.py

+60
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,66 @@ def on_data(self, data):
118118
assert test_callback.data['results'][0]['alternatives'][0]
119119
['transcript'] == 'thunderstorms could produce large hail isolated tornadoes and heavy rain '
120120

121+
def test_on_transcription_interim_results_false(self):
122+
class MyRecognizeCallback(RecognizeCallback):
123+
def __init__(self):
124+
RecognizeCallback.__init__(self)
125+
self.error = None
126+
self.transcript = None
127+
def on_error(self, error):
128+
self.error = error
129+
def on_transcription(self, transcript):
130+
self.transcript = transcript
131+
test_callback = MyRecognizeCallback()
132+
with open(os.path.join(os.path.dirname(__file__), '../../resources/speech_with_pause.wav'), 'rb') as audio_file:
133+
audio_source = AudioSource(audio_file, False)
134+
self.speech_to_text.recognize_using_websocket(audio_source, "audio/wav", test_callback, model="en-US_Telephony",
135+
interim_results=False, low_latency=False)
136+
assert test_callback.error is None
137+
assert test_callback.transcript is not None
138+
assert test_callback.transcript[0][0]['transcript'] in ['isolated tornadoes ', 'isolated tornados ']
139+
assert test_callback.transcript[1][0]['transcript'] == 'and heavy rain '
140+
def test_on_transcription_interim_results_true(self):
141+
class MyRecognizeCallback(RecognizeCallback):
142+
def __init__(self):
143+
RecognizeCallback.__init__(self)
144+
self.error = None
145+
self.transcript = None
146+
def on_error(self, error):
147+
self.error = error
148+
def on_transcription(self, transcript):
149+
self.transcript = transcript
150+
assert transcript[0]['confidence'] is not None
151+
assert transcript[0]['transcript'] is not None
152+
test_callback = MyRecognizeCallback()
153+
with open(os.path.join(os.path.dirname(__file__), '../../resources/speech_with_pause.wav'), 'rb') as audio_file:
154+
audio_source = AudioSource(audio_file, False)
155+
self.speech_to_text.recognize_using_websocket(audio_source, "audio/wav", test_callback, model="en-US_Telephony",
156+
interim_results=True, low_latency=True)
157+
assert test_callback.error is None
158+
assert test_callback.transcript is not None
159+
assert test_callback.transcript[0]['transcript'] == 'and heavy rain '
160+
def test_on_transcription_interim_results_true_low_latency_false(self):
161+
class MyRecognizeCallback(RecognizeCallback):
162+
def __init__(self):
163+
RecognizeCallback.__init__(self)
164+
self.error = None
165+
self.transcript = None
166+
def on_error(self, error):
167+
self.error = error
168+
def on_transcription(self, transcript):
169+
self.transcript = transcript
170+
assert transcript[0]['confidence'] is not None
171+
assert transcript[0]['transcript'] is not None
172+
test_callback = MyRecognizeCallback()
173+
with open(os.path.join(os.path.dirname(__file__), '../../resources/speech_with_pause.wav'), 'rb') as audio_file:
174+
audio_source = AudioSource(audio_file, False)
175+
self.speech_to_text.recognize_using_websocket(audio_source, "audio/wav", test_callback, model="en-US_Telephony",
176+
interim_results=True, low_latency=False)
177+
assert test_callback.error is None
178+
assert test_callback.transcript is not None
179+
assert test_callback.transcript[0]['transcript'] == 'and heavy rain '
180+
121181
def test_custom_grammars(self):
122182
customization_id = None
123183
for custom_model in self.custom_models.get('customizations'):

0 commit comments

Comments
 (0)