diff --git a/nvdaHelper/local/nvdaHelperLocal.def b/nvdaHelper/local/nvdaHelperLocal.def index ed63733f45..5f95d8cea8 100644 --- a/nvdaHelper/local/nvdaHelperLocal.def +++ b/nvdaHelper/local/nvdaHelperLocal.def @@ -82,6 +82,7 @@ EXPORTS wasPlay_pause wasPlay_resume wasPlay_setChannelVolume + wasPlay_startTrimmingLeadingSilence wasPlay_startup wasSilence_init wasSilence_playFor diff --git a/nvdaHelper/local/silenceDetect.h b/nvdaHelper/local/silenceDetect.h new file mode 100644 index 0000000000..16e9004471 --- /dev/null +++ b/nvdaHelper/local/silenceDetect.h @@ -0,0 +1,219 @@ +// A part of NonVisual Desktop Access (NVDA) +// This file is covered by the GNU General Public License. +// See the file COPYING for more details. +// Copyright (C) 2025 NV Access Limited, gexgd0419 + +#ifndef SILENCEDETECT_H +#define SILENCEDETECT_H + +#include +#include +#include +#include +#include + +namespace SilenceDetect { + +/** + * Compile-time wave format tag. + * Supports integer and floating-point formats. + * `SampleType` should be the smallest numeric type that can hold a sample, for example, 32-bit int for 24-bit format. + * Signedness of `SampleType` matters. For unsigned types, the zero point is at middle, e.g. 128 for 8-bit unsigned. + * `bytesPerSample` should be <= `sizeof(SampleType)` for integer formats, + * and == `sizeof(SampleType)` for floating-point formats. + * Assumes C++20 standard. + */ +template +struct WaveFormat { + static_assert(std::is_arithmetic_v, "SampleType should be an integer or floating-point type"); + static_assert(!(std::is_floating_point_v && bytesPerSample != sizeof(SampleType)), + "When SampleType is a floating-point type, bytesPerSample should be equal to sizeof(SampleType)"); + static_assert(!(std::is_integral_v && !(bytesPerSample <= sizeof(SampleType) && bytesPerSample > 0)), + "When SampleType is an integer type, bytesPerSample should be less than or equal to sizeof(SampleType) and greater than 0"); + + typedef SampleType SampleType; + static constexpr size_t bytesPerSample = bytesPerSample; + + static constexpr SampleType zeroPoint() { + // for unsigned types, zero point is at middle + // for signed types, zero is zero + if constexpr (std::is_unsigned_v) + return SampleType(1) << (bytesPerSample * 8 - 1); + else + return SampleType(); + } + + static constexpr SampleType (max)() { + if constexpr (std::is_floating_point_v) { + // For floating-point samples, maximum value is 1.0 + return SampleType(1); + } else { + // Trim the maximum value to `bytesPerSample` bytes + return (std::numeric_limits::max)() >> ((sizeof(SampleType) - bytesPerSample) * 8); + } + } + + static constexpr SampleType (min)() { + if constexpr (std::is_floating_point_v) { + // For floating-point samples, minimum value is -1.0 + return SampleType(-1); + } else { + // Trim the minimum value to `bytesPerSample` bytes + return (std::numeric_limits::min)() >> ((sizeof(SampleType) - bytesPerSample) * 8); + } + } + + static constexpr SampleType defaultThreshold() { + // Default threshold: 1 / 2^10 or 0.0009765625 + if constexpr (std::is_floating_point_v) + return SampleType(1) / (1 << 10); + else if constexpr (bytesPerSample * 8 > 10) + return SampleType(1) << (bytesPerSample * 8 - 10); + else + return SampleType(); + } + + static constexpr auto toSigned(SampleType smp) { + if constexpr (std::is_integral_v) { + // In C++20, signed integer types must use two's complement, + // so the following conversion is well-defined. + using SignedType = std::make_signed_t; + return SignedType(smp - zeroPoint()); + } else { + return smp; + } + } + + static constexpr SampleType fromSigned(SampleType smp) { + if constexpr (std::is_integral_v) { + // Signed overflow is undefined behavior, + // so convert to unsigned first. + using UnsignedType = std::make_unsigned_t; + return SampleType(UnsignedType(smp) + zeroPoint()); + } else { + return smp; + } + } + + static constexpr SampleType signExtend(SampleType smp) { + if constexpr (std::is_unsigned_v || bytesPerSample == sizeof(SampleType)) { + return smp; + } else { + constexpr auto shift = (sizeof(SampleType) - bytesPerSample) * 8; + // Convert to unsigned first to prevent left-shifting negative numbers + using UnsignedType = std::make_unsigned_t; + return SampleType(UnsignedType(smp) << shift) >> shift; + } + } +}; + +inline WORD getFormatTag(const WAVEFORMATEX* wfx) { + if (wfx->wFormatTag == WAVE_FORMAT_EXTENSIBLE) { + auto wfext = reinterpret_cast(wfx); + if (IS_VALID_WAVEFORMATEX_GUID(&wfext->SubFormat)) + return EXTRACT_WAVEFORMATEX_ID(&wfext->SubFormat); + } + return wfx->wFormatTag; +} + +/** + * Return the leading silence wave data length, in bytes. + * Assumes the wave data to be of one channel (mono). + * Uses a `WaveFormat` type (`Fmt`) to determine the wave format. + */ +template +size_t getLeadingSilenceSizeMono( + const unsigned char* waveData, + size_t size, + typename Fmt::SampleType threshold +) { + using SampleType = Fmt::SampleType; + constexpr size_t bytesPerSample = Fmt::bytesPerSample; + + if (size < bytesPerSample) + return 0; + + constexpr SampleType zeroPoint = Fmt::zeroPoint(); + const SampleType minValue = zeroPoint - threshold, maxValue = zeroPoint + threshold; + + // Check each sample + SampleType smp = SampleType(); + const unsigned char* const pEnd = waveData + (size - (size % bytesPerSample)); + for (const unsigned char* p = waveData; p < pEnd; p += bytesPerSample) { + memcpy(&smp, p, bytesPerSample); + smp = Fmt::signExtend(smp); + // this sample is out of range, so the previous sample is the final sample of leading silence. + if (smp < minValue || smp > maxValue) + return p - waveData; + } + + // The whole data block is silence + return size; +} + +/** + * Invoke a functor with an argument of a WaveFormat type that corresponds to the specified WAVEFORMATEX. + * Return false if the WAVEFORMATEX is unknown. + */ +template +bool callByWaveFormat(const WAVEFORMATEX* wfx, Func&& func) { + switch (getFormatTag(wfx)) { + case WAVE_FORMAT_PCM: + switch (wfx->wBitsPerSample) { + case 8: // 8-bits are unsigned, others are signed + func(WaveFormat()); + break; + case 16: + func(WaveFormat()); + break; + case 24: + func(WaveFormat()); + break; + case 32: + func(WaveFormat()); + break; + default: + return false; + } + break; + case WAVE_FORMAT_IEEE_FLOAT: + switch (wfx->wBitsPerSample) { + case 32: + func(WaveFormat()); + break; + case 64: + func(WaveFormat()); + break; + default: + return false; + } + break; + default: + return false; + } + return true; +} + +/** + * Return the leading silence wave data length, in bytes. + * Uses a `WAVEFORMATEX` to determine the wave format. + */ +inline size_t getLeadingSilenceSize( + const WAVEFORMATEX* wfx, + const unsigned char* waveData, + size_t size +) { + size_t len; + if (!callByWaveFormat(wfx, [=, &len](auto fmtTag) { + using Fmt = decltype(fmtTag); + len = getLeadingSilenceSizeMono( + waveData, size, Fmt::defaultThreshold()); + })) + return 0; + + return len - len % wfx->nBlockAlign; // round down to block (channel) boundaries +} + +} // namespace SilenceDetect + +#endif // SILENCEDETECT_H diff --git a/nvdaHelper/local/wasapi.cpp b/nvdaHelper/local/wasapi.cpp index e942f6952d..540815c33b 100644 --- a/nvdaHelper/local/wasapi.cpp +++ b/nvdaHelper/local/wasapi.cpp @@ -24,6 +24,7 @@ This license can be found at: #include #include #include +#include "silenceDetect.h" /** * Support for audio playback using WASAPI. @@ -194,6 +195,8 @@ class WasapiPlayer { HRESULT resume(); HRESULT setChannelVolume(unsigned int channel, float level); + void startTrimmingLeadingSilence(bool start); + private: void maybeFireCallback(); @@ -245,6 +248,7 @@ class WasapiPlayer { unsigned int defaultDeviceChangeCount; unsigned int deviceStateChangeCount; bool isUsingPreferredDevice = false; + bool isTrimmingLeadingSilence = false; }; WasapiPlayer::WasapiPlayer(wchar_t* endpointId, WAVEFORMATEX format, @@ -342,6 +346,19 @@ HRESULT WasapiPlayer::feed(unsigned char* data, unsigned int size, return true; }; + if (isTrimmingLeadingSilence) { + size_t silenceSize = SilenceDetect::getLeadingSilenceSize(&format, data, size); + if (silenceSize >= size) { + // The whole chunk is silence. Continue checking for silence in the next chunk. + remainingFrames = 0; + } else { + // Silence ends in this chunk. Skip the silence and continue. + data += silenceSize; + remainingFrames = (size - silenceSize) / format.nBlockAlign; + isTrimmingLeadingSilence = false; // Stop checking for silence + } + } + while (remainingFrames > 0) { UINT32 paddingFrames; @@ -643,6 +660,10 @@ HRESULT WasapiPlayer::setChannelVolume(unsigned int channel, float level) { return volume->SetChannelVolume(channel, level); } +void WasapiPlayer::startTrimmingLeadingSilence(bool start) { + isTrimmingLeadingSilence = start; +} + HRESULT WasapiPlayer::disableCommunicationDucking(IMMDevice* device) { // Disable the default ducking experience used when a communication audio // session is active, as we never want NVDA's audio to be ducked. @@ -839,6 +860,10 @@ HRESULT wasPlay_setChannelVolume( return player->setChannelVolume(channel, level); } +void wasPlay_startTrimmingLeadingSilence(WasapiPlayer* player, bool start) { + player->startTrimmingLeadingSilence(start); +} + /** * This must be called once per session at startup before wasPlay_create is * called. diff --git a/projectDocs/dev/developerGuide/developerGuide.md b/projectDocs/dev/developerGuide/developerGuide.md index d0b8c24712..2a60cbe475 100644 --- a/projectDocs/dev/developerGuide/developerGuide.md +++ b/projectDocs/dev/developerGuide/developerGuide.md @@ -1393,6 +1393,7 @@ For examples of how to define and use new extension points, please see the code |`Action` |`synthIndexReached` |Notifies when a synthesizer reaches an index during speech.| |`Action` |`synthDoneSpeaking` |Notifies when a synthesizer finishes speaking.| |`Action` |`synthChanged` |Notifies of synthesizer changes.| +|`Action` |`pre_synthSpeak` |Notifies when the current synthesizer is about to speak something.| ### tones {#tonesExtPts} diff --git a/source/config/configSpec.py b/source/config/configSpec.py index 4653f5ebc9..9ca325cfc8 100644 --- a/source/config/configSpec.py +++ b/source/config/configSpec.py @@ -45,6 +45,7 @@ autoDialectSwitching = boolean(default=false) delayedCharacterDescriptions = boolean(default=false) excludedSpeechModes = int_list(default=list()) + trimLeadingSilence = boolean(default=true) [[__many__]] capPitchChange = integer(default=30,min=-100,max=100) diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py index 6278c2440b..ee764e079e 100644 --- a/source/gui/settingsDialogs.py +++ b/source/gui/settingsDialogs.py @@ -3720,6 +3720,7 @@ def __init__(self, parent): # Advanced settings panel label = _("Speech") speechSizer = wx.StaticBoxSizer(wx.VERTICAL, self, label=label) + speechBox = speechSizer.GetStaticBox() speechGroup = guiHelper.BoxSizerHelper(speechSizer, sizer=speechSizer) sHelper.addItem(speechGroup) @@ -3750,6 +3751,14 @@ def __init__(self, parent): ["featureFlag", "cancelExpiredFocusSpeech"], ) + # Translators: This is the label for a checkbox control in the + # Advanced settings panel. + label = _("Trim leading silence in speech audio") + self.trimLeadingSilenceCheckBox = speechGroup.addItem(wx.CheckBox(speechBox, label=label)) + self.bindHelpEvent("TrimLeadingSilenceSpeech", self.trimLeadingSilenceCheckBox) + self.trimLeadingSilenceCheckBox.SetValue(config.conf["speech"]["trimLeadingSilence"]) + self.trimLeadingSilenceCheckBox.defaultValue = self._getDefaultValue(["speech", "trimLeadingSilence"]) + # Translators: This is the label for a group of advanced options in the # Advanced settings panel label = _("Virtual Buffers") @@ -3934,6 +3943,7 @@ def haveConfigDefaultsBeenRestored(self): and self.wtStrategyCombo.isValueConfigSpecDefault() and self.cancelExpiredFocusSpeechCombo.GetSelection() == self.cancelExpiredFocusSpeechCombo.defaultValue + and self.trimLeadingSilenceCheckBox.IsChecked() == self.trimLeadingSilenceCheckBox.defaultValue and self.loadChromeVBufWhenBusyCombo.isValueConfigSpecDefault() and self.caretMoveTimeoutSpinControl.GetValue() == self.caretMoveTimeoutSpinControl.defaultValue and self.reportTransparentColorCheckBox.GetValue() @@ -3963,6 +3973,7 @@ def restoreToDefaults(self): self.diffAlgoCombo.SetSelection(self.diffAlgoCombo.defaultValue) self.wtStrategyCombo.resetToConfigSpecDefault() self.cancelExpiredFocusSpeechCombo.SetSelection(self.cancelExpiredFocusSpeechCombo.defaultValue) + self.trimLeadingSilenceCheckBox.SetValue(self.trimLeadingSilenceCheckBox.defaultValue) self.loadChromeVBufWhenBusyCombo.resetToConfigSpecDefault() self.caretMoveTimeoutSpinControl.SetValue(self.caretMoveTimeoutSpinControl.defaultValue) self.reportTransparentColorCheckBox.SetValue(self.reportTransparentColorCheckBox.defaultValue) @@ -3974,6 +3985,14 @@ def restoreToDefaults(self): def onSave(self): log.debug("Saving advanced config") + + if config.conf["speech"]["trimLeadingSilence"] != self.trimLeadingSilenceCheckBox.IsChecked(): + # Reload the synthesizer if "trimLeadingSilence" changes + config.conf["speech"]["trimLeadingSilence"] = self.trimLeadingSilenceCheckBox.IsChecked() + currentSynth = getSynth() + if not setSynth(currentSynth.name): + _synthWarningDialog(currentSynth.name) + config.conf["development"]["enableScratchpadDir"] = self.scratchpadCheckBox.IsChecked() selectiveUIAEventRegistrationChoice = self.selectiveUIAEventRegistrationCombo.GetSelection() config.conf["UIA"]["eventRegistration"] = self.selectiveUIAEventRegistrationVals[ diff --git a/source/nvwave.py b/source/nvwave.py index 59ce04d6fd..9a2390dfd2 100644 --- a/source/nvwave.py +++ b/source/nvwave.py @@ -39,6 +39,9 @@ import core import globalVars from pycaw.utils import AudioUtilities +from speech import SpeechSequence +from speech.commands import BreakCommand +from synthDriverHandler import pre_synthSpeak from utils.mmdevice import _getOutputDevices @@ -289,6 +292,14 @@ def __init__( if config.conf["audio"]["audioAwakeTime"] > 0: NVDAHelper.localLib.wasSilence_init(outputDevice) WasapiWavePlayer._silenceDevice = outputDevice + # Enable trimming by default for speech only + self.enableTrimmingLeadingSilence( + purpose is AudioPurpose.SPEECH and config.conf["speech"]["trimLeadingSilence"], + ) + if self._enableTrimmingLeadingSilence: + self.startTrimmingLeadingSilence() + self._isLeadingSilenceInserted: bool = False + pre_synthSpeak.register(self._onPreSpeak) @wasPlay_callback def _callback(cppPlayer, feedId): @@ -313,6 +324,7 @@ def __del__(self): # a weakref callback can run before __del__ in some cases, which would mean # it has already been removed from _instances. self._player = None + pre_synthSpeak.unregister(self._onPreSpeak) def open(self): """Open the output device. @@ -323,7 +335,7 @@ def open(self): NVDAHelper.localLib.wasPlay_open(self._player) except WindowsError: log.warning( - "Couldn't open specified or default audio device. " "There may be no audio devices.", + "Couldn't open specified or default audio device. There may be no audio devices.", ) WavePlayer.audioDeviceError_static = True raise @@ -357,6 +369,10 @@ def feed( feedId = c_uint() if onDone else None # Never treat this instance as idle while we're feeding. self._lastActiveTime = None + # If a BreakCommand is used to insert leading silence in this utterance, + # turn off trimming temporarily. + if self._purpose is AudioPurpose.SPEECH and self._isLeadingSilenceInserted: + self.startTrimmingLeadingSilence(False) try: NVDAHelper.localLib.wasPlay_feed( self._player, @@ -393,6 +409,8 @@ def sync(self): def idle(self): """Indicate that this player is now idle; i.e. the current continuous segment of audio is complete.""" self.sync() + if self._enableTrimmingLeadingSilence: + self.startTrimmingLeadingSilence() if self._audioDucker: self._audioDucker.disable() @@ -401,6 +419,8 @@ def stop(self): if self._audioDucker: self._audioDucker.disable() NVDAHelper.localLib.wasPlay_stop(self._player) + if self._enableTrimmingLeadingSilence: + self.startTrimmingLeadingSilence() self._lastActiveTime = None self._isPaused = False self._doneCallbacks = {} @@ -455,6 +475,17 @@ def setVolume( if not (all and e.winerror == E_INVALIDARG): raise + def enableTrimmingLeadingSilence(self, enable: bool) -> None: + """Enable or disable automatic leading silence removal. + This is by default enabled for speech audio, and disabled for non-speech audio.""" + self._enableTrimmingLeadingSilence = enable + if not enable: + self.startTrimmingLeadingSilence(False) + + def startTrimmingLeadingSilence(self, start: bool = True) -> None: + """Start or stop trimming the leading silence from the next audio chunk.""" + NVDAHelper.localLib.wasPlay_startTrimmingLeadingSilence(self._player, start) + def _setVolumeFromConfig(self): if self._purpose is not AudioPurpose.SOUNDS: return @@ -508,6 +539,7 @@ def _idleCheck(cls): if player._lastActiveTime <= threshold: try: NVDAHelper.localLib.wasPlay_idle(player._player) + player.startTrimmingLeadingSilence() except OSError: # #16125: IAudioClock::GetPosition sometimes fails with an access # violation on a device which has been invalidated. This shouldn't happen @@ -524,6 +556,16 @@ def _idleCheck(cls): # Schedule another check here in case feed isn't called for a while. cls._scheduleIdleCheck() + def _onPreSpeak(self, speechSequence: SpeechSequence): + self._isLeadingSilenceInserted = False + # Check if leading silence of the current utterance is inserted by a BreakCommand. + for item in speechSequence: + if isinstance(item, BreakCommand): + self._isLeadingSilenceInserted = True + break + elif isinstance(item, str): + break + WavePlayer = WasapiWavePlayer fileWavePlayer: Optional[WavePlayer] = None diff --git a/source/speech/manager.py b/source/speech/manager.py index b642bfdf83..d5b374ccaf 100644 --- a/source/speech/manager.py +++ b/source/speech/manager.py @@ -22,7 +22,7 @@ from .priorities import Spri, SPEECH_PRIORITIES from logHandler import log -from synthDriverHandler import getSynth +from synthDriverHandler import getSynth, pre_synthSpeak from typing import ( Dict, Any, @@ -431,6 +431,7 @@ def _pushNextSpeech(self, doneSpeaking: bool): self._indexesSpeaking.append(item.index) self._cancelledLastSpeechWithSynth = False log._speechManagerUnitTest(f"Synth Gets: {seq}") + pre_synthSpeak.notify(speechSequence=seq) getSynth().speak(seq) def _getNextPriority(self): diff --git a/source/synthDriverHandler.py b/source/synthDriverHandler.py index 1e4bb9b02c..07a4a997cf 100644 --- a/source/synthDriverHandler.py +++ b/source/synthDriverHandler.py @@ -567,3 +567,11 @@ def isDebugForSynthDriver(): @param isFallback: Whether the synth is set as fallback synth due to another synth's failure @type isFallback: bool """ + +pre_synthSpeak = extensionPoints.Action() +""" +Notifies when speak() of the current synthesizer is about to be called. + +:param speechSequence: the speech sequence to pass to speak() +:type speechSequence: speech.SpeechSequence +""" diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md index fe7c3b241d..665b6da51f 100644 --- a/user_docs/en/changes.md +++ b/user_docs/en/changes.md @@ -59,6 +59,7 @@ Prefix matching on command line flags, e.g. using `--di` for `--disable-addons` * Microsoft Speech API version 5 and Microsoft Speech Platform voices now use WASAPI for audio output, which may improve the responsiveness of those voices. (#13284, @gexgd0419) * The keyboard settings for "Speak typed characters" and "Speak typed words" now have three options: Off, Only in edit controls, and Always. (#17505, @Cary-rowen) * By default, "Speak typed characters" is now set to "Only in edit controls". +* The silence at the beginning of speech will now be trimmed when using OneCore voices, SAPI5 voices, and some third-party voice add-ons to improve their responsiveness. (#17614, @gexgd0419) ### Bug Fixes @@ -147,6 +148,7 @@ Add-ons will need to be re-tested and have their manifest updated. * Added the `matchFunc` parameter to `addUsbDevices` which is also available on `addUsbDevice`. * This way device detection can be constrained further in cases where a VID/PID-combination is shared by multiple devices across multiple drivers, or when a HID device offers multiple endpoints, for example. * See the method documentation as well as examples in the albatross and brailliantB drivers for more information. +* Added a new extension point `pre_synthSpeak` in `synthDriverHandler`, which will be called before the speech manager calls `speak` of the current synthesizer. #### API Breaking Changes diff --git a/user_docs/en/userGuide.md b/user_docs/en/userGuide.md index dfc6698346..c1e631a2a6 100644 --- a/user_docs/en/userGuide.md +++ b/user_docs/en/userGuide.md @@ -3321,6 +3321,12 @@ This option enables behaviour which attempts to cancel speech for expired focus In particular moving quickly through messages in Gmail with Chrome can cause NVDA to speak outdated information. This functionality is enabled by default as of NVDA 2021.1. +##### Trim leading silence in speech audio {#TrimLeadingSilenceSpeech} + +When enabled, NVDA will remove silence from the start of speech audio, which may improve the responsiveness of some speech synthesizers. +This option is enabled by default, and should only affect the silence at the beginning of speech. +If you find that some necessary silence periods are also missing (e.g. pause between two sentences) when using a speech synthesizer add-on, you may turn this feature off entirely to resolve the issue. + ##### Caret move timeout (in MS) {#AdvancedSettingsCaretMoveTimeout} This option allows you to configure the number of milliseconds NVDA will wait for the caret (insertion point) to move in editable text controls.