diff --git a/source/config/configSpec.py b/source/config/configSpec.py index c08f31546ce..b1d3146df13 100644 --- a/source/config/configSpec.py +++ b/source/config/configSpec.py @@ -297,6 +297,8 @@ [uwpOcr] language = string(default="") + autoRefresh = boolean(default=false) + autoRefreshInterval = integer(default=1500, min=100) [upgrade] newLaptopKeyboardLayout = boolean(default=false) diff --git a/source/contentRecog/__init__.py b/source/contentRecog/__init__.py index 600bc281f30..66a4127b7a1 100644 --- a/source/contentRecog/__init__.py +++ b/source/contentRecog/__init__.py @@ -1,8 +1,7 @@ -#contentRecog/__init__.py -#A part of NonVisual Desktop Access (NVDA) -#Copyright (C) 2017 NV Access Limited -#This file is covered by the GNU General Public License. -#See the file COPYING for more details. +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2017-2023 NV Access Limited, James Teh, Leonard de Ruijter +# This file is covered by the GNU General Public License. +# See the file COPYING for more details. """Framework for recognition of content; OCR, image recognition, etc. When authors don't provide sufficient information for a screen reader user to determine the content of something, @@ -14,11 +13,17 @@ """ from collections import namedtuple +import ctypes +from typing import Callable, Dict, List, Union import garbageHandler +from baseObject import AutoPropertyObject import cursorManager import textInfos.offsets from abc import ABCMeta, abstractmethod from locationHelper import RectLTWH +from NVDAObjects import NVDAObject + +onRecognizeResultCallbackT = Callable[[Union["RecognitionResult", Exception]], None] class BaseContentRecogTextInfo(cursorManager._ReviewCursorManagerTextInfo): @@ -27,24 +32,31 @@ class BaseContentRecogTextInfo(cursorManager._ReviewCursorManagerTextInfo): """ -class ContentRecognizer(garbageHandler.TrackedObject, metaclass=ABCMeta): +class ContentRecognizer(AutoPropertyObject): """Implementation of a content recognizer. """ - def getResizeFactor(self, width, height): + allowAutoRefresh: bool = False + """ + Whether to allow automatic, periodic refresh when using this recognizer. + This allows the user to see live changes as they occur. However, if a + recognizer uses an internet service or is very resource intensive, this + may be undesirable. + """ + autoRefreshInterval: int = 1500 + """How often (in ms) to perform recognition.""" + + def getResizeFactor(self, width: int, height: int) -> Union[int, float]: """Return the factor by which an image must be resized before it is passed to this recognizer. @param width: The width of the image in pixels. - @type width: int @param height: The height of the image in pixels. - @type height: int @return: The resize factor, C{1} for no resizing. - @rtype: int or float """ return 1 @abstractmethod - def recognize(self, pixels, imageInfo, onResult): + def recognize(self, pixels: ctypes.Array, imageInfo: "RecogImageInfo", onResult: onRecognizeResultCallbackT): """Asynchronously recognize content from an image. This method should not block. Only one recognition can be performed at a time. @@ -56,9 +68,8 @@ def recognize(self, pixels, imageInfo, onResult): However, the alpha channel should be ignored. @type pixels: Two dimensional array (y then x) of L{winGDI.RGBQUAD} @param imageInfo: Information about the image for recognition. - @type imageInfo: L{RecogImageInfo} - @param onResult: A callable which takes a L{RecognitionResult} (or an exception on failure) as its only argument. - @type onResult: callable + @param onResult: A callable which takes a L{RecognitionResult} (or an exception on failure) + as its only argument. """ raise NotImplementedError @@ -73,17 +84,16 @@ def validateCaptureBounds(self, location: RectLTWH) -> bool: """ return True - def validateObject(self, nav): + def validateObject(self, nav: NVDAObject) -> bool: """Validation to be performed on the navigator object before content recognition @param nav: The navigator object to be validated - @type nav: L{NVDAObjects.NVDAObject} @return: C{True} or C{False}, depending on whether the navigator object is valid or not. C{True} for no validation. - @rtype: bool """ return True -class RecogImageInfo(object): + +class RecogImageInfo: """Encapsulates information about a recognized image and provides functionality to convert coordinates. An image captured for recognition can begin at any point on the screen. @@ -97,18 +107,20 @@ class RecogImageInfo(object): This is done using the L{convertXToScreen} and L{convertYToScreen} methods. """ - def __init__(self, screenLeft, screenTop, screenWidth, screenHeight, resizeFactor): + def __init__( + self, + screenLeft: int, + screenTop: int, + screenWidth: int, + screenHeight: int, + resizeFactor: Union[int, float] + ): """ @param screenLeft: The x screen coordinate of the upper-left corner of the image. - @type screenLeft: int @param screenTop: The y screen coordinate of the upper-left corner of the image. - @type screenTop: int @param screenWidth: The width of the image on the screen. - @type screenWidth: int @param screenHeight: The height of the image on the screen. - @type screenHeight: int @param resizeFactor: The factor by which the image must be resized for recognition. - @type resizeFactor: int or float @raise ValueError: If the supplied screen coordinates indicate that the image is not visible; e.g. width or height of 0. """ @@ -125,7 +137,14 @@ def __init__(self, screenLeft, screenTop, screenWidth, screenHeight, resizeFacto self.recogHeight = int(screenHeight * resizeFactor) @classmethod - def createFromRecognizer(cls, screenLeft, screenTop, screenWidth, screenHeight, recognizer): + def createFromRecognizer( + cls, + screenLeft: int, + screenTop: int, + screenWidth: int, + screenHeight: int, + recognizer: ContentRecognizer + ): """Convenience method to construct an instance using a L{ContentRecognizer}. The resize factor is obtained by calling L{ContentRecognizer.getResizeFactor}. """ @@ -172,10 +191,12 @@ def makeTextInfo(self, obj, position) -> BaseContentRecogTextInfo: """ raise NotImplementedError + # Used internally by LinesWordsResult. # (Lwr is short for LinesWordsResult.) LwrWord = namedtuple("LwrWord", ("offset", "left", "top", "width", "height")) + class LinesWordsResult(RecognitionResult): """A L{RecognizerResult} which can create TextInfos based on a simple lines/words data structure. The data structure is a list of lines, wherein each line is a list of words, @@ -183,7 +204,7 @@ class LinesWordsResult(RecognitionResult): Several OCR engines produce output in a format which can be easily converted to this. """ - def __init__(self, data, imageInfo): + def __init__(self, data: List[List[Dict[str, Union[str, int]]]], imageInfo: RecogImageInfo): """Constructor. @param data: The lines/words data structure. For example: [ @@ -196,11 +217,9 @@ def __init__(self, data, imageInfo): {"x": 117, "y": 105, "width": 11, "height": 9, "text": "Word4"} ] ] - @type data: list of lists of dicts @param imageInfo: Information about the recognized image. This is used to convert coordinates in the recognized image to screen coordinates. - @type imageInfo: L{RecogImageInfo} """ self.data = data self.imageInfo = imageInfo @@ -223,11 +242,13 @@ def _parseData(self): # Separate with a space. self._textList.append(" ") self.textLen += 1 - self.words.append(LwrWord(self.textLen, + self.words.append(LwrWord( + self.textLen, self.imageInfo.convertXToScreen(word["x"]), self.imageInfo.convertYToScreen(word["y"]), self.imageInfo.convertWidthToScreen(word["width"]), - self.imageInfo.convertHeightToScreen(word["height"]))) + self.imageInfo.convertHeightToScreen(word["height"])) + ) text = word["text"] self._textList.append(text) self.textLen += len(text) @@ -249,7 +270,7 @@ class LwrTextInfo(BaseContentRecogTextInfo, textInfos.offsets.OffsetsTextInfo): def __init__(self, obj, position, result): self.result = result - super(LwrTextInfo, self).__init__(obj, position) + super().__init__(obj, position) def copy(self): return self.__class__(self.obj, self.bookmark, self.result) @@ -315,7 +336,7 @@ class SimpleResultTextInfo(BaseContentRecogTextInfo, textInfos.offsets.OffsetsTe def __init__(self, obj, position, result): self.result = result - super(SimpleResultTextInfo, self).__init__(obj, position) + super().__init__(obj, position) def copy(self): return self.__class__(self.obj, self.bookmark, self.result) @@ -325,6 +346,3 @@ def _getStoryText(self): def _getStoryLength(self): return len(self.result.text) - - def _getStoryText(self): - return self.result.text diff --git a/source/contentRecog/recogUi.py b/source/contentRecog/recogUi.py index ef2df600a94..bb11140ee7e 100644 --- a/source/contentRecog/recogUi.py +++ b/source/contentRecog/recogUi.py @@ -1,8 +1,7 @@ -#contentRecog/recogUi.py -#A part of NonVisual Desktop Access (NVDA) -#Copyright (C) 2017 NV Access Limited -#This file is covered by the GNU General Public License. -#See the file COPYING for more details. +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2017-2023 NV Access Limited, James Teh, Leonard de RUijter +# This file is covered by the GNU General Public License. +# See the file COPYING for more details. """User interface for content recognition. This module provides functionality to capture an image from the screen @@ -11,10 +10,12 @@ NVDA scripts or GUI call the L{recognizeNavigatorObject} function with the recognizer they wish to use. """ +from typing import Optional, Union import api import ui import screenBitmap import NVDAObjects.window +from NVDAObjects.behaviors import LiveText import controlTypes import browseMode import cursorManager @@ -22,7 +23,8 @@ import textInfos from logHandler import log import queueHandler -from . import RecogImageInfo, BaseContentRecogTextInfo +import core +from . import RecogImageInfo, ContentRecognizer, RecognitionResult, onRecognizeResultCallbackT class RecogResultNVDAObject(cursorManager.CursorManager, NVDAObjects.window.Window): @@ -40,8 +42,9 @@ class RecogResultNVDAObject(cursorManager.CursorManager, NVDAObjects.window.Wind def __init__(self, result=None, obj=None): self.parent = parent = api.getFocusObject() self.result = result - self._selection = self.makeTextInfo(textInfos.POSITION_FIRST) - super(RecogResultNVDAObject, self).__init__(windowHandle=parent.windowHandle) + if result: + self._selection = self.makeTextInfo(textInfos.POSITION_FIRST) + super().__init__(windowHandle=parent.windowHandle) def makeTextInfo(self, position): # Maintain our own fake selection/caret. @@ -64,6 +67,9 @@ def setFocus(self): # This might get called from a background thread and all NVDA events must run in the main thread. eventHandler.queueEvent("gainFocus", self) + def _get_hasFocus(self) -> bool: + return self is api.getFocusObject() + def script_activatePosition(self, gesture): try: self._selection.activate() @@ -98,13 +104,105 @@ def script_findPrevious(self, gesture): "kb:escape": "exit", } + +class RefreshableRecogResultNVDAObject(RecogResultNVDAObject, LiveText): + """NVDA Object that itself is responsible for fetching the recognizition result. + It is also able to refresh the result at intervals whenthe recognizer supports it. + """ + + def __init__( + self, + recognizer: ContentRecognizer, + imageInfo: RecogImageInfo, + obj: Optional[NVDAObjects.NVDAObject] = None + ): + self.recognizer = recognizer + self.imageInfo = imageInfo + super().__init__(result=None, obj=obj) + LiveText.initOverlayClass(self) + + def _recognize(self, onResult: onRecognizeResultCallbackT): + if self.result and not self.hasFocus: + # We've already recognized once, so we did have focus, but we don't any + # more. This means the user dismissed the recognition result, so we + # shouldn't recognize again. + return + imgInfo = self.imageInfo + sb = screenBitmap.ScreenBitmap(imgInfo.recogWidth, imgInfo.recogHeight) + pixels = sb.captureImage( + imgInfo.screenLeft, imgInfo.screenTop, + imgInfo.screenWidth, imgInfo.screenHeight + ) + self.recognizer.recognize(pixels, self.imageInfo, onResult) + + def _onFirstResult(self, result: Union[RecognitionResult, Exception]): + global _activeRecog + _activeRecog = None + # This might get called from a background thread, so any UI calls must be queued to the main thread. + if isinstance(result, Exception): + log.error(f"Recognition failed: {result}") + queueHandler.queueFunction( + queueHandler.eventQueue, + ui.message, + # Translators: Reported when recognition (e.g. OCR) fails. + _("Recognition failed") + ) + return + self.result = result + self._selection = self.makeTextInfo(textInfos.POSITION_FIRST) + # This method queues an event to the main thread. + self.setFocus() + if self.recognizer.allowAutoRefresh: + self._scheduleRecognize() + + def _scheduleRecognize(self): + core.callLater(self.recognizer.autoRefreshInterval, self._recognize, self._onResult) + + def _onResult(self, result: Union[RecognitionResult, Exception]): + if not self.hasFocus: + # The user has dismissed the recognition result. + return + if isinstance(result, Exception): + log.error(f"Subsequent recognition failed: {result}") + queueHandler.queueFunction( + queueHandler.eventQueue, + ui.message, + # Translators: Reported when recognition (e.g. OCR) fails during automatic refresh. + _("Automatic refresh of recognition result failed") + ) + self.stopMonitoring() + return + self.result = result + # The current selection refers to the old result. We need to refresh that, + # but try to keep the same cursor position. + self.selection = self.makeTextInfo(self._selection.bookmark) + # Tell LiveText that our text has changed. + self.event_textChange() + self._scheduleRecognize() + + def event_gainFocus(self): + super().event_gainFocus() + if self.recognizer.allowAutoRefresh: + # Make LiveText watch for and report new text. + self.startMonitoring() + + def event_loseFocus(self): + # note: If monitoring has not been started, this will have no effect. + self.stopMonitoring() + super().event_loseFocus() + + def start(self): + self._recognize(self._onFirstResult) + + #: Keeps track of the recognition in progress, if any. _activeRecog = None -def recognizeNavigatorObject(recognizer): + + +def recognizeNavigatorObject(recognizer: ContentRecognizer): """User interface function to recognize content in the navigator object. This should be called from a script or in response to a GUI action. @param recognizer: The content recognizer to use. - @type recognizer: L{contentRecog.ContentRecognizer} """ global _activeRecog if isinstance(api.getFocusObject(), RecogResultNVDAObject): @@ -132,24 +230,8 @@ def recognizeNavigatorObject(recognizer): ui.message(notVisibleMsg) return if _activeRecog: - _activeRecog.cancel() + _activeRecog.recognizer.cancel() # Translators: Reporting when content recognition (e.g. OCR) begins. ui.message(_("Recognizing")) - sb = screenBitmap.ScreenBitmap(imgInfo.recogWidth, imgInfo.recogHeight) - pixels = sb.captureImage(left, top, width, height) - _activeRecog = recognizer - recognizer.recognize(pixels, imgInfo, _recogOnResult) - -def _recogOnResult(result): - global _activeRecog - _activeRecog = None - # This might get called from a background thread, so any UI calls must be queued to the main thread. - if isinstance(result, Exception): - # Translators: Reported when recognition (e.g. OCR) fails. - log.error("Recognition failed: %s" % result) - queueHandler.queueFunction(queueHandler.eventQueue, - ui.message, _("Recognition failed")) - return - resObj = RecogResultNVDAObject(result=result) - # This method queues an event to the main thread. - resObj.setFocus() + _activeRecog = RefreshableRecogResultNVDAObject(recognizer=recognizer, imageInfo=imgInfo) + _activeRecog.start() diff --git a/source/contentRecog/uwpOcr.py b/source/contentRecog/uwpOcr.py index c743604c94c..57ed15ab97c 100644 --- a/source/contentRecog/uwpOcr.py +++ b/source/contentRecog/uwpOcr.py @@ -66,8 +66,17 @@ def getConfigLanguage(): config.conf["uwpOcr"]["language"] = initial return initial + class UwpOcr(ContentRecognizer): + @classmethod + def _get_allowAutoRefresh(cls) -> bool: + return config.conf['uwpOcr']['autoRefresh'] + + @classmethod + def _get_autoRefreshInterval(cls) -> int: + return config.conf['uwpOcr']['autoRefreshInterval'] + def getResizeFactor(self, width, height): # UWP OCR performs poorly with small images, so increase their size. if width < 100 or height < 100: diff --git a/source/globalCommands.py b/source/globalCommands.py index 98885cc4a4f..6ec1c47f4fe 100755 --- a/source/globalCommands.py +++ b/source/globalCommands.py @@ -4263,6 +4263,13 @@ def _enableScreenCurtain(doEnable: bool = True): ) ) else: + from contentRecog.recogUi import RefreshableRecogResultNVDAObject + focusObj = api.getFocusObject() + if isinstance(focusObj, RefreshableRecogResultNVDAObject) and focusObj.recognizer.allowAutoRefresh: + # Translators: Warning message when trying to enable the screen curtain when OCR is active. + warningMessage = _("Could not enable screen curtain when performing content recognition") + ui.message(warningMessage, speechPriority=speech.priorities.Spri.NOW) + return _enableScreenCurtain() @script( diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py index e5ebc2ec508..3d3e3e1c9f5 100644 --- a/source/gui/settingsDialogs.py +++ b/source/gui/settingsDialogs.py @@ -2660,9 +2660,18 @@ def makeSettings(self, settingsSizer): except ValueError: self.languageChoice.Selection = 0 + # Translators: Label for an option in the Windows OCR settings panel. + autoRefreshText = _("Periodically &refresh recognized content") + self.autoRefreshCheckbox = sHelper.addItem( + wx.CheckBox(self, label=autoRefreshText) + ) + self.bindHelpEvent("Win10OcrSettingsAutoRefresh", self.autoRefreshCheckbox) + self.autoRefreshCheckbox.SetValue(config.conf["uwpOcr"]["autoRefresh"]) + def onSave(self): lang = self.languageCodes[self.languageChoice.Selection] config.conf["uwpOcr"]["language"] = lang + config.conf["uwpOcr"]["autoRefresh"] = self.autoRefreshCheckbox.IsChecked() class AdvancedPanelControls( diff --git a/source/visionEnhancementProviders/screenCurtain.py b/source/visionEnhancementProviders/screenCurtain.py index 1f957e0f599..319cf8a3a35 100644 --- a/source/visionEnhancementProviders/screenCurtain.py +++ b/source/visionEnhancementProviders/screenCurtain.py @@ -279,11 +279,27 @@ def _onCheckEvent(self, evt: wx.CommandEvent): if evt.GetEventObject() is self._enabledCheckbox: self._ensureEnableState(evt.IsChecked()) + def _ocrActive(self) -> bool: + """Outputs a message when trying to activate screen curtain when OCR is active. + @returns: C{True} when OCR is active, C{False} otherwise. + """ + import api + from contentRecog.recogUi import RefreshableRecogResultNVDAObject + import speech + import ui + focusObj = api.getFocusObject() + if isinstance(focusObj, RefreshableRecogResultNVDAObject) and focusObj.recognizer.allowAutoRefresh: + # Translators: Warning message when trying to enable the screen curtain when OCR is active. + warningMessage = _("Could not enable screen curtain when performing content recognition") + ui.message(warningMessage, speechPriority=speech.priorities.Spri.NOW) + return True + return False + def _ensureEnableState(self, shouldBeEnabled: bool): currentlyEnabled = bool(self._providerControl.getProviderInstance()) if shouldBeEnabled and not currentlyEnabled: confirmed = self.confirmInitWithUser() - if not confirmed or not self._providerControl.startProvider(): + if not confirmed or self._ocrActive() or not self._providerControl.startProvider(): self._enabledCheckbox.SetValue(False) elif not shouldBeEnabled and currentlyEnabled: self._providerControl.terminateProvider() diff --git a/tests/checkPot.py b/tests/checkPot.py index 5a21dbc682b..f6163f2df51 100644 --- a/tests/checkPot.py +++ b/tests/checkPot.py @@ -56,7 +56,6 @@ 'Display', 'left', 'right', - 'Recognition failed', 'NVDA &web site', 'E&xit', 'Error renaming profile.', diff --git a/user_docs/en/changes.t2t b/user_docs/en/changes.t2t index 4d0bcd3d72c..2676f753866 100644 --- a/user_docs/en/changes.t2t +++ b/user_docs/en/changes.t2t @@ -15,6 +15,10 @@ What's New in NVDA - An option to separately configure the volume of NVDA sounds. (#1409, #15038) - - +- NVDA is now able to continually update the result when performing optical character recognition (OCR), speaking new text as it appears. (#2797) + - To enable this functionality, enable the option "Periodically refresh recognized content" in the Windows OCR category of NVDA's settings dialog. + - Once enabled, you can toggle speaking new text by toggling report dynamic content changes (pressing ``NVDA+5``). + - - When using automatic detection of braille displays, it is now possible to opt-out drivers from detection from the braille display selection dialog. (#15196) - A new option in Document Formatting settings, "Ignore blank lines for line indentation reporting". (#13394) - diff --git a/user_docs/en/userGuide.t2t b/user_docs/en/userGuide.t2t index fe0bd40fa30..3ff435a5885 100644 --- a/user_docs/en/userGuide.t2t +++ b/user_docs/en/userGuide.t2t @@ -1096,6 +1096,9 @@ NVDA can use this to recognize text from images or inaccessible applications. You can set the language to use for text recognition in the [Windows OCR category #Win10OcrSettings] of the [NVDA Settings #NVDASettings] dialog. Additional languages can be installed by opening the Start menu, choosing Settings, selecting Time & Language -> Region & Language and then choosing Add a language. +When you want to monitor constantly changing content, such as when watching a video with subtitles, you can optionally enable automatic refresh of the recognized content. +This can also be done in the [Windows OCR category #Win10OcrSettings] of the [NVDA Settings #NVDASettings] dialog. + Windows OCR may be partially or fully incompatible with [NVDA vision enhancements #Vision] or other external visual aids. You will need to disable these aids before proceeding to a recognition. %kc:beginInclude @@ -2196,6 +2199,12 @@ This category contains the following options: This combo box allows you to choose the language to be used for text recognition. To cycle through available languages from anywhere, please assign a custom gesture using the [Input Gestures dialog #InputGestures]. +==== Periodically refresh recognized content ====[Win10OcrSettingsAutoRefresh] +When this checkbox is enabled, NVDA will automatically refresh the recognized content when a recognition result has focus. +This can be very useful when you want to monitor constantly changing content, such as when watching a video with subtitles. +The refresh takes place every one and a half seconds. +This option is disabled by default. + +++ Advanced Settings +++[AdvancedSettings] Warning! The settings in this category are for advanced users and may cause NVDA to not function correctly if configured in the wrong way. Only make changes to these settings if you are sure you know what you are doing or if you have been specifically instructed to by an NVDA developer.