diff --git a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_multimodal_web_surfer.py b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_multimodal_web_surfer.py index f90dc01cdda2..748f61e42f61 100644 --- a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_multimodal_web_surfer.py +++ b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/_multimodal_web_surfer.py @@ -264,8 +264,6 @@ def _download_handler(download: Download) -> None: TOOL_SLEEP, TOOL_HOVER, ] - # Number of lines of text to extract from the page in the absence of OCR - self.n_lines_page_text = 50 self.did_lazy_init = False # flag to check if we have initialized the browser async def _lazy_init( @@ -743,7 +741,7 @@ async def _execute_tool( ocr_text = ( await self._get_ocr_text(new_screenshot, cancellation_token=cancellation_token) if self.use_ocr is True - else await self._playwright_controller.get_webpage_text(self._page, n_lines=self.n_lines_page_text) + else await self._playwright_controller.get_visible_text(self._page) ) # Return the complete observation @@ -752,7 +750,7 @@ async def _execute_tool( if self.use_ocr: message_content += f"Automatic OCR of the page screenshot has detected the following text:\n\n{ocr_text}" else: - message_content += f"The first {self.n_lines_page_text} lines of the page text is:\n\n{ocr_text}" + message_content += f"The following text is visible in the viewport:\n\n{ocr_text}" return [ message_content, diff --git a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/page_script.js b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/page_script.js index 95b32d5b9902..1363e83dbd70 100644 --- a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/page_script.js +++ b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/page_script.js @@ -367,10 +367,63 @@ var MultimodalWebSurfer = MultimodalWebSurfer || (function() { return results; }; + + let getVisibleText = function() { + // Get the window’s current viewport boundaries + const viewportHeight = window.innerHeight || document.documentElement.clientHeight; + const viewportWidth = window.innerWidth || document.documentElement.clientWidth; + + let textInView = ""; + const walker = document.createTreeWalker( + document.body, + NodeFilter.SHOW_TEXT, + null, + false + ); + + while (walker.nextNode()) { + const textNode = walker.currentNode; + // Create a range to retrieve bounding rectangles of the current text node + const range = document.createRange(); + range.selectNodeContents(textNode); + + const rects = range.getClientRects(); + + // Check if any rect is inside (or partially inside) the viewport + for (const rect of rects) { + const isVisible = + rect.width > 0 && + rect.height > 0 && + rect.bottom >= 0 && + rect.right >= 0 && + rect.top <= viewportHeight && + rect.left <= viewportWidth; + + if (isVisible) { + textInView += textNode.nodeValue.replace(/\s+/g, " "); + // Is the parent a block element? + if (textNode.parentNode) { + const parent = textNode.parentNode; + const style = window.getComputedStyle(parent); + if (["inline", "hidden", "none"].indexOf(style.display) === -1) { + textInView += "\n"; + } + } + break; // No need to check other rects once found visible + } + } + } + + // Remove blank lines from textInView + textInView = textInView.replace(/^\s*\n/gm, "").trim().replace(/\n+/g, "\n"); + return textInView; + }; + return { getInteractiveRects: getInteractiveRects, getVisualViewport: getVisualViewport, getFocusedElementId: getFocusedElementId, getPageMetadata: getPageMetadata, + getVisibleText: getVisibleText, }; })(); diff --git a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/playwright_controller.py b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/playwright_controller.py index d691826a45ee..412bc07dba74 100644 --- a/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/playwright_controller.py +++ b/python/packages/autogen-ext/src/autogen_ext/agents/web_surfer/playwright_controller.py @@ -527,6 +527,25 @@ async def get_webpage_text(self, page: Page, n_lines: int = 50) -> str: except Exception: return "" + async def get_visible_text(self, page: Page) -> str: + """ + Retrieve the text content of the browser viewport (approximately). + + Args: + page (Page): The Playwright page object. + + Returns: + str: The text content of the page. + """ + assert page is not None + try: + await page.evaluate(self._page_script) + except Exception: + pass + result = await page.evaluate("MultimodalWebSurfer.getVisibleText();") + assert isinstance(result, str) + return result + async def get_page_markdown(self, page: Page) -> str: """ Retrieve the markdown content of the web page.