Skip to content

Commit

Permalink
WebSurfer: print viewport text (#5329)
Browse files Browse the repository at this point in the history
This PR adds a method that approximately extracts the text visible in
the viewport of the web browser (as opposed to always printing the first
50 lines, or relying entirely on OCR).
  • Loading branch information
afourney authored Feb 3, 2025
1 parent 227b875 commit 877796d
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,6 @@ def _download_handler(download: Download) -> None:
TOOL_SLEEP,
TOOL_HOVER,
]
# Number of lines of text to extract from the page in the absence of OCR
self.n_lines_page_text = 50
self.did_lazy_init = False # flag to check if we have initialized the browser

async def _lazy_init(
Expand Down Expand Up @@ -743,7 +741,7 @@ async def _execute_tool(
ocr_text = (
await self._get_ocr_text(new_screenshot, cancellation_token=cancellation_token)
if self.use_ocr is True
else await self._playwright_controller.get_webpage_text(self._page, n_lines=self.n_lines_page_text)
else await self._playwright_controller.get_visible_text(self._page)
)

# Return the complete observation
Expand All @@ -752,7 +750,7 @@ async def _execute_tool(
if self.use_ocr:
message_content += f"Automatic OCR of the page screenshot has detected the following text:\n\n{ocr_text}"
else:
message_content += f"The first {self.n_lines_page_text} lines of the page text is:\n\n{ocr_text}"
message_content += f"The following text is visible in the viewport:\n\n{ocr_text}"

return [
message_content,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -367,10 +367,63 @@ var MultimodalWebSurfer = MultimodalWebSurfer || (function() {
return results;
};


let getVisibleText = function() {
// Get the window’s current viewport boundaries
const viewportHeight = window.innerHeight || document.documentElement.clientHeight;
const viewportWidth = window.innerWidth || document.documentElement.clientWidth;

let textInView = "";
const walker = document.createTreeWalker(
document.body,
NodeFilter.SHOW_TEXT,
null,
false
);

while (walker.nextNode()) {
const textNode = walker.currentNode;
// Create a range to retrieve bounding rectangles of the current text node
const range = document.createRange();
range.selectNodeContents(textNode);

const rects = range.getClientRects();

// Check if any rect is inside (or partially inside) the viewport
for (const rect of rects) {
const isVisible =
rect.width > 0 &&
rect.height > 0 &&
rect.bottom >= 0 &&
rect.right >= 0 &&
rect.top <= viewportHeight &&
rect.left <= viewportWidth;

if (isVisible) {
textInView += textNode.nodeValue.replace(/\s+/g, " ");
// Is the parent a block element?
if (textNode.parentNode) {
const parent = textNode.parentNode;
const style = window.getComputedStyle(parent);
if (["inline", "hidden", "none"].indexOf(style.display) === -1) {
textInView += "\n";
}
}
break; // No need to check other rects once found visible
}
}
}

// Remove blank lines from textInView
textInView = textInView.replace(/^\s*\n/gm, "").trim().replace(/\n+/g, "\n");
return textInView;
};

return {
getInteractiveRects: getInteractiveRects,
getVisualViewport: getVisualViewport,
getFocusedElementId: getFocusedElementId,
getPageMetadata: getPageMetadata,
getVisibleText: getVisibleText,
};
})();
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,25 @@ async def get_webpage_text(self, page: Page, n_lines: int = 50) -> str:
except Exception:
return ""

async def get_visible_text(self, page: Page) -> str:
"""
Retrieve the text content of the browser viewport (approximately).
Args:
page (Page): The Playwright page object.
Returns:
str: The text content of the page.
"""
assert page is not None
try:
await page.evaluate(self._page_script)
except Exception:
pass
result = await page.evaluate("MultimodalWebSurfer.getVisibleText();")
assert isinstance(result, str)
return result

async def get_page_markdown(self, page: Page) -> str:
"""
Retrieve the markdown content of the web page.
Expand Down

0 comments on commit 877796d

Please sign in to comment.