From 74ee05757d312b9ea91449e15e79223be201efca Mon Sep 17 00:00:00 2001 From: Mahmudul Alam Date: Mon, 30 Mar 2026 06:13:08 +0600 Subject: [PATCH 1/7] feat: add Windows UI automation inspection and background DOM monitoring endpoints --- server/main.py | 3 + server/windows.py | 263 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 266 insertions(+) create mode 100644 server/windows.py diff --git a/server/main.py b/server/main.py index 3e295b8d..d0ccb2cf 100644 --- a/server/main.py +++ b/server/main.py @@ -11,6 +11,7 @@ from server.mobile import router as mobile_router, start_ui_dump_uploads from server.mac import router as mac_router from server.linux import router as linux_router +from server.windows import router as windows_router, upload_windows_ui_dump from server.installers import router as installers_router import asyncio @@ -43,12 +44,14 @@ def main() -> FastAPI: v1router.include_router(mobile_router) v1router.include_router(mac_router) v1router.include_router(linux_router) + v1router.include_router(windows_router) v1router.include_router(installers_router) app = FastAPI() @app.on_event("startup") async def _start_background_uploads(): start_ui_dump_uploads() + asyncio.create_task(upload_windows_ui_dump()) app.include_router(v1router) origins = [ diff --git a/server/windows.py b/server/windows.py new file mode 100644 index 00000000..37b3dfc3 --- /dev/null +++ b/server/windows.py @@ -0,0 +1,263 @@ +import hashlib +import os +import sys +import asyncio +import requests +import time +from typing import Literal +from fastapi import APIRouter +from pydantic import BaseModel + +from Framework.Utilities import ConfigModule, CommonUtil + + +router = APIRouter(prefix="/windows", tags=["windows"]) + +_TARGET_APP_NAME: str | None = None +_TARGET_APP_SET_TIME: float = 0.0 + + +class InspectorResponse(BaseModel): + """Response model for the /inspect endpoint.""" + status: Literal["ok", "error"] = "ok" + ui_xml: str | None = None + error: str | None = None + + +class AppInfo(BaseModel): + """Model for an active application/window.""" + name: str + pid: int + class_name: str + automation_id: str + + +def _xml_escape(value: str) -> str: + """Escape special characters for XML attributes.""" + return ( + value + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + ) + + +_automation_loaded = False + + +def _get_automation_imports(): + """Lazily import UIAutomation types (only available on Windows with pythonnet). + + Mirrors the clr setup from Framework/Built_In_Automation/Desktop/Windows/BuiltInFunctions.py. + """ + global _automation_loaded + if not _automation_loaded: + import clr + dll_path = os.getcwd().split("Framework")[0] + "Framework" + os.sep + "windows_dll_files" + os.sep + clr.AddReference(dll_path + "UIAutomationClient") + clr.AddReference(dll_path + "UIAutomationTypes") + clr.AddReference(dll_path + "UIAutomationProvider") + _automation_loaded = True + + from System.Windows.Automation import ( + AutomationElement, + TreeScope, + Condition, + TreeWalker, + ) + return AutomationElement, TreeScope, Condition, TreeWalker + + +def _dump_element_to_xml(element, indent_level: int = 0, max_depth: int = 30) -> list[str]: + """Recursively dump a UIAutomation element tree to XML strings.""" + if indent_level > max_depth: + return [] + + lines: list[str] = [] + indent = " " * indent_level + + try: + current = element.Current + control_type = current.LocalizedControlType or "unknown" + # Sanitize the tag name: replace spaces with underscores + tag = control_type.replace(" ", "_") + name = _xml_escape(current.Name or "") + class_name = _xml_escape(current.ClassName or "") + automation_id = _xml_escape(current.AutomationId or "") + + attrs = f'name="{name}"' + if class_name: + attrs += f' class="{class_name}"' + if automation_id: + attrs += f' automation_id="{automation_id}"' + + # Add bounding rectangle if available + try: + rect = current.BoundingRectangle + if rect.Width > 0 or rect.Height > 0: + attrs += f' x="{int(rect.Left)}" y="{int(rect.Top)}"' + attrs += f' width="{int(rect.Width)}" height="{int(rect.Height)}"' + except Exception: + pass + + # Get children + _, TreeScope, Condition, _ = _get_automation_imports() + children = element.FindAll(TreeScope.Children, Condition.TrueCondition) + + if children.Count > 0: + lines.append(f'{indent}<{tag} {attrs}>') + for i in range(children.Count): + child = children[i] + lines.extend(_dump_element_to_xml(child, indent_level + 1, max_depth)) + lines.append(f'{indent}') + else: + lines.append(f'{indent}<{tag} {attrs}/>') + + except Exception: + pass + + return lines + + +def _find_window_by_name(app_name: str): + """Find the top-level window element matching app_name (case-insensitive substring match).""" + AutomationElement, TreeScope, Condition, _ = _get_automation_imports() + root = AutomationElement.RootElement + windows = root.FindAll(TreeScope.Children, Condition.TrueCondition) + + app_lower = app_name.lower() + for i in range(windows.Count): + try: + win = windows[i] + win_name = win.Current.Name or "" + if app_lower in win_name.lower(): + return win + except Exception: + continue + return None + + +def _get_ui_tree_xml(app_name: str) -> str | None: + """Get the full UI tree of a window as XML.""" + window = _find_window_by_name(app_name) + if window is None: + return None + + xml_lines = [''] + xml_lines.extend(_dump_element_to_xml(window, indent_level=0)) + return "\n".join(xml_lines) + + +def _get_active_apps() -> list[AppInfo]: + """Return all top-level windows (active apps) from the UIAutomation tree.""" + AutomationElement, TreeScope, Condition, _ = _get_automation_imports() + root = AutomationElement.RootElement + windows = root.FindAll(TreeScope.Children, Condition.TrueCondition) + + apps: list[AppInfo] = [] + for i in range(windows.Count): + try: + win = windows[i] + name = win.Current.Name or "" + # Skip empty-named windows (usually invisible system windows) + if not name.strip(): + continue + apps.append(AppInfo( + name=name, + pid=win.Current.ProcessId, + class_name=win.Current.ClassName or "", + automation_id=win.Current.AutomationId or "", + )) + except Exception: + continue + return apps + + +@router.get("/inspect") +def inspect(app_name: str): + """Get the Windows UI DOM (XML tree) for a given application. + + Args: + app_name: Name (or substring) of the target application window. Required. + """ + global _TARGET_APP_NAME, _TARGET_APP_SET_TIME + _TARGET_APP_NAME = app_name + _TARGET_APP_SET_TIME = time.time() + + if sys.platform != "win32": + return InspectorResponse(status="error", error="This endpoint is only available on Windows") + + try: + xml_content = _get_ui_tree_xml(app_name) + if not xml_content: + return InspectorResponse( + status="error", + error=f"No window found matching '{app_name}'. Use /apps to list active windows.", + ) + return InspectorResponse(status="ok", ui_xml=xml_content) + except Exception as e: + return InspectorResponse(status="error", error=str(e)) + + +@router.get("/apps", response_model=list[AppInfo]) +def get_apps(): + """Return all opened/active application windows.""" + if sys.platform != "win32": + return [] + + try: + return _get_active_apps() + except Exception: + return [] + + +async def upload_windows_ui_dump(): + """Continuously upload Windows UI dump if changed. + + Only runs on Windows. Uploads to the server with key 'dom_windows'. + """ + global _TARGET_APP_NAME, _TARGET_APP_SET_TIME + + if sys.platform != "win32": + return + + prev_xml_hash = "" + while True: + try: + if _TARGET_APP_NAME and (time.time() - _TARGET_APP_SET_TIME) > 8 * 3600: + _TARGET_APP_NAME = None + + target_app = _TARGET_APP_NAME + + if target_app: + xml_content = await asyncio.to_thread(_get_ui_tree_xml, target_app) + if xml_content: + new_xml_hash = hashlib.sha256(xml_content.encode("utf-8")).hexdigest() + + if prev_xml_hash != new_xml_hash: + prev_xml_hash = new_xml_hash + + url = ( + ConfigModule.get_config_value("Authentication", "server_address").strip() + + "/node_ai_contents/" + ) + apiKey = ConfigModule.get_config_value("Authentication", "api-key").strip() + + res = await asyncio.to_thread( + requests.post, + url, + headers={"X-Api-Key": apiKey}, + json={ + "dom_win": {"dom": xml_content}, + "node_id": CommonUtil.MachineInfo().getLocalUser().lower(), + "app_name": target_app, + }, + timeout=10, + ) + if res.ok: + CommonUtil.ExecLog("", "Windows UI dump uploaded successfully", iLogLevel=1) + except Exception as e: + CommonUtil.ExecLog("", f"Error uploading Windows UI dump: {str(e)}", iLogLevel=3) + + await asyncio.sleep(5) From e41e0845ab39484ca3fed1631d8fb25350154a12 Mon Sep 17 00:00:00 2001 From: Mahmudul Alam Date: Tue, 31 Mar 2026 09:11:50 +0600 Subject: [PATCH 2/7] changed from * import to Specific import --- .../Desktop/Windows/BuiltInFunctions.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/Framework/Built_In_Automation/Desktop/Windows/BuiltInFunctions.py b/Framework/Built_In_Automation/Desktop/Windows/BuiltInFunctions.py index 6bf3e4a1..fe27c073 100644 --- a/Framework/Built_In_Automation/Desktop/Windows/BuiltInFunctions.py +++ b/Framework/Built_In_Automation/Desktop/Windows/BuiltInFunctions.py @@ -66,12 +66,22 @@ # this needs to be here on top, otherwise will return error import clr, System dll_path = os.getcwd().split("Framework")[0] + "Framework" + os.sep + "windows_dll_files" + os.sep -clr.AddReference(dll_path+"UIAutomationClient") -clr.AddReference(dll_path+"UIAutomationTypes") -clr.AddReference(dll_path+"UIAutomationProvider") +clr.AddReference(dll_path + "UIAutomationClient") +clr.AddReference(dll_path + "UIAutomationTypes") +clr.AddReference(dll_path + "UIAutomationProvider") clr.AddReference("System.Windows.Forms") -from System.Windows.Automation import * +from System.Windows.Automation import ( + AutomationElement, + TreeScope, + Condition, + Automation, + InvokePattern, + ValuePattern, + TogglePattern, + SelectionItemPattern, + ExpandCollapsePattern, +) import pyautogui # Should be removed after we complete sequential actions import autoit # The likely method we'll use From aa83c44dd46084cb3a8748a54ee39324b80bd810 Mon Sep 17 00:00:00 2001 From: Mahmudul Alam Date: Wed, 1 Apr 2026 12:00:08 +0600 Subject: [PATCH 3/7] improved windows ai chatbot inspection --- .../Desktop/Windows/BuiltInFunctions.py | 2 +- server/windows.py | 26 +++++++++++++++---- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/Framework/Built_In_Automation/Desktop/Windows/BuiltInFunctions.py b/Framework/Built_In_Automation/Desktop/Windows/BuiltInFunctions.py index fe27c073..bfbbcfcb 100644 --- a/Framework/Built_In_Automation/Desktop/Windows/BuiltInFunctions.py +++ b/Framework/Built_In_Automation/Desktop/Windows/BuiltInFunctions.py @@ -319,7 +319,7 @@ def Check_uncheck(data_set): if command == "check" and is_selected == "On": CommonUtil.ExecLog(sModuleInfo, "The element is already checked so skipped it", 1) return "passed" - elif command == "uncheck" and not is_selected: + elif command == "uncheck" and is_selected == "Off": CommonUtil.ExecLog(sModuleInfo, "The element is already unchecked so skipped it", 1) return "passed" try: diff --git a/server/windows.py b/server/windows.py index 37b3dfc3..3008f195 100644 --- a/server/windows.py +++ b/server/windows.py @@ -15,6 +15,7 @@ _TARGET_APP_NAME: str | None = None _TARGET_APP_SET_TIME: float = 0.0 +_active_ui_requests: dict[str, asyncio.Task] = {} class InspectorResponse(BaseModel): @@ -149,6 +150,19 @@ def _get_ui_tree_xml(app_name: str) -> str | None: return "\n".join(xml_lines) +async def _get_ui_tree_xml_async(app_name: str) -> str | None: + """Run _get_ui_tree_xml async and avoid concurrent duplicate requests for the same app.""" + if app_name in _active_ui_requests: + return await _active_ui_requests[app_name] + + task = asyncio.create_task(asyncio.to_thread(_get_ui_tree_xml, app_name)) + _active_ui_requests[app_name] = task + try: + return await task + finally: + _active_ui_requests.pop(app_name, None) + + def _get_active_apps() -> list[AppInfo]: """Return all top-level windows (active apps) from the UIAutomation tree.""" AutomationElement, TreeScope, Condition, _ = _get_automation_imports() @@ -175,7 +189,7 @@ def _get_active_apps() -> list[AppInfo]: @router.get("/inspect") -def inspect(app_name: str): +async def inspect(app_name: str): """Get the Windows UI DOM (XML tree) for a given application. Args: @@ -189,7 +203,7 @@ def inspect(app_name: str): return InspectorResponse(status="error", error="This endpoint is only available on Windows") try: - xml_content = _get_ui_tree_xml(app_name) + xml_content = await _get_ui_tree_xml_async(app_name) if not xml_content: return InspectorResponse( status="error", @@ -200,14 +214,15 @@ def inspect(app_name: str): return InspectorResponse(status="error", error=str(e)) + @router.get("/apps", response_model=list[AppInfo]) -def get_apps(): +async def get_apps(): """Return all opened/active application windows.""" if sys.platform != "win32": return [] try: - return _get_active_apps() + return await asyncio.to_thread(_get_active_apps) except Exception: return [] @@ -231,10 +246,11 @@ async def upload_windows_ui_dump(): target_app = _TARGET_APP_NAME if target_app: - xml_content = await asyncio.to_thread(_get_ui_tree_xml, target_app) + xml_content = await _get_ui_tree_xml_async(target_app) if xml_content: new_xml_hash = hashlib.sha256(xml_content.encode("utf-8")).hexdigest() + if prev_xml_hash != new_xml_hash: prev_xml_hash = new_xml_hash From b0f08bec1565599491d1c466750f789c9fdc54d1 Mon Sep 17 00:00:00 2001 From: Mahmudul Alam Date: Thu, 2 Apr 2026 09:22:29 +0600 Subject: [PATCH 4/7] fix special character app not supporting --- .../Built_In_Automation/Desktop/Windows/BuiltInFunctions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Framework/Built_In_Automation/Desktop/Windows/BuiltInFunctions.py b/Framework/Built_In_Automation/Desktop/Windows/BuiltInFunctions.py index bfbbcfcb..274c857e 100644 --- a/Framework/Built_In_Automation/Desktop/Windows/BuiltInFunctions.py +++ b/Framework/Built_In_Automation/Desktop/Windows/BuiltInFunctions.py @@ -2162,7 +2162,7 @@ def Run_Application(data_set): #last_start_time = time.time() autoit.send("^{ESC}") time.sleep(keypress_interval) - autoit.send(Desktop_app) + autoit.send(Desktop_app, 1) time.sleep(keypress_interval) autoit.send("{ENTER}") CommonUtil.ExecLog(sModuleInfo, "Successfully launched your app", 1) From 3185ace1bdc8a1b3d0c1135c93f0ce9fa6129263 Mon Sep 17 00:00:00 2001 From: Mahmudul Alam Date: Thu, 2 Apr 2026 10:02:01 +0600 Subject: [PATCH 5/7] Automatically fallback to gui if normal click failed --- .../Desktop/Windows/BuiltInFunctions.py | 119 ++++++++++-------- 1 file changed, 65 insertions(+), 54 deletions(-) diff --git a/Framework/Built_In_Automation/Desktop/Windows/BuiltInFunctions.py b/Framework/Built_In_Automation/Desktop/Windows/BuiltInFunctions.py index 274c857e..80ec9bf3 100644 --- a/Framework/Built_In_Automation/Desktop/Windows/BuiltInFunctions.py +++ b/Framework/Built_In_Automation/Desktop/Windows/BuiltInFunctions.py @@ -219,65 +219,62 @@ def Click_Element_None_Mouse(Element, Expand=True, Gui=False, offset: str | None pattern_name = Automation.PatternName(each) CommonUtil.ExecLog(sModuleInfo, "Pattern name attached to the current element is: %s " % pattern_name, 1) - # Expand and collapse actions - if pattern_name == "ExpandCollapse": - if Expand: - # check to see if its expanded, if expanded, then do nothing... if not, expand it - status = Element.GetCurrentPattern( - ExpandCollapsePattern.Pattern - ).Current.ExpandCollapseState - if status == 0: - CommonUtil.ExecLog(sModuleInfo, "Expanding the item", 1) - Element.GetCurrentPattern( + try: + # Expand and collapse actions + if pattern_name == "ExpandCollapse": + if Expand: + # check to see if its expanded, if expanded, then do nothing... if not, expand it + status = Element.GetCurrentPattern( ExpandCollapsePattern.Pattern - ).Expand() - return "passed" - elif status == 1: - CommonUtil.ExecLog(sModuleInfo, "Already expanded", 1) - return "passed" - else: - # check to see if its Collapsed, if Collapsed, then do nothing... if not, Collapse it - status = Element.GetCurrentPattern( - ExpandCollapsePattern.Pattern - ).Current.ExpandCollapseState - if status == 1: - CommonUtil.ExecLog(sModuleInfo, "Collapsing the item", 1) - Element.GetCurrentPattern( + ).Current.ExpandCollapseState + if status == 0: + CommonUtil.ExecLog(sModuleInfo, "Expanding the item", 1) + Element.GetCurrentPattern( + ExpandCollapsePattern.Pattern + ).Expand() + return "passed" + elif status == 1: + CommonUtil.ExecLog(sModuleInfo, "Already expanded", 1) + return "passed" + else: + # check to see if its Collapsed, if Collapsed, then do nothing... if not, Collapse it + status = Element.GetCurrentPattern( ExpandCollapsePattern.Pattern - ).Collapse() - return "passed" - elif status == 0: - CommonUtil.ExecLog(sModuleInfo, "Already collapsed", 1) - return "passed" - # Invoking actions - elif pattern_name == "Invoke": - CommonUtil.ExecLog(sModuleInfo, "Invoking the object", 1) - time.sleep(unnecessary_sleep) - Element.GetCurrentPattern(InvokePattern.Pattern).Invoke() - return "passed" - # Selection of an item - elif pattern_name == "SelectionItem": - CommonUtil.ExecLog(sModuleInfo, "Selecting an item", 1) - Element.GetCurrentPattern(SelectionItemPattern.Pattern).Select() - time.sleep(unnecessary_sleep) - return "passed" - # Toggling action - - elif pattern_name == "Toggle": - CommonUtil.ExecLog(sModuleInfo, "Toggling an item", 1) - Element.GetCurrentPattern(TogglePattern.Pattern).Toggle() - time.sleep(unnecessary_sleep) - return "passed" - # if no patterns are found, then we do an actual mouse click - else: - # x = int (Element.Current.BoundingRectangle.X) - # y = int (Element.Current.BoundingRectangle.Y) + ).Current.ExpandCollapseState + if status == 1: + CommonUtil.ExecLog(sModuleInfo, "Collapsing the item", 1) + Element.GetCurrentPattern( + ExpandCollapsePattern.Pattern + ).Collapse() + return "passed" + elif status == 0: + CommonUtil.ExecLog(sModuleInfo, "Already collapsed", 1) + return "passed" + # Invoking actions + elif pattern_name == "Invoke": + CommonUtil.ExecLog(sModuleInfo, "Invoking the object", 1) + time.sleep(unnecessary_sleep) + Element.GetCurrentPattern(InvokePattern.Pattern).Invoke() + return "passed" + # Selection of an item + elif pattern_name == "SelectionItem": + CommonUtil.ExecLog(sModuleInfo, "Selecting an item", 1) + Element.GetCurrentPattern(SelectionItemPattern.Pattern).Select() + time.sleep(unnecessary_sleep) + return "passed" + # Toggling action + elif pattern_name == "Toggle": + CommonUtil.ExecLog(sModuleInfo, "Toggling an item", 1) + Element.GetCurrentPattern(TogglePattern.Pattern).Toggle() + time.sleep(unnecessary_sleep) + return "passed" + except Exception as e: CommonUtil.ExecLog( sModuleInfo, - "We did not find any pattern for this object, so we will click by mouse with location", - 1, + f"Normal click ({pattern_name}) failed or did nothing ({e}). Automatically using GUI click.", + 2, ) - x, y = get_coords(Element) + x, y = get_coords(Element, offset) win32api.SetCursorPos((x, y)) win32api.mouse_event(win32con.MOUSEEVENTF_LEFTDOWN, x, y, 0, 0) time.sleep(0.1) @@ -285,6 +282,20 @@ def Click_Element_None_Mouse(Element, Expand=True, Gui=False, offset: str | None time.sleep(unnecessary_sleep) return "passed" + # if no patterns matched the standard ones, then we do an actual mouse click as fallback + CommonUtil.ExecLog( + sModuleInfo, + "We did not find any suitable pattern for this object, so we will click by mouse with location", + 1, + ) + x, y = get_coords(Element, offset) + win32api.SetCursorPos((x, y)) + win32api.mouse_event(win32con.MOUSEEVENTF_LEFTDOWN, x, y, 0, 0) + time.sleep(0.1) + win32api.mouse_event(win32con.MOUSEEVENTF_LEFTUP, x, y, 0, 0) + time.sleep(unnecessary_sleep) + return "passed" + CommonUtil.ExecLog(sModuleInfo, "Unable to perform the action on the object", 3) return "zeuz_failed" except Exception: From 178cb17a639b5d6565959859a01c254fbe6155f3 Mon Sep 17 00:00:00 2001 From: Mahmudul Alam Date: Tue, 7 Apr 2026 04:55:55 +0600 Subject: [PATCH 6/7] feat: enhance UI tree XML generation with ElementTree and improved structure --- server/windows.py | 150 +++++++++++++++++++++++++++------------------- 1 file changed, 89 insertions(+), 61 deletions(-) diff --git a/server/windows.py b/server/windows.py index 3008f195..1f3dacc9 100644 --- a/server/windows.py +++ b/server/windows.py @@ -4,6 +4,7 @@ import asyncio import requests import time +import xml.etree.ElementTree as ET from typing import Literal from fastapi import APIRouter from pydantic import BaseModel @@ -15,7 +16,6 @@ _TARGET_APP_NAME: str | None = None _TARGET_APP_SET_TIME: float = 0.0 -_active_ui_requests: dict[str, asyncio.Task] = {} class InspectorResponse(BaseModel): @@ -70,55 +70,59 @@ def _get_automation_imports(): return AutomationElement, TreeScope, Condition, TreeWalker -def _dump_element_to_xml(element, indent_level: int = 0, max_depth: int = 30) -> list[str]: - """Recursively dump a UIAutomation element tree to XML strings.""" - if indent_level > max_depth: - return [] +def _build_element_tree(xml_parent, ui_element, max_depth: int = 50, _depth: int = 0): + """Recursively build an ET tree from a UIAutomation element. - lines: list[str] = [] - indent = " " * indent_level + Mirrors create_tree() from ZeuZ_Windows_Inspector.py:
tags with + Name, AutomationId, LocalizedControlType, ClassName, Left, Right, Top, Bottom. + """ + if _depth > max_depth: + return + _, TreeScope, Condition, _ = _get_automation_imports() try: - current = element.Current - control_type = current.LocalizedControlType or "unknown" - # Sanitize the tag name: replace spaces with underscores - tag = control_type.replace(" ", "_") - name = _xml_escape(current.Name or "") - class_name = _xml_escape(current.ClassName or "") - automation_id = _xml_escape(current.AutomationId or "") - - attrs = f'name="{name}"' - if class_name: - attrs += f' class="{class_name}"' - if automation_id: - attrs += f' automation_id="{automation_id}"' - - # Add bounding rectangle if available + child_elements = ui_element.FindAll(TreeScope.Children, Condition.TrueCondition) + except Exception: + return + + for i in range(child_elements.Count): + each_child = child_elements[i] try: - rect = current.BoundingRectangle - if rect.Width > 0 or rect.Height > 0: - attrs += f' x="{int(rect.Left)}" y="{int(rect.Top)}"' - attrs += f' width="{int(rect.Width)}" height="{int(rect.Height)}"' + elem_name = _xml_escape(each_child.Current.Name or "") + elem_automationid = _xml_escape(each_child.Current.AutomationId or "") + elem_control = _xml_escape(each_child.Current.LocalizedControlType or "") + elem_class = _xml_escape(each_child.Current.ClassName or "") + try: + left = str(each_child.Current.BoundingRectangle.Left) + right = str(each_child.Current.BoundingRectangle.Right) + top = str(each_child.Current.BoundingRectangle.Top) + bottom = str(each_child.Current.BoundingRectangle.Bottom) + except Exception: + left, right, top, bottom = "", "", "", "" + + attribs = { + "Name": elem_name, + "AutomationId": elem_automationid, + "LocalizedControlType": elem_control, + "ClassName": elem_class, + "Left": left, + "Right": right, + "Top": top, + "Bottom": bottom, + } + xml_child = ET.SubElement(xml_parent, "div", **attribs) + _build_element_tree(xml_child, each_child, max_depth, _depth + 1) except Exception: - pass - - # Get children - _, TreeScope, Condition, _ = _get_automation_imports() - children = element.FindAll(TreeScope.Children, Condition.TrueCondition) - - if children.Count > 0: - lines.append(f'{indent}<{tag} {attrs}>') - for i in range(children.Count): - child = children[i] - lines.extend(_dump_element_to_xml(child, indent_level + 1, max_depth)) - lines.append(f'{indent}') - else: - lines.append(f'{indent}<{tag} {attrs}/>') + continue - except Exception: - pass - return lines +def _remove_coordinates(root): + """Remove Left/Right/Top/Bottom attributes from all elements. Matches inspector's Remove_coordinate().""" + for each in root: + att = each.attrib + for key in ("Left", "Right", "Top", "Bottom"): + att.pop(key, None) + _remove_coordinates(each) def _find_window_by_name(app_name: str): @@ -139,28 +143,53 @@ def _find_window_by_name(app_name: str): return None -def _get_ui_tree_xml(app_name: str) -> str | None: - """Get the full UI tree of a window as XML.""" +def _get_ui_tree(app_name: str) -> ET.Element | None: + """Build the UI tree as an ET Element matching ZeuZ_Windows_Inspector format. + + Returns a root with Name, AutomationId, LocalizedControlType, ClassName, pid. + Children are
elements with the same attributes plus Left/Right/Top/Bottom. + """ window = _find_window_by_name(app_name) if window is None: return None - xml_lines = [''] - xml_lines.extend(_dump_element_to_xml(window, indent_level=0)) - return "\n".join(xml_lines) + current = window.Current + attribs = { + "Name": _xml_escape(current.Name or ""), + "AutomationId": _xml_escape(current.AutomationId or ""), + "LocalizedControlType": _xml_escape(current.LocalizedControlType or ""), + "ClassName": _xml_escape(current.ClassName or ""), + "pid": str(current.ProcessId) if hasattr(current, "ProcessId") else "", + } + root = ET.Element("body", **attribs) + _build_element_tree(root, window) + return root + + +def _get_ui_tree_xml(app_name: str) -> str | None: + """Get the full UI tree of a window as XML string (with coordinates, for /inspect).""" + root = _get_ui_tree(app_name) + if root is None: + return None + try: + ET.indent(root) + except AttributeError: + pass + return ET.tostring(root, encoding="unicode") -async def _get_ui_tree_xml_async(app_name: str) -> str | None: - """Run _get_ui_tree_xml async and avoid concurrent duplicate requests for the same app.""" - if app_name in _active_ui_requests: - return await _active_ui_requests[app_name] - - task = asyncio.create_task(asyncio.to_thread(_get_ui_tree_xml, app_name)) - _active_ui_requests[app_name] = task +def _get_ui_tree_xml_for_upload(app_name: str) -> str | None: + """Get the UI tree XML for upload (without coordinates, matching inspector's uploaded version).""" + root = _get_ui_tree(app_name) + if root is None: + return None + _remove_coordinates(root) try: - return await task - finally: - _active_ui_requests.pop(app_name, None) + ET.indent(root, "") + except AttributeError: + pass + return ET.tostring(root, encoding="unicode") + def _get_active_apps() -> list[AppInfo]: @@ -203,7 +232,7 @@ async def inspect(app_name: str): return InspectorResponse(status="error", error="This endpoint is only available on Windows") try: - xml_content = await _get_ui_tree_xml_async(app_name) + xml_content = await asyncio.to_thread(_get_ui_tree_xml, app_name) if not xml_content: return InspectorResponse( status="error", @@ -246,11 +275,10 @@ async def upload_windows_ui_dump(): target_app = _TARGET_APP_NAME if target_app: - xml_content = await _get_ui_tree_xml_async(target_app) + xml_content = await asyncio.to_thread(_get_ui_tree_xml_for_upload, target_app) if xml_content: new_xml_hash = hashlib.sha256(xml_content.encode("utf-8")).hexdigest() - if prev_xml_hash != new_xml_hash: prev_xml_hash = new_xml_hash From f317f728ef878c479b7a0afb74e91db5d87afbd5 Mon Sep 17 00:00:00 2001 From: Mahmudul Alam Date: Wed, 8 Apr 2026 07:44:04 +0600 Subject: [PATCH 7/7] feat: add hotkey functionality to capture UI tree on demand --- server/windows.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/server/windows.py b/server/windows.py index 1f3dacc9..a9e63cd4 100644 --- a/server/windows.py +++ b/server/windows.py @@ -17,6 +17,8 @@ _TARGET_APP_NAME: str | None = None _TARGET_APP_SET_TIME: float = 0.0 +_HOTKEY = "ctrl+shift+i" + class InspectorResponse(BaseModel): """Response model for the /inspect endpoint.""" @@ -192,6 +194,16 @@ def _get_ui_tree_xml_for_upload(app_name: str) -> str | None: +def _wait_hotkey_and_capture(app_name: str) -> str | None: + """Block until user presses the hotkey, then immediately capture the UI tree. + + This runs in a thread so the menu stays open (no focus change). + """ + import keyboard + keyboard.wait(_HOTKEY) + return _get_ui_tree_xml(app_name) + + def _get_active_apps() -> list[AppInfo]: """Return all top-level windows (active apps) from the UIAutomation tree.""" AutomationElement, TreeScope, Condition, _ = _get_automation_imports() @@ -244,6 +256,31 @@ async def inspect(app_name: str): +@router.get("/snapshot") +async def snapshot(app_name: str): + """Wait for hotkey press, then capture and return the UI tree. + + The request blocks until the user presses the hotkey (Ctrl+Shift+I). + This allows capturing menus/popups that disappear on focus change. + + Args: + app_name: Name (or substring) of the target application window. + """ + if sys.platform != "win32": + return InspectorResponse(status="error", error="This endpoint is only available on Windows") + + try: + xml_content = await asyncio.to_thread(_wait_hotkey_and_capture, app_name) + if not xml_content: + return InspectorResponse( + status="error", + error=f"No window found matching '{app_name}'. Use /apps to list active windows.", + ) + return InspectorResponse(status="ok", ui_xml=xml_content) + except Exception as e: + return InspectorResponse(status="error", error=str(e)) + + @router.get("/apps", response_model=list[AppInfo]) async def get_apps(): """Return all opened/active application windows."""