From 4aff01e3f38c5be200704865b7279d18d2662b49 Mon Sep 17 00:00:00 2001
From: pk-zipstack <praveen@zipstack.com>
Date: Tue, 7 Apr 2026 11:55:52 +0530
Subject: [PATCH 01/12] [FEAT] Add document_insights mode to LLMWhisperer V2
 adapter with signature metadata in LLM context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add document_insights as a new processing mode in the LLMWhisperer V2
  adapter (Modes enum + JSON schema dropdown)
- Extract signature_metadata from LLMWhisperer response when using
  document_insights mode and surface it in TextExtractionMetadata
- Thread signature_metadata through the workers pipeline
  (extract → answer_params → construct_prompt)
- Format signature metadata as a human-readable context block injected
  into the LLM prompt's Context section
- Update prompt-service extraction endpoint to return signature_metadata
- Mirror construct_prompt changes in prompt-service for parity

Workers execution path (API deployments, workflow runs) is fully
functional. Prompt-service path has endpoints ready but structure tool
threading is a follow-up.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/unstract/prompt_service/constants.py  |  1 +
 .../prompt_service/controllers/extraction.py  |  9 +++-
 .../prompt_service/services/answer_prompt.py  | 41 +++++++++++++++-
 .../prompt_service/services/extraction.py     | 16 +++++-
 .../src/unstract/sdk1/adapters/x2text/dto.py  |  1 +
 .../x2text/llm_whisperer_v2/src/constants.py  |  2 +
 .../llm_whisperer_v2/src/llm_whisperer_v2.py  | 15 ++++++
 .../src/static/json_schema.json               |  3 +-
 workers/executor/executors/answer_prompt.py   | 49 ++++++++++++++++++-
 workers/executor/executors/constants.py       |  1 +
 workers/executor/executors/legacy_executor.py | 18 +++++++
 11 files changed, 149 insertions(+), 7 deletions(-)

diff --git a/prompt-service/src/unstract/prompt_service/constants.py b/prompt-service/src/unstract/prompt_service/constants.py
index 9eddab8423..16bb364ce3 100644
--- a/prompt-service/src/unstract/prompt_service/constants.py
+++ b/prompt-service/src/unstract/prompt_service/constants.py
@@ -84,6 +84,7 @@ class PromptServiceConstants:
     LINE_ITEM = "line-item"
     LINE_NUMBERS = "line_numbers"
     WHISPER_HASH = "whisper_hash"
+    SIGNATURE_METADATA = "signature_metadata"
     PAID_FEATURE_MSG = (
         "It is a cloud / enterprise feature. If you have purchased a plan and still "
         "face this issue, please contact support"
diff --git a/prompt-service/src/unstract/prompt_service/controllers/extraction.py b/prompt-service/src/unstract/prompt_service/controllers/extraction.py
index 516894f429..4a5d47bb91 100644
--- a/prompt-service/src/unstract/prompt_service/controllers/extraction.py
+++ b/prompt-service/src/unstract/prompt_service/controllers/extraction.py
@@ -36,7 +36,7 @@ def extract() -> Any:
     tool_exec_metadata = payload.get(IKeys.TOOL_EXECUTION_METATADA, {})
     execution_run_data_folder = payload.get(IKeys.EXECUTION_DATA_DIR, "")
 
-    extracted_text = ExtractionService.perform_extraction(
+    extraction_result = ExtractionService.perform_extraction(
         file_path=file_path,
         x2text_instance_id=x2text_instance_id,
         output_file_path=output_file_path,
@@ -49,5 +49,10 @@ def extract() -> Any:
         tool_exec_metadata=tool_exec_metadata,
         execution_run_data_folder=execution_run_data_folder,
     )
-    response = {IKeys.EXTRACTED_TEXT: extracted_text}
+    response = {
+        IKeys.EXTRACTED_TEXT: extraction_result["extracted_text"],
+    }
+    signature_metadata = extraction_result.get("signature_metadata")
+    if signature_metadata:
+        response["signature_metadata"] = signature_metadata
     return response
diff --git a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
index 9f8cbf9c28..9ccf20abe3 100644
--- a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
+++ b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
@@ -141,6 +141,7 @@ def construct_and_run_prompt(
             platform_postamble=platform_postamble,
             word_confidence_postamble=word_confidence_postamble,
             prompt_type=prompt_type,
+            signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA),
         )
         output[PSKeys.COMBINED_PROMPT] = prompt
         return AnswerPromptService.run_completion(
@@ -155,6 +156,36 @@ def construct_and_run_prompt(
             execution_source=execution_source,
         )
 
+    @staticmethod
+    def _format_signature_metadata(
+        signature_metadata: dict[str, list[Any]],
+    ) -> str:
+        """Format signature metadata as a human-readable context block."""
+        lines: list[str] = []
+        for page_num, signatures in sorted(
+            signature_metadata.items(), key=lambda x: int(x[0])
+        ):
+            if not signatures:
+                continue
+            for sig in signatures:
+                name = sig.get("name", "Unknown")
+                sig_type = sig.get("type", "signature")
+                desc = sig.get("desc", "")
+                page_display = int(page_num) + 1  # 0-indexed to 1-indexed
+                entry = f"- Page {page_display}: {name} ({sig_type})"
+                if desc:
+                    entry += f" — {desc}"
+                lines.append(entry)
+        if not lines:
+            return ""
+        header = (
+            "\n\n[Document Signature Information]\n"
+            "The following signatures were detected in this document. "
+            "Use this information to answer any questions about signatories, "
+            "signing parties, or document execution status.\n"
+        )
+        return header + "\n".join(lines)
+
     @staticmethod
     def construct_prompt(
         preamble: str,
@@ -165,6 +196,7 @@ def construct_prompt(
         platform_postamble: str,
         word_confidence_postamble: str,
         prompt_type: str = PSKeys.TEXT,
+        signature_metadata: dict[str, list[Any]] | None = None,
     ) -> str:
         prompt = f"{preamble}\n\nQuestion or Instruction: {prompt}"
         if grammar_list is not None and len(grammar_list) > 0:
@@ -190,8 +222,15 @@ def construct_prompt(
             platform_postamble += "\n\n"
             if word_confidence_postamble:
                 platform_postamble += f"{word_confidence_postamble}\n\n"
+        # Append signature metadata to context if present
+        signature_context = ""
+        if signature_metadata:
+            signature_context = AnswerPromptService._format_signature_metadata(
+                signature_metadata
+            )
         prompt += (
-            f"\n\n{postamble}\n\nContext:\n---------------\n{context}\n"
+            f"\n\n{postamble}\n\nContext:\n---------------\n{context}"
+            f"{signature_context}\n"
             f"-----------------\n\n{platform_postamble}Answer:"
         )
         return prompt
diff --git a/prompt-service/src/unstract/prompt_service/services/extraction.py b/prompt-service/src/unstract/prompt_service/services/extraction.py
index 76430657f9..d7e9824bbb 100644
--- a/prompt-service/src/unstract/prompt_service/services/extraction.py
+++ b/prompt-service/src/unstract/prompt_service/services/extraction.py
@@ -30,7 +30,7 @@ def perform_extraction(
         execution_source: str | None = None,
         tool_exec_metadata: dict[str, Any] | None = None,
         execution_run_data_folder: str | None = None,
-    ) -> str:
+    ) -> dict[str, Any]:
         extracted_text = ""
         util = PromptServiceBaseTool(platform_key=platform_key)
         x2text = X2Text(
@@ -64,7 +64,19 @@ def perform_extraction(
                     fs=fs,
                 )
             extracted_text = process_response.extracted_text
-            return extracted_text
+            # Extract signature metadata if present
+            signature_metadata = None
+            if (
+                process_response.extraction_metadata
+                and process_response.extraction_metadata.signature_metadata
+            ):
+                signature_metadata = (
+                    process_response.extraction_metadata.signature_metadata
+                )
+            return {
+                "extracted_text": extracted_text,
+                "signature_metadata": signature_metadata,
+            }
         except AdapterError as e:
             msg = f"Error from text extractor '{x2text.x2text_instance.get_name()}'. "
             msg += str(e)
diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py
index 95c60bbe8c..4f4a92d812 100644
--- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py
+++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py
@@ -6,6 +6,7 @@
 class TextExtractionMetadata:
     whisper_hash: str
     line_metadata: dict[Any, Any] | None = None
+    signature_metadata: dict[str, list[Any]] | None = None
 
 
 @dataclass
diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/constants.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/constants.py
index 090a3bf6f4..722e1a2f3f 100644
--- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/constants.py
+++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/constants.py
@@ -7,6 +7,8 @@ class Modes(Enum):
     LOW_COST = "low_cost"
     HIGH_QUALITY = "high_quality"
     FORM = "form"
+    TABLE = "table"
+    DOCUMENT_INSIGHTS = "document_insights"
 
 
 class OutputModes(Enum):
diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
index 3a48a57647..3d1e1f6a97 100644
--- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
+++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
@@ -10,6 +10,8 @@
     TextExtractionResult,
 )
 from unstract.sdk1.adapters.x2text.llm_whisperer_v2.src.constants import (
+    Modes,
+    WhispererConfig,
     WhispererEndpoint,
 )
 from unstract.sdk1.adapters.x2text.llm_whisperer_v2.src.dto import (
@@ -96,9 +98,22 @@ def process(
             fs=fs,
             extra_params=extra_params,
         )
+        # Extract signature_metadata when using document_insights mode
+        signature_metadata = None
+        mode = self.config.get(WhispererConfig.MODE, Modes.FORM.value)
+        if mode == Modes.DOCUMENT_INSIGHTS.value:
+            response_metadata = response.get("metadata", {})
+            signature_metadata = {}
+            for page_num, page_data in response_metadata.items():
+                if isinstance(page_data, dict) and "signature_metadata" in page_data:
+                    signature_metadata[page_num] = page_data["signature_metadata"]
+            if not any(signature_metadata.values()):
+                signature_metadata = None
+
         metadata = TextExtractionMetadata(
             whisper_hash=response.get(X2TextConstants.WHISPER_HASH_V2, ""),
             line_metadata=response.get("line_metadata"),
+            signature_metadata=signature_metadata,
         )
 
         return TextExtractionResult(
diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json
index 1215ede56a..00da534c37 100644
--- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json
+++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json
@@ -34,7 +34,8 @@
         "low_cost",
         "high_quality",
         "form",
-        "table"
+        "table",
+        "document_insights"
       ],
       "default": "form",
       "description": "Processing mode to use, described in the [LLMWhisperer documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_api/#modes)."
diff --git a/workers/executor/executors/answer_prompt.py b/workers/executor/executors/answer_prompt.py
index 89936fe598..b1a5998dde 100644
--- a/workers/executor/executors/answer_prompt.py
+++ b/workers/executor/executors/answer_prompt.py
@@ -157,6 +157,7 @@ def construct_and_run_prompt(
             platform_postamble=platform_postamble,
             word_confidence_postamble=word_confidence_postamble,
             prompt_type=prompt_type,
+            signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA),
         )
         output[PSKeys.COMBINED_PROMPT] = prompt
         return AnswerPromptService.run_completion(
@@ -189,6 +190,44 @@ def _build_grammar_notes(grammar_list: list[dict[str, Any]]) -> str:
                 )
         return notes
 
+    @staticmethod
+    def _format_signature_metadata(
+        signature_metadata: dict[str, list[Any]],
+    ) -> str:
+        """Format signature metadata as a human-readable context block.
+
+        Args:
+            signature_metadata: Dict keyed by page number (str) with lists
+                of signature entries, each having 'type', 'name', 'desc'.
+
+        Returns:
+            Formatted string for LLM context injection.
+        """
+        lines: list[str] = []
+        for page_num, signatures in sorted(
+            signature_metadata.items(), key=lambda x: int(x[0])
+        ):
+            if not signatures:
+                continue
+            for sig in signatures:
+                name = sig.get("name", "Unknown")
+                sig_type = sig.get("type", "signature")
+                desc = sig.get("desc", "")
+                page_display = int(page_num) + 1  # 0-indexed to 1-indexed
+                entry = f"- Page {page_display}: {name} ({sig_type})"
+                if desc:
+                    entry += f" — {desc}"
+                lines.append(entry)
+        if not lines:
+            return ""
+        header = (
+            "\n\n[Document Signature Information]\n"
+            "The following signatures were detected in this document. "
+            "Use this information to answer any questions about signatories, "
+            "signing parties, or document execution status.\n"
+        )
+        return header + "\n".join(lines)
+
     @staticmethod
     def construct_prompt(
         preamble: str,
@@ -199,6 +238,7 @@ def construct_prompt(
         platform_postamble: str,
         word_confidence_postamble: str,
         prompt_type: str = "text",
+        signature_metadata: dict[str, list[Any]] | None = None,
     ) -> str:
         """Build the full prompt string with preamble, grammar, postamble, context."""
         prompt = f"{preamble}\n\nQuestion or Instruction: {prompt}"
@@ -212,8 +252,15 @@ def construct_prompt(
             platform_postamble += "\n\n"
             if word_confidence_postamble:
                 platform_postamble += f"{word_confidence_postamble}\n\n"
+        # Append signature metadata to context if present
+        signature_context = ""
+        if signature_metadata:
+            signature_context = AnswerPromptService._format_signature_metadata(
+                signature_metadata
+            )
         prompt += (
-            f"\n\n{postamble}\n\nContext:\n---------------\n{context}\n"
+            f"\n\n{postamble}\n\nContext:\n---------------\n{context}"
+            f"{signature_context}\n"
             f"-----------------\n\n{platform_postamble}Answer:"
         )
         return prompt
diff --git a/workers/executor/executors/constants.py b/workers/executor/executors/constants.py
index 9eddab8423..16bb364ce3 100644
--- a/workers/executor/executors/constants.py
+++ b/workers/executor/executors/constants.py
@@ -84,6 +84,7 @@ class PromptServiceConstants:
     LINE_ITEM = "line-item"
     LINE_NUMBERS = "line_numbers"
     WHISPER_HASH = "whisper_hash"
+    SIGNATURE_METADATA = "signature_metadata"
     PAID_FEATURE_MSG = (
         "It is a cloud / enterprise feature. If you have purchased a plan and still "
         "face this issue, please contact support"
diff --git a/workers/executor/executors/legacy_executor.py b/workers/executor/executors/legacy_executor.py
index cf33c43212..012bd89937 100644
--- a/workers/executor/executors/legacy_executor.py
+++ b/workers/executor/executors/legacy_executor.py
@@ -261,6 +261,15 @@ def _handle_extract(self, context: ExecutionContext) -> ExecutionResult:
                 result_data["highlight_metadata"] = (
                     process_response.extraction_metadata.line_metadata
                 )
+            # Include signature metadata when available
+            # (from document_insights mode)
+            if (
+                process_response.extraction_metadata
+                and process_response.extraction_metadata.signature_metadata
+            ):
+                result_data["signature_metadata"] = (
+                    process_response.extraction_metadata.signature_metadata
+                )
             return ExecutionResult(
                 success=True,
                 data=result_data,
@@ -536,6 +545,15 @@ def _handle_structure_pipeline(self, context: ExecutionContext) -> ExecutionResu
                 return extract_result
             extracted_text = extract_result.data.get(IKeys.EXTRACTED_TEXT, "")
 
+            # Pass signature metadata to answer phase via tool_settings
+            from executor.executors.constants import PromptServiceConstants as PSKeys
+
+            signature_metadata = extract_result.data.get("signature_metadata")
+            if signature_metadata:
+                tool_settings = answer_params.get(PSKeys.TOOL_SETTINGS, {})
+                tool_settings[PSKeys.SIGNATURE_METADATA] = signature_metadata
+                answer_params[PSKeys.TOOL_SETTINGS] = tool_settings
+
         # ---- Step 2: Summarize (if enabled) ----
         if is_summarization:
             shim.stream_log(f"Pipeline step {step}: Summarizing extracted text...")

From b982e581bd65a28cdf9099073befebf8ff479e37 Mon Sep 17 00:00:00 2001
From: pk-zipstack <praveen@zipstack.com>
Date: Thu, 9 Apr 2026 16:03:19 +0530
Subject: [PATCH 02/12] [FEAT] Compute signature page references for frontend
 page navigation

When document_insights mode detects signatures, compute page references
by finding the first line_metadata entry for each page with signatures
and converting to a 1-indexed hex value. This enables the frontend to
navigate/jump to pages containing signatures without highlighting.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/unstract/sdk1/adapters/x2text/dto.py  |  1 +
 .../llm_whisperer_v2/src/llm_whisperer_v2.py  | 64 +++++++++++++++++++
 2 files changed, 65 insertions(+)

diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py
index 4f4a92d812..4a0885a01f 100644
--- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py
+++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py
@@ -7,6 +7,7 @@ class TextExtractionMetadata:
     whisper_hash: str
     line_metadata: dict[Any, Any] | None = None
     signature_metadata: dict[str, list[Any]] | None = None
+    signature_page_references: dict[str, Any] | None = None
 
 
 @dataclass
diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
index 3d1e1f6a97..7c3ce92a3a 100644
--- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
+++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
@@ -56,6 +56,59 @@ def get_description() -> str:
     def get_icon() -> str:
         return "/icons/adapter-icons/LLMWhispererV2.png"
 
+    @staticmethod
+    def _build_signature_page_references(
+        signature_metadata: dict[str, list[Any]],
+        line_metadata: list[list[int]],
+    ) -> dict[str, Any] | None:
+        """Build page references for frontend navigation to signature pages.
+
+        For each page that has signatures, finds the first line_metadata
+        entry for that page and converts its index to a 1-indexed hex
+        value. This allows the frontend to jump to the correct page.
+
+        Args:
+            signature_metadata: Dict keyed by page number (str, 0-indexed)
+                with lists of signature entries.
+            line_metadata: List of [page, y_pos, height, page_height] arrays.
+
+        Returns:
+            Dict mapping page number to hex reference and signer names,
+            or None if no references could be built.
+        """
+        if not line_metadata:
+            return None
+
+        # Build a map of page number -> first line_metadata index
+        page_first_line: dict[int, int] = {}
+        for idx, entry in enumerate(line_metadata):
+            if isinstance(entry, list) and len(entry) >= 1:
+                page = entry[0]
+                if page not in page_first_line:
+                    page_first_line[page] = idx
+
+        references: dict[str, Any] = {}
+        for page_str, signatures in signature_metadata.items():
+            if not signatures:
+                continue
+            page_num = int(page_str)
+            if page_num not in page_first_line:
+                continue
+            line_index = page_first_line[page_num]
+            hex_value = f"0x{line_index + 1:02X}"  # 1-indexed hex
+            signers = [
+                sig.get("name", "Unknown")
+                for sig in signatures
+                if isinstance(sig, dict)
+            ]
+            references[page_str] = {
+                "hex": hex_value,
+                "line_metadata_index": line_index,
+                "signers": signers,
+            }
+
+        return references if references else None
+
     def test_connection(self) -> bool:
         LLMWhispererHelper.test_connection_request(
             config=self.config,
@@ -110,10 +163,21 @@ def process(
             if not any(signature_metadata.values()):
                 signature_metadata = None
 
+        # Compute signature page references for frontend navigation
+        signature_page_references = None
+        if signature_metadata:
+            raw_line_metadata = response.get("line_metadata", [])
+            signature_page_references = (
+                LLMWhispererV2._build_signature_page_references(
+                    signature_metadata, raw_line_metadata
+                )
+            )
+
         metadata = TextExtractionMetadata(
             whisper_hash=response.get(X2TextConstants.WHISPER_HASH_V2, ""),
             line_metadata=response.get("line_metadata"),
             signature_metadata=signature_metadata,
+            signature_page_references=signature_page_references,
         )
 
         return TextExtractionResult(

From 2cfcede7811ac662e6d96a2005d065fcd19704e0 Mon Sep 17 00:00:00 2001
From: pk-zipstack <praveen@zipstack.com>
Date: Wed, 15 Apr 2026 12:57:55 +0530
Subject: [PATCH 03/12] [MISC] Add DOC_INSIGHTS debug loggers across signature
 metadata flow

Add loggers at key points to trace signature metadata through the
pipeline: adapter extraction, workers pipeline (_handle_extract,
tool_settings injection), prompt construction, and prompt-service
extraction endpoint. All loggers use the DOC_INSIGHTS prefix for
easy grep-filtering during UI testing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../prompt_service/services/answer_prompt.py  |  9 +++++
 .../prompt_service/services/extraction.py     |  8 ++++
 .../llm_whisperer_v2/src/llm_whisperer_v2.py  | 39 +++++++++++++++++++
 workers/executor/executors/answer_prompt.py   |  9 +++++
 workers/executor/executors/legacy_executor.py | 25 ++++++++++++
 5 files changed, 90 insertions(+)

diff --git a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
index 9ccf20abe3..3fd69e1d91 100644
--- a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
+++ b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
@@ -225,9 +225,18 @@ def construct_prompt(
         # Append signature metadata to context if present
         signature_context = ""
         if signature_metadata:
+            app.logger.info(
+                "DOC_INSIGHTS construct_prompt: injecting signature context "
+                "for %d page(s)",
+                len(signature_metadata),
+            )
             signature_context = AnswerPromptService._format_signature_metadata(
                 signature_metadata
             )
+            app.logger.debug(
+                "DOC_INSIGHTS construct_prompt: signature_context=%s",
+                signature_context[:200] if signature_context else "empty",
+            )
         prompt += (
             f"\n\n{postamble}\n\nContext:\n---------------\n{context}"
             f"{signature_context}\n"
diff --git a/prompt-service/src/unstract/prompt_service/services/extraction.py b/prompt-service/src/unstract/prompt_service/services/extraction.py
index d7e9824bbb..290f13f482 100644
--- a/prompt-service/src/unstract/prompt_service/services/extraction.py
+++ b/prompt-service/src/unstract/prompt_service/services/extraction.py
@@ -1,6 +1,9 @@
+import logging
 from pathlib import Path
 from typing import Any
 
+logger = logging.getLogger(__name__)
+
 from unstract.prompt_service.constants import ExecutionSource
 from unstract.prompt_service.constants import IndexingConstants as IKeys
 from unstract.prompt_service.exceptions import ExtractionError
@@ -73,6 +76,11 @@ def perform_extraction(
                 signature_metadata = (
                     process_response.extraction_metadata.signature_metadata
                 )
+                logger.info(
+                    "DOC_INSIGHTS extraction: signature_metadata found "
+                    "for pages: %s",
+                    list(signature_metadata.keys()),
+                )
             return {
                 "extracted_text": extracted_text,
                 "signature_metadata": signature_metadata,
diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
index 7c3ce92a3a..984bc63a15 100644
--- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
+++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
@@ -77,6 +77,8 @@ def _build_signature_page_references(
             or None if no references could be built.
         """
         if not line_metadata:
+            logger.warning("DOC_INSIGHTS: no line_metadata available, "
+                           "cannot build page references")
             return None
 
         # Build a map of page number -> first line_metadata index
@@ -86,6 +88,9 @@ def _build_signature_page_references(
                 page = entry[0]
                 if page not in page_first_line:
                     page_first_line[page] = idx
+        logger.debug(
+            "DOC_INSIGHTS: page_first_line map: %s", page_first_line
+        )
 
         references: dict[str, Any] = {}
         for page_str, signatures in signature_metadata.items():
@@ -93,6 +98,10 @@ def _build_signature_page_references(
                 continue
             page_num = int(page_str)
             if page_num not in page_first_line:
+                logger.warning(
+                    "DOC_INSIGHTS: page %d not found in line_metadata",
+                    page_num,
+                )
                 continue
             line_index = page_first_line[page_num]
             hex_value = f"0x{line_index + 1:02X}"  # 1-indexed hex
@@ -154,24 +163,54 @@ def process(
         # Extract signature_metadata when using document_insights mode
         signature_metadata = None
         mode = self.config.get(WhispererConfig.MODE, Modes.FORM.value)
+        logger.info(
+            "DOC_INSIGHTS: mode=%s, is_document_insights=%s",
+            mode,
+            mode == Modes.DOCUMENT_INSIGHTS.value,
+        )
         if mode == Modes.DOCUMENT_INSIGHTS.value:
             response_metadata = response.get("metadata", {})
+            logger.info(
+                "DOC_INSIGHTS: response has metadata keys: %s",
+                list(response_metadata.keys()) if response_metadata else "None",
+            )
             signature_metadata = {}
             for page_num, page_data in response_metadata.items():
                 if isinstance(page_data, dict) and "signature_metadata" in page_data:
                     signature_metadata[page_num] = page_data["signature_metadata"]
+                    logger.info(
+                        "DOC_INSIGHTS: page %s has %d signature(s): %s",
+                        page_num,
+                        len(page_data["signature_metadata"]),
+                        [s.get("name") for s in page_data["signature_metadata"]],
+                    )
             if not any(signature_metadata.values()):
+                logger.info("DOC_INSIGHTS: no signatures found across any page")
                 signature_metadata = None
+            else:
+                logger.info(
+                    "DOC_INSIGHTS: signature_metadata extracted for pages: %s",
+                    list(signature_metadata.keys()),
+                )
 
         # Compute signature page references for frontend navigation
         signature_page_references = None
         if signature_metadata:
             raw_line_metadata = response.get("line_metadata", [])
+            logger.info(
+                "DOC_INSIGHTS: line_metadata has %d entries, "
+                "computing page references",
+                len(raw_line_metadata),
+            )
             signature_page_references = (
                 LLMWhispererV2._build_signature_page_references(
                     signature_metadata, raw_line_metadata
                 )
             )
+            logger.info(
+                "DOC_INSIGHTS: signature_page_references=%s",
+                signature_page_references,
+            )
 
         metadata = TextExtractionMetadata(
             whisper_hash=response.get(X2TextConstants.WHISPER_HASH_V2, ""),
diff --git a/workers/executor/executors/answer_prompt.py b/workers/executor/executors/answer_prompt.py
index b1a5998dde..163ea294a1 100644
--- a/workers/executor/executors/answer_prompt.py
+++ b/workers/executor/executors/answer_prompt.py
@@ -255,9 +255,18 @@ def construct_prompt(
         # Append signature metadata to context if present
         signature_context = ""
         if signature_metadata:
+            logger.info(
+                "DOC_INSIGHTS construct_prompt: injecting signature context "
+                "for %d page(s)",
+                len(signature_metadata),
+            )
             signature_context = AnswerPromptService._format_signature_metadata(
                 signature_metadata
             )
+            logger.debug(
+                "DOC_INSIGHTS construct_prompt: signature_context=%s",
+                signature_context[:200] if signature_context else "empty",
+            )
         prompt += (
             f"\n\n{postamble}\n\nContext:\n---------------\n{context}"
             f"{signature_context}\n"
diff --git a/workers/executor/executors/legacy_executor.py b/workers/executor/executors/legacy_executor.py
index 012bd89937..08951b3446 100644
--- a/workers/executor/executors/legacy_executor.py
+++ b/workers/executor/executors/legacy_executor.py
@@ -270,6 +270,26 @@ def _handle_extract(self, context: ExecutionContext) -> ExecutionResult:
                 result_data["signature_metadata"] = (
                     process_response.extraction_metadata.signature_metadata
                 )
+                logger.info(
+                    "DOC_INSIGHTS _handle_extract: signature_metadata found "
+                    "for pages: %s",
+                    list(process_response.extraction_metadata
+                         .signature_metadata.keys()),
+                )
+            if (
+                process_response.extraction_metadata
+                and process_response.extraction_metadata.signature_page_references
+            ):
+                result_data["signature_page_references"] = (
+                    process_response.extraction_metadata
+                    .signature_page_references
+                )
+                logger.info(
+                    "DOC_INSIGHTS _handle_extract: "
+                    "signature_page_references=%s",
+                    process_response.extraction_metadata
+                    .signature_page_references,
+                )
             return ExecutionResult(
                 success=True,
                 data=result_data,
@@ -553,6 +573,11 @@ def _handle_structure_pipeline(self, context: ExecutionContext) -> ExecutionResu
                 tool_settings = answer_params.get(PSKeys.TOOL_SETTINGS, {})
                 tool_settings[PSKeys.SIGNATURE_METADATA] = signature_metadata
                 answer_params[PSKeys.TOOL_SETTINGS] = tool_settings
+                logger.info(
+                    "DOC_INSIGHTS pipeline: injected signature_metadata "
+                    "into tool_settings for pages: %s",
+                    list(signature_metadata.keys()),
+                )
 
         # ---- Step 2: Summarize (if enabled) ----
         if is_summarization:

From cc63bdd375dfde3186e6c855295bdeee49873d8d Mon Sep 17 00:00:00 2001
From: pk-zipstack <praveen@zipstack.com>
Date: Wed, 15 Apr 2026 16:42:55 +0530
Subject: [PATCH 04/12] [FIX] Allow empty user_id in indexing-status internal
 endpoint

user_id may be empty for mock auth users (default OSS setup). It's only
used as a Redis cache key fragment, so empty values are acceptable.
Dropping user_id from the required-fields validation unblocks indexing
for these users.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../prompt_studio/prompt_studio_core_v2/internal_views.py   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/backend/prompt_studio/prompt_studio_core_v2/internal_views.py b/backend/prompt_studio/prompt_studio_core_v2/internal_views.py
index 3ad3a5db16..8c3391f838 100644
--- a/backend/prompt_studio/prompt_studio_core_v2/internal_views.py
+++ b/backend/prompt_studio/prompt_studio_core_v2/internal_views.py
@@ -244,11 +244,13 @@ def indexing_status(request):
     user_id = data.get("user_id", "")
     doc_id_key = data.get("doc_id_key", "")
 
-    if not action or not org_id or not user_id or not doc_id_key:
+    # user_id may be empty (e.g. mock auth users) - it's only used as a
+    # Redis cache key fragment, so empty is acceptable.
+    if not action or not org_id or not doc_id_key:
         return JsonResponse(
             {
                 "success": False,
-                "error": "action, org_id, user_id, doc_id_key are required",
+                "error": "action, org_id, doc_id_key are required",
             },
             status=status.HTTP_400_BAD_REQUEST,
         )

From ea011a480fd3da7b97d9fce7a07d4b2e5f5e12a7 Mon Sep 17 00:00:00 2001
From: pk-zipstack <praveen@zipstack.com>
Date: Thu, 14 May 2026 14:43:03 +0530
Subject: [PATCH 05/12] [FEAT] Surface signature page highlights in Prompt
 Studio for document_insights mode

Wire the signature data captured by the LLMWhisperer V2 adapter through
to Prompt Studio's existing highlight pipeline, so clicking a signer-
related answer jumps the PDF viewer to the page containing the
signature without any frontend changes.

- Adapter: signature_page_references now also carries resolved coords
  [page, y, height, page_height] alongside the existing hex / line index.
- Workers _handle_extract writes a <extract>.doc_insights.json sidecar
  so Prompt Studio cache hits don't lose signature data; the pipeline
  path threads signature_page_references into tool_settings alongside
  signature_metadata.
- AnswerPromptService._attach_signature_highlights (mirrored in workers
  and prompt-service) scans the LLM answer for signer names (case-
  insensitive substring) and appends the matching page coords to
  metadata[HIGHLIGHT_DATA][prompt_key]. Falls back to all signature
  pages when the answer mentions signing generically. De-dupes against
  hex-comment highlights.
- Prompt Studio backend: dynamic_extractor now returns ExtractResult
  (text + signature_metadata + signature_page_references), reading the
  sidecar on cache hits. All five answer_prompt dispatch sites inject
  the signature data into tool_settings.
- Prompt-service: extraction service + controller surface
  signature_page_references for parity.
- Tests: 7 new unit tests for _attach_signature_highlights covering
  name match, multi-page coords, keyword fallback, no-op cases, and
  preservation/dedup of existing highlight entries.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../prompt_studio_core_v2/constants.py        |   2 +
 .../prompt_studio_helper.py                   | 146 ++++++++++++++-
 .../src/unstract/prompt_service/constants.py  |   1 +
 .../prompt_service/controllers/extraction.py  |   3 +
 .../prompt_service/services/answer_prompt.py  | 104 ++++++++++-
 .../prompt_service/services/extraction.py     |  14 ++
 .../llm_whisperer_v2/src/llm_whisperer_v2.py  |   7 +
 workers/executor/executors/answer_prompt.py   | 115 +++++++++++-
 workers/executor/executors/constants.py       |   1 +
 workers/executor/executors/legacy_executor.py |  77 +++++++-
 workers/tests/test_answer_prompt.py           | 173 ++++++++++++++++++
 11 files changed, 626 insertions(+), 17 deletions(-)

diff --git a/backend/prompt_studio/prompt_studio_core_v2/constants.py b/backend/prompt_studio/prompt_studio_core_v2/constants.py
index c1ab14d380..fa7eea68fc 100644
--- a/backend/prompt_studio/prompt_studio_core_v2/constants.py
+++ b/backend/prompt_studio/prompt_studio_core_v2/constants.py
@@ -104,6 +104,8 @@ class ToolStudioPromptKeys:
     EXECUTION_SOURCE = "execution_source"
     LINE_ITEM = "line-item"
     CUSTOM_DATA = "custom_data"
+    SIGNATURE_METADATA = "signature_metadata"
+    SIGNATURE_PAGE_REFERENCES = "signature_page_references"
     # Webhook postprocessing settings
     ENABLE_POSTPROCESSING_WEBHOOK = "enable_postprocessing_webhook"
     POSTPROCESSING_WEBHOOK_URL = "postprocessing_webhook_url"
diff --git a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
index d0ffef3114..9eccd1d98f 100644
--- a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
+++ b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
@@ -4,7 +4,22 @@
 import time
 import uuid
 from pathlib import Path
-from typing import Any
+from typing import Any, NamedTuple
+
+
+class ExtractResult(NamedTuple):
+    """Return value of ``PromptStudioHelper.dynamic_extractor``.
+
+    ``signature_metadata`` and ``signature_page_references`` are populated
+    only when the x2text adapter is LLMWhisperer V2 in ``document_insights``
+    mode and the document contains signatures. They are read either from
+    the live extract dispatch result (cache miss) or from the on-disk
+    ``.doc_insights.json`` sidecar (cache hit).
+    """
+
+    text: str
+    signature_metadata: dict[str, Any] | None = None
+    signature_page_references: dict[str, Any] | None = None
 
 from account_v2.constants import Common
 from account_v2.models import User
@@ -734,7 +749,7 @@ def build_fetch_response_payload(
         )
 
         # Extract (blocking, usually cached)
-        extracted_text = PromptStudioHelper.dynamic_extractor(
+        extract_result = PromptStudioHelper.dynamic_extractor(
             profile_manager=profile_manager,
             file_path=file_path,
             org_id=org_id,
@@ -742,6 +757,7 @@ def build_fetch_response_payload(
             run_id=run_id,
             enable_highlight=tool.enable_highlight,
         )
+        extracted_text = extract_result.text
 
         is_summary = tool.summarize_as_source
         if is_summary:
@@ -836,6 +852,14 @@ def build_fetch_response_payload(
         tool_settings[TSPKeys.WORD_CONFIDENCE_POSTAMBLE] = getattr(
             settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), ""
         )
+        if extract_result.signature_metadata:
+            tool_settings[TSPKeys.SIGNATURE_METADATA] = (
+                extract_result.signature_metadata
+            )
+        if extract_result.signature_page_references:
+            tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = (
+                extract_result.signature_page_references
+            )
 
         file_hash = fs_instance.get_hash_from_file(path=extract_path)
 
@@ -951,7 +975,7 @@ def build_bulk_fetch_response_payload(
         )
 
         # Extract ONCE (blocking, usually cached)
-        extracted_text = PromptStudioHelper.dynamic_extractor(
+        extract_result = PromptStudioHelper.dynamic_extractor(
             profile_manager=profile_manager,
             file_path=file_path,
             org_id=org_id,
@@ -959,6 +983,7 @@ def build_bulk_fetch_response_payload(
             run_id=run_id,
             enable_highlight=tool.enable_highlight,
         )
+        extracted_text = extract_result.text
 
         is_summary = tool.summarize_as_source
         if is_summary:
@@ -1026,6 +1051,14 @@ def build_bulk_fetch_response_payload(
         tool_settings[TSPKeys.WORD_CONFIDENCE_POSTAMBLE] = getattr(
             settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), ""
         )
+        if extract_result.signature_metadata:
+            tool_settings[TSPKeys.SIGNATURE_METADATA] = (
+                extract_result.signature_metadata
+            )
+        if extract_result.signature_page_references:
+            tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = (
+                extract_result.signature_page_references
+            )
 
         file_hash = fs_instance.get_hash_from_file(path=extract_path)
 
@@ -1126,7 +1159,7 @@ def build_single_pass_payload(
         )
 
         # Extract (blocking, usually cached)
-        PromptStudioHelper.dynamic_extractor(
+        extract_result = PromptStudioHelper.dynamic_extractor(
             profile_manager=default_profile,
             file_path=doc_path,
             org_id=org_id,
@@ -1165,6 +1198,14 @@ def build_single_pass_payload(
             or TSPKeys.SIMPLE,
             TSPKeys.SIMILARITY_TOP_K: default_profile.similarity_top_k,
         }
+        if extract_result.signature_metadata:
+            tool_settings[TSPKeys.SIGNATURE_METADATA] = (
+                extract_result.signature_metadata
+            )
+        if extract_result.signature_page_references:
+            tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = (
+                extract_result.signature_page_references
+            )
 
         for p in prompts:
             if not p.prompt:
@@ -1366,7 +1407,7 @@ def index_document(
             tool=util,
         )
 
-        extracted_text = PromptStudioHelper.dynamic_extractor(
+        extract_result = PromptStudioHelper.dynamic_extractor(
             profile_manager=default_profile,
             file_path=file_path,
             org_id=org_id,
@@ -1374,6 +1415,7 @@ def index_document(
             run_id=run_id,
             enable_highlight=tool.enable_highlight,
         )
+        extracted_text = extract_result.text
         if tool.summarize_context:
             summarize_file_path = PromptStudioHelper.summarize(
                 file_name, org_id, run_id, tool
@@ -1817,7 +1859,7 @@ def _fetch_response(
             tool=util,
         )
         logger.info(f"Extracting text from {file_path} for {doc_id}")
-        extracted_text = PromptStudioHelper.dynamic_extractor(
+        extract_result = PromptStudioHelper.dynamic_extractor(
             profile_manager=profile_manager,
             file_path=file_path,
             org_id=org_id,
@@ -1825,6 +1867,7 @@ def _fetch_response(
             run_id=run_id,
             enable_highlight=tool.enable_highlight,
         )
+        extracted_text = extract_result.text
         logger.info(f"Extracted text from {file_path} for {doc_id}")
         if is_summary:
             profile_manager.chunk_size = 0
@@ -1933,6 +1976,14 @@ def _fetch_response(
         tool_settings[TSPKeys.WORD_CONFIDENCE_POSTAMBLE] = getattr(
             settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), ""
         )
+        if extract_result.signature_metadata:
+            tool_settings[TSPKeys.SIGNATURE_METADATA] = (
+                extract_result.signature_metadata
+            )
+        if extract_result.signature_page_references:
+            tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = (
+                extract_result.signature_page_references
+            )
         file_hash = fs_instance.get_hash_from_file(path=doc_path)
 
         payload = {
@@ -2194,7 +2245,7 @@ def _fetch_single_pass_response(
         file_path = os.path.join(
             directory, "extract", os.path.splitext(filename)[0] + ".txt"
         )
-        PromptStudioHelper.dynamic_extractor(
+        extract_result = PromptStudioHelper.dynamic_extractor(
             profile_manager=default_profile,
             file_path=input_file_path,
             org_id=org_id,
@@ -2232,6 +2283,14 @@ def _fetch_single_pass_response(
             default_profile.retrieval_strategy or TSPKeys.SIMPLE
         )
         tool_settings[TSPKeys.SIMILARITY_TOP_K] = default_profile.similarity_top_k
+        if extract_result.signature_metadata:
+            tool_settings[TSPKeys.SIGNATURE_METADATA] = (
+                extract_result.signature_metadata
+            )
+        if extract_result.signature_page_references:
+            tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = (
+                extract_result.signature_page_references
+            )
         for prompt in prompts:
             if not prompt.prompt:
                 raise EmptyPromptError()
@@ -2291,6 +2350,49 @@ def get_tool_from_tool_id(tool_id: str) -> CustomTool | None:
         except CustomTool.DoesNotExist:
             return None
 
+    @staticmethod
+    def _signature_sidecar_path(extract_file_path: str) -> str:
+        p = Path(extract_file_path)
+        return str(p.with_suffix("")) + ".doc_insights.json"
+
+    @staticmethod
+    def _load_signature_sidecar(
+        extract_file_path: str,
+        fs_instance: Any,
+    ) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
+        """Return ``(signature_metadata, signature_page_references)`` from the
+        sidecar, or ``(None, None)`` if the sidecar is missing or unreadable.
+
+        Signature data is only written by the executor when a document
+        contains signatures in document_insights mode; cache-hit calls
+        for documents extracted in other modes legitimately have no
+        sidecar, so absence is not an error.
+        """
+        sidecar_path = PromptStudioHelper._signature_sidecar_path(extract_file_path)
+        try:
+            raw = fs_instance.read(path=sidecar_path, mode="r")
+        except FileNotFoundError:
+            return None, None
+        except Exception as e:
+            logger.warning(
+                "DOC_INSIGHTS sidecar: failed to read %s: %s",
+                sidecar_path,
+                e,
+            )
+            return None, None
+        try:
+            data = json.loads(raw)
+        except (TypeError, ValueError) as e:
+            logger.warning(
+                "DOC_INSIGHTS sidecar: failed to parse %s: %s",
+                sidecar_path,
+                e,
+            )
+            return None, None
+        sig_meta = data.get("signature_metadata") or None
+        sig_refs = data.get("signature_page_references") or None
+        return sig_meta, sig_refs
+
     @staticmethod
     def dynamic_extractor(
         file_path: str,
@@ -2299,7 +2401,7 @@ def dynamic_extractor(
         org_id: str,
         profile_manager: ProfileManager,
         document_id: str,
-    ) -> str:
+    ) -> ExtractResult:
         # Guard against None metadata (when adapter_metadata_b is None)
         metadata = profile_manager.x2text.metadata or {}
         x2text_config_hash = ToolUtils.hash_str(json.dumps(metadata, sort_keys=True))
@@ -2329,7 +2431,15 @@ def dynamic_extractor(
             try:
                 extracted_text = fs_instance.read(path=extract_file_path, mode="r")
                 logger.info("Extracted text found. Reading from file..")
-                return extracted_text
+                sig_meta, sig_refs = PromptStudioHelper._load_signature_sidecar(
+                    extract_file_path=extract_file_path,
+                    fs_instance=fs_instance,
+                )
+                return ExtractResult(
+                    text=extracted_text,
+                    signature_metadata=sig_meta,
+                    signature_page_references=sig_refs,
+                )
             except FileNotFoundError as e:
                 logger.warning(
                     f"File not found for extraction. {extract_file_path}. {e}"
@@ -2383,6 +2493,18 @@ def dynamic_extractor(
             )
 
         extracted_text = result.data.get("extracted_text", "")
+        signature_metadata = result.data.get("signature_metadata")
+        signature_page_references = result.data.get("signature_page_references")
+        if signature_metadata or signature_page_references:
+            logger.info(
+                "DOC_INSIGHTS dynamic_extractor: captured signature data "
+                "(pages=%s, refs=%s) for document %s",
+                list(signature_metadata.keys()) if signature_metadata else [],
+                list(signature_page_references.keys())
+                if signature_page_references
+                else [],
+                document_id,
+            )
         success = PromptStudioIndexHelper.mark_extraction_status(
             document_id=document_id,
             profile_manager=profile_manager,
@@ -2395,7 +2517,11 @@ def dynamic_extractor(
                 f"Extraction completed but status not saved."
             )
 
-        return extracted_text
+        return ExtractResult(
+            text=extracted_text,
+            signature_metadata=signature_metadata,
+            signature_page_references=signature_page_references,
+        )
 
     @staticmethod
     def export_project_settings(tool: CustomTool) -> dict:
diff --git a/prompt-service/src/unstract/prompt_service/constants.py b/prompt-service/src/unstract/prompt_service/constants.py
index 16bb364ce3..58a6b72fcd 100644
--- a/prompt-service/src/unstract/prompt_service/constants.py
+++ b/prompt-service/src/unstract/prompt_service/constants.py
@@ -85,6 +85,7 @@ class PromptServiceConstants:
     LINE_NUMBERS = "line_numbers"
     WHISPER_HASH = "whisper_hash"
     SIGNATURE_METADATA = "signature_metadata"
+    SIGNATURE_PAGE_REFERENCES = "signature_page_references"
     PAID_FEATURE_MSG = (
         "It is a cloud / enterprise feature. If you have purchased a plan and still "
         "face this issue, please contact support"
diff --git a/prompt-service/src/unstract/prompt_service/controllers/extraction.py b/prompt-service/src/unstract/prompt_service/controllers/extraction.py
index 4a5d47bb91..588e561491 100644
--- a/prompt-service/src/unstract/prompt_service/controllers/extraction.py
+++ b/prompt-service/src/unstract/prompt_service/controllers/extraction.py
@@ -55,4 +55,7 @@ def extract() -> Any:
     signature_metadata = extraction_result.get("signature_metadata")
     if signature_metadata:
         response["signature_metadata"] = signature_metadata
+    signature_page_references = extraction_result.get("signature_page_references")
+    if signature_page_references:
+        response["signature_page_references"] = signature_page_references
     return response
diff --git a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
index 3fd69e1d91..0a4a96f333 100644
--- a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
+++ b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
@@ -144,7 +144,7 @@ def construct_and_run_prompt(
             signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA),
         )
         output[PSKeys.COMBINED_PROMPT] = prompt
-        return AnswerPromptService.run_completion(
+        answer = AnswerPromptService.run_completion(
             llm=llm,
             prompt=prompt,
             metadata=metadata,
@@ -155,6 +155,108 @@ def construct_and_run_prompt(
             file_path=file_path,
             execution_source=execution_source,
         )
+        AnswerPromptService._attach_signature_highlights(
+            answer=answer,
+            signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA),
+            signature_page_references=tool_settings.get(
+                PSKeys.SIGNATURE_PAGE_REFERENCES
+            ),
+            metadata=metadata,
+            prompt_key=output[PSKeys.NAME],
+        )
+        return answer
+
+    # Generic signature-related terms used as a fallback trigger when the
+    # LLM answer doesn't mention any specific signer name but does talk
+    # about signing in general. Matched as case-insensitive substrings.
+    _SIGNATURE_KEYWORDS = (
+        "signature",
+        "signed",
+        "signatory",
+        "signatories",
+        "signing",
+        "executed",
+    )
+
+    @staticmethod
+    def _attach_signature_highlights(
+        answer: str,
+        signature_metadata: dict[str, list[Any]] | None,
+        signature_page_references: dict[str, Any] | None,
+        metadata: dict[str, Any] | None,
+        prompt_key: str | None,
+    ) -> None:
+        """Attach signature page highlights to ``metadata`` when the LLM
+        answer references a known signer or signatures generally.
+
+        Mirror of the workers post-processor — see
+        ``executor.executors.answer_prompt.AnswerPromptService._attach_signature_highlights``
+        for behavior details.
+        """
+        if not signature_page_references or not signature_metadata:
+            return
+        if metadata is None or not prompt_key:
+            return
+        if not isinstance(answer, str) or not answer.strip():
+            return
+
+        page_coords: dict[str, list[int]] = {}
+        for page_str, ref in signature_page_references.items():
+            if not isinstance(ref, dict):
+                continue
+            coords = ref.get("coords")
+            if isinstance(coords, list) and len(coords) >= 4:
+                page_coords[page_str] = list(coords[:4])
+        if not page_coords:
+            return
+
+        answer_lower = answer.lower()
+        matched_pages: list[str] = []
+        for page_str, signatures in signature_metadata.items():
+            if page_str not in page_coords or not signatures:
+                continue
+            for sig in signatures:
+                if not isinstance(sig, dict):
+                    continue
+                name = (sig.get("name") or "").strip()
+                if name and name.lower() in answer_lower:
+                    matched_pages.append(page_str)
+                    break
+
+        if not matched_pages:
+            if any(
+                kw in answer_lower for kw in AnswerPromptService._SIGNATURE_KEYWORDS
+            ):
+                matched_pages = list(page_coords.keys())
+
+        if not matched_pages:
+            return
+
+        seen: set[tuple[int, ...]] = set()
+        new_coords: list[list[int]] = []
+        for page_str in matched_pages:
+            coords = page_coords[page_str]
+            key = tuple(coords)
+            if key in seen:
+                continue
+            seen.add(key)
+            new_coords.append(coords)
+
+        bucket = metadata.setdefault(PSKeys.HIGHLIGHT_DATA, {})
+        existing = bucket.get(prompt_key)
+        if not isinstance(existing, list):
+            existing = []
+        for coords in new_coords:
+            if coords not in existing:
+                existing.append(coords)
+        bucket[prompt_key] = existing
+        app.logger.info(
+            "DOC_INSIGHTS attach_signature_highlights: prompt=%s, added %d "
+            "signature highlight(s) on pages %s",
+            prompt_key,
+            len(new_coords),
+            matched_pages,
+        )
 
     @staticmethod
     def _format_signature_metadata(
diff --git a/prompt-service/src/unstract/prompt_service/services/extraction.py b/prompt-service/src/unstract/prompt_service/services/extraction.py
index 290f13f482..bd6fc9212b 100644
--- a/prompt-service/src/unstract/prompt_service/services/extraction.py
+++ b/prompt-service/src/unstract/prompt_service/services/extraction.py
@@ -69,6 +69,7 @@ def perform_extraction(
             extracted_text = process_response.extracted_text
             # Extract signature metadata if present
             signature_metadata = None
+            signature_page_references = None
             if (
                 process_response.extraction_metadata
                 and process_response.extraction_metadata.signature_metadata
@@ -81,9 +82,22 @@ def perform_extraction(
                     "for pages: %s",
                     list(signature_metadata.keys()),
                 )
+            if (
+                process_response.extraction_metadata
+                and process_response.extraction_metadata.signature_page_references
+            ):
+                signature_page_references = (
+                    process_response.extraction_metadata.signature_page_references
+                )
+                logger.info(
+                    "DOC_INSIGHTS extraction: signature_page_references "
+                    "found for pages: %s",
+                    list(signature_page_references.keys()),
+                )
             return {
                 "extracted_text": extracted_text,
                 "signature_metadata": signature_metadata,
+                "signature_page_references": signature_page_references,
             }
         except AdapterError as e:
             msg = f"Error from text extractor '{x2text.x2text_instance.get_name()}'. "
diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
index 984bc63a15..72c4d2f026 100644
--- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
+++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
@@ -110,10 +110,17 @@ def _build_signature_page_references(
                 for sig in signatures
                 if isinstance(sig, dict)
             ]
+            coords_entry = line_metadata[line_index]
+            coords = (
+                list(coords_entry[:4])
+                if isinstance(coords_entry, list) and len(coords_entry) >= 4
+                else None
+            )
             references[page_str] = {
                 "hex": hex_value,
                 "line_metadata_index": line_index,
                 "signers": signers,
+                "coords": coords,
             }
 
         return references if references else None
diff --git a/workers/executor/executors/answer_prompt.py b/workers/executor/executors/answer_prompt.py
index 61f9f2319d..d3ba2f4749 100644
--- a/workers/executor/executors/answer_prompt.py
+++ b/workers/executor/executors/answer_prompt.py
@@ -160,7 +160,7 @@ def construct_and_run_prompt(
             signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA),
         )
         output[PSKeys.COMBINED_PROMPT] = prompt
-        return AnswerPromptService.run_completion(
+        answer = AnswerPromptService.run_completion(
             llm=llm,
             prompt=prompt,
             metadata=metadata,
@@ -172,6 +172,16 @@ def construct_and_run_prompt(
             execution_source=execution_source,
             process_text=process_text,
         )
+        AnswerPromptService._attach_signature_highlights(
+            answer=answer,
+            signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA),
+            signature_page_references=tool_settings.get(
+                PSKeys.SIGNATURE_PAGE_REFERENCES
+            ),
+            metadata=metadata,
+            prompt_key=output[PSKeys.NAME],
+        )
+        return answer
 
     @staticmethod
     def _build_grammar_notes(grammar_list: list[dict[str, Any]]) -> str:
@@ -190,6 +200,109 @@ def _build_grammar_notes(grammar_list: list[dict[str, Any]]) -> str:
                 )
         return notes
 
+    # Generic signature-related terms used as a fallback trigger when the
+    # LLM answer doesn't mention any specific signer name but does talk
+    # about signing in general (e.g. "Is this signed?" → "Yes, the document
+    # is signed."). Matched as case-insensitive substrings.
+    _SIGNATURE_KEYWORDS = (
+        "signature",
+        "signed",
+        "signatory",
+        "signatories",
+        "signing",
+        "executed",
+    )
+
+    @staticmethod
+    def _attach_signature_highlights(
+        answer: str,
+        signature_metadata: dict[str, list[Any]] | None,
+        signature_page_references: dict[str, Any] | None,
+        metadata: dict[str, Any] | None,
+        prompt_key: str | None,
+    ) -> None:
+        """Attach signature page highlights to ``metadata`` when the LLM
+        answer references a known signer or signatures generally.
+
+        - For each signer name in ``signature_metadata`` found as a
+          case-insensitive substring in ``answer``, append that page's
+          coords (from ``signature_page_references``) to
+          ``metadata[HIGHLIGHT_DATA][prompt_key]``.
+        - If no signer-name match is found but the answer mentions
+          generic signature keywords (signature, signed, signatory,
+          executed, signing), append every signature page's coords.
+
+        ``metadata[HIGHLIGHT_DATA][prompt_key]`` is mutated in place; the
+        existing list (populated by hex-comment processing in
+        ``run_completion``) is preserved and extended.
+        """
+        if not signature_page_references or not signature_metadata:
+            return
+        if metadata is None or not prompt_key:
+            return
+        if not isinstance(answer, str) or not answer.strip():
+            return
+
+        # Build page → coords map (one coord array per signature page).
+        page_coords: dict[str, list[int]] = {}
+        for page_str, ref in signature_page_references.items():
+            if not isinstance(ref, dict):
+                continue
+            coords = ref.get("coords")
+            if isinstance(coords, list) and len(coords) >= 4:
+                page_coords[page_str] = list(coords[:4])
+        if not page_coords:
+            return
+
+        answer_lower = answer.lower()
+        matched_pages: list[str] = []
+        for page_str, signatures in signature_metadata.items():
+            if page_str not in page_coords or not signatures:
+                continue
+            for sig in signatures:
+                if not isinstance(sig, dict):
+                    continue
+                name = (sig.get("name") or "").strip()
+                if name and name.lower() in answer_lower:
+                    matched_pages.append(page_str)
+                    break  # one match per page is enough
+
+        if not matched_pages:
+            # No specific signer matched — fall back to all signature pages
+            # when the answer talks about signing generically.
+            if any(kw in answer_lower for kw in AnswerPromptService._SIGNATURE_KEYWORDS):
+                matched_pages = list(page_coords.keys())
+
+        if not matched_pages:
+            return
+
+        seen: set[tuple[int, ...]] = set()
+        new_coords: list[list[int]] = []
+        for page_str in matched_pages:
+            coords = page_coords[page_str]
+            key = tuple(coords)
+            if key in seen:
+                continue
+            seen.add(key)
+            new_coords.append(coords)
+
+        bucket = metadata.setdefault(PSKeys.HIGHLIGHT_DATA, {})
+        existing = bucket.get(prompt_key)
+        if not isinstance(existing, list):
+            existing = []
+        # Avoid duplicating coords already present from hex-comment processing.
+        for coords in new_coords:
+            if coords not in existing:
+                existing.append(coords)
+        bucket[prompt_key] = existing
+        logger.info(
+            "DOC_INSIGHTS attach_signature_highlights: prompt=%s, added %d "
+            "signature highlight(s) on pages %s",
+            prompt_key,
+            len(new_coords),
+            matched_pages,
+        )
+
     @staticmethod
     def _format_signature_metadata(
         signature_metadata: dict[str, list[Any]],
diff --git a/workers/executor/executors/constants.py b/workers/executor/executors/constants.py
index 16bb364ce3..58a6b72fcd 100644
--- a/workers/executor/executors/constants.py
+++ b/workers/executor/executors/constants.py
@@ -85,6 +85,7 @@ class PromptServiceConstants:
     LINE_NUMBERS = "line_numbers"
     WHISPER_HASH = "whisper_hash"
     SIGNATURE_METADATA = "signature_metadata"
+    SIGNATURE_PAGE_REFERENCES = "signature_page_references"
     PAID_FEATURE_MSG = (
         "It is a cloud / enterprise feature. If you have purchased a plan and still "
         "face this issue, please contact support"
diff --git a/workers/executor/executors/legacy_executor.py b/workers/executor/executors/legacy_executor.py
index 1b9f20bce3..4ddccaa5be 100644
--- a/workers/executor/executors/legacy_executor.py
+++ b/workers/executor/executors/legacy_executor.py
@@ -290,6 +290,14 @@ def _handle_extract(self, context: ExecutionContext) -> ExecutionResult:
                     process_response.extraction_metadata
                     .signature_page_references,
                 )
+            self._write_signature_sidecar(
+                fs=fs,
+                output_file_path=output_file_path,
+                signature_metadata=result_data.get("signature_metadata"),
+                signature_page_references=result_data.get(
+                    "signature_page_references"
+                ),
+            )
             return ExecutionResult(
                 success=True,
                 data=result_data,
@@ -305,6 +313,54 @@ def _handle_extract(self, context: ExecutionContext) -> ExecutionResult:
             msg = f"Error from text extractor '{name}'. {e}"
             raise ExtractionError(message=msg) from e
 
+    @staticmethod
+    def _signature_sidecar_path(output_file_path: str) -> str:
+        """Sidecar JSON for document_insights signature data.
+
+        Lives next to the extracted ``.txt`` file so cache hits in
+        Prompt Studio can recover signature data without re-extracting.
+        """
+        p = Path(output_file_path)
+        return str(p.with_suffix("") ) + ".doc_insights.json"
+
+    @staticmethod
+    def _write_signature_sidecar(
+        fs: Any,
+        output_file_path: str | None,
+        signature_metadata: dict[str, Any] | None,
+        signature_page_references: dict[str, Any] | None,
+    ) -> None:
+        """Persist signature data alongside the extracted-text file.
+
+        Skipped if there's no signature data or no output path (e.g.,
+        when running without disk output).
+        """
+        if not output_file_path:
+            return
+        if not signature_metadata and not signature_page_references:
+            return
+        sidecar_path = LegacyExecutor._signature_sidecar_path(output_file_path)
+        payload = {
+            "signature_metadata": signature_metadata or {},
+            "signature_page_references": signature_page_references or {},
+        }
+        try:
+            ToolUtils.dump_json(
+                file_to_dump=sidecar_path,
+                json_to_dump=payload,
+                fs=fs,
+            )
+            logger.info(
+                "DOC_INSIGHTS sidecar: wrote signature data to %s",
+                sidecar_path,
+            )
+        except Exception as e:
+            logger.warning(
+                "DOC_INSIGHTS sidecar: failed to write %s: %s",
+                sidecar_path,
+                e,
+            )
+
     @staticmethod
     def _update_exec_metadata(
         fs: Any,
@@ -578,14 +634,25 @@ def _handle_structure_pipeline(self, context: ExecutionContext) -> ExecutionResu
             from executor.executors.constants import PromptServiceConstants as PSKeys
 
             signature_metadata = extract_result.data.get("signature_metadata")
-            if signature_metadata:
+            signature_page_references = extract_result.data.get(
+                "signature_page_references"
+            )
+            if signature_metadata or signature_page_references:
                 tool_settings = answer_params.get(PSKeys.TOOL_SETTINGS, {})
-                tool_settings[PSKeys.SIGNATURE_METADATA] = signature_metadata
+                if signature_metadata:
+                    tool_settings[PSKeys.SIGNATURE_METADATA] = signature_metadata
+                if signature_page_references:
+                    tool_settings[PSKeys.SIGNATURE_PAGE_REFERENCES] = (
+                        signature_page_references
+                    )
                 answer_params[PSKeys.TOOL_SETTINGS] = tool_settings
                 logger.info(
-                    "DOC_INSIGHTS pipeline: injected signature_metadata "
-                    "into tool_settings for pages: %s",
-                    list(signature_metadata.keys()),
+                    "DOC_INSIGHTS pipeline: injected signature data into "
+                    "tool_settings (pages=%s, refs=%s)",
+                    list(signature_metadata.keys()) if signature_metadata else [],
+                    list(signature_page_references.keys())
+                    if signature_page_references
+                    else [],
                 )
 
         # ---- Step 2: Summarize (if enabled) ----
diff --git a/workers/tests/test_answer_prompt.py b/workers/tests/test_answer_prompt.py
index 6c9fb9fce9..ca1f93b556 100644
--- a/workers/tests/test_answer_prompt.py
+++ b/workers/tests/test_answer_prompt.py
@@ -900,6 +900,179 @@ def test_construct_prompt_with_grammar(self):
         assert "sum, total" in result
 
 
+class TestAttachSignatureHighlights:
+    """Tests for the signature-highlight post-processor."""
+
+    @staticmethod
+    def _fixture_signatures():
+        """Build a minimal signature fixture set covering two pages."""
+        signature_metadata = {
+            "0": [
+                {"name": "Mr Dagan", "type": "signature", "desc": ""},
+                {"name": "Carmela Avner", "type": "signature", "desc": ""},
+            ],
+            "1": [
+                {"name": "Eve Other", "type": "signature", "desc": ""},
+            ],
+        }
+        signature_page_references = {
+            "0": {
+                "hex": "0x10",
+                "line_metadata_index": 15,
+                "signers": ["Mr Dagan", "Carmela Avner"],
+                "coords": [0, 320, 31, 3168],
+            },
+            "1": {
+                "hex": "0x20",
+                "line_metadata_index": 31,
+                "signers": ["Eve Other"],
+                "coords": [1, 100, 40, 3168],
+            },
+        }
+        return signature_metadata, signature_page_references
+
+    def test_name_match_attaches_only_matched_page(self):
+        from executor.executors.answer_prompt import AnswerPromptService
+
+        sig_meta, sig_refs = self._fixture_signatures()
+        metadata = {}
+        AnswerPromptService._attach_signature_highlights(
+            answer="The document was signed by Mr Dagan on Jan 1.",
+            signature_metadata=sig_meta,
+            signature_page_references=sig_refs,
+            metadata=metadata,
+            prompt_key="signer",
+        )
+        # Only page 0's coords (Mr Dagan) should be attached.
+        assert metadata["highlight_data"]["signer"] == [[0, 320, 31, 3168]]
+
+    def test_case_insensitive_substring_match(self):
+        from executor.executors.answer_prompt import AnswerPromptService
+
+        sig_meta, sig_refs = self._fixture_signatures()
+        metadata = {}
+        AnswerPromptService._attach_signature_highlights(
+            answer="signed by mr dagan, with sign-off from carmela avner.",
+            signature_metadata=sig_meta,
+            signature_page_references=sig_refs,
+            metadata=metadata,
+            prompt_key="signers",
+        )
+        # Both names matched but both are on page 0 → single coord, deduped.
+        assert metadata["highlight_data"]["signers"] == [[0, 320, 31, 3168]]
+
+    def test_multi_page_names_attach_distinct_coords(self):
+        from executor.executors.answer_prompt import AnswerPromptService
+
+        sig_meta, sig_refs = self._fixture_signatures()
+        metadata = {}
+        AnswerPromptService._attach_signature_highlights(
+            answer="Signed by Mr Dagan and Eve Other.",
+            signature_metadata=sig_meta,
+            signature_page_references=sig_refs,
+            metadata=metadata,
+            prompt_key="signers",
+        )
+        # Page 0 and page 1 coords both attached.
+        coords = metadata["highlight_data"]["signers"]
+        assert [0, 320, 31, 3168] in coords
+        assert [1, 100, 40, 3168] in coords
+        assert len(coords) == 2
+
+    def test_keyword_fallback_attaches_all_signature_pages(self):
+        """Generic signature mention with no name match → all pages."""
+        from executor.executors.answer_prompt import AnswerPromptService
+
+        sig_meta, sig_refs = self._fixture_signatures()
+        metadata = {}
+        AnswerPromptService._attach_signature_highlights(
+            answer="Yes, the document is signed.",
+            signature_metadata=sig_meta,
+            signature_page_references=sig_refs,
+            metadata=metadata,
+            prompt_key="is_signed",
+        )
+        coords = metadata["highlight_data"]["is_signed"]
+        assert [0, 320, 31, 3168] in coords
+        assert [1, 100, 40, 3168] in coords
+        assert len(coords) == 2
+
+    def test_no_match_no_keyword_no_op(self):
+        """Answer with neither name match nor keyword → no highlights added."""
+        from executor.executors.answer_prompt import AnswerPromptService
+
+        sig_meta, sig_refs = self._fixture_signatures()
+        metadata = {}
+        AnswerPromptService._attach_signature_highlights(
+            answer="The total amount is $42.",
+            signature_metadata=sig_meta,
+            signature_page_references=sig_refs,
+            metadata=metadata,
+            prompt_key="total",
+        )
+        assert "highlight_data" not in metadata
+
+    def test_preserves_existing_highlight_entries(self):
+        """Coords already in metadata[HIGHLIGHT_DATA][key] are kept; no dups."""
+        from executor.executors.answer_prompt import AnswerPromptService
+
+        sig_meta, sig_refs = self._fixture_signatures()
+        metadata = {
+            "highlight_data": {
+                "signer": [
+                    [9, 9, 9, 9],  # pre-existing, unrelated highlight
+                    [0, 320, 31, 3168],  # would duplicate the page-0 sig
+                ]
+            }
+        }
+        AnswerPromptService._attach_signature_highlights(
+            answer="Signed by Mr Dagan.",
+            signature_metadata=sig_meta,
+            signature_page_references=sig_refs,
+            metadata=metadata,
+            prompt_key="signer",
+        )
+        # Pre-existing entries preserved, page-0 coord not duplicated.
+        assert metadata["highlight_data"]["signer"] == [
+            [9, 9, 9, 9],
+            [0, 320, 31, 3168],
+        ]
+
+    def test_missing_inputs_no_op(self):
+        """No-op when signature data or metadata pieces are missing."""
+        from executor.executors.answer_prompt import AnswerPromptService
+
+        # No signature_metadata
+        metadata = {}
+        AnswerPromptService._attach_signature_highlights(
+            answer="signed by Mr Dagan",
+            signature_metadata=None,
+            signature_page_references={"0": {"coords": [0, 0, 0, 0]}},
+            metadata=metadata,
+            prompt_key="k",
+        )
+        assert metadata == {}
+        # No signature_page_references
+        AnswerPromptService._attach_signature_highlights(
+            answer="signed by Mr Dagan",
+            signature_metadata={"0": [{"name": "Mr Dagan"}]},
+            signature_page_references=None,
+            metadata=metadata,
+            prompt_key="k",
+        )
+        assert metadata == {}
+        # Empty/None answer
+        sig_meta, sig_refs = self._fixture_signatures()
+        AnswerPromptService._attach_signature_highlights(
+            answer="",
+            signature_metadata=sig_meta,
+            signature_page_references=sig_refs,
+            metadata=metadata,
+            prompt_key="k",
+        )
+        assert metadata == {}
+
+
 class TestVariableReplacementService:
     """Tests for the VariableReplacementService."""
 

From e5333bcc6c5f78c25e5b65b521dd5abf87fb7fdb Mon Sep 17 00:00:00 2001
From: pk-zipstack <praveen@zipstack.com>
Date: Thu, 14 May 2026 15:43:39 +0530
Subject: [PATCH 06/12] [FIX] Pick content line per page and use word-boundary
 matching for signature highlights
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two bugs surfaced when testing in Prompt Studio with a multi-page
signed PDF where the LLM answer was a single signer name.

1) Adapter was selecting unusable line_metadata entries

   _build_signature_page_references picked the first line_metadata
   entry per page, but the first entry is often an empty marker row
   like [0, 0, 0, 3168] or [1, 0, 0, 0]. Zero height makes the overlay
   invisible; zero page_height causes divide-by-zero in the frontend's
   percentage calc. Now skip entries with height <= 0 or
   page_height <= 0 and pick the first true content line.

2) Post-processor matched signer initials inside other names

   "P S" (a signer on page 0) was matching across word boundaries
   inside "Pradeep Surukanti" — case-insensitive substring "p s"
   appears between "Pradee[p s]urukanti". Both pages got highlights,
   the viewer jumped to the first (wrong) page. Switched to a regex
   with \b anchors so the signer name has to appear as a whole token
   or phrase.

Added a regression test (test_short_initials_do_not_falsely_match_across_words)
that locks in the fix for this exact scenario.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../prompt_service/services/answer_prompt.py  | 11 ++++++-
 .../llm_whisperer_v2/src/llm_whisperer_v2.py  | 17 +++++++---
 workers/executor/executors/answer_prompt.py   | 11 ++++++-
 workers/tests/test_answer_prompt.py           | 33 +++++++++++++++++++
 4 files changed, 65 insertions(+), 7 deletions(-)

diff --git a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
index 0a4a96f333..8224441a48 100644
--- a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
+++ b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
@@ -1,4 +1,5 @@
 import ipaddress
+import re
 import socket
 from logging import Logger
 from typing import Any
@@ -219,7 +220,15 @@ def _attach_signature_highlights(
                 if not isinstance(sig, dict):
                     continue
                 name = (sig.get("name") or "").strip()
-                if name and name.lower() in answer_lower:
+                if not name:
+                    continue
+                # Word-boundary regex avoids false positives like
+                # signer "P S" matching the gap between "Pradeep" and
+                # "Surukanti" inside "Pradeep Surukanti".
+                pattern = re.compile(
+                    r"\b" + re.escape(name) + r"\b", re.IGNORECASE
+                )
+                if pattern.search(answer):
                     matched_pages.append(page_str)
                     break
 
diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
index 72c4d2f026..a980b5b4f0 100644
--- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
+++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
@@ -81,13 +81,20 @@ def _build_signature_page_references(
                            "cannot build page references")
             return None
 
-        # Build a map of page number -> first line_metadata index
+        # Build a map of page number -> first *content* line index.
+        # Skip marker/empty rows like [0, 0, 0, 3168] or [1, 0, 0, 0]:
+        # they have zero height or zero page_height and produce an
+        # invisible overlay (and divide-by-zero in the frontend's
+        # percentage calculations).
         page_first_line: dict[int, int] = {}
         for idx, entry in enumerate(line_metadata):
-            if isinstance(entry, list) and len(entry) >= 1:
-                page = entry[0]
-                if page not in page_first_line:
-                    page_first_line[page] = idx
+            if not isinstance(entry, list) or len(entry) < 4:
+                continue
+            page, _y, height, page_height = entry[0], entry[1], entry[2], entry[3]
+            if height <= 0 or page_height <= 0:
+                continue
+            if page not in page_first_line:
+                page_first_line[page] = idx
         logger.debug(
             "DOC_INSIGHTS: page_first_line map: %s", page_first_line
         )
diff --git a/workers/executor/executors/answer_prompt.py b/workers/executor/executors/answer_prompt.py
index d3ba2f4749..d1451c8111 100644
--- a/workers/executor/executors/answer_prompt.py
+++ b/workers/executor/executors/answer_prompt.py
@@ -13,6 +13,7 @@
 import ipaddress
 import logging
 import os
+import re
 import socket
 from typing import Any
 from urllib.parse import urlparse
@@ -263,7 +264,15 @@ def _attach_signature_highlights(
                 if not isinstance(sig, dict):
                     continue
                 name = (sig.get("name") or "").strip()
-                if name and name.lower() in answer_lower:
+                if not name:
+                    continue
+                # Word-boundary regex avoids false positives like
+                # signer "P S" matching the gap between "Pradeep" and
+                # "Surukanti" inside "Pradeep Surukanti".
+                pattern = re.compile(
+                    r"\b" + re.escape(name) + r"\b", re.IGNORECASE
+                )
+                if pattern.search(answer):
                     matched_pages.append(page_str)
                     break  # one match per page is enough
 
diff --git a/workers/tests/test_answer_prompt.py b/workers/tests/test_answer_prompt.py
index ca1f93b556..e7b534591a 100644
--- a/workers/tests/test_answer_prompt.py
+++ b/workers/tests/test_answer_prompt.py
@@ -1038,6 +1038,39 @@ def test_preserves_existing_highlight_entries(self):
             [0, 320, 31, 3168],
         ]
 
+    def test_short_initials_do_not_falsely_match_across_words(self):
+        """Regression: signer "P S" must not match across "Pradeep Surukanti".
+
+        Pure substring matching incorrectly fired because "p s" appears
+        between "Pradee[p s]urukanti". Word-boundary matching prevents
+        the false positive.
+        """
+        from executor.executors.answer_prompt import AnswerPromptService
+
+        signature_metadata = {
+            "0": [
+                {"name": "P S", "type": "signature"},
+                {"name": "H S", "type": "signature"},
+            ],
+            "1": [
+                {"name": "Pradeep Surukanti", "type": "signature"},
+            ],
+        }
+        signature_page_references = {
+            "0": {"coords": [0, 100, 30, 3168]},
+            "1": {"coords": [1, 200, 30, 3168]},
+        }
+        metadata = {}
+        AnswerPromptService._attach_signature_highlights(
+            answer="Pradeep Surukanti",
+            signature_metadata=signature_metadata,
+            signature_page_references=signature_page_references,
+            metadata=metadata,
+            prompt_key="signer",
+        )
+        # Only the actual signer's page should be attached, not page 0.
+        assert metadata["highlight_data"]["signer"] == [[1, 200, 30, 3168]]
+
     def test_missing_inputs_no_op(self):
         """No-op when signature data or metadata pieces are missing."""
         from executor.executors.answer_prompt import AnswerPromptService

From df77c3eb95ffab6fd3899c06738878e2a2b77eb1 Mon Sep 17 00:00:00 2001
From: pk-zipstack <praveen@zipstack.com>
Date: Thu, 14 May 2026 17:24:24 +0530
Subject: [PATCH 07/12] [FIX] Allow signature page jumps without the
 enable_highlight toggle

Two clickability gates in the frontend prevented signature-driven page
jumps from firing when the tool's enable_highlight setting was off,
even though the backend was correctly producing highlight_data from
document_insights signature extraction.

1) TextResult only rendered the clickable Typography.Text variant when
   enableHighlight was true, so the answer stayed a plain <div>.
2) handleSelectHighlight in PromptCard.jsx returned silently when
   enable_highlight was false, so even if the click fired,
   selectedHighlight state never updated and PdfViewer never received
   a non-empty highlightData prop (jumpToPage was never called).

Both gates now also pass through when highlight_data is present, so the
signature feature works on tools that have document_insights mode on
LLMWhisperer V2 even without flipping the separate enable_highlight
toggle. Existing flows (with enable_highlight=false and no highlight
data) are unchanged because their highlight_data is empty.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../prompt-card/DisplayPromptResult.jsx       | 12 +++++-
 .../custom-tools/prompt-card/PromptCard.jsx   | 41 +++++++++++--------
 2 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/frontend/src/components/custom-tools/prompt-card/DisplayPromptResult.jsx b/frontend/src/components/custom-tools/prompt-card/DisplayPromptResult.jsx
index f5f9474024..857d96f33a 100644
--- a/frontend/src/components/custom-tools/prompt-card/DisplayPromptResult.jsx
+++ b/frontend/src/components/custom-tools/prompt-card/DisplayPromptResult.jsx
@@ -415,14 +415,22 @@ const TextResult = ({
 
   const confidence = getConfidenceForText();
 
-  return enableHighlight ? (
+  // Make the answer clickable when the tool has highlighting enabled OR
+  // when the backend produced highlight_data (e.g. signature page refs
+  // from LLMWhisperer's document_insights mode), so signature highlights
+  // still work without requiring the separate enable_highlight toggle.
+  const hasHighlightData =
+    Array.isArray(highlightData) && highlightData.length > 0;
+  const isClickable = enableHighlight || hasHighlightData;
+
+  return isClickable ? (
     <Typography.Text
       wrap
       onClick={() =>
         onSelectHighlight(highlightData, promptId, profileId, confidence)
       }
       className={`prompt-output-result json-value ${
-        highlightData ? "clickable" : ""
+        hasHighlightData ? "clickable" : ""
       } ${selectedHighlight?.highlightedPrompt === promptId ? "selected" : ""}`}
     >
       {parsedOutput}
diff --git a/frontend/src/components/custom-tools/prompt-card/PromptCard.jsx b/frontend/src/components/custom-tools/prompt-card/PromptCard.jsx
index 2e579108c2..11558a5323 100644
--- a/frontend/src/components/custom-tools/prompt-card/PromptCard.jsx
+++ b/frontend/src/components/custom-tools/prompt-card/PromptCard.jsx
@@ -240,23 +240,32 @@ const PromptCard = memo(
       highlightedProfile,
       confidenceData,
     ) => {
-      if (details?.enable_highlight) {
-        const processedHighlight =
-          singlePassExtractMode &&
-          typeof highlightData === "object" &&
-          !Array.isArray(highlightData)
-            ? flattenHighlightData(highlightData)
-            : highlightData;
-
-        updateCustomTool({
-          selectedHighlight: {
-            highlight: processedHighlight,
-            highlightedPrompt: highlightedPrompt,
-            highlightedProfile: highlightedProfile,
-            confidence: confidenceData,
-          },
-        });
+      // Allow highlight state to update when the tool has highlighting
+      // enabled OR when the backend produced highlight_data (e.g.
+      // signature page refs from LLMWhisperer's document_insights mode),
+      // so signature-driven page jumps work without the separate
+      // enable_highlight toggle.
+      const hasHighlightData = Array.isArray(highlightData)
+        ? highlightData.length > 0
+        : Boolean(highlightData);
+      if (!details?.enable_highlight && !hasHighlightData) {
+        return;
       }
+      const processedHighlight =
+        singlePassExtractMode &&
+        typeof highlightData === "object" &&
+        !Array.isArray(highlightData)
+          ? flattenHighlightData(highlightData)
+          : highlightData;
+
+      updateCustomTool({
+        selectedHighlight: {
+          highlight: processedHighlight,
+          highlightedPrompt: highlightedPrompt,
+          highlightedProfile: highlightedProfile,
+          confidence: confidenceData,
+        },
+      });
     };
 
     const handleTypeChange = (value) => {

From d49d924ad83d7b05ffbd7a5f99f7eefec1d76ab3 Mon Sep 17 00:00:00 2001
From: pk-zipstack <praveen@zipstack.com>
Date: Thu, 14 May 2026 17:37:00 +0530
Subject: [PATCH 08/12] [MISC] Address pre-commit + SonarCloud findings on
 signature highlights

- Extract duplicated signature-highlight post-processor logic into a
  shared helper at ``unstract/sdk1/utils/signature_highlights.py`` and
  delegate from both workers and prompt-service. Cuts SonarCloud's
  duplication metric and brings the cognitive-complexity score below
  the gate.
- Split LLMWhispererV2._build_signature_page_references into
  _index_first_content_line_per_page + _build_page_reference_entry to
  cut its cognitive complexity below 15.
- Move ExtractResult NamedTuple in prompt_studio_helper below the import
  block to silence ruff E402 (and drop a duplicate logger= line that
  was already present).
- Move the `logger = ...` line in prompt-service extraction.py below
  all imports to fix the same E402 issue.
- Apply ruff-format normalisation across the touched files.

No behaviour change. All 8 signature-highlight unit tests still pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../prompt_studio_helper.py                   |  50 +++---
 .../prompt_service/services/answer_prompt.py  | 104 +++---------
 .../prompt_service/services/extraction.py     |   7 +-
 .../llm_whisperer_v2/src/llm_whisperer_v2.py  | 113 +++++++------
 .../sdk1/utils/signature_highlights.py        | 156 ++++++++++++++++++
 workers/executor/executors/answer_prompt.py   | 116 +++----------
 workers/executor/executors/legacy_executor.py |  18 +-
 workers/tests/test_answer_prompt.py           |   5 +-
 8 files changed, 295 insertions(+), 274 deletions(-)
 create mode 100644 unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py

diff --git a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
index 9eccd1d98f..fcd9720289 100644
--- a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
+++ b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
@@ -6,21 +6,6 @@
 from pathlib import Path
 from typing import Any, NamedTuple
 
-
-class ExtractResult(NamedTuple):
-    """Return value of ``PromptStudioHelper.dynamic_extractor``.
-
-    ``signature_metadata`` and ``signature_page_references`` are populated
-    only when the x2text adapter is LLMWhisperer V2 in ``document_insights``
-    mode and the document contains signatures. They are read either from
-    the live extract dispatch result (cache miss) or from the on-disk
-    ``.doc_insights.json`` sidecar (cache hit).
-    """
-
-    text: str
-    signature_metadata: dict[str, Any] | None = None
-    signature_page_references: dict[str, Any] | None = None
-
 from account_v2.constants import Common
 from account_v2.models import User
 from adapter_processor_v2.constants import AdapterKeys
@@ -95,7 +80,20 @@ class ExtractResult(NamedTuple):
 CHOICES_JSON = "/static/select_choices.json"
 ERROR_MSG = "User %s doesn't have access to adapter %s"
 
-logger = logging.getLogger(__name__)
+
+class ExtractResult(NamedTuple):
+    """Return value of ``PromptStudioHelper.dynamic_extractor``.
+
+    ``signature_metadata`` and ``signature_page_references`` are populated
+    only when the x2text adapter is LLMWhisperer V2 in ``document_insights``
+    mode and the document contains signatures. They are read either from
+    the live extract dispatch result (cache miss) or from the on-disk
+    ``.doc_insights.json`` sidecar (cache hit).
+    """
+
+    text: str
+    signature_metadata: dict[str, Any] | None = None
+    signature_page_references: dict[str, Any] | None = None
 
 
 class PromptStudioHelper:
@@ -853,9 +851,7 @@ def build_fetch_response_payload(
             settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), ""
         )
         if extract_result.signature_metadata:
-            tool_settings[TSPKeys.SIGNATURE_METADATA] = (
-                extract_result.signature_metadata
-            )
+            tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata
         if extract_result.signature_page_references:
             tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = (
                 extract_result.signature_page_references
@@ -1052,9 +1048,7 @@ def build_bulk_fetch_response_payload(
             settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), ""
         )
         if extract_result.signature_metadata:
-            tool_settings[TSPKeys.SIGNATURE_METADATA] = (
-                extract_result.signature_metadata
-            )
+            tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata
         if extract_result.signature_page_references:
             tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = (
                 extract_result.signature_page_references
@@ -1199,9 +1193,7 @@ def build_single_pass_payload(
             TSPKeys.SIMILARITY_TOP_K: default_profile.similarity_top_k,
         }
         if extract_result.signature_metadata:
-            tool_settings[TSPKeys.SIGNATURE_METADATA] = (
-                extract_result.signature_metadata
-            )
+            tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata
         if extract_result.signature_page_references:
             tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = (
                 extract_result.signature_page_references
@@ -1977,9 +1969,7 @@ def _fetch_response(
             settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), ""
         )
         if extract_result.signature_metadata:
-            tool_settings[TSPKeys.SIGNATURE_METADATA] = (
-                extract_result.signature_metadata
-            )
+            tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata
         if extract_result.signature_page_references:
             tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = (
                 extract_result.signature_page_references
@@ -2284,9 +2274,7 @@ def _fetch_single_pass_response(
         )
         tool_settings[TSPKeys.SIMILARITY_TOP_K] = default_profile.similarity_top_k
         if extract_result.signature_metadata:
-            tool_settings[TSPKeys.SIGNATURE_METADATA] = (
-                extract_result.signature_metadata
-            )
+            tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata
         if extract_result.signature_page_references:
             tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = (
                 extract_result.signature_page_references
diff --git a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
index 8224441a48..49a1009a67 100644
--- a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
+++ b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
@@ -1,5 +1,4 @@
 import ipaddress
-import re
 import socket
 from logging import Logger
 from typing import Any
@@ -25,6 +24,10 @@
 from unstract.sdk1.file_storage.constants import StorageType
 from unstract.sdk1.file_storage.env_helper import EnvHelper
 from unstract.sdk1.llm import LLM
+from unstract.sdk1.utils.signature_highlights import (
+    merge_into_highlight_data,
+    resolve_signature_highlight_coords,
+)
 
 
 def _is_safe_public_url(url: str) -> bool:
@@ -159,26 +162,12 @@ def construct_and_run_prompt(
         AnswerPromptService._attach_signature_highlights(
             answer=answer,
             signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA),
-            signature_page_references=tool_settings.get(
-                PSKeys.SIGNATURE_PAGE_REFERENCES
-            ),
+            signature_page_references=tool_settings.get(PSKeys.SIGNATURE_PAGE_REFERENCES),
             metadata=metadata,
             prompt_key=output[PSKeys.NAME],
         )
         return answer
 
-    # Generic signature-related terms used as a fallback trigger when the
-    # LLM answer doesn't mention any specific signer name but does talk
-    # about signing in general. Matched as case-insensitive substrings.
-    _SIGNATURE_KEYWORDS = (
-        "signature",
-        "signed",
-        "signatory",
-        "signatories",
-        "signing",
-        "executed",
-    )
-
     @staticmethod
     def _attach_signature_highlights(
         answer: str,
@@ -190,81 +179,30 @@ def _attach_signature_highlights(
         """Attach signature page highlights to ``metadata`` when the LLM
         answer references a known signer or signatures generally.
 
-        Mirror of the workers post-processor — see
-        ``executor.executors.answer_prompt.AnswerPromptService._attach_signature_highlights``
-        for behavior details.
+        Delegates the matching logic to
+        ``unstract.sdk1.utils.signature_highlights`` so workers and
+        prompt-service stay in sync.
         """
-        if not signature_page_references or not signature_metadata:
-            return
         if metadata is None or not prompt_key:
             return
-        if not isinstance(answer, str) or not answer.strip():
-            return
-
-        page_coords: dict[str, list[int]] = {}
-        for page_str, ref in signature_page_references.items():
-            if not isinstance(ref, dict):
-                continue
-            coords = ref.get("coords")
-            if isinstance(coords, list) and len(coords) >= 4:
-                page_coords[page_str] = list(coords[:4])
-        if not page_coords:
-            return
-
-        answer_lower = answer.lower()
-        matched_pages: list[str] = []
-        for page_str, signatures in signature_metadata.items():
-            if page_str not in page_coords or not signatures:
-                continue
-            for sig in signatures:
-                if not isinstance(sig, dict):
-                    continue
-                name = (sig.get("name") or "").strip()
-                if not name:
-                    continue
-                # Word-boundary regex avoids false positives like
-                # signer "P S" matching the gap between "Pradeep" and
-                # "Surukanti" inside "Pradeep Surukanti".
-                pattern = re.compile(
-                    r"\b" + re.escape(name) + r"\b", re.IGNORECASE
-                )
-                if pattern.search(answer):
-                    matched_pages.append(page_str)
-                    break
-
-        if not matched_pages:
-            if any(
-                kw in answer_lower for kw in AnswerPromptService._SIGNATURE_KEYWORDS
-            ):
-                matched_pages = list(page_coords.keys())
-
-        if not matched_pages:
+        new_coords = resolve_signature_highlight_coords(
+            answer=answer,
+            signature_metadata=signature_metadata,
+            signature_page_references=signature_page_references,
+        )
+        if not new_coords:
             return
-
-        seen: set[tuple[int, ...]] = set()
-        new_coords: list[list[int]] = []
-        for page_str in matched_pages:
-            coords = page_coords[page_str]
-            key = tuple(coords)
-            if key in seen:
-                continue
-            seen.add(key)
-            new_coords.append(coords)
-
-        bucket = metadata.setdefault(PSKeys.HIGHLIGHT_DATA, {})
-        existing = bucket.get(prompt_key)
-        if not isinstance(existing, list):
-            existing = []
-        for coords in new_coords:
-            if coords not in existing:
-                existing.append(coords)
-        bucket[prompt_key] = existing
+        merge_into_highlight_data(
+            metadata=metadata,
+            prompt_key=prompt_key,
+            new_coords=new_coords,
+            highlight_data_key=PSKeys.HIGHLIGHT_DATA,
+        )
         app.logger.info(
             "DOC_INSIGHTS attach_signature_highlights: prompt=%s, added %d "
-            "signature highlight(s) on pages %s",
+            "signature highlight(s)",
             prompt_key,
             len(new_coords),
-            matched_pages,
         )
 
     @staticmethod
diff --git a/prompt-service/src/unstract/prompt_service/services/extraction.py b/prompt-service/src/unstract/prompt_service/services/extraction.py
index bd6fc9212b..8b252e8b55 100644
--- a/prompt-service/src/unstract/prompt_service/services/extraction.py
+++ b/prompt-service/src/unstract/prompt_service/services/extraction.py
@@ -2,8 +2,6 @@
 from pathlib import Path
 from typing import Any
 
-logger = logging.getLogger(__name__)
-
 from unstract.prompt_service.constants import ExecutionSource
 from unstract.prompt_service.constants import IndexingConstants as IKeys
 from unstract.prompt_service.exceptions import ExtractionError
@@ -17,6 +15,8 @@
 from unstract.sdk1.utils.tool import ToolUtils
 from unstract.sdk1.x2txt import TextExtractionResult, X2Text
 
+logger = logging.getLogger(__name__)
+
 
 class ExtractionService:
     @staticmethod
@@ -78,8 +78,7 @@ def perform_extraction(
                     process_response.extraction_metadata.signature_metadata
                 )
                 logger.info(
-                    "DOC_INSIGHTS extraction: signature_metadata found "
-                    "for pages: %s",
+                    "DOC_INSIGHTS extraction: signature_metadata found " "for pages: %s",
                     list(signature_metadata.keys()),
                 )
             if (
diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
index a980b5b4f0..066c8a4a57 100644
--- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
+++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py
@@ -56,6 +56,50 @@ def get_description() -> str:
     def get_icon() -> str:
         return "/icons/adapter-icons/LLMWhispererV2.png"
 
+    @staticmethod
+    def _index_first_content_line_per_page(
+        line_metadata: list[list[int]],
+    ) -> dict[int, int]:
+        """Map each page to its first content-line index in ``line_metadata``.
+
+        Marker/empty rows like ``[0, 0, 0, 3168]`` or ``[1, 0, 0, 0]`` are
+        skipped because they have zero height or zero page_height and
+        produce an invisible overlay (and divide-by-zero in the frontend's
+        percentage calculation).
+        """
+        page_first_line: dict[int, int] = {}
+        for idx, entry in enumerate(line_metadata):
+            if not isinstance(entry, list) or len(entry) < 4:
+                continue
+            page, _y, height, page_height = entry[0], entry[1], entry[2], entry[3]
+            if height <= 0 or page_height <= 0:
+                continue
+            if page not in page_first_line:
+                page_first_line[page] = idx
+        return page_first_line
+
+    @staticmethod
+    def _build_page_reference_entry(
+        line_index: int,
+        signatures: list[Any],
+        line_metadata: list[list[int]],
+    ) -> dict[str, Any]:
+        """Build a single ``signature_page_references`` entry for one page."""
+        coords_entry = line_metadata[line_index]
+        coords = (
+            list(coords_entry[:4])
+            if isinstance(coords_entry, list) and len(coords_entry) >= 4
+            else None
+        )
+        return {
+            "hex": f"0x{line_index + 1:02X}",  # 1-indexed hex
+            "line_metadata_index": line_index,
+            "signers": [
+                sig.get("name", "Unknown") for sig in signatures if isinstance(sig, dict)
+            ],
+            "coords": coords,
+        }
+
     @staticmethod
     def _build_signature_page_references(
         signature_metadata: dict[str, list[Any]],
@@ -63,9 +107,12 @@ def _build_signature_page_references(
     ) -> dict[str, Any] | None:
         """Build page references for frontend navigation to signature pages.
 
-        For each page that has signatures, finds the first line_metadata
-        entry for that page and converts its index to a 1-indexed hex
-        value. This allows the frontend to jump to the correct page.
+        For each page that has signatures, finds the first **content**
+        line in ``line_metadata`` (skipping zero-height marker rows) and
+        emits its 1-indexed hex value plus resolved coords. The frontend
+        uses ``coords`` directly in its highlight overlay; the workers
+        executor caches the result in a sidecar JSON next to the
+        extracted text file so cached extracts retain it.
 
         Args:
             signature_metadata: Dict keyed by page number (str, 0-indexed)
@@ -73,31 +120,18 @@ def _build_signature_page_references(
             line_metadata: List of [page, y_pos, height, page_height] arrays.
 
         Returns:
-            Dict mapping page number to hex reference and signer names,
-            or None if no references could be built.
+            Dict mapping page number to ``{hex, line_metadata_index,
+            signers, coords}``, or None if no references could be built.
         """
         if not line_metadata:
-            logger.warning("DOC_INSIGHTS: no line_metadata available, "
-                           "cannot build page references")
+            logger.warning(
+                "DOC_INSIGHTS: no line_metadata available, "
+                "cannot build page references"
+            )
             return None
 
-        # Build a map of page number -> first *content* line index.
-        # Skip marker/empty rows like [0, 0, 0, 3168] or [1, 0, 0, 0]:
-        # they have zero height or zero page_height and produce an
-        # invisible overlay (and divide-by-zero in the frontend's
-        # percentage calculations).
-        page_first_line: dict[int, int] = {}
-        for idx, entry in enumerate(line_metadata):
-            if not isinstance(entry, list) or len(entry) < 4:
-                continue
-            page, _y, height, page_height = entry[0], entry[1], entry[2], entry[3]
-            if height <= 0 or page_height <= 0:
-                continue
-            if page not in page_first_line:
-                page_first_line[page] = idx
-        logger.debug(
-            "DOC_INSIGHTS: page_first_line map: %s", page_first_line
-        )
+        page_first_line = LLMWhispererV2._index_first_content_line_per_page(line_metadata)
+        logger.debug("DOC_INSIGHTS: page_first_line map: %s", page_first_line)
 
         references: dict[str, Any] = {}
         for page_str, signatures in signature_metadata.items():
@@ -106,29 +140,14 @@ def _build_signature_page_references(
             page_num = int(page_str)
             if page_num not in page_first_line:
                 logger.warning(
-                    "DOC_INSIGHTS: page %d not found in line_metadata",
-                    page_num,
+                    "DOC_INSIGHTS: page %d not found in line_metadata", page_num
                 )
                 continue
-            line_index = page_first_line[page_num]
-            hex_value = f"0x{line_index + 1:02X}"  # 1-indexed hex
-            signers = [
-                sig.get("name", "Unknown")
-                for sig in signatures
-                if isinstance(sig, dict)
-            ]
-            coords_entry = line_metadata[line_index]
-            coords = (
-                list(coords_entry[:4])
-                if isinstance(coords_entry, list) and len(coords_entry) >= 4
-                else None
+            references[page_str] = LLMWhispererV2._build_page_reference_entry(
+                line_index=page_first_line[page_num],
+                signatures=signatures,
+                line_metadata=line_metadata,
             )
-            references[page_str] = {
-                "hex": hex_value,
-                "line_metadata_index": line_index,
-                "signers": signers,
-                "coords": coords,
-            }
 
         return references if references else None
 
@@ -216,10 +235,8 @@ def process(
                 "computing page references",
                 len(raw_line_metadata),
             )
-            signature_page_references = (
-                LLMWhispererV2._build_signature_page_references(
-                    signature_metadata, raw_line_metadata
-                )
+            signature_page_references = LLMWhispererV2._build_signature_page_references(
+                signature_metadata, raw_line_metadata
             )
             logger.info(
                 "DOC_INSIGHTS: signature_page_references=%s",
diff --git a/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py b/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py
new file mode 100644
index 0000000000..9736754ca9
--- /dev/null
+++ b/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py
@@ -0,0 +1,156 @@
+"""Shared helpers for surfacing LLMWhisperer signature page highlights.
+
+The workers executor and the prompt-service answer-prompt service both
+need to post-process LLM answers against the signature metadata that
+LLMWhisperer V2's ``document_insights`` mode produces. This module owns
+the matching logic so both services stay in lock-step without copy-paste
+drift.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any
+
+# Generic signature-related terms used as a fallback trigger when the
+# LLM answer doesn't mention any specific signer name but does talk
+# about signing in general (e.g. "Is this signed?" → "Yes, the document
+# is signed."). Matched as case-insensitive substrings.
+SIGNATURE_KEYWORDS: tuple[str, ...] = (
+    "signature",
+    "signed",
+    "signatory",
+    "signatories",
+    "signing",
+    "executed",
+)
+
+
+def _build_page_coords(
+    signature_page_references: dict[str, Any],
+) -> dict[str, list[int]]:
+    """Pick the resolved coords array per signature page.
+
+    Entries without a four-element ``coords`` list are skipped.
+    """
+    page_coords: dict[str, list[int]] = {}
+    for page_str, ref in signature_page_references.items():
+        if not isinstance(ref, dict):
+            continue
+        coords = ref.get("coords")
+        if isinstance(coords, list) and len(coords) >= 4:
+            page_coords[page_str] = list(coords[:4])
+    return page_coords
+
+
+def _find_pages_matching_signers(
+    answer: str,
+    signature_metadata: dict[str, list[Any]],
+    eligible_pages: set[str],
+) -> list[str]:
+    """Return the pages whose signer names appear in ``answer``.
+
+    Each name is matched as a whole token/phrase (case-insensitive,
+    word-boundary anchored) to avoid signer initials like ``"P S"``
+    matching the gap between ``"Pradeep"`` and ``"Surukanti"`` inside
+    ``"Pradeep Surukanti"``.
+    """
+    matched: list[str] = []
+    for page_str, signatures in signature_metadata.items():
+        if page_str not in eligible_pages or not signatures:
+            continue
+        for sig in signatures:
+            if not isinstance(sig, dict):
+                continue
+            name = (sig.get("name") or "").strip()
+            if not name:
+                continue
+            pattern = re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE)
+            if pattern.search(answer):
+                matched.append(page_str)
+                break
+    return matched
+
+
+def _dedupe_coords(
+    matched_pages: list[str],
+    page_coords: dict[str, list[int]],
+) -> list[list[int]]:
+    """Map matched pages to their coords, preserving order and dropping dups."""
+    seen: set[tuple[int, ...]] = set()
+    new_coords: list[list[int]] = []
+    for page_str in matched_pages:
+        coords = page_coords[page_str]
+        key = tuple(coords)
+        if key in seen:
+            continue
+        seen.add(key)
+        new_coords.append(coords)
+    return new_coords
+
+
+def resolve_signature_highlight_coords(
+    answer: str,
+    signature_metadata: dict[str, list[Any]] | None,
+    signature_page_references: dict[str, Any] | None,
+) -> list[list[int]]:
+    """Return the page coords that the LLM answer should highlight.
+
+    Matching rules:
+
+    - For each signer name in ``signature_metadata`` that appears as a
+      whole word/phrase (case-insensitive) inside ``answer``, the
+      corresponding page's coords are included.
+    - When no signer name matches but the answer mentions a generic
+      signature keyword (``signature``, ``signed``, ``signatory``,
+      ``signing``, ``executed``), every signature page's coords are
+      included as a fallback.
+    - Returns an empty list when there's nothing to attach.
+
+    Returned coords are de-duplicated by content while preserving order.
+    """
+    if not signature_page_references or not signature_metadata:
+        return []
+    if not isinstance(answer, str) or not answer.strip():
+        return []
+
+    page_coords = _build_page_coords(signature_page_references)
+    if not page_coords:
+        return []
+
+    matched_pages = _find_pages_matching_signers(
+        answer=answer,
+        signature_metadata=signature_metadata,
+        eligible_pages=set(page_coords.keys()),
+    )
+
+    if not matched_pages and any(kw in answer.lower() for kw in SIGNATURE_KEYWORDS):
+        matched_pages = list(page_coords.keys())
+
+    if not matched_pages:
+        return []
+
+    return _dedupe_coords(matched_pages, page_coords)
+
+
+def merge_into_highlight_data(
+    metadata: dict[str, Any],
+    prompt_key: str,
+    new_coords: list[list[int]],
+    highlight_data_key: str = "highlight_data",
+) -> None:
+    """Append signature coords to ``metadata[highlight_data_key][prompt_key]``.
+
+    Skips duplicates against existing entries (e.g. those populated by
+    the hex-comment highlight pipeline). Mutates ``metadata`` in place.
+    """
+    if not new_coords:
+        return
+    bucket = metadata.setdefault(highlight_data_key, {})
+    existing = bucket.get(prompt_key)
+    if not isinstance(existing, list):
+        existing = []
+    for coords in new_coords:
+        if coords not in existing:
+            existing.append(coords)
+    bucket[prompt_key] = existing
diff --git a/workers/executor/executors/answer_prompt.py b/workers/executor/executors/answer_prompt.py
index d1451c8111..6b781cad33 100644
--- a/workers/executor/executors/answer_prompt.py
+++ b/workers/executor/executors/answer_prompt.py
@@ -13,7 +13,6 @@
 import ipaddress
 import logging
 import os
-import re
 import socket
 from typing import Any
 from urllib.parse import urlparse
@@ -21,6 +20,11 @@
 from executor.executors.constants import PromptServiceConstants as PSKeys
 from executor.executors.exceptions import LegacyExecutorError, RateLimitError
 
+from unstract.sdk1.utils.signature_highlights import (
+    merge_into_highlight_data,
+    resolve_signature_highlight_coords,
+)
+
 logger = logging.getLogger(__name__)
 
 
@@ -176,9 +180,7 @@ def construct_and_run_prompt(
         AnswerPromptService._attach_signature_highlights(
             answer=answer,
             signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA),
-            signature_page_references=tool_settings.get(
-                PSKeys.SIGNATURE_PAGE_REFERENCES
-            ),
+            signature_page_references=tool_settings.get(PSKeys.SIGNATURE_PAGE_REFERENCES),
             metadata=metadata,
             prompt_key=output[PSKeys.NAME],
         )
@@ -201,19 +203,6 @@ def _build_grammar_notes(grammar_list: list[dict[str, Any]]) -> str:
                 )
         return notes
 
-    # Generic signature-related terms used as a fallback trigger when the
-    # LLM answer doesn't mention any specific signer name but does talk
-    # about signing in general (e.g. "Is this signed?" → "Yes, the document
-    # is signed."). Matched as case-insensitive substrings.
-    _SIGNATURE_KEYWORDS = (
-        "signature",
-        "signed",
-        "signatory",
-        "signatories",
-        "signing",
-        "executed",
-    )
-
     @staticmethod
     def _attach_signature_highlights(
         answer: str,
@@ -225,91 +214,30 @@ def _attach_signature_highlights(
         """Attach signature page highlights to ``metadata`` when the LLM
         answer references a known signer or signatures generally.
 
-        - For each signer name in ``signature_metadata`` found as a
-          case-insensitive substring in ``answer``, append that page's
-          coords (from ``signature_page_references``) to
-          ``metadata[HIGHLIGHT_DATA][prompt_key]``.
-        - If no signer-name match is found but the answer mentions
-          generic signature keywords (signature, signed, signatory,
-          executed, signing), append every signature page's coords.
-
-        ``metadata[HIGHLIGHT_DATA][prompt_key]`` is mutated in place; the
-        existing list (populated by hex-comment processing in
-        ``run_completion``) is preserved and extended.
+        Delegates the matching logic to
+        ``unstract.sdk1.utils.signature_highlights`` so workers and
+        prompt-service stay in sync.
         """
-        if not signature_page_references or not signature_metadata:
-            return
         if metadata is None or not prompt_key:
             return
-        if not isinstance(answer, str) or not answer.strip():
-            return
-
-        # Build page → coords map (one coord array per signature page).
-        page_coords: dict[str, list[int]] = {}
-        for page_str, ref in signature_page_references.items():
-            if not isinstance(ref, dict):
-                continue
-            coords = ref.get("coords")
-            if isinstance(coords, list) and len(coords) >= 4:
-                page_coords[page_str] = list(coords[:4])
-        if not page_coords:
-            return
-
-        answer_lower = answer.lower()
-        matched_pages: list[str] = []
-        for page_str, signatures in signature_metadata.items():
-            if page_str not in page_coords or not signatures:
-                continue
-            for sig in signatures:
-                if not isinstance(sig, dict):
-                    continue
-                name = (sig.get("name") or "").strip()
-                if not name:
-                    continue
-                # Word-boundary regex avoids false positives like
-                # signer "P S" matching the gap between "Pradeep" and
-                # "Surukanti" inside "Pradeep Surukanti".
-                pattern = re.compile(
-                    r"\b" + re.escape(name) + r"\b", re.IGNORECASE
-                )
-                if pattern.search(answer):
-                    matched_pages.append(page_str)
-                    break  # one match per page is enough
-
-        if not matched_pages:
-            # No specific signer matched — fall back to all signature pages
-            # when the answer talks about signing generically.
-            if any(kw in answer_lower for kw in AnswerPromptService._SIGNATURE_KEYWORDS):
-                matched_pages = list(page_coords.keys())
-
-        if not matched_pages:
+        new_coords = resolve_signature_highlight_coords(
+            answer=answer,
+            signature_metadata=signature_metadata,
+            signature_page_references=signature_page_references,
+        )
+        if not new_coords:
             return
-
-        seen: set[tuple[int, ...]] = set()
-        new_coords: list[list[int]] = []
-        for page_str in matched_pages:
-            coords = page_coords[page_str]
-            key = tuple(coords)
-            if key in seen:
-                continue
-            seen.add(key)
-            new_coords.append(coords)
-
-        bucket = metadata.setdefault(PSKeys.HIGHLIGHT_DATA, {})
-        existing = bucket.get(prompt_key)
-        if not isinstance(existing, list):
-            existing = []
-        # Avoid duplicating coords already present from hex-comment processing.
-        for coords in new_coords:
-            if coords not in existing:
-                existing.append(coords)
-        bucket[prompt_key] = existing
+        merge_into_highlight_data(
+            metadata=metadata,
+            prompt_key=prompt_key,
+            new_coords=new_coords,
+            highlight_data_key=PSKeys.HIGHLIGHT_DATA,
+        )
         logger.info(
             "DOC_INSIGHTS attach_signature_highlights: prompt=%s, added %d "
-            "signature highlight(s) on pages %s",
+            "signature highlight(s)",
             prompt_key,
             len(new_coords),
-            matched_pages,
         )
 
     @staticmethod
diff --git a/workers/executor/executors/legacy_executor.py b/workers/executor/executors/legacy_executor.py
index 4ddccaa5be..23ffa6a873 100644
--- a/workers/executor/executors/legacy_executor.py
+++ b/workers/executor/executors/legacy_executor.py
@@ -273,30 +273,24 @@ def _handle_extract(self, context: ExecutionContext) -> ExecutionResult:
                 logger.info(
                     "DOC_INSIGHTS _handle_extract: signature_metadata found "
                     "for pages: %s",
-                    list(process_response.extraction_metadata
-                         .signature_metadata.keys()),
+                    list(process_response.extraction_metadata.signature_metadata.keys()),
                 )
             if (
                 process_response.extraction_metadata
                 and process_response.extraction_metadata.signature_page_references
             ):
                 result_data["signature_page_references"] = (
-                    process_response.extraction_metadata
-                    .signature_page_references
+                    process_response.extraction_metadata.signature_page_references
                 )
                 logger.info(
-                    "DOC_INSIGHTS _handle_extract: "
-                    "signature_page_references=%s",
-                    process_response.extraction_metadata
-                    .signature_page_references,
+                    "DOC_INSIGHTS _handle_extract: " "signature_page_references=%s",
+                    process_response.extraction_metadata.signature_page_references,
                 )
             self._write_signature_sidecar(
                 fs=fs,
                 output_file_path=output_file_path,
                 signature_metadata=result_data.get("signature_metadata"),
-                signature_page_references=result_data.get(
-                    "signature_page_references"
-                ),
+                signature_page_references=result_data.get("signature_page_references"),
             )
             return ExecutionResult(
                 success=True,
@@ -321,7 +315,7 @@ def _signature_sidecar_path(output_file_path: str) -> str:
         Prompt Studio can recover signature data without re-extracting.
         """
         p = Path(output_file_path)
-        return str(p.with_suffix("") ) + ".doc_insights.json"
+        return str(p.with_suffix("")) + ".doc_insights.json"
 
     @staticmethod
     def _write_signature_sidecar(
diff --git a/workers/tests/test_answer_prompt.py b/workers/tests/test_answer_prompt.py
index e7b534591a..708b720af7 100644
--- a/workers/tests/test_answer_prompt.py
+++ b/workers/tests/test_answer_prompt.py
@@ -11,7 +11,6 @@
 from executor.executors.constants import (
     PromptServiceConstants as PSKeys,
 )
-
 from unstract.sdk1.execution.context import ExecutionContext, Operation
 
 # ---------------------------------------------------------------------------
@@ -109,7 +108,9 @@ def _mock_deps(llm=None):
         llm = _mock_llm()
 
     # AnswerPromptService — use the real class
-    from executor.executors.answer_prompt import AnswerPromptService as answer_prompt_svc_cls
+    from executor.executors.answer_prompt import (
+        AnswerPromptService as answer_prompt_svc_cls,
+    )
 
     retrieval_svc = MagicMock(name="RetrievalService")
     retrieval_svc.run_retrieval.return_value = ["chunk1", "chunk2"]

From 4cfbc8c8a9c0c830e9a983c24905402fe6de23b3 Mon Sep 17 00:00:00 2001
From: pk-zipstack <praveen@zipstack.com>
Date: Thu, 14 May 2026 17:49:53 +0530
Subject: [PATCH 09/12] [MISC] Reduce cognitive complexity flagged by
 SonarCloud

- Extract _inject_signature_data_into_tool_settings helper in
  prompt_studio_helper.py; the 5 call sites now invoke it instead of
  inlining the if-blocks. Drops complexity of build_fetch_response_payload,
  build_bulk_fetch_response_payload, and dynamic_extractor below the
  gate (15).
- Extract _capture_signature_data on LegacyExecutor for the
  signature_metadata / signature_page_references capture +
  sidecar write. _handle_extract is back below 15.
- Split _find_pages_matching_signers in signature_highlights.py into
  a helper _any_signer_matches so the outer pages walk becomes a list
  comprehension. Complexity below 15.
- Fix two implicit-string-concatenation log lines that ruff-format
  collapsed onto one line (S5799).

No behaviour change. All 8 signature-highlight unit tests still pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../prompt_studio_helper.py                   | 62 +++++++++--------
 .../prompt_service/services/extraction.py     |  2 +-
 .../sdk1/utils/signature_highlights.py        | 46 +++++++------
 workers/executor/executors/legacy_executor.py | 69 +++++++++++--------
 4 files changed, 100 insertions(+), 79 deletions(-)

diff --git a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
index 7e2e5cae4e..6273f7f363 100644
--- a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
+++ b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
@@ -861,12 +861,9 @@ def build_fetch_response_payload(
         tool_settings[TSPKeys.WORD_CONFIDENCE_POSTAMBLE] = getattr(
             settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), ""
         )
-        if extract_result.signature_metadata:
-            tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata
-        if extract_result.signature_page_references:
-            tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = (
-                extract_result.signature_page_references
-            )
+        PromptStudioHelper._inject_signature_data_into_tool_settings(
+            tool_settings, extract_result
+        )
 
         file_hash = fs_instance.get_hash_from_file(path=extract_path)
 
@@ -1058,12 +1055,9 @@ def build_bulk_fetch_response_payload(
         tool_settings[TSPKeys.WORD_CONFIDENCE_POSTAMBLE] = getattr(
             settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), ""
         )
-        if extract_result.signature_metadata:
-            tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata
-        if extract_result.signature_page_references:
-            tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = (
-                extract_result.signature_page_references
-            )
+        PromptStudioHelper._inject_signature_data_into_tool_settings(
+            tool_settings, extract_result
+        )
 
         file_hash = fs_instance.get_hash_from_file(path=extract_path)
 
@@ -1203,12 +1197,9 @@ def build_single_pass_payload(
             or TSPKeys.SIMPLE,
             TSPKeys.SIMILARITY_TOP_K: default_profile.similarity_top_k,
         }
-        if extract_result.signature_metadata:
-            tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata
-        if extract_result.signature_page_references:
-            tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = (
-                extract_result.signature_page_references
-            )
+        PromptStudioHelper._inject_signature_data_into_tool_settings(
+            tool_settings, extract_result
+        )
 
         lookup_configs = get_lookup_configs_for_tool(tool, prompts=prompts)
         if lookup_configs:
@@ -2009,12 +2000,9 @@ def _fetch_response(
         tool_settings[TSPKeys.WORD_CONFIDENCE_POSTAMBLE] = getattr(
             settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), ""
         )
-        if extract_result.signature_metadata:
-            tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata
-        if extract_result.signature_page_references:
-            tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = (
-                extract_result.signature_page_references
-            )
+        PromptStudioHelper._inject_signature_data_into_tool_settings(
+            tool_settings, extract_result
+        )
         file_hash = fs_instance.get_hash_from_file(path=doc_path)
 
         payload = {
@@ -2314,12 +2302,9 @@ def _fetch_single_pass_response(
             default_profile.retrieval_strategy or TSPKeys.SIMPLE
         )
         tool_settings[TSPKeys.SIMILARITY_TOP_K] = default_profile.similarity_top_k
-        if extract_result.signature_metadata:
-            tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata
-        if extract_result.signature_page_references:
-            tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = (
-                extract_result.signature_page_references
-            )
+        PromptStudioHelper._inject_signature_data_into_tool_settings(
+            tool_settings, extract_result
+        )
         for prompt in prompts:
             if not prompt.prompt:
                 raise EmptyPromptError()
@@ -2379,6 +2364,23 @@ def get_tool_from_tool_id(tool_id: str) -> CustomTool | None:
         except CustomTool.DoesNotExist:
             return None
 
+    @staticmethod
+    def _inject_signature_data_into_tool_settings(
+        tool_settings: dict[str, Any],
+        extract_result: "ExtractResult",
+    ) -> None:
+        """Inject ``signature_metadata`` / ``signature_page_references``
+        from the extract result into ``tool_settings`` (mutated in place).
+
+        No-op when document_insights mode produced no signature data.
+        """
+        if extract_result.signature_metadata:
+            tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata
+        if extract_result.signature_page_references:
+            tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = (
+                extract_result.signature_page_references
+            )
+
     @staticmethod
     def _signature_sidecar_path(extract_file_path: str) -> str:
         p = Path(extract_file_path)
diff --git a/prompt-service/src/unstract/prompt_service/services/extraction.py b/prompt-service/src/unstract/prompt_service/services/extraction.py
index 8b252e8b55..f431a33648 100644
--- a/prompt-service/src/unstract/prompt_service/services/extraction.py
+++ b/prompt-service/src/unstract/prompt_service/services/extraction.py
@@ -78,7 +78,7 @@ def perform_extraction(
                     process_response.extraction_metadata.signature_metadata
                 )
                 logger.info(
-                    "DOC_INSIGHTS extraction: signature_metadata found " "for pages: %s",
+                    "DOC_INSIGHTS extraction: signature_metadata found for pages: %s",
                     list(signature_metadata.keys()),
                 )
             if (
diff --git a/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py b/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py
index 9736754ca9..ac140bd649 100644
--- a/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py
+++ b/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py
@@ -43,33 +43,39 @@ def _build_page_coords(
     return page_coords
 
 
-def _find_pages_matching_signers(
-    answer: str,
-    signature_metadata: dict[str, list[Any]],
-    eligible_pages: set[str],
-) -> list[str]:
-    """Return the pages whose signer names appear in ``answer``.
+def _any_signer_matches(signatures: list[Any], answer: str) -> bool:
+    """Return True if any signer name in ``signatures`` appears in ``answer``.
 
     Each name is matched as a whole token/phrase (case-insensitive,
     word-boundary anchored) to avoid signer initials like ``"P S"``
     matching the gap between ``"Pradeep"`` and ``"Surukanti"`` inside
     ``"Pradeep Surukanti"``.
     """
-    matched: list[str] = []
-    for page_str, signatures in signature_metadata.items():
-        if page_str not in eligible_pages or not signatures:
+    for sig in signatures:
+        if not isinstance(sig, dict):
+            continue
+        name = (sig.get("name") or "").strip()
+        if not name:
             continue
-        for sig in signatures:
-            if not isinstance(sig, dict):
-                continue
-            name = (sig.get("name") or "").strip()
-            if not name:
-                continue
-            pattern = re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE)
-            if pattern.search(answer):
-                matched.append(page_str)
-                break
-    return matched
+        pattern = re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE)
+        if pattern.search(answer):
+            return True
+    return False
+
+
+def _find_pages_matching_signers(
+    answer: str,
+    signature_metadata: dict[str, list[Any]],
+    eligible_pages: set[str],
+) -> list[str]:
+    """Return the pages whose signer names appear in ``answer``."""
+    return [
+        page_str
+        for page_str, signatures in signature_metadata.items()
+        if page_str in eligible_pages
+        and signatures
+        and _any_signer_matches(signatures, answer)
+    ]
 
 
 def _dedupe_coords(
diff --git a/workers/executor/executors/legacy_executor.py b/workers/executor/executors/legacy_executor.py
index 1d00daad26..9119850e02 100644
--- a/workers/executor/executors/legacy_executor.py
+++ b/workers/executor/executors/legacy_executor.py
@@ -305,36 +305,12 @@ def _handle_extract(self, context: ExecutionContext) -> ExecutionResult:
                 result_data["highlight_metadata"] = (
                     process_response.extraction_metadata.line_metadata
                 )
-            # Include signature metadata when available
-            # (from document_insights mode)
-            if (
-                process_response.extraction_metadata
-                and process_response.extraction_metadata.signature_metadata
-            ):
-                result_data["signature_metadata"] = (
-                    process_response.extraction_metadata.signature_metadata
-                )
-                logger.info(
-                    "DOC_INSIGHTS _handle_extract: signature_metadata found "
-                    "for pages: %s",
-                    list(process_response.extraction_metadata.signature_metadata.keys()),
-                )
-            if (
-                process_response.extraction_metadata
-                and process_response.extraction_metadata.signature_page_references
-            ):
-                result_data["signature_page_references"] = (
-                    process_response.extraction_metadata.signature_page_references
-                )
-                logger.info(
-                    "DOC_INSIGHTS _handle_extract: " "signature_page_references=%s",
-                    process_response.extraction_metadata.signature_page_references,
-                )
-            self._write_signature_sidecar(
+            # Include signature metadata when available (document_insights mode)
+            self._capture_signature_data(
                 fs=fs,
                 output_file_path=output_file_path,
-                signature_metadata=result_data.get("signature_metadata"),
-                signature_page_references=result_data.get("signature_page_references"),
+                process_response=process_response,
+                result_data=result_data,
             )
             return ExecutionResult(
                 success=True,
@@ -351,6 +327,43 @@ def _handle_extract(self, context: ExecutionContext) -> ExecutionResult:
             msg = f"Error from text extractor '{name}'. {e}"
             raise ExtractionError(message=msg) from e
 
+    def _capture_signature_data(
+        self,
+        fs: Any,
+        output_file_path: str | None,
+        process_response: TextExtractionResult,
+        result_data: dict[str, Any],
+    ) -> None:
+        """Move document_insights signature fields onto the result dict and
+        persist them in a sidecar JSON next to the extracted text file.
+
+        No-op when the adapter did not produce signature data (e.g.
+        non-LLMWhisperer-V2 adapters or modes other than ``document_insights``).
+        """
+        extraction_metadata = process_response.extraction_metadata
+        if not extraction_metadata:
+            return
+        signature_metadata = extraction_metadata.signature_metadata
+        signature_page_references = extraction_metadata.signature_page_references
+        if signature_metadata:
+            result_data["signature_metadata"] = signature_metadata
+            logger.info(
+                "DOC_INSIGHTS _handle_extract: signature_metadata found for " "pages: %s",
+                list(signature_metadata.keys()),
+            )
+        if signature_page_references:
+            result_data["signature_page_references"] = signature_page_references
+            logger.info(
+                "DOC_INSIGHTS _handle_extract: signature_page_references=%s",
+                signature_page_references,
+            )
+        self._write_signature_sidecar(
+            fs=fs,
+            output_file_path=output_file_path,
+            signature_metadata=signature_metadata,
+            signature_page_references=signature_page_references,
+        )
+
     @staticmethod
     def _signature_sidecar_path(output_file_path: str) -> str:
         """Sidecar JSON for document_insights signature data.

From fd3e2002db9c8bfa8e30550d3155a352ce14ccef Mon Sep 17 00:00:00 2001
From: pk-zipstack <praveen@zipstack.com>
Date: Thu, 14 May 2026 17:59:45 +0530
Subject: [PATCH 10/12] [MISC] Final SonarCloud cleanup: exception logging,
 complexity, string concat

- Use logger.exception() in three except blocks in prompt_studio_helper.py
  (S8572) so the traceback is always captured.
- Extract _log_signature_capture from dynamic_extractor to bring its
  cognitive complexity below 15 (S3776).
- Merge a string-pair that ruff-format collapsed onto one line in
  legacy_executor.py (S5799).

No behaviour change. All 8 signature-highlight unit tests still pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../prompt_studio_helper.py                   | 36 ++++++++++++-------
 workers/executor/executors/legacy_executor.py |  2 +-
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
index 6273f7f363..5b54bc58be 100644
--- a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
+++ b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py
@@ -1651,7 +1651,7 @@ def _execute_single_prompt(
             # Validation responses are user-facing; DRF renders them as-is.
             raise
         except Exception as e:
-            logger.error(
+            logger.exception(
                 f"[{tool.tool_id}] Error while fetching response for "
                 f"prompt {id} and doc {document_id}: {e}"
             )
@@ -1719,7 +1719,7 @@ def _execute_prompts_in_single_pass(
             # Validation responses are user-facing; DRF renders them as-is.
             raise
         except Exception as e:
-            logger.error(
+            logger.exception(
                 f"[{tool.tool_id}] Error while fetching single pass response: {e}"
             )
             PromptStudioHelper._publish_log(
@@ -2213,7 +2213,7 @@ def dynamic_indexer(
                 msg = e.actual_err.response.json().get("error", str(e))
 
             msg = f"Error while indexing '{filename}'. {msg}"
-            logger.error(msg, stack_info=True, exc_info=True)
+            logger.exception(msg, stack_info=True)
             PromptStudioHelper._publish_log(
                 {"tool_id": tool_id, "run_id": run_id, "doc_name": filename},
                 LogLevels.ERROR,
@@ -2364,6 +2364,23 @@ def get_tool_from_tool_id(tool_id: str) -> CustomTool | None:
         except CustomTool.DoesNotExist:
             return None
 
+    @staticmethod
+    def _log_signature_capture(
+        signature_metadata: dict[str, Any] | None,
+        signature_page_references: dict[str, Any] | None,
+        document_id: str,
+    ) -> None:
+        """Log signature data capture from a fresh extract dispatch."""
+        if not (signature_metadata or signature_page_references):
+            return
+        logger.info(
+            "DOC_INSIGHTS dynamic_extractor: captured signature data "
+            "(pages=%s, refs=%s) for document %s",
+            list(signature_metadata.keys()) if signature_metadata else [],
+            list(signature_page_references.keys()) if signature_page_references else [],
+            document_id,
+        )
+
     @staticmethod
     def _inject_signature_data_into_tool_settings(
         tool_settings: dict[str, Any],
@@ -2526,16 +2543,9 @@ def dynamic_extractor(
         extracted_text = result.data.get("extracted_text", "")
         signature_metadata = result.data.get("signature_metadata")
         signature_page_references = result.data.get("signature_page_references")
-        if signature_metadata or signature_page_references:
-            logger.info(
-                "DOC_INSIGHTS dynamic_extractor: captured signature data "
-                "(pages=%s, refs=%s) for document %s",
-                list(signature_metadata.keys()) if signature_metadata else [],
-                list(signature_page_references.keys())
-                if signature_page_references
-                else [],
-                document_id,
-            )
+        PromptStudioHelper._log_signature_capture(
+            signature_metadata, signature_page_references, document_id
+        )
         success = PromptStudioIndexHelper.mark_extraction_status(
             document_id=document_id,
             profile_manager=profile_manager,
diff --git a/workers/executor/executors/legacy_executor.py b/workers/executor/executors/legacy_executor.py
index 9119850e02..3e571b66ec 100644
--- a/workers/executor/executors/legacy_executor.py
+++ b/workers/executor/executors/legacy_executor.py
@@ -348,7 +348,7 @@ def _capture_signature_data(
         if signature_metadata:
             result_data["signature_metadata"] = signature_metadata
             logger.info(
-                "DOC_INSIGHTS _handle_extract: signature_metadata found for " "pages: %s",
+                "DOC_INSIGHTS _handle_extract: signature_metadata found for pages: %s",
                 list(signature_metadata.keys()),
             )
         if signature_page_references:

From 108dddec9356a629699a3b125504de0ddd27f6e8 Mon Sep 17 00:00:00 2001
From: pk-zipstack <praveen@zipstack.com>
Date: Thu, 14 May 2026 18:10:48 +0530
Subject: [PATCH 11/12] [MISC] Move signature LLM-context formatter into shared
 SDK helper

Cuts the SonarCloud duplication metric: the formatter that turns
``signature_metadata`` into the ``[Document Signature Information]``
context block was the same code in both workers and prompt-service.
Now both services import ``format_signature_metadata_context`` from
``unstract.sdk1.utils.signature_highlights``.

No behaviour change. All 8 signature-highlight unit tests still pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../prompt_service/services/answer_prompt.py  | 35 +--------------
 .../sdk1/utils/signature_highlights.py        | 34 +++++++++++++++
 workers/executor/executors/answer_prompt.py   | 43 +------------------
 3 files changed, 38 insertions(+), 74 deletions(-)

diff --git a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
index 49a1009a67..be620238ac 100644
--- a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
+++ b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
@@ -25,6 +25,7 @@
 from unstract.sdk1.file_storage.env_helper import EnvHelper
 from unstract.sdk1.llm import LLM
 from unstract.sdk1.utils.signature_highlights import (
+    format_signature_metadata_context,
     merge_into_highlight_data,
     resolve_signature_highlight_coords,
 )
@@ -205,36 +206,6 @@ def _attach_signature_highlights(
             len(new_coords),
         )
 
-    @staticmethod
-    def _format_signature_metadata(
-        signature_metadata: dict[str, list[Any]],
-    ) -> str:
-        """Format signature metadata as a human-readable context block."""
-        lines: list[str] = []
-        for page_num, signatures in sorted(
-            signature_metadata.items(), key=lambda x: int(x[0])
-        ):
-            if not signatures:
-                continue
-            for sig in signatures:
-                name = sig.get("name", "Unknown")
-                sig_type = sig.get("type", "signature")
-                desc = sig.get("desc", "")
-                page_display = int(page_num) + 1  # 0-indexed to 1-indexed
-                entry = f"- Page {page_display}: {name} ({sig_type})"
-                if desc:
-                    entry += f" — {desc}"
-                lines.append(entry)
-        if not lines:
-            return ""
-        header = (
-            "\n\n[Document Signature Information]\n"
-            "The following signatures were detected in this document. "
-            "Use this information to answer any questions about signatories, "
-            "signing parties, or document execution status.\n"
-        )
-        return header + "\n".join(lines)
-
     @staticmethod
     def construct_prompt(
         preamble: str,
@@ -279,9 +250,7 @@ def construct_prompt(
                 "for %d page(s)",
                 len(signature_metadata),
             )
-            signature_context = AnswerPromptService._format_signature_metadata(
-                signature_metadata
-            )
+            signature_context = format_signature_metadata_context(signature_metadata)
             app.logger.debug(
                 "DOC_INSIGHTS construct_prompt: signature_context=%s",
                 signature_context[:200] if signature_context else "empty",
diff --git a/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py b/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py
index ac140bd649..cf29292ac5 100644
--- a/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py
+++ b/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py
@@ -139,6 +139,40 @@ def resolve_signature_highlight_coords(
     return _dedupe_coords(matched_pages, page_coords)
 
 
+def format_signature_metadata_context(
+    signature_metadata: dict[str, list[Any]],
+) -> str:
+    """Format ``signature_metadata`` as a human-readable LLM context block.
+
+    Returns an empty string when no signatures are present. Page numbers
+    are converted from 0-indexed to 1-indexed for display.
+    """
+    lines: list[str] = []
+    for page_num, signatures in sorted(
+        signature_metadata.items(), key=lambda x: int(x[0])
+    ):
+        if not signatures:
+            continue
+        for sig in signatures:
+            name = sig.get("name", "Unknown")
+            sig_type = sig.get("type", "signature")
+            desc = sig.get("desc", "")
+            page_display = int(page_num) + 1  # 0-indexed → 1-indexed
+            entry = f"- Page {page_display}: {name} ({sig_type})"
+            if desc:
+                entry += f" — {desc}"
+            lines.append(entry)
+    if not lines:
+        return ""
+    header = (
+        "\n\n[Document Signature Information]\n"
+        "The following signatures were detected in this document. "
+        "Use this information to answer any questions about signatories, "
+        "signing parties, or document execution status.\n"
+    )
+    return header + "\n".join(lines)
+
+
 def merge_into_highlight_data(
     metadata: dict[str, Any],
     prompt_key: str,
diff --git a/workers/executor/executors/answer_prompt.py b/workers/executor/executors/answer_prompt.py
index 06c66934f9..8bfc00bf1a 100644
--- a/workers/executor/executors/answer_prompt.py
+++ b/workers/executor/executors/answer_prompt.py
@@ -21,6 +21,7 @@
 from executor.executors.exceptions import LegacyExecutorError, RateLimitError
 
 from unstract.sdk1.utils.signature_highlights import (
+    format_signature_metadata_context,
     merge_into_highlight_data,
     resolve_signature_highlight_coords,
 )
@@ -240,44 +241,6 @@ def _attach_signature_highlights(
             len(new_coords),
         )
 
-    @staticmethod
-    def _format_signature_metadata(
-        signature_metadata: dict[str, list[Any]],
-    ) -> str:
-        """Format signature metadata as a human-readable context block.
-
-        Args:
-            signature_metadata: Dict keyed by page number (str) with lists
-                of signature entries, each having 'type', 'name', 'desc'.
-
-        Returns:
-            Formatted string for LLM context injection.
-        """
-        lines: list[str] = []
-        for page_num, signatures in sorted(
-            signature_metadata.items(), key=lambda x: int(x[0])
-        ):
-            if not signatures:
-                continue
-            for sig in signatures:
-                name = sig.get("name", "Unknown")
-                sig_type = sig.get("type", "signature")
-                desc = sig.get("desc", "")
-                page_display = int(page_num) + 1  # 0-indexed to 1-indexed
-                entry = f"- Page {page_display}: {name} ({sig_type})"
-                if desc:
-                    entry += f" — {desc}"
-                lines.append(entry)
-        if not lines:
-            return ""
-        header = (
-            "\n\n[Document Signature Information]\n"
-            "The following signatures were detected in this document. "
-            "Use this information to answer any questions about signatories, "
-            "signing parties, or document execution status.\n"
-        )
-        return header + "\n".join(lines)
-
     @staticmethod
     def construct_prompt(
         preamble: str,
@@ -310,9 +273,7 @@ def construct_prompt(
                 "for %d page(s)",
                 len(signature_metadata),
             )
-            signature_context = AnswerPromptService._format_signature_metadata(
-                signature_metadata
-            )
+            signature_context = format_signature_metadata_context(signature_metadata)
             logger.debug(
                 "DOC_INSIGHTS construct_prompt: signature_context=%s",
                 signature_context[:200] if signature_context else "empty",

From 91fd22f40739024686634cfe7685739eddcd58dc Mon Sep 17 00:00:00 2001
From: pk-zipstack <praveen@zipstack.com>
Date: Thu, 14 May 2026 18:19:56 +0530
Subject: [PATCH 12/12] [MISC] Extract SSRF webhook-URL helper into shared SDK
 module

Workers and prompt-service both had a ~50-line ``_is_safe_public_url``
helper for SSRF protection on postprocessing webhook URLs. Moved it
to ``unstract.sdk1.utils.url_safety.is_safe_public_url`` and import
from there. Brings the SonarCloud new-code duplication metric down
further without touching behaviour.

No behaviour change. All 8 signature-highlight unit tests still pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../prompt_service/services/answer_prompt.py  | 58 +---------------
 .../src/unstract/sdk1/utils/url_safety.py     | 68 +++++++++++++++++++
 workers/executor/executors/answer_prompt.py   | 59 +---------------
 3 files changed, 72 insertions(+), 113 deletions(-)
 create mode 100644 unstract/sdk1/src/unstract/sdk1/utils/url_safety.py

diff --git a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
index be620238ac..62503f1890 100644
--- a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
+++ b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py
@@ -1,8 +1,5 @@
-import ipaddress
-import socket
 from logging import Logger
 from typing import Any
-from urllib.parse import urlparse
 
 from flask import current_app as app
 
@@ -29,58 +26,7 @@
     merge_into_highlight_data,
     resolve_signature_highlight_coords,
 )
-
-
-def _is_safe_public_url(url: str) -> bool:
-    """Validate webhook URL for SSRF protection.
-
-    Only allows HTTPS and blocks private/loopback/internal addresses.
-    Resolves all DNS records (A/AAAA) to prevent DNS rebinding attacks.
-    """
-    try:
-        p = urlparse(url)
-        if p.scheme not in ("https",):  # Only allow HTTPS for security
-            return False
-        host = p.hostname or ""
-        # Block obvious local hosts
-        if host in ("localhost",):
-            return False
-
-        addrs: set[str] = set()
-        # If literal IP, validate directly; else resolve all records (A/AAAA)
-        try:
-            ipaddress.ip_address(host)
-            addrs.add(host)
-        except ValueError:
-            try:
-                for family, _type, _proto, _canonname, sockaddr in socket.getaddrinfo(
-                    host, None, type=socket.SOCK_STREAM
-                ):
-                    addr = sockaddr[0]
-                    addrs.add(addr)
-            except Exception:
-                return False
-
-        if not addrs:
-            return False
-
-        # Validate all resolved addresses
-        for addr in addrs:
-            try:
-                ip = ipaddress.ip_address(addr)
-            except ValueError:
-                return False
-            if (
-                ip.is_private
-                or ip.is_loopback
-                or ip.is_link_local
-                or ip.is_reserved
-                or ip.is_multicast
-            ):
-                return False
-        return True
-    except Exception:
-        return False
+from unstract.sdk1.utils.url_safety import is_safe_public_url
 
 
 class AnswerPromptService:
@@ -498,7 +444,7 @@ def handle_json(
                         app.logger.warning(
                             "Postprocessing webhook enabled but URL missing; skipping."
                         )
-                    elif not _is_safe_public_url(webhook_url):
+                    elif not is_safe_public_url(webhook_url):
                         app.logger.warning(
                             "Postprocessing webhook URL is not allowed; skipping."
                         )
diff --git a/unstract/sdk1/src/unstract/sdk1/utils/url_safety.py b/unstract/sdk1/src/unstract/sdk1/utils/url_safety.py
new file mode 100644
index 0000000000..40257399cf
--- /dev/null
+++ b/unstract/sdk1/src/unstract/sdk1/utils/url_safety.py
@@ -0,0 +1,68 @@
+"""URL safety helpers (SSRF protection).
+
+Shared between the workers executor and the prompt-service answer-prompt
+service because both need to validate webhook URLs before issuing
+postprocessing callbacks.
+"""
+
+from __future__ import annotations
+
+import ipaddress
+import socket
+from urllib.parse import urlparse
+
+
+def _resolve_host_addresses(host: str) -> set[str]:
+    """Resolve a hostname or IP string to a set of IP address strings."""
+    try:
+        ipaddress.ip_address(host)
+        return {host}
+    except ValueError:
+        pass
+    try:
+        return {
+            sockaddr[0]
+            for _family, _type, _proto, _canonname, sockaddr in socket.getaddrinfo(
+                host, None, type=socket.SOCK_STREAM
+            )
+        }
+    except Exception:
+        return set()
+
+
+def is_safe_public_url(url: str) -> bool:
+    """Validate a URL for use as an outbound webhook target (SSRF protection).
+
+    Only HTTPS URLs are allowed, and the resolved host must not point to
+    a private, loopback, link-local, reserved, or multicast address.
+    All DNS records (A/AAAA) are resolved to prevent DNS rebinding
+    attacks.
+    """
+    try:
+        p = urlparse(url)
+        if p.scheme not in ("https",):  # only HTTPS
+            return False
+        host = p.hostname or ""
+        if host == "localhost":
+            return False
+
+        addrs = _resolve_host_addresses(host)
+        if not addrs:
+            return False
+
+        for addr in addrs:
+            try:
+                ip = ipaddress.ip_address(addr)
+            except ValueError:
+                return False
+            if (
+                ip.is_private
+                or ip.is_loopback
+                or ip.is_link_local
+                or ip.is_reserved
+                or ip.is_multicast
+            ):
+                return False
+        return True
+    except Exception:
+        return False
diff --git a/workers/executor/executors/answer_prompt.py b/workers/executor/executors/answer_prompt.py
index 8bfc00bf1a..510bfbbc9b 100644
--- a/workers/executor/executors/answer_prompt.py
+++ b/workers/executor/executors/answer_prompt.py
@@ -10,12 +10,9 @@
 are integrated at the caller level (LegacyExecutor).
 """
 
-import ipaddress
 import logging
 import os
-import socket
 from typing import Any
-from urllib.parse import urlparse
 
 from executor.executors.constants import PromptServiceConstants as PSKeys
 from executor.executors.exceptions import LegacyExecutorError, RateLimitError
@@ -25,63 +22,11 @@
     merge_into_highlight_data,
     resolve_signature_highlight_coords,
 )
+from unstract.sdk1.utils.url_safety import is_safe_public_url
 
 logger = logging.getLogger(__name__)
 
 
-def _resolve_host_addresses(host: str) -> set[str]:
-    """Resolve a hostname or IP string to a set of IP address strings."""
-    try:
-        ipaddress.ip_address(host)
-        return {host}
-    except ValueError:
-        pass
-    try:
-        return {
-            sockaddr[0]
-            for _family, _type, _proto, _canonname, sockaddr in socket.getaddrinfo(
-                host, None, type=socket.SOCK_STREAM
-            )
-        }
-    except Exception:
-        return set()
-
-
-def _is_safe_public_url(url: str) -> bool:
-    """Validate webhook URL for SSRF protection.
-
-    Only allows HTTPS and blocks private/loopback/internal addresses.
-    """
-    try:
-        p = urlparse(url)
-        if p.scheme not in ("https",):
-            return False
-        host = p.hostname or ""
-        if host in ("localhost",):
-            return False
-
-        addrs = _resolve_host_addresses(host)
-        if not addrs:
-            return False
-
-        for addr in addrs:
-            try:
-                ip = ipaddress.ip_address(addr)
-            except ValueError:
-                return False
-            if (
-                ip.is_private
-                or ip.is_loopback
-                or ip.is_link_local
-                or ip.is_reserved
-                or ip.is_multicast
-            ):
-                return False
-        return True
-    except Exception:
-        return False
-
-
 class AnswerPromptService:
     @staticmethod
     def extract_variable(
@@ -359,7 +304,7 @@ def _run_webhook_postprocess(
         if not webhook_url:
             logger.warning("Postprocessing webhook enabled but URL missing; skipping.")
             return parsed_data, None
-        if not _is_safe_public_url(webhook_url):
+        if not is_safe_public_url(webhook_url):
             logger.warning("Postprocessing webhook URL is not allowed; skipping.")
             return parsed_data, None
         try: