From 4aff01e3f38c5be200704865b7279d18d2662b49 Mon Sep 17 00:00:00 2001 From: pk-zipstack Date: Tue, 7 Apr 2026 11:55:52 +0530 Subject: [PATCH 01/12] [FEAT] Add document_insights mode to LLMWhisperer V2 adapter with signature metadata in LLM context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add document_insights as a new processing mode in the LLMWhisperer V2 adapter (Modes enum + JSON schema dropdown) - Extract signature_metadata from LLMWhisperer response when using document_insights mode and surface it in TextExtractionMetadata - Thread signature_metadata through the workers pipeline (extract → answer_params → construct_prompt) - Format signature metadata as a human-readable context block injected into the LLM prompt's Context section - Update prompt-service extraction endpoint to return signature_metadata - Mirror construct_prompt changes in prompt-service for parity Workers execution path (API deployments, workflow runs) is fully functional. Prompt-service path has endpoints ready but structure tool threading is a follow-up. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/unstract/prompt_service/constants.py | 1 + .../prompt_service/controllers/extraction.py | 9 +++- .../prompt_service/services/answer_prompt.py | 41 +++++++++++++++- .../prompt_service/services/extraction.py | 16 +++++- .../src/unstract/sdk1/adapters/x2text/dto.py | 1 + .../x2text/llm_whisperer_v2/src/constants.py | 2 + .../llm_whisperer_v2/src/llm_whisperer_v2.py | 15 ++++++ .../src/static/json_schema.json | 3 +- workers/executor/executors/answer_prompt.py | 49 ++++++++++++++++++- workers/executor/executors/constants.py | 1 + workers/executor/executors/legacy_executor.py | 18 +++++++ 11 files changed, 149 insertions(+), 7 deletions(-) diff --git a/prompt-service/src/unstract/prompt_service/constants.py b/prompt-service/src/unstract/prompt_service/constants.py index 9eddab8423..16bb364ce3 100644 --- a/prompt-service/src/unstract/prompt_service/constants.py +++ b/prompt-service/src/unstract/prompt_service/constants.py @@ -84,6 +84,7 @@ class PromptServiceConstants: LINE_ITEM = "line-item" LINE_NUMBERS = "line_numbers" WHISPER_HASH = "whisper_hash" + SIGNATURE_METADATA = "signature_metadata" PAID_FEATURE_MSG = ( "It is a cloud / enterprise feature. If you have purchased a plan and still " "face this issue, please contact support" diff --git a/prompt-service/src/unstract/prompt_service/controllers/extraction.py b/prompt-service/src/unstract/prompt_service/controllers/extraction.py index 516894f429..4a5d47bb91 100644 --- a/prompt-service/src/unstract/prompt_service/controllers/extraction.py +++ b/prompt-service/src/unstract/prompt_service/controllers/extraction.py @@ -36,7 +36,7 @@ def extract() -> Any: tool_exec_metadata = payload.get(IKeys.TOOL_EXECUTION_METATADA, {}) execution_run_data_folder = payload.get(IKeys.EXECUTION_DATA_DIR, "") - extracted_text = ExtractionService.perform_extraction( + extraction_result = ExtractionService.perform_extraction( file_path=file_path, x2text_instance_id=x2text_instance_id, output_file_path=output_file_path, @@ -49,5 +49,10 @@ def extract() -> Any: tool_exec_metadata=tool_exec_metadata, execution_run_data_folder=execution_run_data_folder, ) - response = {IKeys.EXTRACTED_TEXT: extracted_text} + response = { + IKeys.EXTRACTED_TEXT: extraction_result["extracted_text"], + } + signature_metadata = extraction_result.get("signature_metadata") + if signature_metadata: + response["signature_metadata"] = signature_metadata return response diff --git a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py index 9f8cbf9c28..9ccf20abe3 100644 --- a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py +++ b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py @@ -141,6 +141,7 @@ def construct_and_run_prompt( platform_postamble=platform_postamble, word_confidence_postamble=word_confidence_postamble, prompt_type=prompt_type, + signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA), ) output[PSKeys.COMBINED_PROMPT] = prompt return AnswerPromptService.run_completion( @@ -155,6 +156,36 @@ def construct_and_run_prompt( execution_source=execution_source, ) + @staticmethod + def _format_signature_metadata( + signature_metadata: dict[str, list[Any]], + ) -> str: + """Format signature metadata as a human-readable context block.""" + lines: list[str] = [] + for page_num, signatures in sorted( + signature_metadata.items(), key=lambda x: int(x[0]) + ): + if not signatures: + continue + for sig in signatures: + name = sig.get("name", "Unknown") + sig_type = sig.get("type", "signature") + desc = sig.get("desc", "") + page_display = int(page_num) + 1 # 0-indexed to 1-indexed + entry = f"- Page {page_display}: {name} ({sig_type})" + if desc: + entry += f" — {desc}" + lines.append(entry) + if not lines: + return "" + header = ( + "\n\n[Document Signature Information]\n" + "The following signatures were detected in this document. " + "Use this information to answer any questions about signatories, " + "signing parties, or document execution status.\n" + ) + return header + "\n".join(lines) + @staticmethod def construct_prompt( preamble: str, @@ -165,6 +196,7 @@ def construct_prompt( platform_postamble: str, word_confidence_postamble: str, prompt_type: str = PSKeys.TEXT, + signature_metadata: dict[str, list[Any]] | None = None, ) -> str: prompt = f"{preamble}\n\nQuestion or Instruction: {prompt}" if grammar_list is not None and len(grammar_list) > 0: @@ -190,8 +222,15 @@ def construct_prompt( platform_postamble += "\n\n" if word_confidence_postamble: platform_postamble += f"{word_confidence_postamble}\n\n" + # Append signature metadata to context if present + signature_context = "" + if signature_metadata: + signature_context = AnswerPromptService._format_signature_metadata( + signature_metadata + ) prompt += ( - f"\n\n{postamble}\n\nContext:\n---------------\n{context}\n" + f"\n\n{postamble}\n\nContext:\n---------------\n{context}" + f"{signature_context}\n" f"-----------------\n\n{platform_postamble}Answer:" ) return prompt diff --git a/prompt-service/src/unstract/prompt_service/services/extraction.py b/prompt-service/src/unstract/prompt_service/services/extraction.py index 76430657f9..d7e9824bbb 100644 --- a/prompt-service/src/unstract/prompt_service/services/extraction.py +++ b/prompt-service/src/unstract/prompt_service/services/extraction.py @@ -30,7 +30,7 @@ def perform_extraction( execution_source: str | None = None, tool_exec_metadata: dict[str, Any] | None = None, execution_run_data_folder: str | None = None, - ) -> str: + ) -> dict[str, Any]: extracted_text = "" util = PromptServiceBaseTool(platform_key=platform_key) x2text = X2Text( @@ -64,7 +64,19 @@ def perform_extraction( fs=fs, ) extracted_text = process_response.extracted_text - return extracted_text + # Extract signature metadata if present + signature_metadata = None + if ( + process_response.extraction_metadata + and process_response.extraction_metadata.signature_metadata + ): + signature_metadata = ( + process_response.extraction_metadata.signature_metadata + ) + return { + "extracted_text": extracted_text, + "signature_metadata": signature_metadata, + } except AdapterError as e: msg = f"Error from text extractor '{x2text.x2text_instance.get_name()}'. " msg += str(e) diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py index 95c60bbe8c..4f4a92d812 100644 --- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py +++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py @@ -6,6 +6,7 @@ class TextExtractionMetadata: whisper_hash: str line_metadata: dict[Any, Any] | None = None + signature_metadata: dict[str, list[Any]] | None = None @dataclass diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/constants.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/constants.py index 090a3bf6f4..722e1a2f3f 100644 --- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/constants.py +++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/constants.py @@ -7,6 +7,8 @@ class Modes(Enum): LOW_COST = "low_cost" HIGH_QUALITY = "high_quality" FORM = "form" + TABLE = "table" + DOCUMENT_INSIGHTS = "document_insights" class OutputModes(Enum): diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py index 3a48a57647..3d1e1f6a97 100644 --- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py +++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py @@ -10,6 +10,8 @@ TextExtractionResult, ) from unstract.sdk1.adapters.x2text.llm_whisperer_v2.src.constants import ( + Modes, + WhispererConfig, WhispererEndpoint, ) from unstract.sdk1.adapters.x2text.llm_whisperer_v2.src.dto import ( @@ -96,9 +98,22 @@ def process( fs=fs, extra_params=extra_params, ) + # Extract signature_metadata when using document_insights mode + signature_metadata = None + mode = self.config.get(WhispererConfig.MODE, Modes.FORM.value) + if mode == Modes.DOCUMENT_INSIGHTS.value: + response_metadata = response.get("metadata", {}) + signature_metadata = {} + for page_num, page_data in response_metadata.items(): + if isinstance(page_data, dict) and "signature_metadata" in page_data: + signature_metadata[page_num] = page_data["signature_metadata"] + if not any(signature_metadata.values()): + signature_metadata = None + metadata = TextExtractionMetadata( whisper_hash=response.get(X2TextConstants.WHISPER_HASH_V2, ""), line_metadata=response.get("line_metadata"), + signature_metadata=signature_metadata, ) return TextExtractionResult( diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json index 1215ede56a..00da534c37 100644 --- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json +++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json @@ -34,7 +34,8 @@ "low_cost", "high_quality", "form", - "table" + "table", + "document_insights" ], "default": "form", "description": "Processing mode to use, described in the [LLMWhisperer documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_api/#modes)." diff --git a/workers/executor/executors/answer_prompt.py b/workers/executor/executors/answer_prompt.py index 89936fe598..b1a5998dde 100644 --- a/workers/executor/executors/answer_prompt.py +++ b/workers/executor/executors/answer_prompt.py @@ -157,6 +157,7 @@ def construct_and_run_prompt( platform_postamble=platform_postamble, word_confidence_postamble=word_confidence_postamble, prompt_type=prompt_type, + signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA), ) output[PSKeys.COMBINED_PROMPT] = prompt return AnswerPromptService.run_completion( @@ -189,6 +190,44 @@ def _build_grammar_notes(grammar_list: list[dict[str, Any]]) -> str: ) return notes + @staticmethod + def _format_signature_metadata( + signature_metadata: dict[str, list[Any]], + ) -> str: + """Format signature metadata as a human-readable context block. + + Args: + signature_metadata: Dict keyed by page number (str) with lists + of signature entries, each having 'type', 'name', 'desc'. + + Returns: + Formatted string for LLM context injection. + """ + lines: list[str] = [] + for page_num, signatures in sorted( + signature_metadata.items(), key=lambda x: int(x[0]) + ): + if not signatures: + continue + for sig in signatures: + name = sig.get("name", "Unknown") + sig_type = sig.get("type", "signature") + desc = sig.get("desc", "") + page_display = int(page_num) + 1 # 0-indexed to 1-indexed + entry = f"- Page {page_display}: {name} ({sig_type})" + if desc: + entry += f" — {desc}" + lines.append(entry) + if not lines: + return "" + header = ( + "\n\n[Document Signature Information]\n" + "The following signatures were detected in this document. " + "Use this information to answer any questions about signatories, " + "signing parties, or document execution status.\n" + ) + return header + "\n".join(lines) + @staticmethod def construct_prompt( preamble: str, @@ -199,6 +238,7 @@ def construct_prompt( platform_postamble: str, word_confidence_postamble: str, prompt_type: str = "text", + signature_metadata: dict[str, list[Any]] | None = None, ) -> str: """Build the full prompt string with preamble, grammar, postamble, context.""" prompt = f"{preamble}\n\nQuestion or Instruction: {prompt}" @@ -212,8 +252,15 @@ def construct_prompt( platform_postamble += "\n\n" if word_confidence_postamble: platform_postamble += f"{word_confidence_postamble}\n\n" + # Append signature metadata to context if present + signature_context = "" + if signature_metadata: + signature_context = AnswerPromptService._format_signature_metadata( + signature_metadata + ) prompt += ( - f"\n\n{postamble}\n\nContext:\n---------------\n{context}\n" + f"\n\n{postamble}\n\nContext:\n---------------\n{context}" + f"{signature_context}\n" f"-----------------\n\n{platform_postamble}Answer:" ) return prompt diff --git a/workers/executor/executors/constants.py b/workers/executor/executors/constants.py index 9eddab8423..16bb364ce3 100644 --- a/workers/executor/executors/constants.py +++ b/workers/executor/executors/constants.py @@ -84,6 +84,7 @@ class PromptServiceConstants: LINE_ITEM = "line-item" LINE_NUMBERS = "line_numbers" WHISPER_HASH = "whisper_hash" + SIGNATURE_METADATA = "signature_metadata" PAID_FEATURE_MSG = ( "It is a cloud / enterprise feature. If you have purchased a plan and still " "face this issue, please contact support" diff --git a/workers/executor/executors/legacy_executor.py b/workers/executor/executors/legacy_executor.py index cf33c43212..012bd89937 100644 --- a/workers/executor/executors/legacy_executor.py +++ b/workers/executor/executors/legacy_executor.py @@ -261,6 +261,15 @@ def _handle_extract(self, context: ExecutionContext) -> ExecutionResult: result_data["highlight_metadata"] = ( process_response.extraction_metadata.line_metadata ) + # Include signature metadata when available + # (from document_insights mode) + if ( + process_response.extraction_metadata + and process_response.extraction_metadata.signature_metadata + ): + result_data["signature_metadata"] = ( + process_response.extraction_metadata.signature_metadata + ) return ExecutionResult( success=True, data=result_data, @@ -536,6 +545,15 @@ def _handle_structure_pipeline(self, context: ExecutionContext) -> ExecutionResu return extract_result extracted_text = extract_result.data.get(IKeys.EXTRACTED_TEXT, "") + # Pass signature metadata to answer phase via tool_settings + from executor.executors.constants import PromptServiceConstants as PSKeys + + signature_metadata = extract_result.data.get("signature_metadata") + if signature_metadata: + tool_settings = answer_params.get(PSKeys.TOOL_SETTINGS, {}) + tool_settings[PSKeys.SIGNATURE_METADATA] = signature_metadata + answer_params[PSKeys.TOOL_SETTINGS] = tool_settings + # ---- Step 2: Summarize (if enabled) ---- if is_summarization: shim.stream_log(f"Pipeline step {step}: Summarizing extracted text...") From b982e581bd65a28cdf9099073befebf8ff479e37 Mon Sep 17 00:00:00 2001 From: pk-zipstack Date: Thu, 9 Apr 2026 16:03:19 +0530 Subject: [PATCH 02/12] [FEAT] Compute signature page references for frontend page navigation When document_insights mode detects signatures, compute page references by finding the first line_metadata entry for each page with signatures and converting to a 1-indexed hex value. This enables the frontend to navigate/jump to pages containing signatures without highlighting. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/unstract/sdk1/adapters/x2text/dto.py | 1 + .../llm_whisperer_v2/src/llm_whisperer_v2.py | 64 +++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py index 4f4a92d812..4a0885a01f 100644 --- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py +++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py @@ -7,6 +7,7 @@ class TextExtractionMetadata: whisper_hash: str line_metadata: dict[Any, Any] | None = None signature_metadata: dict[str, list[Any]] | None = None + signature_page_references: dict[str, Any] | None = None @dataclass diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py index 3d1e1f6a97..7c3ce92a3a 100644 --- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py +++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py @@ -56,6 +56,59 @@ def get_description() -> str: def get_icon() -> str: return "/icons/adapter-icons/LLMWhispererV2.png" + @staticmethod + def _build_signature_page_references( + signature_metadata: dict[str, list[Any]], + line_metadata: list[list[int]], + ) -> dict[str, Any] | None: + """Build page references for frontend navigation to signature pages. + + For each page that has signatures, finds the first line_metadata + entry for that page and converts its index to a 1-indexed hex + value. This allows the frontend to jump to the correct page. + + Args: + signature_metadata: Dict keyed by page number (str, 0-indexed) + with lists of signature entries. + line_metadata: List of [page, y_pos, height, page_height] arrays. + + Returns: + Dict mapping page number to hex reference and signer names, + or None if no references could be built. + """ + if not line_metadata: + return None + + # Build a map of page number -> first line_metadata index + page_first_line: dict[int, int] = {} + for idx, entry in enumerate(line_metadata): + if isinstance(entry, list) and len(entry) >= 1: + page = entry[0] + if page not in page_first_line: + page_first_line[page] = idx + + references: dict[str, Any] = {} + for page_str, signatures in signature_metadata.items(): + if not signatures: + continue + page_num = int(page_str) + if page_num not in page_first_line: + continue + line_index = page_first_line[page_num] + hex_value = f"0x{line_index + 1:02X}" # 1-indexed hex + signers = [ + sig.get("name", "Unknown") + for sig in signatures + if isinstance(sig, dict) + ] + references[page_str] = { + "hex": hex_value, + "line_metadata_index": line_index, + "signers": signers, + } + + return references if references else None + def test_connection(self) -> bool: LLMWhispererHelper.test_connection_request( config=self.config, @@ -110,10 +163,21 @@ def process( if not any(signature_metadata.values()): signature_metadata = None + # Compute signature page references for frontend navigation + signature_page_references = None + if signature_metadata: + raw_line_metadata = response.get("line_metadata", []) + signature_page_references = ( + LLMWhispererV2._build_signature_page_references( + signature_metadata, raw_line_metadata + ) + ) + metadata = TextExtractionMetadata( whisper_hash=response.get(X2TextConstants.WHISPER_HASH_V2, ""), line_metadata=response.get("line_metadata"), signature_metadata=signature_metadata, + signature_page_references=signature_page_references, ) return TextExtractionResult( From 2cfcede7811ac662e6d96a2005d065fcd19704e0 Mon Sep 17 00:00:00 2001 From: pk-zipstack Date: Wed, 15 Apr 2026 12:57:55 +0530 Subject: [PATCH 03/12] [MISC] Add DOC_INSIGHTS debug loggers across signature metadata flow Add loggers at key points to trace signature metadata through the pipeline: adapter extraction, workers pipeline (_handle_extract, tool_settings injection), prompt construction, and prompt-service extraction endpoint. All loggers use the DOC_INSIGHTS prefix for easy grep-filtering during UI testing. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../prompt_service/services/answer_prompt.py | 9 +++++ .../prompt_service/services/extraction.py | 8 ++++ .../llm_whisperer_v2/src/llm_whisperer_v2.py | 39 +++++++++++++++++++ workers/executor/executors/answer_prompt.py | 9 +++++ workers/executor/executors/legacy_executor.py | 25 ++++++++++++ 5 files changed, 90 insertions(+) diff --git a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py index 9ccf20abe3..3fd69e1d91 100644 --- a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py +++ b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py @@ -225,9 +225,18 @@ def construct_prompt( # Append signature metadata to context if present signature_context = "" if signature_metadata: + app.logger.info( + "DOC_INSIGHTS construct_prompt: injecting signature context " + "for %d page(s)", + len(signature_metadata), + ) signature_context = AnswerPromptService._format_signature_metadata( signature_metadata ) + app.logger.debug( + "DOC_INSIGHTS construct_prompt: signature_context=%s", + signature_context[:200] if signature_context else "empty", + ) prompt += ( f"\n\n{postamble}\n\nContext:\n---------------\n{context}" f"{signature_context}\n" diff --git a/prompt-service/src/unstract/prompt_service/services/extraction.py b/prompt-service/src/unstract/prompt_service/services/extraction.py index d7e9824bbb..290f13f482 100644 --- a/prompt-service/src/unstract/prompt_service/services/extraction.py +++ b/prompt-service/src/unstract/prompt_service/services/extraction.py @@ -1,6 +1,9 @@ +import logging from pathlib import Path from typing import Any +logger = logging.getLogger(__name__) + from unstract.prompt_service.constants import ExecutionSource from unstract.prompt_service.constants import IndexingConstants as IKeys from unstract.prompt_service.exceptions import ExtractionError @@ -73,6 +76,11 @@ def perform_extraction( signature_metadata = ( process_response.extraction_metadata.signature_metadata ) + logger.info( + "DOC_INSIGHTS extraction: signature_metadata found " + "for pages: %s", + list(signature_metadata.keys()), + ) return { "extracted_text": extracted_text, "signature_metadata": signature_metadata, diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py index 7c3ce92a3a..984bc63a15 100644 --- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py +++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py @@ -77,6 +77,8 @@ def _build_signature_page_references( or None if no references could be built. """ if not line_metadata: + logger.warning("DOC_INSIGHTS: no line_metadata available, " + "cannot build page references") return None # Build a map of page number -> first line_metadata index @@ -86,6 +88,9 @@ def _build_signature_page_references( page = entry[0] if page not in page_first_line: page_first_line[page] = idx + logger.debug( + "DOC_INSIGHTS: page_first_line map: %s", page_first_line + ) references: dict[str, Any] = {} for page_str, signatures in signature_metadata.items(): @@ -93,6 +98,10 @@ def _build_signature_page_references( continue page_num = int(page_str) if page_num not in page_first_line: + logger.warning( + "DOC_INSIGHTS: page %d not found in line_metadata", + page_num, + ) continue line_index = page_first_line[page_num] hex_value = f"0x{line_index + 1:02X}" # 1-indexed hex @@ -154,24 +163,54 @@ def process( # Extract signature_metadata when using document_insights mode signature_metadata = None mode = self.config.get(WhispererConfig.MODE, Modes.FORM.value) + logger.info( + "DOC_INSIGHTS: mode=%s, is_document_insights=%s", + mode, + mode == Modes.DOCUMENT_INSIGHTS.value, + ) if mode == Modes.DOCUMENT_INSIGHTS.value: response_metadata = response.get("metadata", {}) + logger.info( + "DOC_INSIGHTS: response has metadata keys: %s", + list(response_metadata.keys()) if response_metadata else "None", + ) signature_metadata = {} for page_num, page_data in response_metadata.items(): if isinstance(page_data, dict) and "signature_metadata" in page_data: signature_metadata[page_num] = page_data["signature_metadata"] + logger.info( + "DOC_INSIGHTS: page %s has %d signature(s): %s", + page_num, + len(page_data["signature_metadata"]), + [s.get("name") for s in page_data["signature_metadata"]], + ) if not any(signature_metadata.values()): + logger.info("DOC_INSIGHTS: no signatures found across any page") signature_metadata = None + else: + logger.info( + "DOC_INSIGHTS: signature_metadata extracted for pages: %s", + list(signature_metadata.keys()), + ) # Compute signature page references for frontend navigation signature_page_references = None if signature_metadata: raw_line_metadata = response.get("line_metadata", []) + logger.info( + "DOC_INSIGHTS: line_metadata has %d entries, " + "computing page references", + len(raw_line_metadata), + ) signature_page_references = ( LLMWhispererV2._build_signature_page_references( signature_metadata, raw_line_metadata ) ) + logger.info( + "DOC_INSIGHTS: signature_page_references=%s", + signature_page_references, + ) metadata = TextExtractionMetadata( whisper_hash=response.get(X2TextConstants.WHISPER_HASH_V2, ""), diff --git a/workers/executor/executors/answer_prompt.py b/workers/executor/executors/answer_prompt.py index b1a5998dde..163ea294a1 100644 --- a/workers/executor/executors/answer_prompt.py +++ b/workers/executor/executors/answer_prompt.py @@ -255,9 +255,18 @@ def construct_prompt( # Append signature metadata to context if present signature_context = "" if signature_metadata: + logger.info( + "DOC_INSIGHTS construct_prompt: injecting signature context " + "for %d page(s)", + len(signature_metadata), + ) signature_context = AnswerPromptService._format_signature_metadata( signature_metadata ) + logger.debug( + "DOC_INSIGHTS construct_prompt: signature_context=%s", + signature_context[:200] if signature_context else "empty", + ) prompt += ( f"\n\n{postamble}\n\nContext:\n---------------\n{context}" f"{signature_context}\n" diff --git a/workers/executor/executors/legacy_executor.py b/workers/executor/executors/legacy_executor.py index 012bd89937..08951b3446 100644 --- a/workers/executor/executors/legacy_executor.py +++ b/workers/executor/executors/legacy_executor.py @@ -270,6 +270,26 @@ def _handle_extract(self, context: ExecutionContext) -> ExecutionResult: result_data["signature_metadata"] = ( process_response.extraction_metadata.signature_metadata ) + logger.info( + "DOC_INSIGHTS _handle_extract: signature_metadata found " + "for pages: %s", + list(process_response.extraction_metadata + .signature_metadata.keys()), + ) + if ( + process_response.extraction_metadata + and process_response.extraction_metadata.signature_page_references + ): + result_data["signature_page_references"] = ( + process_response.extraction_metadata + .signature_page_references + ) + logger.info( + "DOC_INSIGHTS _handle_extract: " + "signature_page_references=%s", + process_response.extraction_metadata + .signature_page_references, + ) return ExecutionResult( success=True, data=result_data, @@ -553,6 +573,11 @@ def _handle_structure_pipeline(self, context: ExecutionContext) -> ExecutionResu tool_settings = answer_params.get(PSKeys.TOOL_SETTINGS, {}) tool_settings[PSKeys.SIGNATURE_METADATA] = signature_metadata answer_params[PSKeys.TOOL_SETTINGS] = tool_settings + logger.info( + "DOC_INSIGHTS pipeline: injected signature_metadata " + "into tool_settings for pages: %s", + list(signature_metadata.keys()), + ) # ---- Step 2: Summarize (if enabled) ---- if is_summarization: From cc63bdd375dfde3186e6c855295bdeee49873d8d Mon Sep 17 00:00:00 2001 From: pk-zipstack Date: Wed, 15 Apr 2026 16:42:55 +0530 Subject: [PATCH 04/12] [FIX] Allow empty user_id in indexing-status internal endpoint user_id may be empty for mock auth users (default OSS setup). It's only used as a Redis cache key fragment, so empty values are acceptable. Dropping user_id from the required-fields validation unblocks indexing for these users. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../prompt_studio/prompt_studio_core_v2/internal_views.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/prompt_studio/prompt_studio_core_v2/internal_views.py b/backend/prompt_studio/prompt_studio_core_v2/internal_views.py index 3ad3a5db16..8c3391f838 100644 --- a/backend/prompt_studio/prompt_studio_core_v2/internal_views.py +++ b/backend/prompt_studio/prompt_studio_core_v2/internal_views.py @@ -244,11 +244,13 @@ def indexing_status(request): user_id = data.get("user_id", "") doc_id_key = data.get("doc_id_key", "") - if not action or not org_id or not user_id or not doc_id_key: + # user_id may be empty (e.g. mock auth users) - it's only used as a + # Redis cache key fragment, so empty is acceptable. + if not action or not org_id or not doc_id_key: return JsonResponse( { "success": False, - "error": "action, org_id, user_id, doc_id_key are required", + "error": "action, org_id, doc_id_key are required", }, status=status.HTTP_400_BAD_REQUEST, ) From ea011a480fd3da7b97d9fce7a07d4b2e5f5e12a7 Mon Sep 17 00:00:00 2001 From: pk-zipstack Date: Thu, 14 May 2026 14:43:03 +0530 Subject: [PATCH 05/12] [FEAT] Surface signature page highlights in Prompt Studio for document_insights mode Wire the signature data captured by the LLMWhisperer V2 adapter through to Prompt Studio's existing highlight pipeline, so clicking a signer- related answer jumps the PDF viewer to the page containing the signature without any frontend changes. - Adapter: signature_page_references now also carries resolved coords [page, y, height, page_height] alongside the existing hex / line index. - Workers _handle_extract writes a .doc_insights.json sidecar so Prompt Studio cache hits don't lose signature data; the pipeline path threads signature_page_references into tool_settings alongside signature_metadata. - AnswerPromptService._attach_signature_highlights (mirrored in workers and prompt-service) scans the LLM answer for signer names (case- insensitive substring) and appends the matching page coords to metadata[HIGHLIGHT_DATA][prompt_key]. Falls back to all signature pages when the answer mentions signing generically. De-dupes against hex-comment highlights. - Prompt Studio backend: dynamic_extractor now returns ExtractResult (text + signature_metadata + signature_page_references), reading the sidecar on cache hits. All five answer_prompt dispatch sites inject the signature data into tool_settings. - Prompt-service: extraction service + controller surface signature_page_references for parity. - Tests: 7 new unit tests for _attach_signature_highlights covering name match, multi-page coords, keyword fallback, no-op cases, and preservation/dedup of existing highlight entries. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../prompt_studio_core_v2/constants.py | 2 + .../prompt_studio_helper.py | 146 ++++++++++++++- .../src/unstract/prompt_service/constants.py | 1 + .../prompt_service/controllers/extraction.py | 3 + .../prompt_service/services/answer_prompt.py | 104 ++++++++++- .../prompt_service/services/extraction.py | 14 ++ .../llm_whisperer_v2/src/llm_whisperer_v2.py | 7 + workers/executor/executors/answer_prompt.py | 115 +++++++++++- workers/executor/executors/constants.py | 1 + workers/executor/executors/legacy_executor.py | 77 +++++++- workers/tests/test_answer_prompt.py | 173 ++++++++++++++++++ 11 files changed, 626 insertions(+), 17 deletions(-) diff --git a/backend/prompt_studio/prompt_studio_core_v2/constants.py b/backend/prompt_studio/prompt_studio_core_v2/constants.py index c1ab14d380..fa7eea68fc 100644 --- a/backend/prompt_studio/prompt_studio_core_v2/constants.py +++ b/backend/prompt_studio/prompt_studio_core_v2/constants.py @@ -104,6 +104,8 @@ class ToolStudioPromptKeys: EXECUTION_SOURCE = "execution_source" LINE_ITEM = "line-item" CUSTOM_DATA = "custom_data" + SIGNATURE_METADATA = "signature_metadata" + SIGNATURE_PAGE_REFERENCES = "signature_page_references" # Webhook postprocessing settings ENABLE_POSTPROCESSING_WEBHOOK = "enable_postprocessing_webhook" POSTPROCESSING_WEBHOOK_URL = "postprocessing_webhook_url" diff --git a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py index d0ffef3114..9eccd1d98f 100644 --- a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py +++ b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py @@ -4,7 +4,22 @@ import time import uuid from pathlib import Path -from typing import Any +from typing import Any, NamedTuple + + +class ExtractResult(NamedTuple): + """Return value of ``PromptStudioHelper.dynamic_extractor``. + + ``signature_metadata`` and ``signature_page_references`` are populated + only when the x2text adapter is LLMWhisperer V2 in ``document_insights`` + mode and the document contains signatures. They are read either from + the live extract dispatch result (cache miss) or from the on-disk + ``.doc_insights.json`` sidecar (cache hit). + """ + + text: str + signature_metadata: dict[str, Any] | None = None + signature_page_references: dict[str, Any] | None = None from account_v2.constants import Common from account_v2.models import User @@ -734,7 +749,7 @@ def build_fetch_response_payload( ) # Extract (blocking, usually cached) - extracted_text = PromptStudioHelper.dynamic_extractor( + extract_result = PromptStudioHelper.dynamic_extractor( profile_manager=profile_manager, file_path=file_path, org_id=org_id, @@ -742,6 +757,7 @@ def build_fetch_response_payload( run_id=run_id, enable_highlight=tool.enable_highlight, ) + extracted_text = extract_result.text is_summary = tool.summarize_as_source if is_summary: @@ -836,6 +852,14 @@ def build_fetch_response_payload( tool_settings[TSPKeys.WORD_CONFIDENCE_POSTAMBLE] = getattr( settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), "" ) + if extract_result.signature_metadata: + tool_settings[TSPKeys.SIGNATURE_METADATA] = ( + extract_result.signature_metadata + ) + if extract_result.signature_page_references: + tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = ( + extract_result.signature_page_references + ) file_hash = fs_instance.get_hash_from_file(path=extract_path) @@ -951,7 +975,7 @@ def build_bulk_fetch_response_payload( ) # Extract ONCE (blocking, usually cached) - extracted_text = PromptStudioHelper.dynamic_extractor( + extract_result = PromptStudioHelper.dynamic_extractor( profile_manager=profile_manager, file_path=file_path, org_id=org_id, @@ -959,6 +983,7 @@ def build_bulk_fetch_response_payload( run_id=run_id, enable_highlight=tool.enable_highlight, ) + extracted_text = extract_result.text is_summary = tool.summarize_as_source if is_summary: @@ -1026,6 +1051,14 @@ def build_bulk_fetch_response_payload( tool_settings[TSPKeys.WORD_CONFIDENCE_POSTAMBLE] = getattr( settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), "" ) + if extract_result.signature_metadata: + tool_settings[TSPKeys.SIGNATURE_METADATA] = ( + extract_result.signature_metadata + ) + if extract_result.signature_page_references: + tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = ( + extract_result.signature_page_references + ) file_hash = fs_instance.get_hash_from_file(path=extract_path) @@ -1126,7 +1159,7 @@ def build_single_pass_payload( ) # Extract (blocking, usually cached) - PromptStudioHelper.dynamic_extractor( + extract_result = PromptStudioHelper.dynamic_extractor( profile_manager=default_profile, file_path=doc_path, org_id=org_id, @@ -1165,6 +1198,14 @@ def build_single_pass_payload( or TSPKeys.SIMPLE, TSPKeys.SIMILARITY_TOP_K: default_profile.similarity_top_k, } + if extract_result.signature_metadata: + tool_settings[TSPKeys.SIGNATURE_METADATA] = ( + extract_result.signature_metadata + ) + if extract_result.signature_page_references: + tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = ( + extract_result.signature_page_references + ) for p in prompts: if not p.prompt: @@ -1366,7 +1407,7 @@ def index_document( tool=util, ) - extracted_text = PromptStudioHelper.dynamic_extractor( + extract_result = PromptStudioHelper.dynamic_extractor( profile_manager=default_profile, file_path=file_path, org_id=org_id, @@ -1374,6 +1415,7 @@ def index_document( run_id=run_id, enable_highlight=tool.enable_highlight, ) + extracted_text = extract_result.text if tool.summarize_context: summarize_file_path = PromptStudioHelper.summarize( file_name, org_id, run_id, tool @@ -1817,7 +1859,7 @@ def _fetch_response( tool=util, ) logger.info(f"Extracting text from {file_path} for {doc_id}") - extracted_text = PromptStudioHelper.dynamic_extractor( + extract_result = PromptStudioHelper.dynamic_extractor( profile_manager=profile_manager, file_path=file_path, org_id=org_id, @@ -1825,6 +1867,7 @@ def _fetch_response( run_id=run_id, enable_highlight=tool.enable_highlight, ) + extracted_text = extract_result.text logger.info(f"Extracted text from {file_path} for {doc_id}") if is_summary: profile_manager.chunk_size = 0 @@ -1933,6 +1976,14 @@ def _fetch_response( tool_settings[TSPKeys.WORD_CONFIDENCE_POSTAMBLE] = getattr( settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), "" ) + if extract_result.signature_metadata: + tool_settings[TSPKeys.SIGNATURE_METADATA] = ( + extract_result.signature_metadata + ) + if extract_result.signature_page_references: + tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = ( + extract_result.signature_page_references + ) file_hash = fs_instance.get_hash_from_file(path=doc_path) payload = { @@ -2194,7 +2245,7 @@ def _fetch_single_pass_response( file_path = os.path.join( directory, "extract", os.path.splitext(filename)[0] + ".txt" ) - PromptStudioHelper.dynamic_extractor( + extract_result = PromptStudioHelper.dynamic_extractor( profile_manager=default_profile, file_path=input_file_path, org_id=org_id, @@ -2232,6 +2283,14 @@ def _fetch_single_pass_response( default_profile.retrieval_strategy or TSPKeys.SIMPLE ) tool_settings[TSPKeys.SIMILARITY_TOP_K] = default_profile.similarity_top_k + if extract_result.signature_metadata: + tool_settings[TSPKeys.SIGNATURE_METADATA] = ( + extract_result.signature_metadata + ) + if extract_result.signature_page_references: + tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = ( + extract_result.signature_page_references + ) for prompt in prompts: if not prompt.prompt: raise EmptyPromptError() @@ -2291,6 +2350,49 @@ def get_tool_from_tool_id(tool_id: str) -> CustomTool | None: except CustomTool.DoesNotExist: return None + @staticmethod + def _signature_sidecar_path(extract_file_path: str) -> str: + p = Path(extract_file_path) + return str(p.with_suffix("")) + ".doc_insights.json" + + @staticmethod + def _load_signature_sidecar( + extract_file_path: str, + fs_instance: Any, + ) -> tuple[dict[str, Any] | None, dict[str, Any] | None]: + """Return ``(signature_metadata, signature_page_references)`` from the + sidecar, or ``(None, None)`` if the sidecar is missing or unreadable. + + Signature data is only written by the executor when a document + contains signatures in document_insights mode; cache-hit calls + for documents extracted in other modes legitimately have no + sidecar, so absence is not an error. + """ + sidecar_path = PromptStudioHelper._signature_sidecar_path(extract_file_path) + try: + raw = fs_instance.read(path=sidecar_path, mode="r") + except FileNotFoundError: + return None, None + except Exception as e: + logger.warning( + "DOC_INSIGHTS sidecar: failed to read %s: %s", + sidecar_path, + e, + ) + return None, None + try: + data = json.loads(raw) + except (TypeError, ValueError) as e: + logger.warning( + "DOC_INSIGHTS sidecar: failed to parse %s: %s", + sidecar_path, + e, + ) + return None, None + sig_meta = data.get("signature_metadata") or None + sig_refs = data.get("signature_page_references") or None + return sig_meta, sig_refs + @staticmethod def dynamic_extractor( file_path: str, @@ -2299,7 +2401,7 @@ def dynamic_extractor( org_id: str, profile_manager: ProfileManager, document_id: str, - ) -> str: + ) -> ExtractResult: # Guard against None metadata (when adapter_metadata_b is None) metadata = profile_manager.x2text.metadata or {} x2text_config_hash = ToolUtils.hash_str(json.dumps(metadata, sort_keys=True)) @@ -2329,7 +2431,15 @@ def dynamic_extractor( try: extracted_text = fs_instance.read(path=extract_file_path, mode="r") logger.info("Extracted text found. Reading from file..") - return extracted_text + sig_meta, sig_refs = PromptStudioHelper._load_signature_sidecar( + extract_file_path=extract_file_path, + fs_instance=fs_instance, + ) + return ExtractResult( + text=extracted_text, + signature_metadata=sig_meta, + signature_page_references=sig_refs, + ) except FileNotFoundError as e: logger.warning( f"File not found for extraction. {extract_file_path}. {e}" @@ -2383,6 +2493,18 @@ def dynamic_extractor( ) extracted_text = result.data.get("extracted_text", "") + signature_metadata = result.data.get("signature_metadata") + signature_page_references = result.data.get("signature_page_references") + if signature_metadata or signature_page_references: + logger.info( + "DOC_INSIGHTS dynamic_extractor: captured signature data " + "(pages=%s, refs=%s) for document %s", + list(signature_metadata.keys()) if signature_metadata else [], + list(signature_page_references.keys()) + if signature_page_references + else [], + document_id, + ) success = PromptStudioIndexHelper.mark_extraction_status( document_id=document_id, profile_manager=profile_manager, @@ -2395,7 +2517,11 @@ def dynamic_extractor( f"Extraction completed but status not saved." ) - return extracted_text + return ExtractResult( + text=extracted_text, + signature_metadata=signature_metadata, + signature_page_references=signature_page_references, + ) @staticmethod def export_project_settings(tool: CustomTool) -> dict: diff --git a/prompt-service/src/unstract/prompt_service/constants.py b/prompt-service/src/unstract/prompt_service/constants.py index 16bb364ce3..58a6b72fcd 100644 --- a/prompt-service/src/unstract/prompt_service/constants.py +++ b/prompt-service/src/unstract/prompt_service/constants.py @@ -85,6 +85,7 @@ class PromptServiceConstants: LINE_NUMBERS = "line_numbers" WHISPER_HASH = "whisper_hash" SIGNATURE_METADATA = "signature_metadata" + SIGNATURE_PAGE_REFERENCES = "signature_page_references" PAID_FEATURE_MSG = ( "It is a cloud / enterprise feature. If you have purchased a plan and still " "face this issue, please contact support" diff --git a/prompt-service/src/unstract/prompt_service/controllers/extraction.py b/prompt-service/src/unstract/prompt_service/controllers/extraction.py index 4a5d47bb91..588e561491 100644 --- a/prompt-service/src/unstract/prompt_service/controllers/extraction.py +++ b/prompt-service/src/unstract/prompt_service/controllers/extraction.py @@ -55,4 +55,7 @@ def extract() -> Any: signature_metadata = extraction_result.get("signature_metadata") if signature_metadata: response["signature_metadata"] = signature_metadata + signature_page_references = extraction_result.get("signature_page_references") + if signature_page_references: + response["signature_page_references"] = signature_page_references return response diff --git a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py index 3fd69e1d91..0a4a96f333 100644 --- a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py +++ b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py @@ -144,7 +144,7 @@ def construct_and_run_prompt( signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA), ) output[PSKeys.COMBINED_PROMPT] = prompt - return AnswerPromptService.run_completion( + answer = AnswerPromptService.run_completion( llm=llm, prompt=prompt, metadata=metadata, @@ -155,6 +155,108 @@ def construct_and_run_prompt( file_path=file_path, execution_source=execution_source, ) + AnswerPromptService._attach_signature_highlights( + answer=answer, + signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA), + signature_page_references=tool_settings.get( + PSKeys.SIGNATURE_PAGE_REFERENCES + ), + metadata=metadata, + prompt_key=output[PSKeys.NAME], + ) + return answer + + # Generic signature-related terms used as a fallback trigger when the + # LLM answer doesn't mention any specific signer name but does talk + # about signing in general. Matched as case-insensitive substrings. + _SIGNATURE_KEYWORDS = ( + "signature", + "signed", + "signatory", + "signatories", + "signing", + "executed", + ) + + @staticmethod + def _attach_signature_highlights( + answer: str, + signature_metadata: dict[str, list[Any]] | None, + signature_page_references: dict[str, Any] | None, + metadata: dict[str, Any] | None, + prompt_key: str | None, + ) -> None: + """Attach signature page highlights to ``metadata`` when the LLM + answer references a known signer or signatures generally. + + Mirror of the workers post-processor — see + ``executor.executors.answer_prompt.AnswerPromptService._attach_signature_highlights`` + for behavior details. + """ + if not signature_page_references or not signature_metadata: + return + if metadata is None or not prompt_key: + return + if not isinstance(answer, str) or not answer.strip(): + return + + page_coords: dict[str, list[int]] = {} + for page_str, ref in signature_page_references.items(): + if not isinstance(ref, dict): + continue + coords = ref.get("coords") + if isinstance(coords, list) and len(coords) >= 4: + page_coords[page_str] = list(coords[:4]) + if not page_coords: + return + + answer_lower = answer.lower() + matched_pages: list[str] = [] + for page_str, signatures in signature_metadata.items(): + if page_str not in page_coords or not signatures: + continue + for sig in signatures: + if not isinstance(sig, dict): + continue + name = (sig.get("name") or "").strip() + if name and name.lower() in answer_lower: + matched_pages.append(page_str) + break + + if not matched_pages: + if any( + kw in answer_lower for kw in AnswerPromptService._SIGNATURE_KEYWORDS + ): + matched_pages = list(page_coords.keys()) + + if not matched_pages: + return + + seen: set[tuple[int, ...]] = set() + new_coords: list[list[int]] = [] + for page_str in matched_pages: + coords = page_coords[page_str] + key = tuple(coords) + if key in seen: + continue + seen.add(key) + new_coords.append(coords) + + bucket = metadata.setdefault(PSKeys.HIGHLIGHT_DATA, {}) + existing = bucket.get(prompt_key) + if not isinstance(existing, list): + existing = [] + for coords in new_coords: + if coords not in existing: + existing.append(coords) + bucket[prompt_key] = existing + app.logger.info( + "DOC_INSIGHTS attach_signature_highlights: prompt=%s, added %d " + "signature highlight(s) on pages %s", + prompt_key, + len(new_coords), + matched_pages, + ) @staticmethod def _format_signature_metadata( diff --git a/prompt-service/src/unstract/prompt_service/services/extraction.py b/prompt-service/src/unstract/prompt_service/services/extraction.py index 290f13f482..bd6fc9212b 100644 --- a/prompt-service/src/unstract/prompt_service/services/extraction.py +++ b/prompt-service/src/unstract/prompt_service/services/extraction.py @@ -69,6 +69,7 @@ def perform_extraction( extracted_text = process_response.extracted_text # Extract signature metadata if present signature_metadata = None + signature_page_references = None if ( process_response.extraction_metadata and process_response.extraction_metadata.signature_metadata @@ -81,9 +82,22 @@ def perform_extraction( "for pages: %s", list(signature_metadata.keys()), ) + if ( + process_response.extraction_metadata + and process_response.extraction_metadata.signature_page_references + ): + signature_page_references = ( + process_response.extraction_metadata.signature_page_references + ) + logger.info( + "DOC_INSIGHTS extraction: signature_page_references " + "found for pages: %s", + list(signature_page_references.keys()), + ) return { "extracted_text": extracted_text, "signature_metadata": signature_metadata, + "signature_page_references": signature_page_references, } except AdapterError as e: msg = f"Error from text extractor '{x2text.x2text_instance.get_name()}'. " diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py index 984bc63a15..72c4d2f026 100644 --- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py +++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py @@ -110,10 +110,17 @@ def _build_signature_page_references( for sig in signatures if isinstance(sig, dict) ] + coords_entry = line_metadata[line_index] + coords = ( + list(coords_entry[:4]) + if isinstance(coords_entry, list) and len(coords_entry) >= 4 + else None + ) references[page_str] = { "hex": hex_value, "line_metadata_index": line_index, "signers": signers, + "coords": coords, } return references if references else None diff --git a/workers/executor/executors/answer_prompt.py b/workers/executor/executors/answer_prompt.py index 61f9f2319d..d3ba2f4749 100644 --- a/workers/executor/executors/answer_prompt.py +++ b/workers/executor/executors/answer_prompt.py @@ -160,7 +160,7 @@ def construct_and_run_prompt( signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA), ) output[PSKeys.COMBINED_PROMPT] = prompt - return AnswerPromptService.run_completion( + answer = AnswerPromptService.run_completion( llm=llm, prompt=prompt, metadata=metadata, @@ -172,6 +172,16 @@ def construct_and_run_prompt( execution_source=execution_source, process_text=process_text, ) + AnswerPromptService._attach_signature_highlights( + answer=answer, + signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA), + signature_page_references=tool_settings.get( + PSKeys.SIGNATURE_PAGE_REFERENCES + ), + metadata=metadata, + prompt_key=output[PSKeys.NAME], + ) + return answer @staticmethod def _build_grammar_notes(grammar_list: list[dict[str, Any]]) -> str: @@ -190,6 +200,109 @@ def _build_grammar_notes(grammar_list: list[dict[str, Any]]) -> str: ) return notes + # Generic signature-related terms used as a fallback trigger when the + # LLM answer doesn't mention any specific signer name but does talk + # about signing in general (e.g. "Is this signed?" → "Yes, the document + # is signed."). Matched as case-insensitive substrings. + _SIGNATURE_KEYWORDS = ( + "signature", + "signed", + "signatory", + "signatories", + "signing", + "executed", + ) + + @staticmethod + def _attach_signature_highlights( + answer: str, + signature_metadata: dict[str, list[Any]] | None, + signature_page_references: dict[str, Any] | None, + metadata: dict[str, Any] | None, + prompt_key: str | None, + ) -> None: + """Attach signature page highlights to ``metadata`` when the LLM + answer references a known signer or signatures generally. + + - For each signer name in ``signature_metadata`` found as a + case-insensitive substring in ``answer``, append that page's + coords (from ``signature_page_references``) to + ``metadata[HIGHLIGHT_DATA][prompt_key]``. + - If no signer-name match is found but the answer mentions + generic signature keywords (signature, signed, signatory, + executed, signing), append every signature page's coords. + + ``metadata[HIGHLIGHT_DATA][prompt_key]`` is mutated in place; the + existing list (populated by hex-comment processing in + ``run_completion``) is preserved and extended. + """ + if not signature_page_references or not signature_metadata: + return + if metadata is None or not prompt_key: + return + if not isinstance(answer, str) or not answer.strip(): + return + + # Build page → coords map (one coord array per signature page). + page_coords: dict[str, list[int]] = {} + for page_str, ref in signature_page_references.items(): + if not isinstance(ref, dict): + continue + coords = ref.get("coords") + if isinstance(coords, list) and len(coords) >= 4: + page_coords[page_str] = list(coords[:4]) + if not page_coords: + return + + answer_lower = answer.lower() + matched_pages: list[str] = [] + for page_str, signatures in signature_metadata.items(): + if page_str not in page_coords or not signatures: + continue + for sig in signatures: + if not isinstance(sig, dict): + continue + name = (sig.get("name") or "").strip() + if name and name.lower() in answer_lower: + matched_pages.append(page_str) + break # one match per page is enough + + if not matched_pages: + # No specific signer matched — fall back to all signature pages + # when the answer talks about signing generically. + if any(kw in answer_lower for kw in AnswerPromptService._SIGNATURE_KEYWORDS): + matched_pages = list(page_coords.keys()) + + if not matched_pages: + return + + seen: set[tuple[int, ...]] = set() + new_coords: list[list[int]] = [] + for page_str in matched_pages: + coords = page_coords[page_str] + key = tuple(coords) + if key in seen: + continue + seen.add(key) + new_coords.append(coords) + + bucket = metadata.setdefault(PSKeys.HIGHLIGHT_DATA, {}) + existing = bucket.get(prompt_key) + if not isinstance(existing, list): + existing = [] + # Avoid duplicating coords already present from hex-comment processing. + for coords in new_coords: + if coords not in existing: + existing.append(coords) + bucket[prompt_key] = existing + logger.info( + "DOC_INSIGHTS attach_signature_highlights: prompt=%s, added %d " + "signature highlight(s) on pages %s", + prompt_key, + len(new_coords), + matched_pages, + ) + @staticmethod def _format_signature_metadata( signature_metadata: dict[str, list[Any]], diff --git a/workers/executor/executors/constants.py b/workers/executor/executors/constants.py index 16bb364ce3..58a6b72fcd 100644 --- a/workers/executor/executors/constants.py +++ b/workers/executor/executors/constants.py @@ -85,6 +85,7 @@ class PromptServiceConstants: LINE_NUMBERS = "line_numbers" WHISPER_HASH = "whisper_hash" SIGNATURE_METADATA = "signature_metadata" + SIGNATURE_PAGE_REFERENCES = "signature_page_references" PAID_FEATURE_MSG = ( "It is a cloud / enterprise feature. If you have purchased a plan and still " "face this issue, please contact support" diff --git a/workers/executor/executors/legacy_executor.py b/workers/executor/executors/legacy_executor.py index 1b9f20bce3..4ddccaa5be 100644 --- a/workers/executor/executors/legacy_executor.py +++ b/workers/executor/executors/legacy_executor.py @@ -290,6 +290,14 @@ def _handle_extract(self, context: ExecutionContext) -> ExecutionResult: process_response.extraction_metadata .signature_page_references, ) + self._write_signature_sidecar( + fs=fs, + output_file_path=output_file_path, + signature_metadata=result_data.get("signature_metadata"), + signature_page_references=result_data.get( + "signature_page_references" + ), + ) return ExecutionResult( success=True, data=result_data, @@ -305,6 +313,54 @@ def _handle_extract(self, context: ExecutionContext) -> ExecutionResult: msg = f"Error from text extractor '{name}'. {e}" raise ExtractionError(message=msg) from e + @staticmethod + def _signature_sidecar_path(output_file_path: str) -> str: + """Sidecar JSON for document_insights signature data. + + Lives next to the extracted ``.txt`` file so cache hits in + Prompt Studio can recover signature data without re-extracting. + """ + p = Path(output_file_path) + return str(p.with_suffix("") ) + ".doc_insights.json" + + @staticmethod + def _write_signature_sidecar( + fs: Any, + output_file_path: str | None, + signature_metadata: dict[str, Any] | None, + signature_page_references: dict[str, Any] | None, + ) -> None: + """Persist signature data alongside the extracted-text file. + + Skipped if there's no signature data or no output path (e.g., + when running without disk output). + """ + if not output_file_path: + return + if not signature_metadata and not signature_page_references: + return + sidecar_path = LegacyExecutor._signature_sidecar_path(output_file_path) + payload = { + "signature_metadata": signature_metadata or {}, + "signature_page_references": signature_page_references or {}, + } + try: + ToolUtils.dump_json( + file_to_dump=sidecar_path, + json_to_dump=payload, + fs=fs, + ) + logger.info( + "DOC_INSIGHTS sidecar: wrote signature data to %s", + sidecar_path, + ) + except Exception as e: + logger.warning( + "DOC_INSIGHTS sidecar: failed to write %s: %s", + sidecar_path, + e, + ) + @staticmethod def _update_exec_metadata( fs: Any, @@ -578,14 +634,25 @@ def _handle_structure_pipeline(self, context: ExecutionContext) -> ExecutionResu from executor.executors.constants import PromptServiceConstants as PSKeys signature_metadata = extract_result.data.get("signature_metadata") - if signature_metadata: + signature_page_references = extract_result.data.get( + "signature_page_references" + ) + if signature_metadata or signature_page_references: tool_settings = answer_params.get(PSKeys.TOOL_SETTINGS, {}) - tool_settings[PSKeys.SIGNATURE_METADATA] = signature_metadata + if signature_metadata: + tool_settings[PSKeys.SIGNATURE_METADATA] = signature_metadata + if signature_page_references: + tool_settings[PSKeys.SIGNATURE_PAGE_REFERENCES] = ( + signature_page_references + ) answer_params[PSKeys.TOOL_SETTINGS] = tool_settings logger.info( - "DOC_INSIGHTS pipeline: injected signature_metadata " - "into tool_settings for pages: %s", - list(signature_metadata.keys()), + "DOC_INSIGHTS pipeline: injected signature data into " + "tool_settings (pages=%s, refs=%s)", + list(signature_metadata.keys()) if signature_metadata else [], + list(signature_page_references.keys()) + if signature_page_references + else [], ) # ---- Step 2: Summarize (if enabled) ---- diff --git a/workers/tests/test_answer_prompt.py b/workers/tests/test_answer_prompt.py index 6c9fb9fce9..ca1f93b556 100644 --- a/workers/tests/test_answer_prompt.py +++ b/workers/tests/test_answer_prompt.py @@ -900,6 +900,179 @@ def test_construct_prompt_with_grammar(self): assert "sum, total" in result +class TestAttachSignatureHighlights: + """Tests for the signature-highlight post-processor.""" + + @staticmethod + def _fixture_signatures(): + """Build a minimal signature fixture set covering two pages.""" + signature_metadata = { + "0": [ + {"name": "Mr Dagan", "type": "signature", "desc": ""}, + {"name": "Carmela Avner", "type": "signature", "desc": ""}, + ], + "1": [ + {"name": "Eve Other", "type": "signature", "desc": ""}, + ], + } + signature_page_references = { + "0": { + "hex": "0x10", + "line_metadata_index": 15, + "signers": ["Mr Dagan", "Carmela Avner"], + "coords": [0, 320, 31, 3168], + }, + "1": { + "hex": "0x20", + "line_metadata_index": 31, + "signers": ["Eve Other"], + "coords": [1, 100, 40, 3168], + }, + } + return signature_metadata, signature_page_references + + def test_name_match_attaches_only_matched_page(self): + from executor.executors.answer_prompt import AnswerPromptService + + sig_meta, sig_refs = self._fixture_signatures() + metadata = {} + AnswerPromptService._attach_signature_highlights( + answer="The document was signed by Mr Dagan on Jan 1.", + signature_metadata=sig_meta, + signature_page_references=sig_refs, + metadata=metadata, + prompt_key="signer", + ) + # Only page 0's coords (Mr Dagan) should be attached. + assert metadata["highlight_data"]["signer"] == [[0, 320, 31, 3168]] + + def test_case_insensitive_substring_match(self): + from executor.executors.answer_prompt import AnswerPromptService + + sig_meta, sig_refs = self._fixture_signatures() + metadata = {} + AnswerPromptService._attach_signature_highlights( + answer="signed by mr dagan, with sign-off from carmela avner.", + signature_metadata=sig_meta, + signature_page_references=sig_refs, + metadata=metadata, + prompt_key="signers", + ) + # Both names matched but both are on page 0 → single coord, deduped. + assert metadata["highlight_data"]["signers"] == [[0, 320, 31, 3168]] + + def test_multi_page_names_attach_distinct_coords(self): + from executor.executors.answer_prompt import AnswerPromptService + + sig_meta, sig_refs = self._fixture_signatures() + metadata = {} + AnswerPromptService._attach_signature_highlights( + answer="Signed by Mr Dagan and Eve Other.", + signature_metadata=sig_meta, + signature_page_references=sig_refs, + metadata=metadata, + prompt_key="signers", + ) + # Page 0 and page 1 coords both attached. + coords = metadata["highlight_data"]["signers"] + assert [0, 320, 31, 3168] in coords + assert [1, 100, 40, 3168] in coords + assert len(coords) == 2 + + def test_keyword_fallback_attaches_all_signature_pages(self): + """Generic signature mention with no name match → all pages.""" + from executor.executors.answer_prompt import AnswerPromptService + + sig_meta, sig_refs = self._fixture_signatures() + metadata = {} + AnswerPromptService._attach_signature_highlights( + answer="Yes, the document is signed.", + signature_metadata=sig_meta, + signature_page_references=sig_refs, + metadata=metadata, + prompt_key="is_signed", + ) + coords = metadata["highlight_data"]["is_signed"] + assert [0, 320, 31, 3168] in coords + assert [1, 100, 40, 3168] in coords + assert len(coords) == 2 + + def test_no_match_no_keyword_no_op(self): + """Answer with neither name match nor keyword → no highlights added.""" + from executor.executors.answer_prompt import AnswerPromptService + + sig_meta, sig_refs = self._fixture_signatures() + metadata = {} + AnswerPromptService._attach_signature_highlights( + answer="The total amount is $42.", + signature_metadata=sig_meta, + signature_page_references=sig_refs, + metadata=metadata, + prompt_key="total", + ) + assert "highlight_data" not in metadata + + def test_preserves_existing_highlight_entries(self): + """Coords already in metadata[HIGHLIGHT_DATA][key] are kept; no dups.""" + from executor.executors.answer_prompt import AnswerPromptService + + sig_meta, sig_refs = self._fixture_signatures() + metadata = { + "highlight_data": { + "signer": [ + [9, 9, 9, 9], # pre-existing, unrelated highlight + [0, 320, 31, 3168], # would duplicate the page-0 sig + ] + } + } + AnswerPromptService._attach_signature_highlights( + answer="Signed by Mr Dagan.", + signature_metadata=sig_meta, + signature_page_references=sig_refs, + metadata=metadata, + prompt_key="signer", + ) + # Pre-existing entries preserved, page-0 coord not duplicated. + assert metadata["highlight_data"]["signer"] == [ + [9, 9, 9, 9], + [0, 320, 31, 3168], + ] + + def test_missing_inputs_no_op(self): + """No-op when signature data or metadata pieces are missing.""" + from executor.executors.answer_prompt import AnswerPromptService + + # No signature_metadata + metadata = {} + AnswerPromptService._attach_signature_highlights( + answer="signed by Mr Dagan", + signature_metadata=None, + signature_page_references={"0": {"coords": [0, 0, 0, 0]}}, + metadata=metadata, + prompt_key="k", + ) + assert metadata == {} + # No signature_page_references + AnswerPromptService._attach_signature_highlights( + answer="signed by Mr Dagan", + signature_metadata={"0": [{"name": "Mr Dagan"}]}, + signature_page_references=None, + metadata=metadata, + prompt_key="k", + ) + assert metadata == {} + # Empty/None answer + sig_meta, sig_refs = self._fixture_signatures() + AnswerPromptService._attach_signature_highlights( + answer="", + signature_metadata=sig_meta, + signature_page_references=sig_refs, + metadata=metadata, + prompt_key="k", + ) + assert metadata == {} + + class TestVariableReplacementService: """Tests for the VariableReplacementService.""" From e5333bcc6c5f78c25e5b65b521dd5abf87fb7fdb Mon Sep 17 00:00:00 2001 From: pk-zipstack Date: Thu, 14 May 2026 15:43:39 +0530 Subject: [PATCH 06/12] [FIX] Pick content line per page and use word-boundary matching for signature highlights MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs surfaced when testing in Prompt Studio with a multi-page signed PDF where the LLM answer was a single signer name. 1) Adapter was selecting unusable line_metadata entries _build_signature_page_references picked the first line_metadata entry per page, but the first entry is often an empty marker row like [0, 0, 0, 3168] or [1, 0, 0, 0]. Zero height makes the overlay invisible; zero page_height causes divide-by-zero in the frontend's percentage calc. Now skip entries with height <= 0 or page_height <= 0 and pick the first true content line. 2) Post-processor matched signer initials inside other names "P S" (a signer on page 0) was matching across word boundaries inside "Pradeep Surukanti" — case-insensitive substring "p s" appears between "Pradee[p s]urukanti". Both pages got highlights, the viewer jumped to the first (wrong) page. Switched to a regex with \b anchors so the signer name has to appear as a whole token or phrase. Added a regression test (test_short_initials_do_not_falsely_match_across_words) that locks in the fix for this exact scenario. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../prompt_service/services/answer_prompt.py | 11 ++++++- .../llm_whisperer_v2/src/llm_whisperer_v2.py | 17 +++++++--- workers/executor/executors/answer_prompt.py | 11 ++++++- workers/tests/test_answer_prompt.py | 33 +++++++++++++++++++ 4 files changed, 65 insertions(+), 7 deletions(-) diff --git a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py index 0a4a96f333..8224441a48 100644 --- a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py +++ b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py @@ -1,4 +1,5 @@ import ipaddress +import re import socket from logging import Logger from typing import Any @@ -219,7 +220,15 @@ def _attach_signature_highlights( if not isinstance(sig, dict): continue name = (sig.get("name") or "").strip() - if name and name.lower() in answer_lower: + if not name: + continue + # Word-boundary regex avoids false positives like + # signer "P S" matching the gap between "Pradeep" and + # "Surukanti" inside "Pradeep Surukanti". + pattern = re.compile( + r"\b" + re.escape(name) + r"\b", re.IGNORECASE + ) + if pattern.search(answer): matched_pages.append(page_str) break diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py index 72c4d2f026..a980b5b4f0 100644 --- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py +++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py @@ -81,13 +81,20 @@ def _build_signature_page_references( "cannot build page references") return None - # Build a map of page number -> first line_metadata index + # Build a map of page number -> first *content* line index. + # Skip marker/empty rows like [0, 0, 0, 3168] or [1, 0, 0, 0]: + # they have zero height or zero page_height and produce an + # invisible overlay (and divide-by-zero in the frontend's + # percentage calculations). page_first_line: dict[int, int] = {} for idx, entry in enumerate(line_metadata): - if isinstance(entry, list) and len(entry) >= 1: - page = entry[0] - if page not in page_first_line: - page_first_line[page] = idx + if not isinstance(entry, list) or len(entry) < 4: + continue + page, _y, height, page_height = entry[0], entry[1], entry[2], entry[3] + if height <= 0 or page_height <= 0: + continue + if page not in page_first_line: + page_first_line[page] = idx logger.debug( "DOC_INSIGHTS: page_first_line map: %s", page_first_line ) diff --git a/workers/executor/executors/answer_prompt.py b/workers/executor/executors/answer_prompt.py index d3ba2f4749..d1451c8111 100644 --- a/workers/executor/executors/answer_prompt.py +++ b/workers/executor/executors/answer_prompt.py @@ -13,6 +13,7 @@ import ipaddress import logging import os +import re import socket from typing import Any from urllib.parse import urlparse @@ -263,7 +264,15 @@ def _attach_signature_highlights( if not isinstance(sig, dict): continue name = (sig.get("name") or "").strip() - if name and name.lower() in answer_lower: + if not name: + continue + # Word-boundary regex avoids false positives like + # signer "P S" matching the gap between "Pradeep" and + # "Surukanti" inside "Pradeep Surukanti". + pattern = re.compile( + r"\b" + re.escape(name) + r"\b", re.IGNORECASE + ) + if pattern.search(answer): matched_pages.append(page_str) break # one match per page is enough diff --git a/workers/tests/test_answer_prompt.py b/workers/tests/test_answer_prompt.py index ca1f93b556..e7b534591a 100644 --- a/workers/tests/test_answer_prompt.py +++ b/workers/tests/test_answer_prompt.py @@ -1038,6 +1038,39 @@ def test_preserves_existing_highlight_entries(self): [0, 320, 31, 3168], ] + def test_short_initials_do_not_falsely_match_across_words(self): + """Regression: signer "P S" must not match across "Pradeep Surukanti". + + Pure substring matching incorrectly fired because "p s" appears + between "Pradee[p s]urukanti". Word-boundary matching prevents + the false positive. + """ + from executor.executors.answer_prompt import AnswerPromptService + + signature_metadata = { + "0": [ + {"name": "P S", "type": "signature"}, + {"name": "H S", "type": "signature"}, + ], + "1": [ + {"name": "Pradeep Surukanti", "type": "signature"}, + ], + } + signature_page_references = { + "0": {"coords": [0, 100, 30, 3168]}, + "1": {"coords": [1, 200, 30, 3168]}, + } + metadata = {} + AnswerPromptService._attach_signature_highlights( + answer="Pradeep Surukanti", + signature_metadata=signature_metadata, + signature_page_references=signature_page_references, + metadata=metadata, + prompt_key="signer", + ) + # Only the actual signer's page should be attached, not page 0. + assert metadata["highlight_data"]["signer"] == [[1, 200, 30, 3168]] + def test_missing_inputs_no_op(self): """No-op when signature data or metadata pieces are missing.""" from executor.executors.answer_prompt import AnswerPromptService From df77c3eb95ffab6fd3899c06738878e2a2b77eb1 Mon Sep 17 00:00:00 2001 From: pk-zipstack Date: Thu, 14 May 2026 17:24:24 +0530 Subject: [PATCH 07/12] [FIX] Allow signature page jumps without the enable_highlight toggle Two clickability gates in the frontend prevented signature-driven page jumps from firing when the tool's enable_highlight setting was off, even though the backend was correctly producing highlight_data from document_insights signature extraction. 1) TextResult only rendered the clickable Typography.Text variant when enableHighlight was true, so the answer stayed a plain
. 2) handleSelectHighlight in PromptCard.jsx returned silently when enable_highlight was false, so even if the click fired, selectedHighlight state never updated and PdfViewer never received a non-empty highlightData prop (jumpToPage was never called). Both gates now also pass through when highlight_data is present, so the signature feature works on tools that have document_insights mode on LLMWhisperer V2 even without flipping the separate enable_highlight toggle. Existing flows (with enable_highlight=false and no highlight data) are unchanged because their highlight_data is empty. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../prompt-card/DisplayPromptResult.jsx | 12 +++++- .../custom-tools/prompt-card/PromptCard.jsx | 41 +++++++++++-------- 2 files changed, 35 insertions(+), 18 deletions(-) diff --git a/frontend/src/components/custom-tools/prompt-card/DisplayPromptResult.jsx b/frontend/src/components/custom-tools/prompt-card/DisplayPromptResult.jsx index f5f9474024..857d96f33a 100644 --- a/frontend/src/components/custom-tools/prompt-card/DisplayPromptResult.jsx +++ b/frontend/src/components/custom-tools/prompt-card/DisplayPromptResult.jsx @@ -415,14 +415,22 @@ const TextResult = ({ const confidence = getConfidenceForText(); - return enableHighlight ? ( + // Make the answer clickable when the tool has highlighting enabled OR + // when the backend produced highlight_data (e.g. signature page refs + // from LLMWhisperer's document_insights mode), so signature highlights + // still work without requiring the separate enable_highlight toggle. + const hasHighlightData = + Array.isArray(highlightData) && highlightData.length > 0; + const isClickable = enableHighlight || hasHighlightData; + + return isClickable ? ( onSelectHighlight(highlightData, promptId, profileId, confidence) } className={`prompt-output-result json-value ${ - highlightData ? "clickable" : "" + hasHighlightData ? "clickable" : "" } ${selectedHighlight?.highlightedPrompt === promptId ? "selected" : ""}`} > {parsedOutput} diff --git a/frontend/src/components/custom-tools/prompt-card/PromptCard.jsx b/frontend/src/components/custom-tools/prompt-card/PromptCard.jsx index 2e579108c2..11558a5323 100644 --- a/frontend/src/components/custom-tools/prompt-card/PromptCard.jsx +++ b/frontend/src/components/custom-tools/prompt-card/PromptCard.jsx @@ -240,23 +240,32 @@ const PromptCard = memo( highlightedProfile, confidenceData, ) => { - if (details?.enable_highlight) { - const processedHighlight = - singlePassExtractMode && - typeof highlightData === "object" && - !Array.isArray(highlightData) - ? flattenHighlightData(highlightData) - : highlightData; - - updateCustomTool({ - selectedHighlight: { - highlight: processedHighlight, - highlightedPrompt: highlightedPrompt, - highlightedProfile: highlightedProfile, - confidence: confidenceData, - }, - }); + // Allow highlight state to update when the tool has highlighting + // enabled OR when the backend produced highlight_data (e.g. + // signature page refs from LLMWhisperer's document_insights mode), + // so signature-driven page jumps work without the separate + // enable_highlight toggle. + const hasHighlightData = Array.isArray(highlightData) + ? highlightData.length > 0 + : Boolean(highlightData); + if (!details?.enable_highlight && !hasHighlightData) { + return; } + const processedHighlight = + singlePassExtractMode && + typeof highlightData === "object" && + !Array.isArray(highlightData) + ? flattenHighlightData(highlightData) + : highlightData; + + updateCustomTool({ + selectedHighlight: { + highlight: processedHighlight, + highlightedPrompt: highlightedPrompt, + highlightedProfile: highlightedProfile, + confidence: confidenceData, + }, + }); }; const handleTypeChange = (value) => { From d49d924ad83d7b05ffbd7a5f99f7eefec1d76ab3 Mon Sep 17 00:00:00 2001 From: pk-zipstack Date: Thu, 14 May 2026 17:37:00 +0530 Subject: [PATCH 08/12] [MISC] Address pre-commit + SonarCloud findings on signature highlights - Extract duplicated signature-highlight post-processor logic into a shared helper at ``unstract/sdk1/utils/signature_highlights.py`` and delegate from both workers and prompt-service. Cuts SonarCloud's duplication metric and brings the cognitive-complexity score below the gate. - Split LLMWhispererV2._build_signature_page_references into _index_first_content_line_per_page + _build_page_reference_entry to cut its cognitive complexity below 15. - Move ExtractResult NamedTuple in prompt_studio_helper below the import block to silence ruff E402 (and drop a duplicate logger= line that was already present). - Move the `logger = ...` line in prompt-service extraction.py below all imports to fix the same E402 issue. - Apply ruff-format normalisation across the touched files. No behaviour change. All 8 signature-highlight unit tests still pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../prompt_studio_helper.py | 50 +++--- .../prompt_service/services/answer_prompt.py | 104 +++--------- .../prompt_service/services/extraction.py | 7 +- .../llm_whisperer_v2/src/llm_whisperer_v2.py | 113 +++++++------ .../sdk1/utils/signature_highlights.py | 156 ++++++++++++++++++ workers/executor/executors/answer_prompt.py | 116 +++---------- workers/executor/executors/legacy_executor.py | 18 +- workers/tests/test_answer_prompt.py | 5 +- 8 files changed, 295 insertions(+), 274 deletions(-) create mode 100644 unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py diff --git a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py index 9eccd1d98f..fcd9720289 100644 --- a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py +++ b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py @@ -6,21 +6,6 @@ from pathlib import Path from typing import Any, NamedTuple - -class ExtractResult(NamedTuple): - """Return value of ``PromptStudioHelper.dynamic_extractor``. - - ``signature_metadata`` and ``signature_page_references`` are populated - only when the x2text adapter is LLMWhisperer V2 in ``document_insights`` - mode and the document contains signatures. They are read either from - the live extract dispatch result (cache miss) or from the on-disk - ``.doc_insights.json`` sidecar (cache hit). - """ - - text: str - signature_metadata: dict[str, Any] | None = None - signature_page_references: dict[str, Any] | None = None - from account_v2.constants import Common from account_v2.models import User from adapter_processor_v2.constants import AdapterKeys @@ -95,7 +80,20 @@ class ExtractResult(NamedTuple): CHOICES_JSON = "/static/select_choices.json" ERROR_MSG = "User %s doesn't have access to adapter %s" -logger = logging.getLogger(__name__) + +class ExtractResult(NamedTuple): + """Return value of ``PromptStudioHelper.dynamic_extractor``. + + ``signature_metadata`` and ``signature_page_references`` are populated + only when the x2text adapter is LLMWhisperer V2 in ``document_insights`` + mode and the document contains signatures. They are read either from + the live extract dispatch result (cache miss) or from the on-disk + ``.doc_insights.json`` sidecar (cache hit). + """ + + text: str + signature_metadata: dict[str, Any] | None = None + signature_page_references: dict[str, Any] | None = None class PromptStudioHelper: @@ -853,9 +851,7 @@ def build_fetch_response_payload( settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), "" ) if extract_result.signature_metadata: - tool_settings[TSPKeys.SIGNATURE_METADATA] = ( - extract_result.signature_metadata - ) + tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata if extract_result.signature_page_references: tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = ( extract_result.signature_page_references @@ -1052,9 +1048,7 @@ def build_bulk_fetch_response_payload( settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), "" ) if extract_result.signature_metadata: - tool_settings[TSPKeys.SIGNATURE_METADATA] = ( - extract_result.signature_metadata - ) + tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata if extract_result.signature_page_references: tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = ( extract_result.signature_page_references @@ -1199,9 +1193,7 @@ def build_single_pass_payload( TSPKeys.SIMILARITY_TOP_K: default_profile.similarity_top_k, } if extract_result.signature_metadata: - tool_settings[TSPKeys.SIGNATURE_METADATA] = ( - extract_result.signature_metadata - ) + tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata if extract_result.signature_page_references: tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = ( extract_result.signature_page_references @@ -1977,9 +1969,7 @@ def _fetch_response( settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), "" ) if extract_result.signature_metadata: - tool_settings[TSPKeys.SIGNATURE_METADATA] = ( - extract_result.signature_metadata - ) + tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata if extract_result.signature_page_references: tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = ( extract_result.signature_page_references @@ -2284,9 +2274,7 @@ def _fetch_single_pass_response( ) tool_settings[TSPKeys.SIMILARITY_TOP_K] = default_profile.similarity_top_k if extract_result.signature_metadata: - tool_settings[TSPKeys.SIGNATURE_METADATA] = ( - extract_result.signature_metadata - ) + tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata if extract_result.signature_page_references: tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = ( extract_result.signature_page_references diff --git a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py index 8224441a48..49a1009a67 100644 --- a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py +++ b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py @@ -1,5 +1,4 @@ import ipaddress -import re import socket from logging import Logger from typing import Any @@ -25,6 +24,10 @@ from unstract.sdk1.file_storage.constants import StorageType from unstract.sdk1.file_storage.env_helper import EnvHelper from unstract.sdk1.llm import LLM +from unstract.sdk1.utils.signature_highlights import ( + merge_into_highlight_data, + resolve_signature_highlight_coords, +) def _is_safe_public_url(url: str) -> bool: @@ -159,26 +162,12 @@ def construct_and_run_prompt( AnswerPromptService._attach_signature_highlights( answer=answer, signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA), - signature_page_references=tool_settings.get( - PSKeys.SIGNATURE_PAGE_REFERENCES - ), + signature_page_references=tool_settings.get(PSKeys.SIGNATURE_PAGE_REFERENCES), metadata=metadata, prompt_key=output[PSKeys.NAME], ) return answer - # Generic signature-related terms used as a fallback trigger when the - # LLM answer doesn't mention any specific signer name but does talk - # about signing in general. Matched as case-insensitive substrings. - _SIGNATURE_KEYWORDS = ( - "signature", - "signed", - "signatory", - "signatories", - "signing", - "executed", - ) - @staticmethod def _attach_signature_highlights( answer: str, @@ -190,81 +179,30 @@ def _attach_signature_highlights( """Attach signature page highlights to ``metadata`` when the LLM answer references a known signer or signatures generally. - Mirror of the workers post-processor — see - ``executor.executors.answer_prompt.AnswerPromptService._attach_signature_highlights`` - for behavior details. + Delegates the matching logic to + ``unstract.sdk1.utils.signature_highlights`` so workers and + prompt-service stay in sync. """ - if not signature_page_references or not signature_metadata: - return if metadata is None or not prompt_key: return - if not isinstance(answer, str) or not answer.strip(): - return - - page_coords: dict[str, list[int]] = {} - for page_str, ref in signature_page_references.items(): - if not isinstance(ref, dict): - continue - coords = ref.get("coords") - if isinstance(coords, list) and len(coords) >= 4: - page_coords[page_str] = list(coords[:4]) - if not page_coords: - return - - answer_lower = answer.lower() - matched_pages: list[str] = [] - for page_str, signatures in signature_metadata.items(): - if page_str not in page_coords or not signatures: - continue - for sig in signatures: - if not isinstance(sig, dict): - continue - name = (sig.get("name") or "").strip() - if not name: - continue - # Word-boundary regex avoids false positives like - # signer "P S" matching the gap between "Pradeep" and - # "Surukanti" inside "Pradeep Surukanti". - pattern = re.compile( - r"\b" + re.escape(name) + r"\b", re.IGNORECASE - ) - if pattern.search(answer): - matched_pages.append(page_str) - break - - if not matched_pages: - if any( - kw in answer_lower for kw in AnswerPromptService._SIGNATURE_KEYWORDS - ): - matched_pages = list(page_coords.keys()) - - if not matched_pages: + new_coords = resolve_signature_highlight_coords( + answer=answer, + signature_metadata=signature_metadata, + signature_page_references=signature_page_references, + ) + if not new_coords: return - - seen: set[tuple[int, ...]] = set() - new_coords: list[list[int]] = [] - for page_str in matched_pages: - coords = page_coords[page_str] - key = tuple(coords) - if key in seen: - continue - seen.add(key) - new_coords.append(coords) - - bucket = metadata.setdefault(PSKeys.HIGHLIGHT_DATA, {}) - existing = bucket.get(prompt_key) - if not isinstance(existing, list): - existing = [] - for coords in new_coords: - if coords not in existing: - existing.append(coords) - bucket[prompt_key] = existing + merge_into_highlight_data( + metadata=metadata, + prompt_key=prompt_key, + new_coords=new_coords, + highlight_data_key=PSKeys.HIGHLIGHT_DATA, + ) app.logger.info( "DOC_INSIGHTS attach_signature_highlights: prompt=%s, added %d " - "signature highlight(s) on pages %s", + "signature highlight(s)", prompt_key, len(new_coords), - matched_pages, ) @staticmethod diff --git a/prompt-service/src/unstract/prompt_service/services/extraction.py b/prompt-service/src/unstract/prompt_service/services/extraction.py index bd6fc9212b..8b252e8b55 100644 --- a/prompt-service/src/unstract/prompt_service/services/extraction.py +++ b/prompt-service/src/unstract/prompt_service/services/extraction.py @@ -2,8 +2,6 @@ from pathlib import Path from typing import Any -logger = logging.getLogger(__name__) - from unstract.prompt_service.constants import ExecutionSource from unstract.prompt_service.constants import IndexingConstants as IKeys from unstract.prompt_service.exceptions import ExtractionError @@ -17,6 +15,8 @@ from unstract.sdk1.utils.tool import ToolUtils from unstract.sdk1.x2txt import TextExtractionResult, X2Text +logger = logging.getLogger(__name__) + class ExtractionService: @staticmethod @@ -78,8 +78,7 @@ def perform_extraction( process_response.extraction_metadata.signature_metadata ) logger.info( - "DOC_INSIGHTS extraction: signature_metadata found " - "for pages: %s", + "DOC_INSIGHTS extraction: signature_metadata found " "for pages: %s", list(signature_metadata.keys()), ) if ( diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py index a980b5b4f0..066c8a4a57 100644 --- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py +++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py @@ -56,6 +56,50 @@ def get_description() -> str: def get_icon() -> str: return "/icons/adapter-icons/LLMWhispererV2.png" + @staticmethod + def _index_first_content_line_per_page( + line_metadata: list[list[int]], + ) -> dict[int, int]: + """Map each page to its first content-line index in ``line_metadata``. + + Marker/empty rows like ``[0, 0, 0, 3168]`` or ``[1, 0, 0, 0]`` are + skipped because they have zero height or zero page_height and + produce an invisible overlay (and divide-by-zero in the frontend's + percentage calculation). + """ + page_first_line: dict[int, int] = {} + for idx, entry in enumerate(line_metadata): + if not isinstance(entry, list) or len(entry) < 4: + continue + page, _y, height, page_height = entry[0], entry[1], entry[2], entry[3] + if height <= 0 or page_height <= 0: + continue + if page not in page_first_line: + page_first_line[page] = idx + return page_first_line + + @staticmethod + def _build_page_reference_entry( + line_index: int, + signatures: list[Any], + line_metadata: list[list[int]], + ) -> dict[str, Any]: + """Build a single ``signature_page_references`` entry for one page.""" + coords_entry = line_metadata[line_index] + coords = ( + list(coords_entry[:4]) + if isinstance(coords_entry, list) and len(coords_entry) >= 4 + else None + ) + return { + "hex": f"0x{line_index + 1:02X}", # 1-indexed hex + "line_metadata_index": line_index, + "signers": [ + sig.get("name", "Unknown") for sig in signatures if isinstance(sig, dict) + ], + "coords": coords, + } + @staticmethod def _build_signature_page_references( signature_metadata: dict[str, list[Any]], @@ -63,9 +107,12 @@ def _build_signature_page_references( ) -> dict[str, Any] | None: """Build page references for frontend navigation to signature pages. - For each page that has signatures, finds the first line_metadata - entry for that page and converts its index to a 1-indexed hex - value. This allows the frontend to jump to the correct page. + For each page that has signatures, finds the first **content** + line in ``line_metadata`` (skipping zero-height marker rows) and + emits its 1-indexed hex value plus resolved coords. The frontend + uses ``coords`` directly in its highlight overlay; the workers + executor caches the result in a sidecar JSON next to the + extracted text file so cached extracts retain it. Args: signature_metadata: Dict keyed by page number (str, 0-indexed) @@ -73,31 +120,18 @@ def _build_signature_page_references( line_metadata: List of [page, y_pos, height, page_height] arrays. Returns: - Dict mapping page number to hex reference and signer names, - or None if no references could be built. + Dict mapping page number to ``{hex, line_metadata_index, + signers, coords}``, or None if no references could be built. """ if not line_metadata: - logger.warning("DOC_INSIGHTS: no line_metadata available, " - "cannot build page references") + logger.warning( + "DOC_INSIGHTS: no line_metadata available, " + "cannot build page references" + ) return None - # Build a map of page number -> first *content* line index. - # Skip marker/empty rows like [0, 0, 0, 3168] or [1, 0, 0, 0]: - # they have zero height or zero page_height and produce an - # invisible overlay (and divide-by-zero in the frontend's - # percentage calculations). - page_first_line: dict[int, int] = {} - for idx, entry in enumerate(line_metadata): - if not isinstance(entry, list) or len(entry) < 4: - continue - page, _y, height, page_height = entry[0], entry[1], entry[2], entry[3] - if height <= 0 or page_height <= 0: - continue - if page not in page_first_line: - page_first_line[page] = idx - logger.debug( - "DOC_INSIGHTS: page_first_line map: %s", page_first_line - ) + page_first_line = LLMWhispererV2._index_first_content_line_per_page(line_metadata) + logger.debug("DOC_INSIGHTS: page_first_line map: %s", page_first_line) references: dict[str, Any] = {} for page_str, signatures in signature_metadata.items(): @@ -106,29 +140,14 @@ def _build_signature_page_references( page_num = int(page_str) if page_num not in page_first_line: logger.warning( - "DOC_INSIGHTS: page %d not found in line_metadata", - page_num, + "DOC_INSIGHTS: page %d not found in line_metadata", page_num ) continue - line_index = page_first_line[page_num] - hex_value = f"0x{line_index + 1:02X}" # 1-indexed hex - signers = [ - sig.get("name", "Unknown") - for sig in signatures - if isinstance(sig, dict) - ] - coords_entry = line_metadata[line_index] - coords = ( - list(coords_entry[:4]) - if isinstance(coords_entry, list) and len(coords_entry) >= 4 - else None + references[page_str] = LLMWhispererV2._build_page_reference_entry( + line_index=page_first_line[page_num], + signatures=signatures, + line_metadata=line_metadata, ) - references[page_str] = { - "hex": hex_value, - "line_metadata_index": line_index, - "signers": signers, - "coords": coords, - } return references if references else None @@ -216,10 +235,8 @@ def process( "computing page references", len(raw_line_metadata), ) - signature_page_references = ( - LLMWhispererV2._build_signature_page_references( - signature_metadata, raw_line_metadata - ) + signature_page_references = LLMWhispererV2._build_signature_page_references( + signature_metadata, raw_line_metadata ) logger.info( "DOC_INSIGHTS: signature_page_references=%s", diff --git a/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py b/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py new file mode 100644 index 0000000000..9736754ca9 --- /dev/null +++ b/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py @@ -0,0 +1,156 @@ +"""Shared helpers for surfacing LLMWhisperer signature page highlights. + +The workers executor and the prompt-service answer-prompt service both +need to post-process LLM answers against the signature metadata that +LLMWhisperer V2's ``document_insights`` mode produces. This module owns +the matching logic so both services stay in lock-step without copy-paste +drift. +""" + +from __future__ import annotations + +import re +from typing import Any + +# Generic signature-related terms used as a fallback trigger when the +# LLM answer doesn't mention any specific signer name but does talk +# about signing in general (e.g. "Is this signed?" → "Yes, the document +# is signed."). Matched as case-insensitive substrings. +SIGNATURE_KEYWORDS: tuple[str, ...] = ( + "signature", + "signed", + "signatory", + "signatories", + "signing", + "executed", +) + + +def _build_page_coords( + signature_page_references: dict[str, Any], +) -> dict[str, list[int]]: + """Pick the resolved coords array per signature page. + + Entries without a four-element ``coords`` list are skipped. + """ + page_coords: dict[str, list[int]] = {} + for page_str, ref in signature_page_references.items(): + if not isinstance(ref, dict): + continue + coords = ref.get("coords") + if isinstance(coords, list) and len(coords) >= 4: + page_coords[page_str] = list(coords[:4]) + return page_coords + + +def _find_pages_matching_signers( + answer: str, + signature_metadata: dict[str, list[Any]], + eligible_pages: set[str], +) -> list[str]: + """Return the pages whose signer names appear in ``answer``. + + Each name is matched as a whole token/phrase (case-insensitive, + word-boundary anchored) to avoid signer initials like ``"P S"`` + matching the gap between ``"Pradeep"`` and ``"Surukanti"`` inside + ``"Pradeep Surukanti"``. + """ + matched: list[str] = [] + for page_str, signatures in signature_metadata.items(): + if page_str not in eligible_pages or not signatures: + continue + for sig in signatures: + if not isinstance(sig, dict): + continue + name = (sig.get("name") or "").strip() + if not name: + continue + pattern = re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE) + if pattern.search(answer): + matched.append(page_str) + break + return matched + + +def _dedupe_coords( + matched_pages: list[str], + page_coords: dict[str, list[int]], +) -> list[list[int]]: + """Map matched pages to their coords, preserving order and dropping dups.""" + seen: set[tuple[int, ...]] = set() + new_coords: list[list[int]] = [] + for page_str in matched_pages: + coords = page_coords[page_str] + key = tuple(coords) + if key in seen: + continue + seen.add(key) + new_coords.append(coords) + return new_coords + + +def resolve_signature_highlight_coords( + answer: str, + signature_metadata: dict[str, list[Any]] | None, + signature_page_references: dict[str, Any] | None, +) -> list[list[int]]: + """Return the page coords that the LLM answer should highlight. + + Matching rules: + + - For each signer name in ``signature_metadata`` that appears as a + whole word/phrase (case-insensitive) inside ``answer``, the + corresponding page's coords are included. + - When no signer name matches but the answer mentions a generic + signature keyword (``signature``, ``signed``, ``signatory``, + ``signing``, ``executed``), every signature page's coords are + included as a fallback. + - Returns an empty list when there's nothing to attach. + + Returned coords are de-duplicated by content while preserving order. + """ + if not signature_page_references or not signature_metadata: + return [] + if not isinstance(answer, str) or not answer.strip(): + return [] + + page_coords = _build_page_coords(signature_page_references) + if not page_coords: + return [] + + matched_pages = _find_pages_matching_signers( + answer=answer, + signature_metadata=signature_metadata, + eligible_pages=set(page_coords.keys()), + ) + + if not matched_pages and any(kw in answer.lower() for kw in SIGNATURE_KEYWORDS): + matched_pages = list(page_coords.keys()) + + if not matched_pages: + return [] + + return _dedupe_coords(matched_pages, page_coords) + + +def merge_into_highlight_data( + metadata: dict[str, Any], + prompt_key: str, + new_coords: list[list[int]], + highlight_data_key: str = "highlight_data", +) -> None: + """Append signature coords to ``metadata[highlight_data_key][prompt_key]``. + + Skips duplicates against existing entries (e.g. those populated by + the hex-comment highlight pipeline). Mutates ``metadata`` in place. + """ + if not new_coords: + return + bucket = metadata.setdefault(highlight_data_key, {}) + existing = bucket.get(prompt_key) + if not isinstance(existing, list): + existing = [] + for coords in new_coords: + if coords not in existing: + existing.append(coords) + bucket[prompt_key] = existing diff --git a/workers/executor/executors/answer_prompt.py b/workers/executor/executors/answer_prompt.py index d1451c8111..6b781cad33 100644 --- a/workers/executor/executors/answer_prompt.py +++ b/workers/executor/executors/answer_prompt.py @@ -13,7 +13,6 @@ import ipaddress import logging import os -import re import socket from typing import Any from urllib.parse import urlparse @@ -21,6 +20,11 @@ from executor.executors.constants import PromptServiceConstants as PSKeys from executor.executors.exceptions import LegacyExecutorError, RateLimitError +from unstract.sdk1.utils.signature_highlights import ( + merge_into_highlight_data, + resolve_signature_highlight_coords, +) + logger = logging.getLogger(__name__) @@ -176,9 +180,7 @@ def construct_and_run_prompt( AnswerPromptService._attach_signature_highlights( answer=answer, signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA), - signature_page_references=tool_settings.get( - PSKeys.SIGNATURE_PAGE_REFERENCES - ), + signature_page_references=tool_settings.get(PSKeys.SIGNATURE_PAGE_REFERENCES), metadata=metadata, prompt_key=output[PSKeys.NAME], ) @@ -201,19 +203,6 @@ def _build_grammar_notes(grammar_list: list[dict[str, Any]]) -> str: ) return notes - # Generic signature-related terms used as a fallback trigger when the - # LLM answer doesn't mention any specific signer name but does talk - # about signing in general (e.g. "Is this signed?" → "Yes, the document - # is signed."). Matched as case-insensitive substrings. - _SIGNATURE_KEYWORDS = ( - "signature", - "signed", - "signatory", - "signatories", - "signing", - "executed", - ) - @staticmethod def _attach_signature_highlights( answer: str, @@ -225,91 +214,30 @@ def _attach_signature_highlights( """Attach signature page highlights to ``metadata`` when the LLM answer references a known signer or signatures generally. - - For each signer name in ``signature_metadata`` found as a - case-insensitive substring in ``answer``, append that page's - coords (from ``signature_page_references``) to - ``metadata[HIGHLIGHT_DATA][prompt_key]``. - - If no signer-name match is found but the answer mentions - generic signature keywords (signature, signed, signatory, - executed, signing), append every signature page's coords. - - ``metadata[HIGHLIGHT_DATA][prompt_key]`` is mutated in place; the - existing list (populated by hex-comment processing in - ``run_completion``) is preserved and extended. + Delegates the matching logic to + ``unstract.sdk1.utils.signature_highlights`` so workers and + prompt-service stay in sync. """ - if not signature_page_references or not signature_metadata: - return if metadata is None or not prompt_key: return - if not isinstance(answer, str) or not answer.strip(): - return - - # Build page → coords map (one coord array per signature page). - page_coords: dict[str, list[int]] = {} - for page_str, ref in signature_page_references.items(): - if not isinstance(ref, dict): - continue - coords = ref.get("coords") - if isinstance(coords, list) and len(coords) >= 4: - page_coords[page_str] = list(coords[:4]) - if not page_coords: - return - - answer_lower = answer.lower() - matched_pages: list[str] = [] - for page_str, signatures in signature_metadata.items(): - if page_str not in page_coords or not signatures: - continue - for sig in signatures: - if not isinstance(sig, dict): - continue - name = (sig.get("name") or "").strip() - if not name: - continue - # Word-boundary regex avoids false positives like - # signer "P S" matching the gap between "Pradeep" and - # "Surukanti" inside "Pradeep Surukanti". - pattern = re.compile( - r"\b" + re.escape(name) + r"\b", re.IGNORECASE - ) - if pattern.search(answer): - matched_pages.append(page_str) - break # one match per page is enough - - if not matched_pages: - # No specific signer matched — fall back to all signature pages - # when the answer talks about signing generically. - if any(kw in answer_lower for kw in AnswerPromptService._SIGNATURE_KEYWORDS): - matched_pages = list(page_coords.keys()) - - if not matched_pages: + new_coords = resolve_signature_highlight_coords( + answer=answer, + signature_metadata=signature_metadata, + signature_page_references=signature_page_references, + ) + if not new_coords: return - - seen: set[tuple[int, ...]] = set() - new_coords: list[list[int]] = [] - for page_str in matched_pages: - coords = page_coords[page_str] - key = tuple(coords) - if key in seen: - continue - seen.add(key) - new_coords.append(coords) - - bucket = metadata.setdefault(PSKeys.HIGHLIGHT_DATA, {}) - existing = bucket.get(prompt_key) - if not isinstance(existing, list): - existing = [] - # Avoid duplicating coords already present from hex-comment processing. - for coords in new_coords: - if coords not in existing: - existing.append(coords) - bucket[prompt_key] = existing + merge_into_highlight_data( + metadata=metadata, + prompt_key=prompt_key, + new_coords=new_coords, + highlight_data_key=PSKeys.HIGHLIGHT_DATA, + ) logger.info( "DOC_INSIGHTS attach_signature_highlights: prompt=%s, added %d " - "signature highlight(s) on pages %s", + "signature highlight(s)", prompt_key, len(new_coords), - matched_pages, ) @staticmethod diff --git a/workers/executor/executors/legacy_executor.py b/workers/executor/executors/legacy_executor.py index 4ddccaa5be..23ffa6a873 100644 --- a/workers/executor/executors/legacy_executor.py +++ b/workers/executor/executors/legacy_executor.py @@ -273,30 +273,24 @@ def _handle_extract(self, context: ExecutionContext) -> ExecutionResult: logger.info( "DOC_INSIGHTS _handle_extract: signature_metadata found " "for pages: %s", - list(process_response.extraction_metadata - .signature_metadata.keys()), + list(process_response.extraction_metadata.signature_metadata.keys()), ) if ( process_response.extraction_metadata and process_response.extraction_metadata.signature_page_references ): result_data["signature_page_references"] = ( - process_response.extraction_metadata - .signature_page_references + process_response.extraction_metadata.signature_page_references ) logger.info( - "DOC_INSIGHTS _handle_extract: " - "signature_page_references=%s", - process_response.extraction_metadata - .signature_page_references, + "DOC_INSIGHTS _handle_extract: " "signature_page_references=%s", + process_response.extraction_metadata.signature_page_references, ) self._write_signature_sidecar( fs=fs, output_file_path=output_file_path, signature_metadata=result_data.get("signature_metadata"), - signature_page_references=result_data.get( - "signature_page_references" - ), + signature_page_references=result_data.get("signature_page_references"), ) return ExecutionResult( success=True, @@ -321,7 +315,7 @@ def _signature_sidecar_path(output_file_path: str) -> str: Prompt Studio can recover signature data without re-extracting. """ p = Path(output_file_path) - return str(p.with_suffix("") ) + ".doc_insights.json" + return str(p.with_suffix("")) + ".doc_insights.json" @staticmethod def _write_signature_sidecar( diff --git a/workers/tests/test_answer_prompt.py b/workers/tests/test_answer_prompt.py index e7b534591a..708b720af7 100644 --- a/workers/tests/test_answer_prompt.py +++ b/workers/tests/test_answer_prompt.py @@ -11,7 +11,6 @@ from executor.executors.constants import ( PromptServiceConstants as PSKeys, ) - from unstract.sdk1.execution.context import ExecutionContext, Operation # --------------------------------------------------------------------------- @@ -109,7 +108,9 @@ def _mock_deps(llm=None): llm = _mock_llm() # AnswerPromptService — use the real class - from executor.executors.answer_prompt import AnswerPromptService as answer_prompt_svc_cls + from executor.executors.answer_prompt import ( + AnswerPromptService as answer_prompt_svc_cls, + ) retrieval_svc = MagicMock(name="RetrievalService") retrieval_svc.run_retrieval.return_value = ["chunk1", "chunk2"] From 4cfbc8c8a9c0c830e9a983c24905402fe6de23b3 Mon Sep 17 00:00:00 2001 From: pk-zipstack Date: Thu, 14 May 2026 17:49:53 +0530 Subject: [PATCH 09/12] [MISC] Reduce cognitive complexity flagged by SonarCloud - Extract _inject_signature_data_into_tool_settings helper in prompt_studio_helper.py; the 5 call sites now invoke it instead of inlining the if-blocks. Drops complexity of build_fetch_response_payload, build_bulk_fetch_response_payload, and dynamic_extractor below the gate (15). - Extract _capture_signature_data on LegacyExecutor for the signature_metadata / signature_page_references capture + sidecar write. _handle_extract is back below 15. - Split _find_pages_matching_signers in signature_highlights.py into a helper _any_signer_matches so the outer pages walk becomes a list comprehension. Complexity below 15. - Fix two implicit-string-concatenation log lines that ruff-format collapsed onto one line (S5799). No behaviour change. All 8 signature-highlight unit tests still pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../prompt_studio_helper.py | 62 +++++++++-------- .../prompt_service/services/extraction.py | 2 +- .../sdk1/utils/signature_highlights.py | 46 +++++++------ workers/executor/executors/legacy_executor.py | 69 +++++++++++-------- 4 files changed, 100 insertions(+), 79 deletions(-) diff --git a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py index 7e2e5cae4e..6273f7f363 100644 --- a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py +++ b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py @@ -861,12 +861,9 @@ def build_fetch_response_payload( tool_settings[TSPKeys.WORD_CONFIDENCE_POSTAMBLE] = getattr( settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), "" ) - if extract_result.signature_metadata: - tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata - if extract_result.signature_page_references: - tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = ( - extract_result.signature_page_references - ) + PromptStudioHelper._inject_signature_data_into_tool_settings( + tool_settings, extract_result + ) file_hash = fs_instance.get_hash_from_file(path=extract_path) @@ -1058,12 +1055,9 @@ def build_bulk_fetch_response_payload( tool_settings[TSPKeys.WORD_CONFIDENCE_POSTAMBLE] = getattr( settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), "" ) - if extract_result.signature_metadata: - tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata - if extract_result.signature_page_references: - tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = ( - extract_result.signature_page_references - ) + PromptStudioHelper._inject_signature_data_into_tool_settings( + tool_settings, extract_result + ) file_hash = fs_instance.get_hash_from_file(path=extract_path) @@ -1203,12 +1197,9 @@ def build_single_pass_payload( or TSPKeys.SIMPLE, TSPKeys.SIMILARITY_TOP_K: default_profile.similarity_top_k, } - if extract_result.signature_metadata: - tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata - if extract_result.signature_page_references: - tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = ( - extract_result.signature_page_references - ) + PromptStudioHelper._inject_signature_data_into_tool_settings( + tool_settings, extract_result + ) lookup_configs = get_lookup_configs_for_tool(tool, prompts=prompts) if lookup_configs: @@ -2009,12 +2000,9 @@ def _fetch_response( tool_settings[TSPKeys.WORD_CONFIDENCE_POSTAMBLE] = getattr( settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), "" ) - if extract_result.signature_metadata: - tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata - if extract_result.signature_page_references: - tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = ( - extract_result.signature_page_references - ) + PromptStudioHelper._inject_signature_data_into_tool_settings( + tool_settings, extract_result + ) file_hash = fs_instance.get_hash_from_file(path=doc_path) payload = { @@ -2314,12 +2302,9 @@ def _fetch_single_pass_response( default_profile.retrieval_strategy or TSPKeys.SIMPLE ) tool_settings[TSPKeys.SIMILARITY_TOP_K] = default_profile.similarity_top_k - if extract_result.signature_metadata: - tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata - if extract_result.signature_page_references: - tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = ( - extract_result.signature_page_references - ) + PromptStudioHelper._inject_signature_data_into_tool_settings( + tool_settings, extract_result + ) for prompt in prompts: if not prompt.prompt: raise EmptyPromptError() @@ -2379,6 +2364,23 @@ def get_tool_from_tool_id(tool_id: str) -> CustomTool | None: except CustomTool.DoesNotExist: return None + @staticmethod + def _inject_signature_data_into_tool_settings( + tool_settings: dict[str, Any], + extract_result: "ExtractResult", + ) -> None: + """Inject ``signature_metadata`` / ``signature_page_references`` + from the extract result into ``tool_settings`` (mutated in place). + + No-op when document_insights mode produced no signature data. + """ + if extract_result.signature_metadata: + tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata + if extract_result.signature_page_references: + tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = ( + extract_result.signature_page_references + ) + @staticmethod def _signature_sidecar_path(extract_file_path: str) -> str: p = Path(extract_file_path) diff --git a/prompt-service/src/unstract/prompt_service/services/extraction.py b/prompt-service/src/unstract/prompt_service/services/extraction.py index 8b252e8b55..f431a33648 100644 --- a/prompt-service/src/unstract/prompt_service/services/extraction.py +++ b/prompt-service/src/unstract/prompt_service/services/extraction.py @@ -78,7 +78,7 @@ def perform_extraction( process_response.extraction_metadata.signature_metadata ) logger.info( - "DOC_INSIGHTS extraction: signature_metadata found " "for pages: %s", + "DOC_INSIGHTS extraction: signature_metadata found for pages: %s", list(signature_metadata.keys()), ) if ( diff --git a/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py b/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py index 9736754ca9..ac140bd649 100644 --- a/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py +++ b/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py @@ -43,33 +43,39 @@ def _build_page_coords( return page_coords -def _find_pages_matching_signers( - answer: str, - signature_metadata: dict[str, list[Any]], - eligible_pages: set[str], -) -> list[str]: - """Return the pages whose signer names appear in ``answer``. +def _any_signer_matches(signatures: list[Any], answer: str) -> bool: + """Return True if any signer name in ``signatures`` appears in ``answer``. Each name is matched as a whole token/phrase (case-insensitive, word-boundary anchored) to avoid signer initials like ``"P S"`` matching the gap between ``"Pradeep"`` and ``"Surukanti"`` inside ``"Pradeep Surukanti"``. """ - matched: list[str] = [] - for page_str, signatures in signature_metadata.items(): - if page_str not in eligible_pages or not signatures: + for sig in signatures: + if not isinstance(sig, dict): + continue + name = (sig.get("name") or "").strip() + if not name: continue - for sig in signatures: - if not isinstance(sig, dict): - continue - name = (sig.get("name") or "").strip() - if not name: - continue - pattern = re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE) - if pattern.search(answer): - matched.append(page_str) - break - return matched + pattern = re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE) + if pattern.search(answer): + return True + return False + + +def _find_pages_matching_signers( + answer: str, + signature_metadata: dict[str, list[Any]], + eligible_pages: set[str], +) -> list[str]: + """Return the pages whose signer names appear in ``answer``.""" + return [ + page_str + for page_str, signatures in signature_metadata.items() + if page_str in eligible_pages + and signatures + and _any_signer_matches(signatures, answer) + ] def _dedupe_coords( diff --git a/workers/executor/executors/legacy_executor.py b/workers/executor/executors/legacy_executor.py index 1d00daad26..9119850e02 100644 --- a/workers/executor/executors/legacy_executor.py +++ b/workers/executor/executors/legacy_executor.py @@ -305,36 +305,12 @@ def _handle_extract(self, context: ExecutionContext) -> ExecutionResult: result_data["highlight_metadata"] = ( process_response.extraction_metadata.line_metadata ) - # Include signature metadata when available - # (from document_insights mode) - if ( - process_response.extraction_metadata - and process_response.extraction_metadata.signature_metadata - ): - result_data["signature_metadata"] = ( - process_response.extraction_metadata.signature_metadata - ) - logger.info( - "DOC_INSIGHTS _handle_extract: signature_metadata found " - "for pages: %s", - list(process_response.extraction_metadata.signature_metadata.keys()), - ) - if ( - process_response.extraction_metadata - and process_response.extraction_metadata.signature_page_references - ): - result_data["signature_page_references"] = ( - process_response.extraction_metadata.signature_page_references - ) - logger.info( - "DOC_INSIGHTS _handle_extract: " "signature_page_references=%s", - process_response.extraction_metadata.signature_page_references, - ) - self._write_signature_sidecar( + # Include signature metadata when available (document_insights mode) + self._capture_signature_data( fs=fs, output_file_path=output_file_path, - signature_metadata=result_data.get("signature_metadata"), - signature_page_references=result_data.get("signature_page_references"), + process_response=process_response, + result_data=result_data, ) return ExecutionResult( success=True, @@ -351,6 +327,43 @@ def _handle_extract(self, context: ExecutionContext) -> ExecutionResult: msg = f"Error from text extractor '{name}'. {e}" raise ExtractionError(message=msg) from e + def _capture_signature_data( + self, + fs: Any, + output_file_path: str | None, + process_response: TextExtractionResult, + result_data: dict[str, Any], + ) -> None: + """Move document_insights signature fields onto the result dict and + persist them in a sidecar JSON next to the extracted text file. + + No-op when the adapter did not produce signature data (e.g. + non-LLMWhisperer-V2 adapters or modes other than ``document_insights``). + """ + extraction_metadata = process_response.extraction_metadata + if not extraction_metadata: + return + signature_metadata = extraction_metadata.signature_metadata + signature_page_references = extraction_metadata.signature_page_references + if signature_metadata: + result_data["signature_metadata"] = signature_metadata + logger.info( + "DOC_INSIGHTS _handle_extract: signature_metadata found for " "pages: %s", + list(signature_metadata.keys()), + ) + if signature_page_references: + result_data["signature_page_references"] = signature_page_references + logger.info( + "DOC_INSIGHTS _handle_extract: signature_page_references=%s", + signature_page_references, + ) + self._write_signature_sidecar( + fs=fs, + output_file_path=output_file_path, + signature_metadata=signature_metadata, + signature_page_references=signature_page_references, + ) + @staticmethod def _signature_sidecar_path(output_file_path: str) -> str: """Sidecar JSON for document_insights signature data. From fd3e2002db9c8bfa8e30550d3155a352ce14ccef Mon Sep 17 00:00:00 2001 From: pk-zipstack Date: Thu, 14 May 2026 17:59:45 +0530 Subject: [PATCH 10/12] [MISC] Final SonarCloud cleanup: exception logging, complexity, string concat - Use logger.exception() in three except blocks in prompt_studio_helper.py (S8572) so the traceback is always captured. - Extract _log_signature_capture from dynamic_extractor to bring its cognitive complexity below 15 (S3776). - Merge a string-pair that ruff-format collapsed onto one line in legacy_executor.py (S5799). No behaviour change. All 8 signature-highlight unit tests still pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../prompt_studio_helper.py | 36 ++++++++++++------- workers/executor/executors/legacy_executor.py | 2 +- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py index 6273f7f363..5b54bc58be 100644 --- a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py +++ b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py @@ -1651,7 +1651,7 @@ def _execute_single_prompt( # Validation responses are user-facing; DRF renders them as-is. raise except Exception as e: - logger.error( + logger.exception( f"[{tool.tool_id}] Error while fetching response for " f"prompt {id} and doc {document_id}: {e}" ) @@ -1719,7 +1719,7 @@ def _execute_prompts_in_single_pass( # Validation responses are user-facing; DRF renders them as-is. raise except Exception as e: - logger.error( + logger.exception( f"[{tool.tool_id}] Error while fetching single pass response: {e}" ) PromptStudioHelper._publish_log( @@ -2213,7 +2213,7 @@ def dynamic_indexer( msg = e.actual_err.response.json().get("error", str(e)) msg = f"Error while indexing '{filename}'. {msg}" - logger.error(msg, stack_info=True, exc_info=True) + logger.exception(msg, stack_info=True) PromptStudioHelper._publish_log( {"tool_id": tool_id, "run_id": run_id, "doc_name": filename}, LogLevels.ERROR, @@ -2364,6 +2364,23 @@ def get_tool_from_tool_id(tool_id: str) -> CustomTool | None: except CustomTool.DoesNotExist: return None + @staticmethod + def _log_signature_capture( + signature_metadata: dict[str, Any] | None, + signature_page_references: dict[str, Any] | None, + document_id: str, + ) -> None: + """Log signature data capture from a fresh extract dispatch.""" + if not (signature_metadata or signature_page_references): + return + logger.info( + "DOC_INSIGHTS dynamic_extractor: captured signature data " + "(pages=%s, refs=%s) for document %s", + list(signature_metadata.keys()) if signature_metadata else [], + list(signature_page_references.keys()) if signature_page_references else [], + document_id, + ) + @staticmethod def _inject_signature_data_into_tool_settings( tool_settings: dict[str, Any], @@ -2526,16 +2543,9 @@ def dynamic_extractor( extracted_text = result.data.get("extracted_text", "") signature_metadata = result.data.get("signature_metadata") signature_page_references = result.data.get("signature_page_references") - if signature_metadata or signature_page_references: - logger.info( - "DOC_INSIGHTS dynamic_extractor: captured signature data " - "(pages=%s, refs=%s) for document %s", - list(signature_metadata.keys()) if signature_metadata else [], - list(signature_page_references.keys()) - if signature_page_references - else [], - document_id, - ) + PromptStudioHelper._log_signature_capture( + signature_metadata, signature_page_references, document_id + ) success = PromptStudioIndexHelper.mark_extraction_status( document_id=document_id, profile_manager=profile_manager, diff --git a/workers/executor/executors/legacy_executor.py b/workers/executor/executors/legacy_executor.py index 9119850e02..3e571b66ec 100644 --- a/workers/executor/executors/legacy_executor.py +++ b/workers/executor/executors/legacy_executor.py @@ -348,7 +348,7 @@ def _capture_signature_data( if signature_metadata: result_data["signature_metadata"] = signature_metadata logger.info( - "DOC_INSIGHTS _handle_extract: signature_metadata found for " "pages: %s", + "DOC_INSIGHTS _handle_extract: signature_metadata found for pages: %s", list(signature_metadata.keys()), ) if signature_page_references: From 108dddec9356a629699a3b125504de0ddd27f6e8 Mon Sep 17 00:00:00 2001 From: pk-zipstack Date: Thu, 14 May 2026 18:10:48 +0530 Subject: [PATCH 11/12] [MISC] Move signature LLM-context formatter into shared SDK helper Cuts the SonarCloud duplication metric: the formatter that turns ``signature_metadata`` into the ``[Document Signature Information]`` context block was the same code in both workers and prompt-service. Now both services import ``format_signature_metadata_context`` from ``unstract.sdk1.utils.signature_highlights``. No behaviour change. All 8 signature-highlight unit tests still pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../prompt_service/services/answer_prompt.py | 35 +-------------- .../sdk1/utils/signature_highlights.py | 34 +++++++++++++++ workers/executor/executors/answer_prompt.py | 43 +------------------ 3 files changed, 38 insertions(+), 74 deletions(-) diff --git a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py index 49a1009a67..be620238ac 100644 --- a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py +++ b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py @@ -25,6 +25,7 @@ from unstract.sdk1.file_storage.env_helper import EnvHelper from unstract.sdk1.llm import LLM from unstract.sdk1.utils.signature_highlights import ( + format_signature_metadata_context, merge_into_highlight_data, resolve_signature_highlight_coords, ) @@ -205,36 +206,6 @@ def _attach_signature_highlights( len(new_coords), ) - @staticmethod - def _format_signature_metadata( - signature_metadata: dict[str, list[Any]], - ) -> str: - """Format signature metadata as a human-readable context block.""" - lines: list[str] = [] - for page_num, signatures in sorted( - signature_metadata.items(), key=lambda x: int(x[0]) - ): - if not signatures: - continue - for sig in signatures: - name = sig.get("name", "Unknown") - sig_type = sig.get("type", "signature") - desc = sig.get("desc", "") - page_display = int(page_num) + 1 # 0-indexed to 1-indexed - entry = f"- Page {page_display}: {name} ({sig_type})" - if desc: - entry += f" — {desc}" - lines.append(entry) - if not lines: - return "" - header = ( - "\n\n[Document Signature Information]\n" - "The following signatures were detected in this document. " - "Use this information to answer any questions about signatories, " - "signing parties, or document execution status.\n" - ) - return header + "\n".join(lines) - @staticmethod def construct_prompt( preamble: str, @@ -279,9 +250,7 @@ def construct_prompt( "for %d page(s)", len(signature_metadata), ) - signature_context = AnswerPromptService._format_signature_metadata( - signature_metadata - ) + signature_context = format_signature_metadata_context(signature_metadata) app.logger.debug( "DOC_INSIGHTS construct_prompt: signature_context=%s", signature_context[:200] if signature_context else "empty", diff --git a/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py b/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py index ac140bd649..cf29292ac5 100644 --- a/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py +++ b/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py @@ -139,6 +139,40 @@ def resolve_signature_highlight_coords( return _dedupe_coords(matched_pages, page_coords) +def format_signature_metadata_context( + signature_metadata: dict[str, list[Any]], +) -> str: + """Format ``signature_metadata`` as a human-readable LLM context block. + + Returns an empty string when no signatures are present. Page numbers + are converted from 0-indexed to 1-indexed for display. + """ + lines: list[str] = [] + for page_num, signatures in sorted( + signature_metadata.items(), key=lambda x: int(x[0]) + ): + if not signatures: + continue + for sig in signatures: + name = sig.get("name", "Unknown") + sig_type = sig.get("type", "signature") + desc = sig.get("desc", "") + page_display = int(page_num) + 1 # 0-indexed → 1-indexed + entry = f"- Page {page_display}: {name} ({sig_type})" + if desc: + entry += f" — {desc}" + lines.append(entry) + if not lines: + return "" + header = ( + "\n\n[Document Signature Information]\n" + "The following signatures were detected in this document. " + "Use this information to answer any questions about signatories, " + "signing parties, or document execution status.\n" + ) + return header + "\n".join(lines) + + def merge_into_highlight_data( metadata: dict[str, Any], prompt_key: str, diff --git a/workers/executor/executors/answer_prompt.py b/workers/executor/executors/answer_prompt.py index 06c66934f9..8bfc00bf1a 100644 --- a/workers/executor/executors/answer_prompt.py +++ b/workers/executor/executors/answer_prompt.py @@ -21,6 +21,7 @@ from executor.executors.exceptions import LegacyExecutorError, RateLimitError from unstract.sdk1.utils.signature_highlights import ( + format_signature_metadata_context, merge_into_highlight_data, resolve_signature_highlight_coords, ) @@ -240,44 +241,6 @@ def _attach_signature_highlights( len(new_coords), ) - @staticmethod - def _format_signature_metadata( - signature_metadata: dict[str, list[Any]], - ) -> str: - """Format signature metadata as a human-readable context block. - - Args: - signature_metadata: Dict keyed by page number (str) with lists - of signature entries, each having 'type', 'name', 'desc'. - - Returns: - Formatted string for LLM context injection. - """ - lines: list[str] = [] - for page_num, signatures in sorted( - signature_metadata.items(), key=lambda x: int(x[0]) - ): - if not signatures: - continue - for sig in signatures: - name = sig.get("name", "Unknown") - sig_type = sig.get("type", "signature") - desc = sig.get("desc", "") - page_display = int(page_num) + 1 # 0-indexed to 1-indexed - entry = f"- Page {page_display}: {name} ({sig_type})" - if desc: - entry += f" — {desc}" - lines.append(entry) - if not lines: - return "" - header = ( - "\n\n[Document Signature Information]\n" - "The following signatures were detected in this document. " - "Use this information to answer any questions about signatories, " - "signing parties, or document execution status.\n" - ) - return header + "\n".join(lines) - @staticmethod def construct_prompt( preamble: str, @@ -310,9 +273,7 @@ def construct_prompt( "for %d page(s)", len(signature_metadata), ) - signature_context = AnswerPromptService._format_signature_metadata( - signature_metadata - ) + signature_context = format_signature_metadata_context(signature_metadata) logger.debug( "DOC_INSIGHTS construct_prompt: signature_context=%s", signature_context[:200] if signature_context else "empty", From 91fd22f40739024686634cfe7685739eddcd58dc Mon Sep 17 00:00:00 2001 From: pk-zipstack Date: Thu, 14 May 2026 18:19:56 +0530 Subject: [PATCH 12/12] [MISC] Extract SSRF webhook-URL helper into shared SDK module Workers and prompt-service both had a ~50-line ``_is_safe_public_url`` helper for SSRF protection on postprocessing webhook URLs. Moved it to ``unstract.sdk1.utils.url_safety.is_safe_public_url`` and import from there. Brings the SonarCloud new-code duplication metric down further without touching behaviour. No behaviour change. All 8 signature-highlight unit tests still pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../prompt_service/services/answer_prompt.py | 58 +--------------- .../src/unstract/sdk1/utils/url_safety.py | 68 +++++++++++++++++++ workers/executor/executors/answer_prompt.py | 59 +--------------- 3 files changed, 72 insertions(+), 113 deletions(-) create mode 100644 unstract/sdk1/src/unstract/sdk1/utils/url_safety.py diff --git a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py index be620238ac..62503f1890 100644 --- a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py +++ b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py @@ -1,8 +1,5 @@ -import ipaddress -import socket from logging import Logger from typing import Any -from urllib.parse import urlparse from flask import current_app as app @@ -29,58 +26,7 @@ merge_into_highlight_data, resolve_signature_highlight_coords, ) - - -def _is_safe_public_url(url: str) -> bool: - """Validate webhook URL for SSRF protection. - - Only allows HTTPS and blocks private/loopback/internal addresses. - Resolves all DNS records (A/AAAA) to prevent DNS rebinding attacks. - """ - try: - p = urlparse(url) - if p.scheme not in ("https",): # Only allow HTTPS for security - return False - host = p.hostname or "" - # Block obvious local hosts - if host in ("localhost",): - return False - - addrs: set[str] = set() - # If literal IP, validate directly; else resolve all records (A/AAAA) - try: - ipaddress.ip_address(host) - addrs.add(host) - except ValueError: - try: - for family, _type, _proto, _canonname, sockaddr in socket.getaddrinfo( - host, None, type=socket.SOCK_STREAM - ): - addr = sockaddr[0] - addrs.add(addr) - except Exception: - return False - - if not addrs: - return False - - # Validate all resolved addresses - for addr in addrs: - try: - ip = ipaddress.ip_address(addr) - except ValueError: - return False - if ( - ip.is_private - or ip.is_loopback - or ip.is_link_local - or ip.is_reserved - or ip.is_multicast - ): - return False - return True - except Exception: - return False +from unstract.sdk1.utils.url_safety import is_safe_public_url class AnswerPromptService: @@ -498,7 +444,7 @@ def handle_json( app.logger.warning( "Postprocessing webhook enabled but URL missing; skipping." ) - elif not _is_safe_public_url(webhook_url): + elif not is_safe_public_url(webhook_url): app.logger.warning( "Postprocessing webhook URL is not allowed; skipping." ) diff --git a/unstract/sdk1/src/unstract/sdk1/utils/url_safety.py b/unstract/sdk1/src/unstract/sdk1/utils/url_safety.py new file mode 100644 index 0000000000..40257399cf --- /dev/null +++ b/unstract/sdk1/src/unstract/sdk1/utils/url_safety.py @@ -0,0 +1,68 @@ +"""URL safety helpers (SSRF protection). + +Shared between the workers executor and the prompt-service answer-prompt +service because both need to validate webhook URLs before issuing +postprocessing callbacks. +""" + +from __future__ import annotations + +import ipaddress +import socket +from urllib.parse import urlparse + + +def _resolve_host_addresses(host: str) -> set[str]: + """Resolve a hostname or IP string to a set of IP address strings.""" + try: + ipaddress.ip_address(host) + return {host} + except ValueError: + pass + try: + return { + sockaddr[0] + for _family, _type, _proto, _canonname, sockaddr in socket.getaddrinfo( + host, None, type=socket.SOCK_STREAM + ) + } + except Exception: + return set() + + +def is_safe_public_url(url: str) -> bool: + """Validate a URL for use as an outbound webhook target (SSRF protection). + + Only HTTPS URLs are allowed, and the resolved host must not point to + a private, loopback, link-local, reserved, or multicast address. + All DNS records (A/AAAA) are resolved to prevent DNS rebinding + attacks. + """ + try: + p = urlparse(url) + if p.scheme not in ("https",): # only HTTPS + return False + host = p.hostname or "" + if host == "localhost": + return False + + addrs = _resolve_host_addresses(host) + if not addrs: + return False + + for addr in addrs: + try: + ip = ipaddress.ip_address(addr) + except ValueError: + return False + if ( + ip.is_private + or ip.is_loopback + or ip.is_link_local + or ip.is_reserved + or ip.is_multicast + ): + return False + return True + except Exception: + return False diff --git a/workers/executor/executors/answer_prompt.py b/workers/executor/executors/answer_prompt.py index 8bfc00bf1a..510bfbbc9b 100644 --- a/workers/executor/executors/answer_prompt.py +++ b/workers/executor/executors/answer_prompt.py @@ -10,12 +10,9 @@ are integrated at the caller level (LegacyExecutor). """ -import ipaddress import logging import os -import socket from typing import Any -from urllib.parse import urlparse from executor.executors.constants import PromptServiceConstants as PSKeys from executor.executors.exceptions import LegacyExecutorError, RateLimitError @@ -25,63 +22,11 @@ merge_into_highlight_data, resolve_signature_highlight_coords, ) +from unstract.sdk1.utils.url_safety import is_safe_public_url logger = logging.getLogger(__name__) -def _resolve_host_addresses(host: str) -> set[str]: - """Resolve a hostname or IP string to a set of IP address strings.""" - try: - ipaddress.ip_address(host) - return {host} - except ValueError: - pass - try: - return { - sockaddr[0] - for _family, _type, _proto, _canonname, sockaddr in socket.getaddrinfo( - host, None, type=socket.SOCK_STREAM - ) - } - except Exception: - return set() - - -def _is_safe_public_url(url: str) -> bool: - """Validate webhook URL for SSRF protection. - - Only allows HTTPS and blocks private/loopback/internal addresses. - """ - try: - p = urlparse(url) - if p.scheme not in ("https",): - return False - host = p.hostname or "" - if host in ("localhost",): - return False - - addrs = _resolve_host_addresses(host) - if not addrs: - return False - - for addr in addrs: - try: - ip = ipaddress.ip_address(addr) - except ValueError: - return False - if ( - ip.is_private - or ip.is_loopback - or ip.is_link_local - or ip.is_reserved - or ip.is_multicast - ): - return False - return True - except Exception: - return False - - class AnswerPromptService: @staticmethod def extract_variable( @@ -359,7 +304,7 @@ def _run_webhook_postprocess( if not webhook_url: logger.warning("Postprocessing webhook enabled but URL missing; skipping.") return parsed_data, None - if not _is_safe_public_url(webhook_url): + if not is_safe_public_url(webhook_url): logger.warning("Postprocessing webhook URL is not allowed; skipping.") return parsed_data, None try: