diff --git a/backend/prompt_studio/prompt_studio_core_v2/constants.py b/backend/prompt_studio/prompt_studio_core_v2/constants.py index 03bd68c1d8..f927708c20 100644 --- a/backend/prompt_studio/prompt_studio_core_v2/constants.py +++ b/backend/prompt_studio/prompt_studio_core_v2/constants.py @@ -105,6 +105,8 @@ class ToolStudioPromptKeys: EXECUTION_SOURCE = "execution_source" LINE_ITEM = "line-item" CUSTOM_DATA = "custom_data" + SIGNATURE_METADATA = "signature_metadata" + SIGNATURE_PAGE_REFERENCES = "signature_page_references" # Webhook postprocessing settings ENABLE_POSTPROCESSING_WEBHOOK = "enable_postprocessing_webhook" POSTPROCESSING_WEBHOOK_URL = "postprocessing_webhook_url" diff --git a/backend/prompt_studio/prompt_studio_core_v2/internal_views.py b/backend/prompt_studio/prompt_studio_core_v2/internal_views.py index 3ad3a5db16..8c3391f838 100644 --- a/backend/prompt_studio/prompt_studio_core_v2/internal_views.py +++ b/backend/prompt_studio/prompt_studio_core_v2/internal_views.py @@ -244,11 +244,13 @@ def indexing_status(request): user_id = data.get("user_id", "") doc_id_key = data.get("doc_id_key", "") - if not action or not org_id or not user_id or not doc_id_key: + # user_id may be empty (e.g. mock auth users) - it's only used as a + # Redis cache key fragment, so empty is acceptable. + if not action or not org_id or not doc_id_key: return JsonResponse( { "success": False, - "error": "action, org_id, user_id, doc_id_key are required", + "error": "action, org_id, doc_id_key are required", }, status=status.HTTP_400_BAD_REQUEST, ) diff --git a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py index 62cbf2d9a2..5b54bc58be 100644 --- a/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py +++ b/backend/prompt_studio/prompt_studio_core_v2/prompt_studio_helper.py @@ -4,7 +4,7 @@ import time import uuid from pathlib import Path -from typing import Any +from typing import Any, NamedTuple from account_v2.constants import Common from account_v2.models import User @@ -85,7 +85,20 @@ CHOICES_JSON = "/static/select_choices.json" ERROR_MSG = "User %s doesn't have access to adapter %s" -logger = logging.getLogger(__name__) + +class ExtractResult(NamedTuple): + """Return value of ``PromptStudioHelper.dynamic_extractor``. + + ``signature_metadata`` and ``signature_page_references`` are populated + only when the x2text adapter is LLMWhisperer V2 in ``document_insights`` + mode and the document contains signatures. They are read either from + the live extract dispatch result (cache miss) or from the on-disk + ``.doc_insights.json`` sidecar (cache hit). + """ + + text: str + signature_metadata: dict[str, Any] | None = None + signature_page_references: dict[str, Any] | None = None class PromptStudioHelper: @@ -742,7 +755,7 @@ def build_fetch_response_payload( ) # Extract (blocking, usually cached) - extracted_text = PromptStudioHelper.dynamic_extractor( + extract_result = PromptStudioHelper.dynamic_extractor( profile_manager=profile_manager, file_path=file_path, org_id=org_id, @@ -750,6 +763,7 @@ def build_fetch_response_payload( run_id=run_id, enable_highlight=tool.enable_highlight, ) + extracted_text = extract_result.text is_summary = tool.summarize_as_source if is_summary: @@ -847,6 +861,9 @@ def build_fetch_response_payload( tool_settings[TSPKeys.WORD_CONFIDENCE_POSTAMBLE] = getattr( settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), "" ) + PromptStudioHelper._inject_signature_data_into_tool_settings( + tool_settings, extract_result + ) file_hash = fs_instance.get_hash_from_file(path=extract_path) @@ -962,7 +979,7 @@ def build_bulk_fetch_response_payload( ) # Extract ONCE (blocking, usually cached) - extracted_text = PromptStudioHelper.dynamic_extractor( + extract_result = PromptStudioHelper.dynamic_extractor( profile_manager=profile_manager, file_path=file_path, org_id=org_id, @@ -970,6 +987,7 @@ def build_bulk_fetch_response_payload( run_id=run_id, enable_highlight=tool.enable_highlight, ) + extracted_text = extract_result.text is_summary = tool.summarize_as_source if is_summary: @@ -1037,6 +1055,9 @@ def build_bulk_fetch_response_payload( tool_settings[TSPKeys.WORD_CONFIDENCE_POSTAMBLE] = getattr( settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), "" ) + PromptStudioHelper._inject_signature_data_into_tool_settings( + tool_settings, extract_result + ) file_hash = fs_instance.get_hash_from_file(path=extract_path) @@ -1137,7 +1158,7 @@ def build_single_pass_payload( ) # Extract (blocking, usually cached) - PromptStudioHelper.dynamic_extractor( + extract_result = PromptStudioHelper.dynamic_extractor( profile_manager=default_profile, file_path=doc_path, org_id=org_id, @@ -1176,6 +1197,9 @@ def build_single_pass_payload( or TSPKeys.SIMPLE, TSPKeys.SIMILARITY_TOP_K: default_profile.similarity_top_k, } + PromptStudioHelper._inject_signature_data_into_tool_settings( + tool_settings, extract_result + ) lookup_configs = get_lookup_configs_for_tool(tool, prompts=prompts) if lookup_configs: @@ -1381,7 +1405,7 @@ def index_document( tool=util, ) - extracted_text = PromptStudioHelper.dynamic_extractor( + extract_result = PromptStudioHelper.dynamic_extractor( profile_manager=default_profile, file_path=file_path, org_id=org_id, @@ -1389,6 +1413,7 @@ def index_document( run_id=run_id, enable_highlight=tool.enable_highlight, ) + extracted_text = extract_result.text if tool.summarize_context: summarize_file_path = PromptStudioHelper.summarize( file_name, org_id, run_id, tool @@ -1626,7 +1651,7 @@ def _execute_single_prompt( # Validation responses are user-facing; DRF renders them as-is. raise except Exception as e: - logger.error( + logger.exception( f"[{tool.tool_id}] Error while fetching response for " f"prompt {id} and doc {document_id}: {e}" ) @@ -1694,7 +1719,7 @@ def _execute_prompts_in_single_pass( # Validation responses are user-facing; DRF renders them as-is. raise except Exception as e: - logger.error( + logger.exception( f"[{tool.tool_id}] Error while fetching single pass response: {e}" ) PromptStudioHelper._publish_log( @@ -1856,7 +1881,7 @@ def _fetch_response( tool=util, ) logger.info(f"Extracting text from {file_path} for {doc_id}") - extracted_text = PromptStudioHelper.dynamic_extractor( + extract_result = PromptStudioHelper.dynamic_extractor( profile_manager=profile_manager, file_path=file_path, org_id=org_id, @@ -1864,6 +1889,7 @@ def _fetch_response( run_id=run_id, enable_highlight=tool.enable_highlight, ) + extracted_text = extract_result.text logger.info(f"Extracted text from {file_path} for {doc_id}") if is_summary: profile_manager.chunk_size = 0 @@ -1974,6 +2000,9 @@ def _fetch_response( tool_settings[TSPKeys.WORD_CONFIDENCE_POSTAMBLE] = getattr( settings, TSPKeys.WORD_CONFIDENCE_POSTAMBLE.upper(), "" ) + PromptStudioHelper._inject_signature_data_into_tool_settings( + tool_settings, extract_result + ) file_hash = fs_instance.get_hash_from_file(path=doc_path) payload = { @@ -2184,7 +2213,7 @@ def dynamic_indexer( msg = e.actual_err.response.json().get("error", str(e)) msg = f"Error while indexing '{filename}'. {msg}" - logger.error(msg, stack_info=True, exc_info=True) + logger.exception(msg, stack_info=True) PromptStudioHelper._publish_log( {"tool_id": tool_id, "run_id": run_id, "doc_name": filename}, LogLevels.ERROR, @@ -2235,7 +2264,7 @@ def _fetch_single_pass_response( file_path = os.path.join( directory, "extract", os.path.splitext(filename)[0] + ".txt" ) - PromptStudioHelper.dynamic_extractor( + extract_result = PromptStudioHelper.dynamic_extractor( profile_manager=default_profile, file_path=input_file_path, org_id=org_id, @@ -2273,6 +2302,9 @@ def _fetch_single_pass_response( default_profile.retrieval_strategy or TSPKeys.SIMPLE ) tool_settings[TSPKeys.SIMILARITY_TOP_K] = default_profile.similarity_top_k + PromptStudioHelper._inject_signature_data_into_tool_settings( + tool_settings, extract_result + ) for prompt in prompts: if not prompt.prompt: raise EmptyPromptError() @@ -2332,6 +2364,83 @@ def get_tool_from_tool_id(tool_id: str) -> CustomTool | None: except CustomTool.DoesNotExist: return None + @staticmethod + def _log_signature_capture( + signature_metadata: dict[str, Any] | None, + signature_page_references: dict[str, Any] | None, + document_id: str, + ) -> None: + """Log signature data capture from a fresh extract dispatch.""" + if not (signature_metadata or signature_page_references): + return + logger.info( + "DOC_INSIGHTS dynamic_extractor: captured signature data " + "(pages=%s, refs=%s) for document %s", + list(signature_metadata.keys()) if signature_metadata else [], + list(signature_page_references.keys()) if signature_page_references else [], + document_id, + ) + + @staticmethod + def _inject_signature_data_into_tool_settings( + tool_settings: dict[str, Any], + extract_result: "ExtractResult", + ) -> None: + """Inject ``signature_metadata`` / ``signature_page_references`` + from the extract result into ``tool_settings`` (mutated in place). + + No-op when document_insights mode produced no signature data. + """ + if extract_result.signature_metadata: + tool_settings[TSPKeys.SIGNATURE_METADATA] = extract_result.signature_metadata + if extract_result.signature_page_references: + tool_settings[TSPKeys.SIGNATURE_PAGE_REFERENCES] = ( + extract_result.signature_page_references + ) + + @staticmethod + def _signature_sidecar_path(extract_file_path: str) -> str: + p = Path(extract_file_path) + return str(p.with_suffix("")) + ".doc_insights.json" + + @staticmethod + def _load_signature_sidecar( + extract_file_path: str, + fs_instance: Any, + ) -> tuple[dict[str, Any] | None, dict[str, Any] | None]: + """Return ``(signature_metadata, signature_page_references)`` from the + sidecar, or ``(None, None)`` if the sidecar is missing or unreadable. + + Signature data is only written by the executor when a document + contains signatures in document_insights mode; cache-hit calls + for documents extracted in other modes legitimately have no + sidecar, so absence is not an error. + """ + sidecar_path = PromptStudioHelper._signature_sidecar_path(extract_file_path) + try: + raw = fs_instance.read(path=sidecar_path, mode="r") + except FileNotFoundError: + return None, None + except Exception as e: + logger.warning( + "DOC_INSIGHTS sidecar: failed to read %s: %s", + sidecar_path, + e, + ) + return None, None + try: + data = json.loads(raw) + except (TypeError, ValueError) as e: + logger.warning( + "DOC_INSIGHTS sidecar: failed to parse %s: %s", + sidecar_path, + e, + ) + return None, None + sig_meta = data.get("signature_metadata") or None + sig_refs = data.get("signature_page_references") or None + return sig_meta, sig_refs + @staticmethod def dynamic_extractor( file_path: str, @@ -2340,7 +2449,7 @@ def dynamic_extractor( org_id: str, profile_manager: ProfileManager, document_id: str, - ) -> str: + ) -> ExtractResult: # Guard against None metadata (when adapter_metadata_b is None) metadata = profile_manager.x2text.metadata or {} x2text_config_hash = ToolUtils.hash_str(json.dumps(metadata, sort_keys=True)) @@ -2370,7 +2479,15 @@ def dynamic_extractor( try: extracted_text = fs_instance.read(path=extract_file_path, mode="r") logger.info("Extracted text found. Reading from file..") - return extracted_text + sig_meta, sig_refs = PromptStudioHelper._load_signature_sidecar( + extract_file_path=extract_file_path, + fs_instance=fs_instance, + ) + return ExtractResult( + text=extracted_text, + signature_metadata=sig_meta, + signature_page_references=sig_refs, + ) except FileNotFoundError as e: logger.warning( f"File not found for extraction. {extract_file_path}. {e}" @@ -2424,6 +2541,11 @@ def dynamic_extractor( ) extracted_text = result.data.get("extracted_text", "") + signature_metadata = result.data.get("signature_metadata") + signature_page_references = result.data.get("signature_page_references") + PromptStudioHelper._log_signature_capture( + signature_metadata, signature_page_references, document_id + ) success = PromptStudioIndexHelper.mark_extraction_status( document_id=document_id, profile_manager=profile_manager, @@ -2436,7 +2558,11 @@ def dynamic_extractor( f"Extraction completed but status not saved." ) - return extracted_text + return ExtractResult( + text=extracted_text, + signature_metadata=signature_metadata, + signature_page_references=signature_page_references, + ) @staticmethod def export_project_settings(tool: CustomTool) -> dict: diff --git a/frontend/src/components/custom-tools/prompt-card/DisplayPromptResult.jsx b/frontend/src/components/custom-tools/prompt-card/DisplayPromptResult.jsx index f5f9474024..857d96f33a 100644 --- a/frontend/src/components/custom-tools/prompt-card/DisplayPromptResult.jsx +++ b/frontend/src/components/custom-tools/prompt-card/DisplayPromptResult.jsx @@ -415,14 +415,22 @@ const TextResult = ({ const confidence = getConfidenceForText(); - return enableHighlight ? ( + // Make the answer clickable when the tool has highlighting enabled OR + // when the backend produced highlight_data (e.g. signature page refs + // from LLMWhisperer's document_insights mode), so signature highlights + // still work without requiring the separate enable_highlight toggle. + const hasHighlightData = + Array.isArray(highlightData) && highlightData.length > 0; + const isClickable = enableHighlight || hasHighlightData; + + return isClickable ? ( onSelectHighlight(highlightData, promptId, profileId, confidence) } className={`prompt-output-result json-value ${ - highlightData ? "clickable" : "" + hasHighlightData ? "clickable" : "" } ${selectedHighlight?.highlightedPrompt === promptId ? "selected" : ""}`} > {parsedOutput} diff --git a/frontend/src/components/custom-tools/prompt-card/PromptCard.jsx b/frontend/src/components/custom-tools/prompt-card/PromptCard.jsx index 29fda260e2..ee1328b3ac 100644 --- a/frontend/src/components/custom-tools/prompt-card/PromptCard.jsx +++ b/frontend/src/components/custom-tools/prompt-card/PromptCard.jsx @@ -254,23 +254,32 @@ const PromptCard = memo( highlightedProfile, confidenceData, ) => { - if (details?.enable_highlight) { - const processedHighlight = - singlePassExtractMode && - typeof highlightData === "object" && - !Array.isArray(highlightData) - ? flattenHighlightData(highlightData) - : highlightData; - - updateCustomTool({ - selectedHighlight: { - highlight: processedHighlight, - highlightedPrompt: highlightedPrompt, - highlightedProfile: highlightedProfile, - confidence: confidenceData, - }, - }); + // Allow highlight state to update when the tool has highlighting + // enabled OR when the backend produced highlight_data (e.g. + // signature page refs from LLMWhisperer's document_insights mode), + // so signature-driven page jumps work without the separate + // enable_highlight toggle. + const hasHighlightData = Array.isArray(highlightData) + ? highlightData.length > 0 + : Boolean(highlightData); + if (!details?.enable_highlight && !hasHighlightData) { + return; } + const processedHighlight = + singlePassExtractMode && + typeof highlightData === "object" && + !Array.isArray(highlightData) + ? flattenHighlightData(highlightData) + : highlightData; + + updateCustomTool({ + selectedHighlight: { + highlight: processedHighlight, + highlightedPrompt: highlightedPrompt, + highlightedProfile: highlightedProfile, + confidence: confidenceData, + }, + }); }; const handleTypeChange = (value) => { diff --git a/prompt-service/src/unstract/prompt_service/constants.py b/prompt-service/src/unstract/prompt_service/constants.py index 9eddab8423..58a6b72fcd 100644 --- a/prompt-service/src/unstract/prompt_service/constants.py +++ b/prompt-service/src/unstract/prompt_service/constants.py @@ -84,6 +84,8 @@ class PromptServiceConstants: LINE_ITEM = "line-item" LINE_NUMBERS = "line_numbers" WHISPER_HASH = "whisper_hash" + SIGNATURE_METADATA = "signature_metadata" + SIGNATURE_PAGE_REFERENCES = "signature_page_references" PAID_FEATURE_MSG = ( "It is a cloud / enterprise feature. If you have purchased a plan and still " "face this issue, please contact support" diff --git a/prompt-service/src/unstract/prompt_service/controllers/extraction.py b/prompt-service/src/unstract/prompt_service/controllers/extraction.py index 516894f429..588e561491 100644 --- a/prompt-service/src/unstract/prompt_service/controllers/extraction.py +++ b/prompt-service/src/unstract/prompt_service/controllers/extraction.py @@ -36,7 +36,7 @@ def extract() -> Any: tool_exec_metadata = payload.get(IKeys.TOOL_EXECUTION_METATADA, {}) execution_run_data_folder = payload.get(IKeys.EXECUTION_DATA_DIR, "") - extracted_text = ExtractionService.perform_extraction( + extraction_result = ExtractionService.perform_extraction( file_path=file_path, x2text_instance_id=x2text_instance_id, output_file_path=output_file_path, @@ -49,5 +49,13 @@ def extract() -> Any: tool_exec_metadata=tool_exec_metadata, execution_run_data_folder=execution_run_data_folder, ) - response = {IKeys.EXTRACTED_TEXT: extracted_text} + response = { + IKeys.EXTRACTED_TEXT: extraction_result["extracted_text"], + } + signature_metadata = extraction_result.get("signature_metadata") + if signature_metadata: + response["signature_metadata"] = signature_metadata + signature_page_references = extraction_result.get("signature_page_references") + if signature_page_references: + response["signature_page_references"] = signature_page_references return response diff --git a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py index 9f8cbf9c28..62503f1890 100644 --- a/prompt-service/src/unstract/prompt_service/services/answer_prompt.py +++ b/prompt-service/src/unstract/prompt_service/services/answer_prompt.py @@ -1,8 +1,5 @@ -import ipaddress -import socket from logging import Logger from typing import Any -from urllib.parse import urlparse from flask import current_app as app @@ -24,58 +21,12 @@ from unstract.sdk1.file_storage.constants import StorageType from unstract.sdk1.file_storage.env_helper import EnvHelper from unstract.sdk1.llm import LLM - - -def _is_safe_public_url(url: str) -> bool: - """Validate webhook URL for SSRF protection. - - Only allows HTTPS and blocks private/loopback/internal addresses. - Resolves all DNS records (A/AAAA) to prevent DNS rebinding attacks. - """ - try: - p = urlparse(url) - if p.scheme not in ("https",): # Only allow HTTPS for security - return False - host = p.hostname or "" - # Block obvious local hosts - if host in ("localhost",): - return False - - addrs: set[str] = set() - # If literal IP, validate directly; else resolve all records (A/AAAA) - try: - ipaddress.ip_address(host) - addrs.add(host) - except ValueError: - try: - for family, _type, _proto, _canonname, sockaddr in socket.getaddrinfo( - host, None, type=socket.SOCK_STREAM - ): - addr = sockaddr[0] - addrs.add(addr) - except Exception: - return False - - if not addrs: - return False - - # Validate all resolved addresses - for addr in addrs: - try: - ip = ipaddress.ip_address(addr) - except ValueError: - return False - if ( - ip.is_private - or ip.is_loopback - or ip.is_link_local - or ip.is_reserved - or ip.is_multicast - ): - return False - return True - except Exception: - return False +from unstract.sdk1.utils.signature_highlights import ( + format_signature_metadata_context, + merge_into_highlight_data, + resolve_signature_highlight_coords, +) +from unstract.sdk1.utils.url_safety import is_safe_public_url class AnswerPromptService: @@ -141,9 +92,10 @@ def construct_and_run_prompt( platform_postamble=platform_postamble, word_confidence_postamble=word_confidence_postamble, prompt_type=prompt_type, + signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA), ) output[PSKeys.COMBINED_PROMPT] = prompt - return AnswerPromptService.run_completion( + answer = AnswerPromptService.run_completion( llm=llm, prompt=prompt, metadata=metadata, @@ -154,6 +106,51 @@ def construct_and_run_prompt( file_path=file_path, execution_source=execution_source, ) + AnswerPromptService._attach_signature_highlights( + answer=answer, + signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA), + signature_page_references=tool_settings.get(PSKeys.SIGNATURE_PAGE_REFERENCES), + metadata=metadata, + prompt_key=output[PSKeys.NAME], + ) + return answer + + @staticmethod + def _attach_signature_highlights( + answer: str, + signature_metadata: dict[str, list[Any]] | None, + signature_page_references: dict[str, Any] | None, + metadata: dict[str, Any] | None, + prompt_key: str | None, + ) -> None: + """Attach signature page highlights to ``metadata`` when the LLM + answer references a known signer or signatures generally. + + Delegates the matching logic to + ``unstract.sdk1.utils.signature_highlights`` so workers and + prompt-service stay in sync. + """ + if metadata is None or not prompt_key: + return + new_coords = resolve_signature_highlight_coords( + answer=answer, + signature_metadata=signature_metadata, + signature_page_references=signature_page_references, + ) + if not new_coords: + return + merge_into_highlight_data( + metadata=metadata, + prompt_key=prompt_key, + new_coords=new_coords, + highlight_data_key=PSKeys.HIGHLIGHT_DATA, + ) + app.logger.info( + "DOC_INSIGHTS attach_signature_highlights: prompt=%s, added %d " + "signature highlight(s)", + prompt_key, + len(new_coords), + ) @staticmethod def construct_prompt( @@ -165,6 +162,7 @@ def construct_prompt( platform_postamble: str, word_confidence_postamble: str, prompt_type: str = PSKeys.TEXT, + signature_metadata: dict[str, list[Any]] | None = None, ) -> str: prompt = f"{preamble}\n\nQuestion or Instruction: {prompt}" if grammar_list is not None and len(grammar_list) > 0: @@ -190,8 +188,22 @@ def construct_prompt( platform_postamble += "\n\n" if word_confidence_postamble: platform_postamble += f"{word_confidence_postamble}\n\n" + # Append signature metadata to context if present + signature_context = "" + if signature_metadata: + app.logger.info( + "DOC_INSIGHTS construct_prompt: injecting signature context " + "for %d page(s)", + len(signature_metadata), + ) + signature_context = format_signature_metadata_context(signature_metadata) + app.logger.debug( + "DOC_INSIGHTS construct_prompt: signature_context=%s", + signature_context[:200] if signature_context else "empty", + ) prompt += ( - f"\n\n{postamble}\n\nContext:\n---------------\n{context}\n" + f"\n\n{postamble}\n\nContext:\n---------------\n{context}" + f"{signature_context}\n" f"-----------------\n\n{platform_postamble}Answer:" ) return prompt @@ -432,7 +444,7 @@ def handle_json( app.logger.warning( "Postprocessing webhook enabled but URL missing; skipping." ) - elif not _is_safe_public_url(webhook_url): + elif not is_safe_public_url(webhook_url): app.logger.warning( "Postprocessing webhook URL is not allowed; skipping." ) diff --git a/prompt-service/src/unstract/prompt_service/services/extraction.py b/prompt-service/src/unstract/prompt_service/services/extraction.py index 76430657f9..f431a33648 100644 --- a/prompt-service/src/unstract/prompt_service/services/extraction.py +++ b/prompt-service/src/unstract/prompt_service/services/extraction.py @@ -1,3 +1,4 @@ +import logging from pathlib import Path from typing import Any @@ -14,6 +15,8 @@ from unstract.sdk1.utils.tool import ToolUtils from unstract.sdk1.x2txt import TextExtractionResult, X2Text +logger = logging.getLogger(__name__) + class ExtractionService: @staticmethod @@ -30,7 +33,7 @@ def perform_extraction( execution_source: str | None = None, tool_exec_metadata: dict[str, Any] | None = None, execution_run_data_folder: str | None = None, - ) -> str: + ) -> dict[str, Any]: extracted_text = "" util = PromptServiceBaseTool(platform_key=platform_key) x2text = X2Text( @@ -64,7 +67,37 @@ def perform_extraction( fs=fs, ) extracted_text = process_response.extracted_text - return extracted_text + # Extract signature metadata if present + signature_metadata = None + signature_page_references = None + if ( + process_response.extraction_metadata + and process_response.extraction_metadata.signature_metadata + ): + signature_metadata = ( + process_response.extraction_metadata.signature_metadata + ) + logger.info( + "DOC_INSIGHTS extraction: signature_metadata found for pages: %s", + list(signature_metadata.keys()), + ) + if ( + process_response.extraction_metadata + and process_response.extraction_metadata.signature_page_references + ): + signature_page_references = ( + process_response.extraction_metadata.signature_page_references + ) + logger.info( + "DOC_INSIGHTS extraction: signature_page_references " + "found for pages: %s", + list(signature_page_references.keys()), + ) + return { + "extracted_text": extracted_text, + "signature_metadata": signature_metadata, + "signature_page_references": signature_page_references, + } except AdapterError as e: msg = f"Error from text extractor '{x2text.x2text_instance.get_name()}'. " msg += str(e) diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py index 95c60bbe8c..4a0885a01f 100644 --- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py +++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/dto.py @@ -6,6 +6,8 @@ class TextExtractionMetadata: whisper_hash: str line_metadata: dict[Any, Any] | None = None + signature_metadata: dict[str, list[Any]] | None = None + signature_page_references: dict[str, Any] | None = None @dataclass diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/constants.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/constants.py index 090a3bf6f4..722e1a2f3f 100644 --- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/constants.py +++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/constants.py @@ -7,6 +7,8 @@ class Modes(Enum): LOW_COST = "low_cost" HIGH_QUALITY = "high_quality" FORM = "form" + TABLE = "table" + DOCUMENT_INSIGHTS = "document_insights" class OutputModes(Enum): diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py index 3a48a57647..066c8a4a57 100644 --- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py +++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/llm_whisperer_v2.py @@ -10,6 +10,8 @@ TextExtractionResult, ) from unstract.sdk1.adapters.x2text.llm_whisperer_v2.src.constants import ( + Modes, + WhispererConfig, WhispererEndpoint, ) from unstract.sdk1.adapters.x2text.llm_whisperer_v2.src.dto import ( @@ -54,6 +56,101 @@ def get_description() -> str: def get_icon() -> str: return "/icons/adapter-icons/LLMWhispererV2.png" + @staticmethod + def _index_first_content_line_per_page( + line_metadata: list[list[int]], + ) -> dict[int, int]: + """Map each page to its first content-line index in ``line_metadata``. + + Marker/empty rows like ``[0, 0, 0, 3168]`` or ``[1, 0, 0, 0]`` are + skipped because they have zero height or zero page_height and + produce an invisible overlay (and divide-by-zero in the frontend's + percentage calculation). + """ + page_first_line: dict[int, int] = {} + for idx, entry in enumerate(line_metadata): + if not isinstance(entry, list) or len(entry) < 4: + continue + page, _y, height, page_height = entry[0], entry[1], entry[2], entry[3] + if height <= 0 or page_height <= 0: + continue + if page not in page_first_line: + page_first_line[page] = idx + return page_first_line + + @staticmethod + def _build_page_reference_entry( + line_index: int, + signatures: list[Any], + line_metadata: list[list[int]], + ) -> dict[str, Any]: + """Build a single ``signature_page_references`` entry for one page.""" + coords_entry = line_metadata[line_index] + coords = ( + list(coords_entry[:4]) + if isinstance(coords_entry, list) and len(coords_entry) >= 4 + else None + ) + return { + "hex": f"0x{line_index + 1:02X}", # 1-indexed hex + "line_metadata_index": line_index, + "signers": [ + sig.get("name", "Unknown") for sig in signatures if isinstance(sig, dict) + ], + "coords": coords, + } + + @staticmethod + def _build_signature_page_references( + signature_metadata: dict[str, list[Any]], + line_metadata: list[list[int]], + ) -> dict[str, Any] | None: + """Build page references for frontend navigation to signature pages. + + For each page that has signatures, finds the first **content** + line in ``line_metadata`` (skipping zero-height marker rows) and + emits its 1-indexed hex value plus resolved coords. The frontend + uses ``coords`` directly in its highlight overlay; the workers + executor caches the result in a sidecar JSON next to the + extracted text file so cached extracts retain it. + + Args: + signature_metadata: Dict keyed by page number (str, 0-indexed) + with lists of signature entries. + line_metadata: List of [page, y_pos, height, page_height] arrays. + + Returns: + Dict mapping page number to ``{hex, line_metadata_index, + signers, coords}``, or None if no references could be built. + """ + if not line_metadata: + logger.warning( + "DOC_INSIGHTS: no line_metadata available, " + "cannot build page references" + ) + return None + + page_first_line = LLMWhispererV2._index_first_content_line_per_page(line_metadata) + logger.debug("DOC_INSIGHTS: page_first_line map: %s", page_first_line) + + references: dict[str, Any] = {} + for page_str, signatures in signature_metadata.items(): + if not signatures: + continue + page_num = int(page_str) + if page_num not in page_first_line: + logger.warning( + "DOC_INSIGHTS: page %d not found in line_metadata", page_num + ) + continue + references[page_str] = LLMWhispererV2._build_page_reference_entry( + line_index=page_first_line[page_num], + signatures=signatures, + line_metadata=line_metadata, + ) + + return references if references else None + def test_connection(self) -> bool: LLMWhispererHelper.test_connection_request( config=self.config, @@ -96,9 +193,61 @@ def process( fs=fs, extra_params=extra_params, ) + # Extract signature_metadata when using document_insights mode + signature_metadata = None + mode = self.config.get(WhispererConfig.MODE, Modes.FORM.value) + logger.info( + "DOC_INSIGHTS: mode=%s, is_document_insights=%s", + mode, + mode == Modes.DOCUMENT_INSIGHTS.value, + ) + if mode == Modes.DOCUMENT_INSIGHTS.value: + response_metadata = response.get("metadata", {}) + logger.info( + "DOC_INSIGHTS: response has metadata keys: %s", + list(response_metadata.keys()) if response_metadata else "None", + ) + signature_metadata = {} + for page_num, page_data in response_metadata.items(): + if isinstance(page_data, dict) and "signature_metadata" in page_data: + signature_metadata[page_num] = page_data["signature_metadata"] + logger.info( + "DOC_INSIGHTS: page %s has %d signature(s): %s", + page_num, + len(page_data["signature_metadata"]), + [s.get("name") for s in page_data["signature_metadata"]], + ) + if not any(signature_metadata.values()): + logger.info("DOC_INSIGHTS: no signatures found across any page") + signature_metadata = None + else: + logger.info( + "DOC_INSIGHTS: signature_metadata extracted for pages: %s", + list(signature_metadata.keys()), + ) + + # Compute signature page references for frontend navigation + signature_page_references = None + if signature_metadata: + raw_line_metadata = response.get("line_metadata", []) + logger.info( + "DOC_INSIGHTS: line_metadata has %d entries, " + "computing page references", + len(raw_line_metadata), + ) + signature_page_references = LLMWhispererV2._build_signature_page_references( + signature_metadata, raw_line_metadata + ) + logger.info( + "DOC_INSIGHTS: signature_page_references=%s", + signature_page_references, + ) + metadata = TextExtractionMetadata( whisper_hash=response.get(X2TextConstants.WHISPER_HASH_V2, ""), line_metadata=response.get("line_metadata"), + signature_metadata=signature_metadata, + signature_page_references=signature_page_references, ) return TextExtractionResult( diff --git a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json index 1215ede56a..00da534c37 100644 --- a/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json +++ b/unstract/sdk1/src/unstract/sdk1/adapters/x2text/llm_whisperer_v2/src/static/json_schema.json @@ -34,7 +34,8 @@ "low_cost", "high_quality", "form", - "table" + "table", + "document_insights" ], "default": "form", "description": "Processing mode to use, described in the [LLMWhisperer documentation](https://docs.unstract.com/llmwhisperer/llm_whisperer/apis/llm_whisperer_text_extraction_api/#modes)." diff --git a/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py b/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py new file mode 100644 index 0000000000..cf29292ac5 --- /dev/null +++ b/unstract/sdk1/src/unstract/sdk1/utils/signature_highlights.py @@ -0,0 +1,196 @@ +"""Shared helpers for surfacing LLMWhisperer signature page highlights. + +The workers executor and the prompt-service answer-prompt service both +need to post-process LLM answers against the signature metadata that +LLMWhisperer V2's ``document_insights`` mode produces. This module owns +the matching logic so both services stay in lock-step without copy-paste +drift. +""" + +from __future__ import annotations + +import re +from typing import Any + +# Generic signature-related terms used as a fallback trigger when the +# LLM answer doesn't mention any specific signer name but does talk +# about signing in general (e.g. "Is this signed?" → "Yes, the document +# is signed."). Matched as case-insensitive substrings. +SIGNATURE_KEYWORDS: tuple[str, ...] = ( + "signature", + "signed", + "signatory", + "signatories", + "signing", + "executed", +) + + +def _build_page_coords( + signature_page_references: dict[str, Any], +) -> dict[str, list[int]]: + """Pick the resolved coords array per signature page. + + Entries without a four-element ``coords`` list are skipped. + """ + page_coords: dict[str, list[int]] = {} + for page_str, ref in signature_page_references.items(): + if not isinstance(ref, dict): + continue + coords = ref.get("coords") + if isinstance(coords, list) and len(coords) >= 4: + page_coords[page_str] = list(coords[:4]) + return page_coords + + +def _any_signer_matches(signatures: list[Any], answer: str) -> bool: + """Return True if any signer name in ``signatures`` appears in ``answer``. + + Each name is matched as a whole token/phrase (case-insensitive, + word-boundary anchored) to avoid signer initials like ``"P S"`` + matching the gap between ``"Pradeep"`` and ``"Surukanti"`` inside + ``"Pradeep Surukanti"``. + """ + for sig in signatures: + if not isinstance(sig, dict): + continue + name = (sig.get("name") or "").strip() + if not name: + continue + pattern = re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE) + if pattern.search(answer): + return True + return False + + +def _find_pages_matching_signers( + answer: str, + signature_metadata: dict[str, list[Any]], + eligible_pages: set[str], +) -> list[str]: + """Return the pages whose signer names appear in ``answer``.""" + return [ + page_str + for page_str, signatures in signature_metadata.items() + if page_str in eligible_pages + and signatures + and _any_signer_matches(signatures, answer) + ] + + +def _dedupe_coords( + matched_pages: list[str], + page_coords: dict[str, list[int]], +) -> list[list[int]]: + """Map matched pages to their coords, preserving order and dropping dups.""" + seen: set[tuple[int, ...]] = set() + new_coords: list[list[int]] = [] + for page_str in matched_pages: + coords = page_coords[page_str] + key = tuple(coords) + if key in seen: + continue + seen.add(key) + new_coords.append(coords) + return new_coords + + +def resolve_signature_highlight_coords( + answer: str, + signature_metadata: dict[str, list[Any]] | None, + signature_page_references: dict[str, Any] | None, +) -> list[list[int]]: + """Return the page coords that the LLM answer should highlight. + + Matching rules: + + - For each signer name in ``signature_metadata`` that appears as a + whole word/phrase (case-insensitive) inside ``answer``, the + corresponding page's coords are included. + - When no signer name matches but the answer mentions a generic + signature keyword (``signature``, ``signed``, ``signatory``, + ``signing``, ``executed``), every signature page's coords are + included as a fallback. + - Returns an empty list when there's nothing to attach. + + Returned coords are de-duplicated by content while preserving order. + """ + if not signature_page_references or not signature_metadata: + return [] + if not isinstance(answer, str) or not answer.strip(): + return [] + + page_coords = _build_page_coords(signature_page_references) + if not page_coords: + return [] + + matched_pages = _find_pages_matching_signers( + answer=answer, + signature_metadata=signature_metadata, + eligible_pages=set(page_coords.keys()), + ) + + if not matched_pages and any(kw in answer.lower() for kw in SIGNATURE_KEYWORDS): + matched_pages = list(page_coords.keys()) + + if not matched_pages: + return [] + + return _dedupe_coords(matched_pages, page_coords) + + +def format_signature_metadata_context( + signature_metadata: dict[str, list[Any]], +) -> str: + """Format ``signature_metadata`` as a human-readable LLM context block. + + Returns an empty string when no signatures are present. Page numbers + are converted from 0-indexed to 1-indexed for display. + """ + lines: list[str] = [] + for page_num, signatures in sorted( + signature_metadata.items(), key=lambda x: int(x[0]) + ): + if not signatures: + continue + for sig in signatures: + name = sig.get("name", "Unknown") + sig_type = sig.get("type", "signature") + desc = sig.get("desc", "") + page_display = int(page_num) + 1 # 0-indexed → 1-indexed + entry = f"- Page {page_display}: {name} ({sig_type})" + if desc: + entry += f" — {desc}" + lines.append(entry) + if not lines: + return "" + header = ( + "\n\n[Document Signature Information]\n" + "The following signatures were detected in this document. " + "Use this information to answer any questions about signatories, " + "signing parties, or document execution status.\n" + ) + return header + "\n".join(lines) + + +def merge_into_highlight_data( + metadata: dict[str, Any], + prompt_key: str, + new_coords: list[list[int]], + highlight_data_key: str = "highlight_data", +) -> None: + """Append signature coords to ``metadata[highlight_data_key][prompt_key]``. + + Skips duplicates against existing entries (e.g. those populated by + the hex-comment highlight pipeline). Mutates ``metadata`` in place. + """ + if not new_coords: + return + bucket = metadata.setdefault(highlight_data_key, {}) + existing = bucket.get(prompt_key) + if not isinstance(existing, list): + existing = [] + for coords in new_coords: + if coords not in existing: + existing.append(coords) + bucket[prompt_key] = existing diff --git a/unstract/sdk1/src/unstract/sdk1/utils/url_safety.py b/unstract/sdk1/src/unstract/sdk1/utils/url_safety.py new file mode 100644 index 0000000000..40257399cf --- /dev/null +++ b/unstract/sdk1/src/unstract/sdk1/utils/url_safety.py @@ -0,0 +1,68 @@ +"""URL safety helpers (SSRF protection). + +Shared between the workers executor and the prompt-service answer-prompt +service because both need to validate webhook URLs before issuing +postprocessing callbacks. +""" + +from __future__ import annotations + +import ipaddress +import socket +from urllib.parse import urlparse + + +def _resolve_host_addresses(host: str) -> set[str]: + """Resolve a hostname or IP string to a set of IP address strings.""" + try: + ipaddress.ip_address(host) + return {host} + except ValueError: + pass + try: + return { + sockaddr[0] + for _family, _type, _proto, _canonname, sockaddr in socket.getaddrinfo( + host, None, type=socket.SOCK_STREAM + ) + } + except Exception: + return set() + + +def is_safe_public_url(url: str) -> bool: + """Validate a URL for use as an outbound webhook target (SSRF protection). + + Only HTTPS URLs are allowed, and the resolved host must not point to + a private, loopback, link-local, reserved, or multicast address. + All DNS records (A/AAAA) are resolved to prevent DNS rebinding + attacks. + """ + try: + p = urlparse(url) + if p.scheme not in ("https",): # only HTTPS + return False + host = p.hostname or "" + if host == "localhost": + return False + + addrs = _resolve_host_addresses(host) + if not addrs: + return False + + for addr in addrs: + try: + ip = ipaddress.ip_address(addr) + except ValueError: + return False + if ( + ip.is_private + or ip.is_loopback + or ip.is_link_local + or ip.is_reserved + or ip.is_multicast + ): + return False + return True + except Exception: + return False diff --git a/workers/executor/executors/answer_prompt.py b/workers/executor/executors/answer_prompt.py index d1eef5b3be..510bfbbc9b 100644 --- a/workers/executor/executors/answer_prompt.py +++ b/workers/executor/executors/answer_prompt.py @@ -10,70 +10,21 @@ are integrated at the caller level (LegacyExecutor). """ -import ipaddress import logging import os -import socket from typing import Any -from urllib.parse import urlparse from executor.executors.constants import PromptServiceConstants as PSKeys from executor.executors.exceptions import LegacyExecutorError, RateLimitError -logger = logging.getLogger(__name__) - - -def _resolve_host_addresses(host: str) -> set[str]: - """Resolve a hostname or IP string to a set of IP address strings.""" - try: - ipaddress.ip_address(host) - return {host} - except ValueError: - pass - try: - return { - sockaddr[0] - for _family, _type, _proto, _canonname, sockaddr in socket.getaddrinfo( - host, None, type=socket.SOCK_STREAM - ) - } - except Exception: - return set() - - -def _is_safe_public_url(url: str) -> bool: - """Validate webhook URL for SSRF protection. - - Only allows HTTPS and blocks private/loopback/internal addresses. - """ - try: - p = urlparse(url) - if p.scheme not in ("https",): - return False - host = p.hostname or "" - if host in ("localhost",): - return False +from unstract.sdk1.utils.signature_highlights import ( + format_signature_metadata_context, + merge_into_highlight_data, + resolve_signature_highlight_coords, +) +from unstract.sdk1.utils.url_safety import is_safe_public_url - addrs = _resolve_host_addresses(host) - if not addrs: - return False - - for addr in addrs: - try: - ip = ipaddress.ip_address(addr) - except ValueError: - return False - if ( - ip.is_private - or ip.is_loopback - or ip.is_link_local - or ip.is_reserved - or ip.is_multicast - ): - return False - return True - except Exception: - return False +logger = logging.getLogger(__name__) class AnswerPromptService: @@ -157,9 +108,10 @@ def construct_and_run_prompt( platform_postamble=platform_postamble, word_confidence_postamble=word_confidence_postamble, prompt_type=prompt_type, + signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA), ) output[PSKeys.COMBINED_PROMPT] = prompt - return AnswerPromptService.run_completion( + answer = AnswerPromptService.run_completion( llm=llm, prompt=prompt, metadata=metadata, @@ -171,6 +123,14 @@ def construct_and_run_prompt( execution_source=execution_source, process_text=process_text, ) + AnswerPromptService._attach_signature_highlights( + answer=answer, + signature_metadata=tool_settings.get(PSKeys.SIGNATURE_METADATA), + signature_page_references=tool_settings.get(PSKeys.SIGNATURE_PAGE_REFERENCES), + metadata=metadata, + prompt_key=output[PSKeys.NAME], + ) + return answer @staticmethod def _build_grammar_notes(grammar_list: list[dict[str, Any]]) -> str: @@ -189,6 +149,43 @@ def _build_grammar_notes(grammar_list: list[dict[str, Any]]) -> str: ) return notes + @staticmethod + def _attach_signature_highlights( + answer: str, + signature_metadata: dict[str, list[Any]] | None, + signature_page_references: dict[str, Any] | None, + metadata: dict[str, Any] | None, + prompt_key: str | None, + ) -> None: + """Attach signature page highlights to ``metadata`` when the LLM + answer references a known signer or signatures generally. + + Delegates the matching logic to + ``unstract.sdk1.utils.signature_highlights`` so workers and + prompt-service stay in sync. + """ + if metadata is None or not prompt_key: + return + new_coords = resolve_signature_highlight_coords( + answer=answer, + signature_metadata=signature_metadata, + signature_page_references=signature_page_references, + ) + if not new_coords: + return + merge_into_highlight_data( + metadata=metadata, + prompt_key=prompt_key, + new_coords=new_coords, + highlight_data_key=PSKeys.HIGHLIGHT_DATA, + ) + logger.info( + "DOC_INSIGHTS attach_signature_highlights: prompt=%s, added %d " + "signature highlight(s)", + prompt_key, + len(new_coords), + ) + @staticmethod def construct_prompt( preamble: str, @@ -199,6 +196,7 @@ def construct_prompt( platform_postamble: str, word_confidence_postamble: str, prompt_type: str = "text", + signature_metadata: dict[str, list[Any]] | None = None, ) -> str: """Build the full prompt string with preamble, grammar, postamble, context.""" prompt = f"{preamble}\n\nQuestion or Instruction: {prompt}" @@ -212,8 +210,22 @@ def construct_prompt( platform_postamble += "\n\n" if word_confidence_postamble: platform_postamble += f"{word_confidence_postamble}\n\n" + # Append signature metadata to context if present + signature_context = "" + if signature_metadata: + logger.info( + "DOC_INSIGHTS construct_prompt: injecting signature context " + "for %d page(s)", + len(signature_metadata), + ) + signature_context = format_signature_metadata_context(signature_metadata) + logger.debug( + "DOC_INSIGHTS construct_prompt: signature_context=%s", + signature_context[:200] if signature_context else "empty", + ) prompt += ( - f"\n\n{postamble}\n\nContext:\n---------------\n{context}\n" + f"\n\n{postamble}\n\nContext:\n---------------\n{context}" + f"{signature_context}\n" f"-----------------\n\n{platform_postamble}Answer:" ) return prompt @@ -292,7 +304,7 @@ def _run_webhook_postprocess( if not webhook_url: logger.warning("Postprocessing webhook enabled but URL missing; skipping.") return parsed_data, None - if not _is_safe_public_url(webhook_url): + if not is_safe_public_url(webhook_url): logger.warning("Postprocessing webhook URL is not allowed; skipping.") return parsed_data, None try: diff --git a/workers/executor/executors/constants.py b/workers/executor/executors/constants.py index 9eddab8423..58a6b72fcd 100644 --- a/workers/executor/executors/constants.py +++ b/workers/executor/executors/constants.py @@ -84,6 +84,8 @@ class PromptServiceConstants: LINE_ITEM = "line-item" LINE_NUMBERS = "line_numbers" WHISPER_HASH = "whisper_hash" + SIGNATURE_METADATA = "signature_metadata" + SIGNATURE_PAGE_REFERENCES = "signature_page_references" PAID_FEATURE_MSG = ( "It is a cloud / enterprise feature. If you have purchased a plan and still " "face this issue, please contact support" diff --git a/workers/executor/executors/legacy_executor.py b/workers/executor/executors/legacy_executor.py index cfd98cceca..3e571b66ec 100644 --- a/workers/executor/executors/legacy_executor.py +++ b/workers/executor/executors/legacy_executor.py @@ -305,6 +305,13 @@ def _handle_extract(self, context: ExecutionContext) -> ExecutionResult: result_data["highlight_metadata"] = ( process_response.extraction_metadata.line_metadata ) + # Include signature metadata when available (document_insights mode) + self._capture_signature_data( + fs=fs, + output_file_path=output_file_path, + process_response=process_response, + result_data=result_data, + ) return ExecutionResult( success=True, data=result_data, @@ -320,6 +327,91 @@ def _handle_extract(self, context: ExecutionContext) -> ExecutionResult: msg = f"Error from text extractor '{name}'. {e}" raise ExtractionError(message=msg) from e + def _capture_signature_data( + self, + fs: Any, + output_file_path: str | None, + process_response: TextExtractionResult, + result_data: dict[str, Any], + ) -> None: + """Move document_insights signature fields onto the result dict and + persist them in a sidecar JSON next to the extracted text file. + + No-op when the adapter did not produce signature data (e.g. + non-LLMWhisperer-V2 adapters or modes other than ``document_insights``). + """ + extraction_metadata = process_response.extraction_metadata + if not extraction_metadata: + return + signature_metadata = extraction_metadata.signature_metadata + signature_page_references = extraction_metadata.signature_page_references + if signature_metadata: + result_data["signature_metadata"] = signature_metadata + logger.info( + "DOC_INSIGHTS _handle_extract: signature_metadata found for pages: %s", + list(signature_metadata.keys()), + ) + if signature_page_references: + result_data["signature_page_references"] = signature_page_references + logger.info( + "DOC_INSIGHTS _handle_extract: signature_page_references=%s", + signature_page_references, + ) + self._write_signature_sidecar( + fs=fs, + output_file_path=output_file_path, + signature_metadata=signature_metadata, + signature_page_references=signature_page_references, + ) + + @staticmethod + def _signature_sidecar_path(output_file_path: str) -> str: + """Sidecar JSON for document_insights signature data. + + Lives next to the extracted ``.txt`` file so cache hits in + Prompt Studio can recover signature data without re-extracting. + """ + p = Path(output_file_path) + return str(p.with_suffix("")) + ".doc_insights.json" + + @staticmethod + def _write_signature_sidecar( + fs: Any, + output_file_path: str | None, + signature_metadata: dict[str, Any] | None, + signature_page_references: dict[str, Any] | None, + ) -> None: + """Persist signature data alongside the extracted-text file. + + Skipped if there's no signature data or no output path (e.g., + when running without disk output). + """ + if not output_file_path: + return + if not signature_metadata and not signature_page_references: + return + sidecar_path = LegacyExecutor._signature_sidecar_path(output_file_path) + payload = { + "signature_metadata": signature_metadata or {}, + "signature_page_references": signature_page_references or {}, + } + try: + ToolUtils.dump_json( + file_to_dump=sidecar_path, + json_to_dump=payload, + fs=fs, + ) + logger.info( + "DOC_INSIGHTS sidecar: wrote signature data to %s", + sidecar_path, + ) + except Exception as e: + logger.warning( + "DOC_INSIGHTS sidecar: failed to write %s: %s", + sidecar_path, + e, + ) + @staticmethod def _update_exec_metadata( fs: Any, @@ -641,6 +733,30 @@ def _failure(child_result: ExecutionResult) -> ExecutionResult: _absorb(extract_result) extracted_text = extract_result.data.get(IKeys.EXTRACTED_TEXT, "") + # Pass signature data captured by document_insights mode to + # the answer phase via tool_settings. + signature_metadata = extract_result.data.get("signature_metadata") + signature_page_references = extract_result.data.get( + "signature_page_references" + ) + if signature_metadata or signature_page_references: + tool_settings = answer_params.get(PSKeys.TOOL_SETTINGS, {}) + if signature_metadata: + tool_settings[PSKeys.SIGNATURE_METADATA] = signature_metadata + if signature_page_references: + tool_settings[PSKeys.SIGNATURE_PAGE_REFERENCES] = ( + signature_page_references + ) + answer_params[PSKeys.TOOL_SETTINGS] = tool_settings + logger.info( + "DOC_INSIGHTS pipeline: injected signature data into " + "tool_settings (pages=%s, refs=%s)", + list(signature_metadata.keys()) if signature_metadata else [], + list(signature_page_references.keys()) + if signature_page_references + else [], + ) + # ---- Step 2: Summarize (if enabled) ---- if is_summarization: step += 1 diff --git a/workers/tests/test_answer_prompt.py b/workers/tests/test_answer_prompt.py index 4b58c4c1ea..cd0ab11e26 100644 --- a/workers/tests/test_answer_prompt.py +++ b/workers/tests/test_answer_prompt.py @@ -11,7 +11,6 @@ from executor.executors.constants import ( PromptServiceConstants as PSKeys, ) - from unstract.sdk1.execution.context import ExecutionContext, Operation # --------------------------------------------------------------------------- @@ -109,7 +108,9 @@ def _mock_deps(llm=None): llm = _mock_llm() # AnswerPromptService — use the real class - from executor.executors.answer_prompt import AnswerPromptService as answer_prompt_svc_cls + from executor.executors.answer_prompt import ( + AnswerPromptService as answer_prompt_svc_cls, + ) retrieval_svc = MagicMock(name="RetrievalService") retrieval_svc.run_retrieval.return_value = ["chunk1", "chunk2"] @@ -900,6 +901,212 @@ def test_construct_prompt_with_grammar(self): assert "sum, total" in result +class TestAttachSignatureHighlights: + """Tests for the signature-highlight post-processor.""" + + @staticmethod + def _fixture_signatures(): + """Build a minimal signature fixture set covering two pages.""" + signature_metadata = { + "0": [ + {"name": "Mr Dagan", "type": "signature", "desc": ""}, + {"name": "Carmela Avner", "type": "signature", "desc": ""}, + ], + "1": [ + {"name": "Eve Other", "type": "signature", "desc": ""}, + ], + } + signature_page_references = { + "0": { + "hex": "0x10", + "line_metadata_index": 15, + "signers": ["Mr Dagan", "Carmela Avner"], + "coords": [0, 320, 31, 3168], + }, + "1": { + "hex": "0x20", + "line_metadata_index": 31, + "signers": ["Eve Other"], + "coords": [1, 100, 40, 3168], + }, + } + return signature_metadata, signature_page_references + + def test_name_match_attaches_only_matched_page(self): + from executor.executors.answer_prompt import AnswerPromptService + + sig_meta, sig_refs = self._fixture_signatures() + metadata = {} + AnswerPromptService._attach_signature_highlights( + answer="The document was signed by Mr Dagan on Jan 1.", + signature_metadata=sig_meta, + signature_page_references=sig_refs, + metadata=metadata, + prompt_key="signer", + ) + # Only page 0's coords (Mr Dagan) should be attached. + assert metadata["highlight_data"]["signer"] == [[0, 320, 31, 3168]] + + def test_case_insensitive_substring_match(self): + from executor.executors.answer_prompt import AnswerPromptService + + sig_meta, sig_refs = self._fixture_signatures() + metadata = {} + AnswerPromptService._attach_signature_highlights( + answer="signed by mr dagan, with sign-off from carmela avner.", + signature_metadata=sig_meta, + signature_page_references=sig_refs, + metadata=metadata, + prompt_key="signers", + ) + # Both names matched but both are on page 0 → single coord, deduped. + assert metadata["highlight_data"]["signers"] == [[0, 320, 31, 3168]] + + def test_multi_page_names_attach_distinct_coords(self): + from executor.executors.answer_prompt import AnswerPromptService + + sig_meta, sig_refs = self._fixture_signatures() + metadata = {} + AnswerPromptService._attach_signature_highlights( + answer="Signed by Mr Dagan and Eve Other.", + signature_metadata=sig_meta, + signature_page_references=sig_refs, + metadata=metadata, + prompt_key="signers", + ) + # Page 0 and page 1 coords both attached. + coords = metadata["highlight_data"]["signers"] + assert [0, 320, 31, 3168] in coords + assert [1, 100, 40, 3168] in coords + assert len(coords) == 2 + + def test_keyword_fallback_attaches_all_signature_pages(self): + """Generic signature mention with no name match → all pages.""" + from executor.executors.answer_prompt import AnswerPromptService + + sig_meta, sig_refs = self._fixture_signatures() + metadata = {} + AnswerPromptService._attach_signature_highlights( + answer="Yes, the document is signed.", + signature_metadata=sig_meta, + signature_page_references=sig_refs, + metadata=metadata, + prompt_key="is_signed", + ) + coords = metadata["highlight_data"]["is_signed"] + assert [0, 320, 31, 3168] in coords + assert [1, 100, 40, 3168] in coords + assert len(coords) == 2 + + def test_no_match_no_keyword_no_op(self): + """Answer with neither name match nor keyword → no highlights added.""" + from executor.executors.answer_prompt import AnswerPromptService + + sig_meta, sig_refs = self._fixture_signatures() + metadata = {} + AnswerPromptService._attach_signature_highlights( + answer="The total amount is $42.", + signature_metadata=sig_meta, + signature_page_references=sig_refs, + metadata=metadata, + prompt_key="total", + ) + assert "highlight_data" not in metadata + + def test_preserves_existing_highlight_entries(self): + """Coords already in metadata[HIGHLIGHT_DATA][key] are kept; no dups.""" + from executor.executors.answer_prompt import AnswerPromptService + + sig_meta, sig_refs = self._fixture_signatures() + metadata = { + "highlight_data": { + "signer": [ + [9, 9, 9, 9], # pre-existing, unrelated highlight + [0, 320, 31, 3168], # would duplicate the page-0 sig + ] + } + } + AnswerPromptService._attach_signature_highlights( + answer="Signed by Mr Dagan.", + signature_metadata=sig_meta, + signature_page_references=sig_refs, + metadata=metadata, + prompt_key="signer", + ) + # Pre-existing entries preserved, page-0 coord not duplicated. + assert metadata["highlight_data"]["signer"] == [ + [9, 9, 9, 9], + [0, 320, 31, 3168], + ] + + def test_short_initials_do_not_falsely_match_across_words(self): + """Regression: signer "P S" must not match across "Pradeep Surukanti". + + Pure substring matching incorrectly fired because "p s" appears + between "Pradee[p s]urukanti". Word-boundary matching prevents + the false positive. + """ + from executor.executors.answer_prompt import AnswerPromptService + + signature_metadata = { + "0": [ + {"name": "P S", "type": "signature"}, + {"name": "H S", "type": "signature"}, + ], + "1": [ + {"name": "Pradeep Surukanti", "type": "signature"}, + ], + } + signature_page_references = { + "0": {"coords": [0, 100, 30, 3168]}, + "1": {"coords": [1, 200, 30, 3168]}, + } + metadata = {} + AnswerPromptService._attach_signature_highlights( + answer="Pradeep Surukanti", + signature_metadata=signature_metadata, + signature_page_references=signature_page_references, + metadata=metadata, + prompt_key="signer", + ) + # Only the actual signer's page should be attached, not page 0. + assert metadata["highlight_data"]["signer"] == [[1, 200, 30, 3168]] + + def test_missing_inputs_no_op(self): + """No-op when signature data or metadata pieces are missing.""" + from executor.executors.answer_prompt import AnswerPromptService + + # No signature_metadata + metadata = {} + AnswerPromptService._attach_signature_highlights( + answer="signed by Mr Dagan", + signature_metadata=None, + signature_page_references={"0": {"coords": [0, 0, 0, 0]}}, + metadata=metadata, + prompt_key="k", + ) + assert metadata == {} + # No signature_page_references + AnswerPromptService._attach_signature_highlights( + answer="signed by Mr Dagan", + signature_metadata={"0": [{"name": "Mr Dagan"}]}, + signature_page_references=None, + metadata=metadata, + prompt_key="k", + ) + assert metadata == {} + # Empty/None answer + sig_meta, sig_refs = self._fixture_signatures() + AnswerPromptService._attach_signature_highlights( + answer="", + signature_metadata=sig_meta, + signature_page_references=sig_refs, + metadata=metadata, + prompt_key="k", + ) + assert metadata == {} + + class TestVariableReplacementService: """Tests for the VariableReplacementService."""