Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,10 @@ pipeline_ui/backend/uploads/*
# Frontend (Node.js)
pipeline_ui/frontend/node_modules/*
pipeline_ui/frontend/package-lock.json


I_origin_0/*
I_origin_1/*
I_origin_2/*

output/*
68 changes: 53 additions & 15 deletions generate_synthetic_table/flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,23 +99,47 @@ def _call_llm(
if return_token_usage:
# Extract token usage from response metadata
token_usage = 0
if hasattr(response, 'response_metadata'):
usage_metadata = response.response_metadata.get('usage', {})
# OpenAI/Gemini format

logger.info(f"=== TOKEN DEBUG START ===")
logger.info(f"Response type: {type(response)}")
logger.info(f"Has usage_metadata: {hasattr(response, 'usage_metadata')}")
logger.info(f"Has response_metadata: {hasattr(response, 'response_metadata')}")

# Try response.usage_metadata first (Gemini pool format)
if hasattr(response, 'usage_metadata') and response.usage_metadata:
usage = response.usage_metadata
logger.info(f"usage_metadata type: {type(usage)}")
logger.info(f"usage_metadata value: {usage}")

if isinstance(usage, dict):
token_usage = usage.get('total_tokens', 0)
if not token_usage:
token_usage = usage.get('input_tokens', 0) + usage.get('output_tokens', 0)
else:
token_usage = getattr(usage, 'total_tokens', 0)
if not token_usage:
token_usage = getattr(usage, 'input_tokens', 0) + getattr(usage, 'output_tokens', 0)

logger.info(f"Extracted token_usage from usage_metadata: {token_usage}")

# Fallback: response.response_metadata (OpenAI format)
if not token_usage and hasattr(response, 'response_metadata'):
metadata = response.response_metadata
logger.info(f"response_metadata: {metadata}")
usage_metadata = metadata.get('usage', {})
logger.info(f"usage from response_metadata: {usage_metadata}")

token_usage = usage_metadata.get('total_tokens', 0)
# Fallback: prompt_tokens + completion_tokens
if not token_usage:
token_usage = usage_metadata.get('prompt_tokens', 0) + usage_metadata.get('completion_tokens', 0)
# Fallback: input_tokens + output_tokens
if not token_usage:
token_usage = usage_metadata.get('input_tokens', 0) + usage_metadata.get('output_tokens', 0)
# Alternative: usage_metadata attribute (dict or object)
if not token_usage and hasattr(response, 'usage_metadata') and response.usage_metadata:
usage = response.usage_metadata
if isinstance(usage, dict):
token_usage = usage.get('total_tokens', 0) or (usage.get('input_tokens', 0) + usage.get('output_tokens', 0))
else:
token_usage = getattr(usage, 'total_tokens', 0) or (getattr(usage, 'input_tokens', 0) + getattr(usage, 'output_tokens', 0))

logger.info(f"Extracted token_usage from response_metadata: {token_usage}")

logger.info(f"Final token_usage: {token_usage}")
logger.info(f"=== TOKEN DEBUG END ===")

return response_content, token_usage

return response_content
Expand Down Expand Up @@ -641,7 +665,11 @@ def _node(state: TableState) -> TableState:
errors.append(f"QA prompt missing placeholder: {e}")
return {**state, "errors": errors}

response_text = _call_llm(llm, prompt)
response_text, token_usage = _call_llm(llm, prompt, return_token_usage=True)

# Debug log for token usage
logger.info(f"QA generation token usage: {token_usage}")

response_json = robust_json_parse(response_text)

qa_results = []
Expand All @@ -650,7 +678,8 @@ def _node(state: TableState) -> TableState:
else:
logger.warning("QA generation did not return valid JSON or 'qa_pairs' key.")

return {**state, "qa_results": qa_results}
logger.info(f"Returning token_usage: {token_usage}")
return {**state, "qa_results": qa_results, "token_usage": token_usage}

return _node

Expand Down Expand Up @@ -685,6 +714,10 @@ def _node(state: TableState) -> TableState:
prompt = prompt_template

response_text, token_usage = _call_llm(llm, prompt, image_urls=image_data_urls, return_token_usage=True)

# Debug log for token usage
logger.info(f"QA generation token usage: {token_usage}")

response_json = robust_json_parse(response_text)

qa_results = []
Expand All @@ -693,6 +726,7 @@ def _node(state: TableState) -> TableState:
else:
logger.warning("QA generation from image did not return valid JSON or 'qa_pairs' key.")

logger.info(f"Returning token_usage: {token_usage}")
return {**state, "qa_results": qa_results, "token_usage": token_usage}

return _node
Expand Down Expand Up @@ -877,6 +911,8 @@ def run_synthetic_table_flow(
temperature: float = 0.2,
base_url: str | None = None,
config_path: str | None = None,
azure_deployment: str | None = None,
azure_endpoint: str | None = None,
qa_only: bool = False,
image_paths: List[str] | None = None,
domain: str | None = None,
Expand All @@ -891,11 +927,13 @@ def run_synthetic_table_flow(

Args:
image_path: Path to the input image or HTML file
provider: LLM provider (openai, gemini, gemini_pool, claude, vllm)
provider: LLM provider (openai, azure, gemini, gemini_pool, claude, vllm)
model: Model name
temperature: Sampling temperature
base_url: Custom base URL for vLLM
config_path: Config path for gemini_pool
azure_deployment: Azure OpenAI deployment name
azure_endpoint: Azure OpenAI endpoint URL
qa_only: If True, skip synthetic data generation and only generate QA from image
image_paths: Optional list of image paths for multi-image processing
domain: Optional domain for prompt customization (e.g. 'public')
Expand Down
57 changes: 55 additions & 2 deletions generate_synthetic_table/llm_factory.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,64 @@
from __future__ import annotations

import os
import yaml
from pathlib import Path
from typing import Optional

from langchain_core.language_models import BaseChatModel
from langchain_openai import ChatOpenAI
from langchain_openai import ChatOpenAI, AzureChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_anthropic import ChatAnthropic

# polling_gemini 모듈에서 GeminiPoolChatModel 임포트
from polling_gemini import GeminiPoolChatModel, create_gemini_chat_model


def _load_azure_config_from_yaml(config_path: Optional[str] = None) -> dict:
"""Load Azure OpenAI configuration from gemini_keys.yaml file."""
if not config_path:
config_path = "apis/gemini_keys.yaml"

config_file = Path(config_path)
if not config_file.exists():
return {}

try:
with open(config_file, 'r', encoding='utf-8') as f:
config = yaml.safe_load(f)

return {
'api_key': config.get('AZURE_OPENAI_API_KEY'),
'endpoint': config.get('AZURE_OPENAI_ENDPOINT'),
'api_version': config.get('AZURE_OPENAI_API_VERSION'),
'deployment': config.get('AZURE_OPENAI_DEPLOYMENT_NAME'),
}
except Exception:
return {}


def get_llm(
provider: str,
model: str,
temperature: float = 0.2,
base_url: Optional[str] = None,
api_key: Optional[str] = None,
config_path: Optional[str] = None,
azure_deployment: Optional[str] = None,
azure_endpoint: Optional[str] = None,
) -> BaseChatModel:
"""
Factory to create a Chat Model based on the provider.

Args:
provider: 'openai', 'gemini', 'gemini_pool', 'claude', or 'vllm'
provider: 'openai', 'azure', 'gemini', 'gemini_pool', 'claude', or 'vllm'
model: Model name (e.g., 'gpt-4', 'gemini-1.5-flash', 'claude-sonnet-4-20250514')
temperature: Sampling temperature
base_url: Optional base URL for vLLM or custom OpenAI endpoints
api_key: Optional API key override
config_path: Optional config path for gemini_pool (apis/gemini_keys.yaml)
azure_deployment: Azure OpenAI deployment name (required for azure provider)
azure_endpoint: Azure OpenAI endpoint URL (required for azure provider)

Returns:
A configured LangChain Chat Model
Expand Down Expand Up @@ -66,6 +95,30 @@ def get_llm(
temperature=temperature,
)

elif provider == "azure":
# Azure OpenAI
# 우선순위: CLI 파라미터 > config_path의 yaml 파일 > 환경변수
yaml_config = _load_azure_config_from_yaml(config_path)

azure_key = api_key or yaml_config.get('api_key') or os.getenv("AZURE_OPENAI_API_KEY")
azure_ep = azure_endpoint or yaml_config.get('endpoint') or os.getenv("AZURE_OPENAI_ENDPOINT")
azure_dep = azure_deployment or yaml_config.get('deployment') or model
azure_ver = yaml_config.get('api_version') or os.getenv("AZURE_OPENAI_API_VERSION") or "2024-02-15-preview"

if not azure_key or not azure_ep:
raise ValueError(
"Azure OpenAI requires AZURE_OPENAI_API_KEY and AZURE_OPENAI_ENDPOINT. "
"Set via environment variables, --azure-* arguments, or in apis/gemini_keys.yaml."
)

return AzureChatOpenAI(
azure_deployment=azure_dep,
azure_endpoint=azure_ep,
api_key=azure_key,
api_version=azure_ver,
temperature=temperature,
)

elif provider == "claude":
# Anthropic Claude API
# ANTHROPIC_API_KEY 환경변수 또는 api_key 파라미터 사용
Expand Down
10 changes: 8 additions & 2 deletions generate_synthetic_table/prompts/insurance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,17 @@ generate_qa: |
3. **Language:** The questions and answers MUST be in Korean.
4. **Reasoning Language:** reasoning_annotation MUST be written in English and MUST be a single string (not a list).
5. **Accuracy:** Ensure all answers are factually correct based on the data in the table.
7. **Context:** Include the specific table cells or rows that were used to answer the question.

**Output Format (JSON):**
{{
"qa_pairs": [
{{
"question": "...",
"answer": "...",
"type": "lookup"
"type": "lookup",
"reasoning_annotation": "Detailed explanation of how the answer was derived (in English, single string)",
"context": "Specific table cells/rows used (e.g., 'Row 2, Column 3: Premium amount')"
}},
...
]
Expand All @@ -43,14 +46,17 @@ generate_qa_from_image: |
4. **Language:** The questions and answers MUST be in Korean.
5. **Reasoning Language:** reasoning_annotation MUST be written in English and MUST be a single string (not a list).
6. **Accuracy:** Ensure 100% factual correctness.
7. **Context:** Include specific cell references or table sections used to derive the answer.

**Output Format (JSON):**
{{
"qa_pairs": [
{{
"question": "...",
"answer": "...",
"type": "lookup"
"type": "lookup",
"reasoning_annotation": "Step-by-step reasoning process in English (single string)",
"context": "Table location used (e.g., 'Premium column, Row 3')"
}},
...
]
Expand Down
Loading