vectorless-engine/config.server.example.yaml at main · hallelx2/vectorless-engine · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# Vectorless Server — example configuration.
#
# Every value below has a sensible default. Override with env vars
# (VLS_* for server, VLE_* for engine) or edit this file.
#
# Precedence: env vars > YAML file > built-in defaults.

# ── Server ─────────────────────────────────────────────────────────
server:
  addr: ":8080"
  read_timeout: 30s
  write_timeout: 120s
  drain_timeout: 15s

  # Direct TLS (optional). Leave empty to terminate TLS at your proxy.
  tls:
    cert_file: ""
    key_file: ""
    min_version: "1.2"   # "1.2" or "1.3"

# ── Authentication ─────────────────────────────────────────────────
auth:
  # "none" — all requests are anonymous (default, for local dev).
  # "api_key" — require Authorization: Bearer <key>.
  mode: "none"
  api_key: ""   # set via VLS_AUTH_API_KEY in production

# ── Prometheus Metrics ─────────────────────────────────────────────
metrics:
  enabled: true   # serves /metrics endpoint

# ── OpenTelemetry Tracing ──────────────────────────────────────────
tracing:
  enabled: false
  endpoint: "localhost:4317"   # OTLP gRPC collector
  insecure: true               # disable TLS for local dev
  service_name: "vectorless-server"
  sample_rate: 1.0             # 0.0–1.0; 1.0 = sample everything

# ── Rate Limiting ──────────────────────────────────────────────────
rate_limit:
  enabled: false
  requests_per_minute: 600

# ── Engine Configuration ───────────────────────────────────────────
# Everything below is passed through to the vectorless engine.
engine:
  database:
    url: "postgres://vectorless:vectorless@localhost:5432/vectorless?sslmode=disable"
    max_conns: 10

  storage:
    driver: "local"   # "local" or "s3"
    local:
      root: "./data/documents"
    # s3:
    #   endpoint: "http://localhost:9000"
    #   region: "us-east-1"
    #   bucket: "vectorless"
    #   access_key: "minioadmin"
    #   secret_key: "minioadmin"
    #   use_path_style: true

  queue:
    driver: "river"   # "river", "qstash", or "asynq"
    river:
      num_workers: 10
    # qstash:
    #   token: ""
    #   webhook_base_url: "https://your-server.com"
    #   current_signing_key: ""
    #   next_signing_key: ""
    # asynq:
    #   addr: "localhost:6379"
    #   password: ""
    #   db: 0
    #   concurrency: 20

  llm:
    driver: "anthropic"   # "anthropic", "openai", or "gemini"
    anthropic:
      api_key: ""   # set via VLS_ANTHROPIC_API_KEY
      model: "claude-sonnet-4-20250514"
      reasoning_model: ""
    # openai:
    #   api_key: ""
    #   model: "gpt-4o"
    #   reasoning_model: ""
    # gemini:
    #   api_key: ""
    #   model: "gemini-2.0-flash"
    #   reasoning_model: ""

  retrieval:
    strategy: "chunked-tree"   # "single-pass" or "chunked-tree"
    chunked_tree:
      max_tokens_per_call: 60000
      max_parallel_calls: 8
      include_sibling_breadcrumbs: true

  ingest:
    # Ingest mode: full (default) | minimal.
    #   full     parse -> persist -> summarize -> HyDE -> multi-axis ->
    #            TOC build. Maximum retrieval quality; minutes on a large
    #            filing.
    #   minimal  parse -> persist -> ready. Skips every LLM enrichment
    #            stage AND table extraction — queryable in seconds. The
    #            page-based strategy (/v1/answer/treewalk) works on it
    #            unchanged (synthesised TOC + raw page reads).
    # Flip the live service without a secret edit: VLS_INGEST_MODE=minimal.
    mode: "full"

    # The summarize and HyDE stages run concurrently. This caps the total
    # number of LLM calls in flight across both stages combined.
    # 0 disables the global cap; default is 12.
    # (Ignored when mode: minimal — no LLM stages run.)
    global_llm_concurrency: 12

    # HyDE candidate-question generation per leaf section. Folded into
    # the retrieval prompt at query time to widen recall on queries that
    # don't echo the section's exact wording.
    hyde:
      enabled: true
      model: ""             # empty => same model as summarization
      num_questions: 5
      concurrency: 4

    # Multi-axis structured summaries (Phase 2.5). JSON-mode summarizer
    # returns {topics, entities, numbers, one_line}. The retrieval
    # prompt surfaces entities + numbers on the section line; the
    # one_line continues to populate the flat `summary` field for
    # backward compatibility.
    summary_axes:
      enabled: true
      max_topics: 4
      max_entities: 8
      max_numbers: 6

  log:
    level: "info"     # "debug", "info", "warn", "error"
    format: "json"    # "json" or "console"