From 17b87eff67e9bf3c8b7c53877863603aef66d7a9 Mon Sep 17 00:00:00 2001 From: Deborah Jacob Date: Fri, 6 Feb 2026 10:12:35 -0500 Subject: [PATCH 1/2] feat: Botanu SDK for Python - OpenTelemetry-native cost attribution Initial implementation of the Botanu SDK providing run-level cost attribution for AI workflows. Features: - @botanu_use_case decorator for entry point registration - UUIDv7 run_id generation and W3C Baggage propagation - Auto-instrumentation for LLM providers (OpenAI, Anthropic, etc.) - Auto-instrumentation for databases, HTTP clients, frameworks - Kubernetes zero-code deployment via OTel Operator - AttemptLedger for durable cost tracking - 100% trace capture (no sampling) for accurate cost attribution Documentation: - Getting started guide and quickstart - Kubernetes deployment guide for 2000+ services - API reference for decorators and configuration - Best practices and anti-patterns Co-Authored-By: Claude Opus 4.5 Signed-off-by: Deborah Jacob --- .clomonitor.yml | 27 + .github/ISSUE_TEMPLATE/bug_report.yml | 88 +++ .github/ISSUE_TEMPLATE/config.yml | 11 + .github/ISSUE_TEMPLATE/feature_request.yml | 53 ++ .github/PULL_REQUEST_TEMPLATE.md | 38 ++ .github/workflows/ci.yml | 104 ++++ .github/workflows/codeql.yml | 40 ++ .github/workflows/release.yml | 137 ++++ .github/workflows/repolinter.yml | 24 + .github/workflows/scorecard.yml | 35 ++ .pre-commit-config.yaml | 46 ++ .repolinterrc.yml | 124 ++++ CHANGELOG.md | 75 +++ CODE_OF_CONDUCT.md | 3 + CONTRIBUTING.md | 87 +++ DCO | 34 + GOVERNANCE.md | 39 ++ LICENSE | 212 ++++++- MAINTAINERS.md | 29 + NOTICE | 17 + README.md | 97 +++ RELEASE.md | 199 ++++++ SECURITY.md | 36 ++ docs/api/configuration.md | 417 +++++++++++++ docs/api/decorators.md | 99 +++ docs/api/tracking.md | 511 +++++++++++++++ docs/concepts/architecture.md | 265 ++++++++ docs/concepts/context-propagation.md | 239 +++++++ docs/concepts/run-context.md | 188 ++++++ docs/getting-started/configuration.md | 271 ++++++++ docs/getting-started/installation.md | 141 +++++ docs/getting-started/quickstart.md | 71 +++ docs/index.md | 65 ++ docs/integration/auto-instrumentation.md | 130 ++++ docs/integration/collector.md | 422 +++++++++++++ docs/integration/existing-otel.md | 295 +++++++++ docs/integration/kubernetes.md | 382 ++++++++++++ docs/patterns/anti-patterns.md | 490 +++++++++++++++ docs/patterns/best-practices.md | 416 +++++++++++++ docs/tracking/data-tracking.md | 412 ++++++++++++ docs/tracking/llm-tracking.md | 332 ++++++++++ docs/tracking/outcomes.md | 363 +++++++++++ pyproject.toml | 220 +++++++ src/botanu/__init__.py | 76 +++ src/botanu/_version.py | 13 + src/botanu/models/__init__.py | 10 + src/botanu/models/run_context.py | 320 ++++++++++ src/botanu/processors/__init__.py | 12 + src/botanu/processors/enricher.py | 81 +++ src/botanu/py.typed | 0 src/botanu/resources/__init__.py | 8 + src/botanu/resources/detector.py | 366 +++++++++++ src/botanu/sdk/__init__.py | 38 ++ src/botanu/sdk/bootstrap.py | 309 +++++++++ src/botanu/sdk/config.py | 294 +++++++++ src/botanu/sdk/context.py | 68 ++ src/botanu/sdk/decorators.py | 280 +++++++++ src/botanu/sdk/middleware.py | 99 +++ src/botanu/sdk/span_helpers.py | 93 +++ src/botanu/tracking/__init__.py | 77 +++ src/botanu/tracking/data.py | 488 +++++++++++++++ src/botanu/tracking/ledger.py | 420 +++++++++++++ src/botanu/tracking/llm.py | 688 +++++++++++++++++++++ tests/conftest.py | 58 ++ tests/integration/__init__.py | 2 + tests/unit/__init__.py | 2 + tests/unit/test_config.py | 203 ++++++ tests/unit/test_context.py | 63 ++ tests/unit/test_data_tracking.py | 209 +++++++ tests/unit/test_decorators.py | 124 ++++ tests/unit/test_enricher.py | 160 +++++ tests/unit/test_ledger.py | 277 +++++++++ tests/unit/test_llm_tracking.py | 307 +++++++++ tests/unit/test_resource_detector.py | 269 ++++++++ tests/unit/test_run_context.py | 204 ++++++ tests/unit/test_span_helpers.py | 124 ++++ 76 files changed, 13009 insertions(+), 17 deletions(-) create mode 100644 .clomonitor.yml create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yml create mode 100644 .github/ISSUE_TEMPLATE/config.yml create mode 100644 .github/ISSUE_TEMPLATE/feature_request.yml create mode 100644 .github/PULL_REQUEST_TEMPLATE.md create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/codeql.yml create mode 100644 .github/workflows/release.yml create mode 100644 .github/workflows/repolinter.yml create mode 100644 .github/workflows/scorecard.yml create mode 100644 .pre-commit-config.yaml create mode 100644 .repolinterrc.yml create mode 100644 CHANGELOG.md create mode 100644 CODE_OF_CONDUCT.md create mode 100644 CONTRIBUTING.md create mode 100644 DCO create mode 100644 GOVERNANCE.md create mode 100644 MAINTAINERS.md create mode 100644 NOTICE create mode 100644 README.md create mode 100644 RELEASE.md create mode 100644 SECURITY.md create mode 100644 docs/api/configuration.md create mode 100644 docs/api/decorators.md create mode 100644 docs/api/tracking.md create mode 100644 docs/concepts/architecture.md create mode 100644 docs/concepts/context-propagation.md create mode 100644 docs/concepts/run-context.md create mode 100644 docs/getting-started/configuration.md create mode 100644 docs/getting-started/installation.md create mode 100644 docs/getting-started/quickstart.md create mode 100644 docs/index.md create mode 100644 docs/integration/auto-instrumentation.md create mode 100644 docs/integration/collector.md create mode 100644 docs/integration/existing-otel.md create mode 100644 docs/integration/kubernetes.md create mode 100644 docs/patterns/anti-patterns.md create mode 100644 docs/patterns/best-practices.md create mode 100644 docs/tracking/data-tracking.md create mode 100644 docs/tracking/llm-tracking.md create mode 100644 docs/tracking/outcomes.md create mode 100644 pyproject.toml create mode 100644 src/botanu/__init__.py create mode 100644 src/botanu/_version.py create mode 100644 src/botanu/models/__init__.py create mode 100644 src/botanu/models/run_context.py create mode 100644 src/botanu/processors/__init__.py create mode 100644 src/botanu/processors/enricher.py create mode 100644 src/botanu/py.typed create mode 100644 src/botanu/resources/__init__.py create mode 100644 src/botanu/resources/detector.py create mode 100644 src/botanu/sdk/__init__.py create mode 100644 src/botanu/sdk/bootstrap.py create mode 100644 src/botanu/sdk/config.py create mode 100644 src/botanu/sdk/context.py create mode 100644 src/botanu/sdk/decorators.py create mode 100644 src/botanu/sdk/middleware.py create mode 100644 src/botanu/sdk/span_helpers.py create mode 100644 src/botanu/tracking/__init__.py create mode 100644 src/botanu/tracking/data.py create mode 100644 src/botanu/tracking/ledger.py create mode 100644 src/botanu/tracking/llm.py create mode 100644 tests/conftest.py create mode 100644 tests/integration/__init__.py create mode 100644 tests/unit/__init__.py create mode 100644 tests/unit/test_config.py create mode 100644 tests/unit/test_context.py create mode 100644 tests/unit/test_data_tracking.py create mode 100644 tests/unit/test_decorators.py create mode 100644 tests/unit/test_enricher.py create mode 100644 tests/unit/test_ledger.py create mode 100644 tests/unit/test_llm_tracking.py create mode 100644 tests/unit/test_resource_detector.py create mode 100644 tests/unit/test_run_context.py create mode 100644 tests/unit/test_span_helpers.py diff --git a/.clomonitor.yml b/.clomonitor.yml new file mode 100644 index 0000000..81639fe --- /dev/null +++ b/.clomonitor.yml @@ -0,0 +1,27 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 +# +# CLOMonitor metadata — used by LF AI & Data Foundation to track +# project maturity and best-practice adoption. +# See: https://clomonitor.io/docs/topics/checks/ + +# Documentation +documentation: + adopters: "https://github.com/botanu-ai/botanu-sdk-python/blob/main/ADOPTERS.md" + changelog: "https://github.com/botanu-ai/botanu-sdk-python/blob/main/CHANGELOG.md" + code_of_conduct: "https://github.com/botanu-ai/botanu-sdk-python/blob/main/CODE_OF_CONDUCT.md" + contributing: "https://github.com/botanu-ai/botanu-sdk-python/blob/main/CONTRIBUTING.md" + governance: "https://github.com/botanu-ai/botanu-sdk-python/blob/main/GOVERNANCE.md" + maintainers: "https://github.com/botanu-ai/botanu-sdk-python/blob/main/MAINTAINERS.md" + readme: "https://github.com/botanu-ai/botanu-sdk-python/blob/main/README.md" + security: "https://github.com/botanu-ai/botanu-sdk-python/blob/main/SECURITY.md" + +# License +license: + approved: true + spdx_id: "Apache-2.0" + +# Best practices +best_practices: + dco: true + openssf_badge: false # TODO: apply at https://www.bestpractices.dev/ diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..da664ab --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,88 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +name: Bug Report +description: Report a bug in the Botanu SDK +labels: ["bug", "triage"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to report a bug. + Please fill in the details below to help us reproduce and fix the issue. + + - type: input + id: version + attributes: + label: Botanu SDK version + description: "Output of `python -c 'import botanu; print(botanu.__version__)'`" + placeholder: "0.1.0" + validations: + required: true + + - type: input + id: python-version + attributes: + label: Python version + description: "Output of `python --version`" + placeholder: "3.12.1" + validations: + required: true + + - type: dropdown + id: init-mode + attributes: + label: Initialization mode + options: + - Standalone (no existing TracerProvider) + - Attach (OTEL-native vendor — Splunk, Honeycomb, etc.) + - Alongside (proprietary agent — Datadog, New Relic, etc.) + - Unknown / not sure + validations: + required: true + + - type: textarea + id: description + attributes: + label: Description + description: A clear and concise description of the bug. + validations: + required: true + + - type: textarea + id: reproduce + attributes: + label: Steps to reproduce + description: Minimal code or steps to reproduce the issue. + render: python + validations: + required: true + + - type: textarea + id: expected + attributes: + label: Expected behavior + description: What you expected to happen. + validations: + required: true + + - type: textarea + id: actual + attributes: + label: Actual behavior + description: What actually happened. Include tracebacks if applicable. + render: shell + validations: + required: true + + - type: textarea + id: context + attributes: + label: Additional context + description: | + - OS and platform + - OTel SDK / instrumentation versions + - Existing observability vendor (Datadog, Splunk, etc.) + - Collector configuration + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..4acc5ec --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,11 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +blank_issues_enabled: false +contact_links: + - name: Questions & Discussions + url: https://github.com/botanu-ai/botanu-sdk-python/discussions + about: Ask questions and discuss ideas + - name: Security Vulnerabilities + url: https://github.com/botanu-ai/botanu-sdk-python/blob/main/SECURITY.md + about: Report security vulnerabilities privately (do NOT open a public issue) diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..d35d736 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,53 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +name: Feature Request +description: Suggest a new feature or enhancement +labels: ["enhancement"] +body: + - type: markdown + attributes: + value: | + Thanks for suggesting an improvement to Botanu SDK! + + - type: textarea + id: problem + attributes: + label: Problem statement + description: What problem does this feature solve? Is this related to a frustration? + validations: + required: true + + - type: textarea + id: solution + attributes: + label: Proposed solution + description: Describe the solution you'd like. Include API sketches if possible. + validations: + required: true + + - type: textarea + id: alternatives + attributes: + label: Alternatives considered + description: Any alternative approaches you've considered. + validations: + required: false + + - type: dropdown + id: scope + attributes: + label: Which component does this affect? + multiple: true + options: + - Core SDK (bootstrap / attach) + - Run context / decorators + - Span processors + - Carrier propagation (SQS, Kafka, Celery) + - LLM / GenAI tracking + - Resource detection + - Collector configuration + - Documentation + - Other + validations: + required: true diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..42cfbe4 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,38 @@ + + + +## Summary + + + +## Changes + + + +- + +## Type of change + + + +- [ ] Bug fix (non-breaking change that fixes an issue) +- [ ] New feature (non-breaking change that adds functionality) +- [ ] Breaking change (fix or feature that would cause existing functionality to change) +- [ ] Documentation update +- [ ] CI / build / tooling + +## Testing + + + +- [ ] Unit tests pass (`pytest`) +- [ ] Lint passes (`ruff check`) +- [ ] Type check passes (`mypy`) + +## Checklist + +- [ ] My code follows the project's coding style +- [ ] I have added SPDX headers to new files +- [ ] I have added tests for my changes +- [ ] I have updated documentation if needed +- [ ] All commits are signed off (`git commit -s`) per the [DCO](../DCO) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..757042e --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,104 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +name: CI + +on: + push: + branches: [main, developer-deborah] + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + # ------------------------------------------------------------------- + # Lint & format check + # ------------------------------------------------------------------- + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - run: pip install ruff + - run: ruff check src/ tests/ + - run: ruff format --check src/ tests/ + + # ------------------------------------------------------------------- + # Type checking + # ------------------------------------------------------------------- + typecheck: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - run: pip install -e ".[dev]" + - run: mypy src/botanu/ + + # ------------------------------------------------------------------- + # Test matrix — Python 3.9 → 3.13 + # ------------------------------------------------------------------- + test: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 # hatch-vcs needs full history + + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: pip install -e ".[dev]" + + - name: Run tests with coverage + run: pytest --cov=botanu --cov-report=xml --cov-report=term-missing + + - name: Upload coverage + if: matrix.python-version == '3.12' + uses: codecov/codecov-action@v4 + with: + file: coverage.xml + fail_ci_if_error: false + + # ------------------------------------------------------------------- + # Build verification — ensure the package builds cleanly + # ------------------------------------------------------------------- + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - run: pip install build + - run: python -m build + - uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/ + + # ------------------------------------------------------------------- + # DCO sign-off check (required by Linux Foundation) + # ------------------------------------------------------------------- + dco: + runs-on: ubuntu-latest + if: github.event_name == 'pull_request' + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: DCO check + uses: dcoapp/app@v1 diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 0000000..b0d5105 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +name: CodeQL + +on: + push: + branches: [main] + pull_request: + branches: [main] + schedule: + - cron: "23 4 * * 1" # Weekly Monday 04:23 UTC + +permissions: + contents: read + +jobs: + analyze: + runs-on: ubuntu-latest + permissions: + security-events: write + strategy: + fail-fast: false + matrix: + language: [python] + steps: + - uses: actions/checkout@v4 + + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: ${{ matrix.language }} + + - name: Autobuild + uses: github/codeql-action/autobuild@v3 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:${{ matrix.language }}" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..a2ed4b5 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,137 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +name: Release to PyPI + +on: + push: + tags: + - "v*" + workflow_dispatch: + inputs: + publish_target: + description: 'Publish target' + required: true + default: 'testpypi' + type: choice + options: + - testpypi + - pypi + +permissions: + contents: read + +jobs: + # ------------------------------------------------------------------- + # Build the package + # ------------------------------------------------------------------- + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 # hatch-vcs needs full history + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install build tools + run: pip install build twine + + - name: Build sdist and wheel + run: python -m build + + - name: Check package with twine + run: twine check dist/* + + - name: List build artifacts + run: ls -la dist/ + + - uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/ + + # ------------------------------------------------------------------- + # Publish to TestPyPI (manual trigger or pre-release tags) + # Uses Trusted Publishing (OIDC — no API tokens needed) + # Requires TestPyPI project to be configured for GitHub OIDC: + # https://test.pypi.org/manage/project/botanu/settings/publishing/ + # ------------------------------------------------------------------- + publish-testpypi: + needs: build + if: | + github.event_name == 'workflow_dispatch' && github.event.inputs.publish_target == 'testpypi' + || (github.event_name == 'push' && contains(github.ref, '-alpha') || contains(github.ref, '-beta') || contains(github.ref, '-rc')) + runs-on: ubuntu-latest + environment: + name: testpypi + url: https://test.pypi.org/p/botanu + permissions: + id-token: write # required for OIDC trusted publishing + steps: + - uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + + - name: Publish to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ + skip-existing: true + + # ------------------------------------------------------------------- + # Publish to PyPI via Trusted Publishing (OIDC — no API tokens) + # Requires PyPI project to be configured for GitHub OIDC: + # https://pypi.org/manage/project/botanu/settings/publishing/ + # ------------------------------------------------------------------- + publish-pypi: + needs: build + if: | + github.event_name == 'workflow_dispatch' && github.event.inputs.publish_target == 'pypi' + || (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && !contains(github.ref, '-')) + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/botanu + permissions: + id-token: write # required for OIDC trusted publishing + steps: + - uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + + # ------------------------------------------------------------------- + # Create GitHub Release with auto-generated notes + # ------------------------------------------------------------------- + github-release: + needs: [build, publish-pypi] + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + + - name: Create GitHub Release + env: + GH_TOKEN: ${{ github.token }} + run: | + if [[ "${{ github.ref_name }}" == *"-"* ]]; then + gh release create "${{ github.ref_name }}" dist/* --generate-notes --prerelease + else + gh release create "${{ github.ref_name }}" dist/* --generate-notes + fi diff --git a/.github/workflows/repolinter.yml b/.github/workflows/repolinter.yml new file mode 100644 index 0000000..1c07d88 --- /dev/null +++ b/.github/workflows/repolinter.yml @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +name: Repolinter + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Run Repolinter + uses: todogroup/repolinter-action@v1 + with: + config_url: https://raw.githubusercontent.com/lfai/foundation/main/repolinter.json diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml new file mode 100644 index 0000000..2e56bfc --- /dev/null +++ b/.github/workflows/scorecard.yml @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +name: OpenSSF Scorecard + +on: + push: + branches: [main] + schedule: + - cron: "30 1 * * 1" # Weekly Monday 01:30 UTC + +permissions: read-all + +jobs: + analysis: + runs-on: ubuntu-latest + permissions: + security-events: write # upload SARIF + id-token: write # publish results + steps: + - uses: actions/checkout@v4 + with: + persist-credentials: false + + - name: Run OpenSSF Scorecard + uses: ossf/scorecard-action@v2 + with: + results_file: results.sarif + results_format: sarif + publish_results: true + + - name: Upload SARIF to GitHub Security tab + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: results.sarif diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..7aba505 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +repos: + # General file hygiene + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-toml + - id: check-added-large-files + args: ["--maxkb=500"] + - id: check-merge-conflict + - id: detect-private-key + + # Ruff — linter + formatter (replaces flake8, isort, black) + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.4.8 + hooks: + - id: ruff + args: [--fix, --exit-non-zero-on-fix] + - id: ruff-format + + # Type checking + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.10.0 + hooks: + - id: mypy + additional_dependencies: ["opentelemetry-api>=1.20.0"] + args: [--ignore-missing-imports] + pass_filenames: false + entry: mypy src/botanu/ + + # SPDX license header check + - repo: https://github.com/fsfe/reuse-tool + rev: v3.0.2 + hooks: + - id: reuse + + # DCO sign-off check (local — CI uses dcoapp/app) + - repo: https://github.com/christophebedard/dco-check + rev: v1.1.0 + hooks: + - id: dco-check diff --git a/.repolinterrc.yml b/.repolinterrc.yml new file mode 100644 index 0000000..d692b3b --- /dev/null +++ b/.repolinterrc.yml @@ -0,0 +1,124 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 +# +# Repolinter configuration for LF AI & Data Foundation compliance. +# See: https://github.com/todogroup/repolinter + +version: 2 +axioms: + linguist: language + licensee: license + packagers: packager + +rules: + # ---- License ---- + license-file-exists: + level: error + rule: + type: file-existence + options: + globsAny: + - LICENSE* + - COPYING* + + # ---- README ---- + readme-file-exists: + level: error + rule: + type: file-existence + options: + globsAny: + - README* + + # ---- CONTRIBUTING ---- + contributing-file-exists: + level: error + rule: + type: file-existence + options: + globsAny: + - CONTRIBUTING* + - .github/CONTRIBUTING* + + # ---- Code of Conduct ---- + code-of-conduct-file-exists: + level: error + rule: + type: file-existence + options: + globsAny: + - CODE_OF_CONDUCT* + - .github/CODE_OF_CONDUCT* + + # ---- SECURITY ---- + security-file-exists: + level: warning + rule: + type: file-existence + options: + globsAny: + - SECURITY* + - .github/SECURITY* + + # ---- NOTICE / attribution ---- + notice-file-exists: + level: warning + rule: + type: file-existence + options: + globsAny: + - NOTICE* + + # ---- DCO ---- + dco-file-exists: + level: warning + rule: + type: file-existence + options: + globsAny: + - DCO* + + # ---- CHANGELOG ---- + changelog-file-exists: + level: warning + rule: + type: file-existence + options: + globsAny: + - CHANGELOG* + - HISTORY* + + # ---- No binaries ---- + binaries-not-present: + level: error + rule: + type: file-type-exclusion + options: + type: + - "**/*.exe" + - "**/*.dll" + - "**/*.so" + - "**/*.dylib" + - "**/*.pyc" + - "**/*.pyo" + + # ---- Source files have SPDX headers ---- + source-license-headers-exist: + level: warning + rule: + type: file-contents + options: + globsAll: + - "src/**/*.py" + content: "SPDX-License-Identifier" + fail-on-non-existent: false + + # ---- No test credentials ---- + test-directory-exists: + level: warning + rule: + type: file-existence + options: + globsAny: + - tests/* + - test/* diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..9eed0fc --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,75 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +## [0.1.0] - 2026-02-05 + +### Added + +- Initial open-source release under Apache-2.0 license +- **Core SDK** + - `enable()` / `disable()` bootstrap functions for SDK initialization + - `@botanu_use_case` decorator with UUIDv7 run_id generation + - `@botanu_outcome` decorator for sub-function outcome tracking + - `emit_outcome()` helper for recording business outcomes + - `set_business_context()` for cost attribution dimensions + - `RunContextEnricher` span processor for automatic run_id propagation + +- **LLM Tracking** (aligned with OTel GenAI semantic conventions) + - `track_llm_call()` context manager for LLM/model operations + - `track_tool_call()` context manager for tool/function calls + - Token usage tracking (input, output, cached) + - Provider normalization for 15+ LLM providers + - Support for all GenAI operations (chat, embeddings, etc.) + +- **Data Tracking** + - `track_db_operation()` for database operations + - `track_storage_operation()` for object storage (S3, GCS, Azure Blob) + - `track_messaging_operation()` for message queues (SQS, Kafka, Pub/Sub) + - System normalization for 30+ database/storage systems + +- **Context Propagation** + - W3C Baggage propagation for cross-service run_id correlation + - Lean mode (default) and full mode propagation options + - `RunContext` model with retry tracking and deadline support + +- **Resource Detection** + - Kubernetes (pod, namespace, container) + - AWS (EC2, ECS, Lambda, Fargate) + - GCP (GCE, Cloud Run, Cloud Functions) + - Azure (VM, Container Apps, Functions) + +- **Auto-Instrumentation Support** + - HTTP clients: requests, httpx, urllib3, aiohttp + - Web frameworks: FastAPI, Flask, Django, Starlette + - Databases: SQLAlchemy, psycopg2, asyncpg, pymongo, Redis + - Messaging: Celery, Kafka + - GenAI: OpenAI, Anthropic, Vertex AI, Google GenAI, LangChain + +- **Optional Extras** + - `[sdk]` - OTel SDK + OTLP exporter + - `[instruments]` - Common library instrumentation + - `[genai]` - GenAI provider instrumentation + - `[carriers]` - Cross-service propagation helpers + - `[all]` - Everything included + - `[dev]` - Development and testing tools + +- **Documentation** + - Comprehensive docs in `/docs` following LF format + - Getting started guides + - API reference + - Best practices and anti-patterns + +### Dependencies + +- Core: `opentelemetry-api >= 1.20.0` +- SDK extra: `opentelemetry-sdk`, `opentelemetry-exporter-otlp-proto-http` +- Python: `>= 3.9` + +[Unreleased]: https://github.com/botanu-ai/botanu-sdk-python/compare/v0.1.0...HEAD +[0.1.0]: https://github.com/botanu-ai/botanu-sdk-python/releases/tag/v0.1.0 diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..643856c --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,3 @@ +# Botanu Code of Conduct + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers agree to abide by the Code of Conduct available at https://lfprojects.org/policies/code-of-conduct/ \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..6d13cd5 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,87 @@ +# Contributing to Botanu SDK + +Thank you for your interest in contributing to Botanu SDK! This document provides guidelines and instructions for contributing. + +## Developer Certificate of Origin (DCO) + +This project requires all commits to be signed off in accordance with the [Developer Certificate of Origin (DCO)](https://developercertificate.org/). This certifies that you have the right to submit your contribution under the project's open source license. + +To sign off your commits, add the `-s` flag to your git commit command: + +```bash +git commit -s -m "Your commit message" +``` + +This will add a `Signed-off-by` line to your commit message: + +``` +Signed-off-by: Your Name +``` + +If you've already made commits without signing off, you can amend them: + +```bash +# Amend the last commit +git commit --amend -s + +# Rebase and sign off multiple commits +git rebase --signoff HEAD~N # where N is the number of commits +``` + +## Development Setup + +1. Clone the repository: + ```bash + git clone https://github.com/botanu-ai/botanu-sdk-python.git + cd botanu-sdk-python + ``` + +2. Create a virtual environment and install dependencies: + ```bash + python -m venv .venv + source .venv/bin/activate # On Windows: .venv\Scripts\activate + pip install -e ".[dev]" + ``` + +3. Run tests: + ```bash + pytest tests/ + ``` + +4. Run linting and type checks: + ```bash + ruff check src/ tests/ + ruff format src/ tests/ + mypy src/botanu/ + ``` + +## Pull Request Process + +1. Fork the repository and create a feature branch +2. Make your changes with appropriate tests +3. Ensure all tests pass and linting is clean +4. Sign off all commits with DCO +5. Submit a pull request with a clear description + +## Code Style + +- Follow [PEP 8](https://pep8.org/) style guidelines +- Use type hints for all function signatures +- Write docstrings for public APIs +- Keep commits focused and atomic + +## Reporting Issues + +Please use GitHub Issues to report bugs or request features. Include: +- A clear description of the issue +- Steps to reproduce (for bugs) +- Expected vs actual behavior +- Python version and OS + +## Code of Conduct + +This project follows the [LF Projects Code of Conduct](https://lfprojects.org/policies/code-of-conduct/). + +## License + +By contributing, you agree that your contributions will be licensed under the Apache License 2.0. diff --git a/DCO b/DCO new file mode 100644 index 0000000..49b8cb0 --- /dev/null +++ b/DCO @@ -0,0 +1,34 @@ +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. diff --git a/GOVERNANCE.md b/GOVERNANCE.md new file mode 100644 index 0000000..9f7a9f0 --- /dev/null +++ b/GOVERNANCE.md @@ -0,0 +1,39 @@ +# Governance + +This project follows the governance model of the [LF AI & Data Foundation](https://lfaidata.foundation/). + +## Roles + +### Maintainers + +Maintainers are responsible for: +- Reviewing and merging pull requests +- Triaging issues +- Releasing new versions +- Ensuring project quality and direction + +Current maintainers are listed in [MAINTAINERS.md](./MAINTAINERS.md). + +### Contributors + +Anyone can contribute by: +- Opening issues +- Submitting pull requests +- Participating in discussions +- Improving documentation + +See [CONTRIBUTING.md](./CONTRIBUTING.md) for contribution guidelines. + +## Decision Making + +- Technical decisions are made through pull request reviews +- Significant changes require approval from at least one maintainer +- Disputes are resolved by maintainer consensus + +## Code of Conduct + +All participants must follow the [Code of Conduct](./CODE_OF_CONDUCT.md). + +## License + +This project is licensed under Apache-2.0. See [LICENSE](./LICENSE). diff --git a/LICENSE b/LICENSE index 49d106a..454411d 100644 --- a/LICENSE +++ b/LICENSE @@ -1,22 +1,200 @@ -BOTANU SOFTWARE LICENSE AGREEMENT -Copyright (c) 2026 Botanu, Inc. -All rights reserved. + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ -This software and associated documentation files are proprietary -and confidential to Botanu, Inc. + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION -Permission is hereby granted to install and use the Software solely for internal -business purposes and only in connection with authorized use of Botanu services, -subject to the terms of a separate written agreement between you and Botanu, Inc. + 1. Definitions. -You may not: -- copy, modify, merge, publish, distribute, sublicense, or sell copies of the Software; -- reverse engineer, decompile, or disassemble the Software; -- remove or alter any proprietary notices contained in the Software. + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR -PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE -FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to the Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by the Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding any notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. Please also get the + boilerplate text of the NOTICE file for your work. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/MAINTAINERS.md b/MAINTAINERS.md new file mode 100644 index 0000000..cdd0615 --- /dev/null +++ b/MAINTAINERS.md @@ -0,0 +1,29 @@ +# Maintainers + +This file lists the maintainers of the Botanu SDK Python project. + +## Current Maintainers + +The maintainers are listed in alphabetical order by GitHub handle. + +| Name | GitHub | Role | +|------|--------|------| +| Deborah Jacob | [@deborahjacob-botanu](https://github.com/deborahjacob-botanu) | Lead Maintainer | + +## Becoming a Maintainer + +Maintainers are contributors who have demonstrated: + +- Sustained contributions to the project +- Deep understanding of the codebase +- Commitment to the project's goals and community + +If you're interested in becoming a maintainer, start by making regular contributions and engaging with the community. + +## Maintainer Responsibilities + +- Review and merge pull requests +- Triage issues +- Participate in project planning +- Uphold the Code of Conduct +- Help onboard new contributors diff --git a/NOTICE b/NOTICE new file mode 100644 index 0000000..0ff65a4 --- /dev/null +++ b/NOTICE @@ -0,0 +1,17 @@ +Botanu SDK for Python +Copyright 2026 The Botanu Authors + +This product includes software developed at +Botanu, Inc. (https://botanu.ai/). + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +This product includes software from the following open source projects: + +- OpenTelemetry Python (https://github.com/open-telemetry/opentelemetry-python) + Copyright The OpenTelemetry Authors + Licensed under the Apache License, Version 2.0 diff --git a/README.md b/README.md new file mode 100644 index 0000000..3605c12 --- /dev/null +++ b/README.md @@ -0,0 +1,97 @@ +# Botanu SDK for Python + +[![CI](https://github.com/botanu-ai/botanu-sdk-python/actions/workflows/ci.yml/badge.svg)](https://github.com/botanu-ai/botanu-sdk-python/actions/workflows/ci.yml) +[![PyPI version](https://img.shields.io/pypi/v/botanu)](https://pypi.org/project/botanu/) +[![Python](https://img.shields.io/badge/python-3.9%20|%203.10%20|%203.11%20|%203.12%20|%203.13-blue)](https://www.python.org/) +[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE) + +OpenTelemetry-native run-level cost attribution for AI workflows. + +## Overview + +Botanu adds **runs** on top of distributed tracing. A run represents a single business transaction that may span multiple LLM calls, database queries, and services. By correlating all operations to a stable `run_id`, you get accurate cost attribution without sampling artifacts. + +## Quick Start + +```python +from botanu import enable, botanu_use_case + +enable(service_name="my-service") + +@botanu_use_case(name="my_workflow") +def my_function(): + data = db.query(...) + result = llm.complete(...) + return result +``` + +## Installation + +```bash +pip install "botanu[all]" +``` + +| Extra | Description | +|-------|-------------| +| `sdk` | OpenTelemetry SDK + OTLP exporter | +| `instruments` | Auto-instrumentation for HTTP, databases | +| `genai` | Auto-instrumentation for LLM providers | +| `all` | All of the above (recommended) | + +## What Gets Auto-Instrumented + +When you install `botanu[all]`, the following are automatically tracked: + +- **LLM Providers** — OpenAI, Anthropic, Vertex AI, Bedrock, Azure OpenAI +- **Databases** — PostgreSQL, MySQL, SQLite, MongoDB, Redis +- **HTTP** — requests, httpx, urllib3, aiohttp +- **Frameworks** — FastAPI, Flask, Django, Starlette +- **Messaging** — Celery, Kafka + +No manual instrumentation required. + +## Kubernetes Deployment + +For large-scale deployments (2000+ services): + +| Service Type | Code Change | Kubernetes Config | +|--------------|-------------|-------------------| +| Entry point | `@botanu_use_case` decorator | Annotation | +| Intermediate | None | Annotation only | + +```yaml +# Intermediate services - annotation only, no code changes +metadata: + annotations: + instrumentation.opentelemetry.io/inject-python: "true" +``` + +Auto-instrumentation captures all HTTP calls including retries (requests, httpx, aiohttp, urllib3). + +See [Kubernetes Deployment Guide](./docs/integration/kubernetes.md) for details. + +## Documentation + +- [Getting Started](./docs/getting-started/) +- [Concepts](./docs/concepts/) +- [Integration](./docs/integration/) +- [API Reference](./docs/api/) + +## Requirements + +- Python 3.9+ +- OpenTelemetry Collector (recommended for production) + +## Contributing + +See [CONTRIBUTING.md](./CONTRIBUTING.md). This project uses DCO sign-off. + +```bash +git commit -s -m "Your commit message" +``` + +## License + +[Apache-2.0](./LICENSE) + +This project is an [LF AI & Data Foundation](https://lfaidata.foundation/) project. diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 0000000..d2454ea --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,199 @@ +# Release Process + +This document describes the release process for Botanu SDK. + +## Versioning + +Botanu SDK follows [Semantic Versioning](https://semver.org/): + +- **MAJOR** (1.0.0): Breaking changes to public API +- **MINOR** (0.2.0): New features, backwards compatible +- **PATCH** (0.1.1): Bug fixes, backwards compatible + +Pre-release versions use suffixes: +- `-alpha.N`: Early development, unstable +- `-beta.N`: Feature complete, testing +- `-rc.N`: Release candidate, final testing + +## Prerequisites + +Before releasing, ensure: + +1. All CI checks pass on `main` branch +2. CHANGELOG.md is updated with release notes +3. Documentation is up to date +4. Test coverage meets threshold (70%+) + +## Release Workflow + +### 1. Prepare the Release + +```bash +# Ensure you're on main with latest changes +git checkout main +git pull origin main + +# Update CHANGELOG.md +# - Move items from [Unreleased] to new version section +# - Add release date +# - Update comparison links at bottom + +# Commit changelog +git add CHANGELOG.md +git commit -s -m "docs: prepare release v0.1.0" +git push origin main +``` + +### 2. Create a Release Tag + +```bash +# For production release +git tag -a v0.1.0 -m "Release v0.1.0" + +# For pre-release +git tag -a v0.1.0-alpha.1 -m "Release v0.1.0-alpha.1" + +# Push tag +git push origin v0.1.0 +``` + +### 3. Automated Publishing + +When a tag is pushed: + +- **Pre-release tags** (`v*-alpha*`, `v*-beta*`, `v*-rc*`) → TestPyPI +- **Release tags** (`v*` without suffix) → PyPI + GitHub Release + +The workflow uses [Trusted Publishing (OIDC)](https://docs.pypi.org/trusted-publishers/) — no API tokens needed. + +### 4. Manual Publishing (if needed) + +You can manually trigger publishing from the Actions tab: + +1. Go to Actions → "Release to PyPI" +2. Click "Run workflow" +3. Select target: `testpypi` or `pypi` +4. Click "Run workflow" + +## TestPyPI Verification + +After publishing to TestPyPI, verify installation: + +```bash +# Create a test environment +python -m venv test-env +source test-env/bin/activate # or test-env\Scripts\activate on Windows + +# Install from TestPyPI +pip install --index-url https://test.pypi.org/simple/ \ + --extra-index-url https://pypi.org/simple/ \ + botanu + +# Verify import +python -c "import botanu; print(botanu.__version__)" + +# Run quick test +python -c " +from botanu import enable, botanu_use_case +enable(service_name='test') +print('Botanu SDK loaded successfully!') +" +``` + +## PyPI Trusted Publishing Setup + +### Initial Setup (One-time) + +1. **Create PyPI project** (if not exists): + - Go to https://pypi.org/manage/projects/ + - Create new project named `botanu` + +2. **Configure Trusted Publisher on PyPI**: + - Go to https://pypi.org/manage/project/botanu/settings/publishing/ + - Add new publisher: + - Owner: `botanu-ai` + - Repository: `botanu-sdk-python` + - Workflow: `release.yml` + - Environment: `pypi` + +3. **Configure Trusted Publisher on TestPyPI**: + - Go to https://test.pypi.org/manage/project/botanu/settings/publishing/ + - Add new publisher with same settings, environment: `testpypi` + +4. **Create GitHub Environments**: + - Go to repo Settings → Environments + - Create `pypi` environment (for production) + - Create `testpypi` environment (for testing) + - Optionally add protection rules (required reviewers, etc.) + +## Local Build Verification + +Before releasing, verify the build locally: + +```bash +# Install build tools +pip install build twine + +# Build the package +python -m build + +# Check the package +twine check dist/* + +# List contents +tar -tvf dist/botanu-*.tar.gz +unzip -l dist/botanu-*.whl + +# Test installation from local wheel +pip install dist/botanu-*.whl +python -c "import botanu; print(botanu.__version__)" +``` + +## Version Determination + +The version is determined by `hatch-vcs` from git tags: + +- Tagged commit: `0.1.0` +- Commits after tag: `0.1.1.dev3+g1234567` +- No tags: `0.0.0.dev0` + +To see what version will be used: + +```bash +pip install hatch-vcs +python -c "from setuptools_scm import get_version; print(get_version())" +``` + +## Rollback Procedure + +If a release has issues: + +1. **Yank from PyPI** (hides from install, but doesn't delete): + ```bash + # Via web UI: PyPI project → Release history → Yank + # Or via API (requires token) + ``` + +2. **Delete GitHub Release** (if needed): + ```bash + gh release delete v0.1.0 --yes + git push origin --delete v0.1.0 + ``` + +3. **Fix and re-release** with a new patch version (e.g., `v0.1.1`) + +## Release Checklist + +- [ ] All CI checks pass +- [ ] CHANGELOG.md updated +- [ ] Documentation updated +- [ ] Version tag follows semver +- [ ] Tag pushed to origin +- [ ] TestPyPI verification passed (for major releases) +- [ ] PyPI package visible +- [ ] GitHub Release created +- [ ] Announcement posted (if applicable) + +## Maintainers + +See [MAINTAINERS.md](./MAINTAINERS.md) for the list of release maintainers. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..4f89ae0 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,36 @@ +# Security Policy + +## Supported Versions + +| Version | Supported | +| ------- | ------------------ | +| 0.1.x | :white_check_mark: | + +Please upgrade to the latest stable version of Botanu which will have known security issues addressed. + +## Reporting a Vulnerability + +The Botanu team takes security vulnerabilities seriously. We appreciate your efforts to responsibly disclose your findings. + +### How to Report + +To report a security vulnerability, please use one of the following methods: + +1. **GitHub Security Advisories** (Preferred): Use the [Security tab](https://github.com/botanu-ai/botanu-sdk-python/security/advisories/new) to privately report a vulnerability. + +2. **Email**: Contact the [maintainer team](https://github.com/botanu-ai/botanu-sdk-python/blob/main/MAINTAINERS.md) + +Please do **not** post security vulnerabilities to the public issue tracker. + +### What to Include + +- Type of vulnerability +- Full paths of affected source files +- Step-by-step instructions to reproduce the issue +- Impact of the issue and potential attack scenarios + +### Response Timeline + +- **Initial Response**: Within 48 hours +- **Status Update**: Within 7 days +- **Resolution Target**: Within 90 days (depending on complexity) diff --git a/docs/api/configuration.md b/docs/api/configuration.md new file mode 100644 index 0000000..cf417ac --- /dev/null +++ b/docs/api/configuration.md @@ -0,0 +1,417 @@ +# Configuration API Reference + +## BotanuConfig + +Dataclass for SDK configuration. + +```python +from botanu.sdk.config import BotanuConfig +``` + +### Fields + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `service_name` | `str` | `"unknown_service"` | Service name (from `OTEL_SERVICE_NAME`) | +| `service_version` | `str` | `None` | Service version (from `OTEL_SERVICE_VERSION`) | +| `service_namespace` | `str` | `None` | Service namespace (from `OTEL_SERVICE_NAMESPACE`) | +| `deployment_environment` | `str` | `"production"` | Environment (from `OTEL_DEPLOYMENT_ENVIRONMENT` or `BOTANU_ENVIRONMENT`) | +| `auto_detect_resources` | `bool` | `True` | Auto-detect cloud resources | +| `otlp_endpoint` | `str` | `"http://localhost:4318/v1/traces"` | OTLP endpoint | +| `otlp_headers` | `dict` | `None` | Custom headers for OTLP exporter | +| `max_export_batch_size` | `int` | `512` | Max spans per batch | +| `max_queue_size` | `int` | `2048` | Max spans in queue | +| `schedule_delay_millis` | `int` | `5000` | Delay between batch exports | +| `propagation_mode` | `str` | `"lean"` | `"lean"` or `"full"` | +| `auto_instrument_packages` | `list` | `[...]` | Packages to auto-instrument | + +### Constructor + +```python +config = BotanuConfig( + service_name="my-service", + deployment_environment="production", + otlp_endpoint="http://collector:4318/v1/traces", +) +``` + +### Class Methods + +#### from_yaml() + +Load configuration from a YAML file. + +```python +@classmethod +def from_yaml(cls, path: Optional[str] = None) -> BotanuConfig +``` + +**Parameters:** +- `path`: Path to YAML config file + +**Raises:** +- `FileNotFoundError`: If config file doesn't exist +- `ValueError`: If YAML is malformed +- `ImportError`: If PyYAML is not installed + +**Example:** + +```python +config = BotanuConfig.from_yaml("config/botanu.yaml") +``` + +#### from_file_or_env() + +Load config from file if exists, otherwise use environment variables. + +```python +@classmethod +def from_file_or_env(cls, path: Optional[str] = None) -> BotanuConfig +``` + +**Search order:** +1. Explicit `path` argument +2. `BOTANU_CONFIG_FILE` environment variable +3. `./botanu.yaml` +4. `./botanu.yml` +5. `./config/botanu.yaml` +6. `./config/botanu.yml` +7. Falls back to environment-only config + +**Example:** + +```python +# Auto-discovers config file +config = BotanuConfig.from_file_or_env() + +# Explicit path +config = BotanuConfig.from_file_or_env("my-config.yaml") +``` + +### Instance Methods + +#### to_dict() + +Export configuration as dictionary. + +```python +def to_dict(self) -> Dict[str, Any] +``` + +**Example:** + +```python +config = BotanuConfig(service_name="my-service") +print(config.to_dict()) +# { +# "service": {"name": "my-service", ...}, +# "otlp": {"endpoint": "...", ...}, +# ... +# } +``` + +--- + +## YAML Configuration Format + +### Full Schema + +```yaml +service: + name: string # Service name + version: string # Service version + namespace: string # Service namespace + environment: string # Deployment environment + +resource: + auto_detect: boolean # Auto-detect cloud resources + +otlp: + endpoint: string # OTLP endpoint URL + headers: # Custom headers + header-name: value + +export: + batch_size: integer # Max spans per batch + queue_size: integer # Max spans in queue + delay_ms: integer # Delay between exports + +propagation: + mode: string # "lean" or "full" + +auto_instrument_packages: # List of packages to instrument + - package_name +``` + +### Environment Variable Interpolation + +```yaml +service: + name: ${OTEL_SERVICE_NAME:-default-service} + environment: ${ENVIRONMENT} + +otlp: + endpoint: ${COLLECTOR_URL:-http://localhost:4318}/v1/traces + headers: + Authorization: Bearer ${API_TOKEN} +``` + +Syntax: +- `${VAR_NAME}` - Required variable +- `${VAR_NAME:-default}` - Variable with default value + +--- + +## enable() + +Bootstrap function to initialize the SDK. + +```python +from botanu import enable + +enable( + service_name: Optional[str] = None, + otlp_endpoint: Optional[str] = None, + config: Optional[BotanuConfig] = None, + auto_instrument: bool = True, + auto_instrument_packages: Optional[List[str]] = None, + propagation_mode: Optional[str] = None, + **kwargs: Any, +) -> None +``` + +### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `service_name` | `str` | From env | Service name | +| `otlp_endpoint` | `str` | From env | OTLP endpoint URL | +| `config` | `BotanuConfig` | `None` | Pre-built configuration | +| `auto_instrument` | `bool` | `True` | Enable auto-instrumentation | +| `auto_instrument_packages` | `list` | `None` | Override default packages | +| `propagation_mode` | `str` | `None` | `"lean"` or `"full"` | +| `**kwargs` | `Any` | `{}` | Additional config fields | + +### Behavior + +1. Creates/merges `BotanuConfig` +2. Configures `TracerProvider` with `RunContextEnricher` +3. Sets up OTLP exporter (if SDK extras installed) +4. Enables auto-instrumentation (if requested) +5. Configures W3C Baggage propagation + +### Examples + +#### Minimal + +```python +from botanu import enable + +enable(service_name="my-service") +``` + +#### With Config Object + +```python +from botanu import enable +from botanu.sdk.config import BotanuConfig + +config = BotanuConfig.from_yaml("config/botanu.yaml") +enable(config=config) +``` + +#### Custom Options + +```python +enable( + service_name="my-service", + otlp_endpoint="http://collector:4318/v1/traces", + auto_instrument_packages=["fastapi", "openai_v2"], + propagation_mode="full", +) +``` + +--- + +## disable() + +Disable the SDK and clean up resources. + +```python +from botanu import disable + +disable() -> None +``` + +### Behavior + +1. Flushes pending spans +2. Shuts down span processors +3. Disables instrumentation + +--- + +## is_enabled() + +Check if the SDK is currently enabled. + +```python +from botanu import is_enabled + +is_enabled() -> bool +``` + +### Example + +```python +if not is_enabled(): + enable(service_name="my-service") +``` + +--- + +## Environment Variables + +### OpenTelemetry Standard + +| Variable | Description | Default | +|----------|-------------|---------| +| `OTEL_SERVICE_NAME` | Service name | `"unknown_service"` | +| `OTEL_SERVICE_VERSION` | Service version | None | +| `OTEL_SERVICE_NAMESPACE` | Service namespace | None | +| `OTEL_DEPLOYMENT_ENVIRONMENT` | Deployment environment | `"production"` | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP base endpoint | `"http://localhost:4318"` | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP traces endpoint (full URL) | None | +| `OTEL_EXPORTER_OTLP_HEADERS` | OTLP headers (key=value pairs) | None | + +### Botanu-Specific + +| Variable | Description | Default | +|----------|-------------|---------| +| `BOTANU_ENVIRONMENT` | Fallback for environment | `"production"` | +| `BOTANU_PROPAGATION_MODE` | `"lean"` or `"full"` | `"lean"` | +| `BOTANU_AUTO_DETECT_RESOURCES` | Auto-detect cloud resources | `"true"` | +| `BOTANU_CONFIG_FILE` | Path to YAML config file | None | + +--- + +## RunContext + +Model for run metadata. + +```python +from botanu.models.run_context import RunContext +``` + +### Class Methods + +#### create() + +Create a new run context. + +```python +@classmethod +def create( + cls, + use_case: str, + workflow: Optional[str] = None, + workflow_version: Optional[str] = None, + environment: Optional[str] = None, + tenant_id: Optional[str] = None, + parent_run_id: Optional[str] = None, + deadline_seconds: Optional[float] = None, +) -> RunContext +``` + +#### create_retry() + +Create a retry context from an original run. + +```python +@classmethod +def create_retry(cls, original: RunContext) -> RunContext +``` + +#### from_baggage() + +Reconstruct context from baggage dictionary. + +```python +@classmethod +def from_baggage(cls, baggage: Dict[str, str]) -> Optional[RunContext] +``` + +### Instance Methods + +#### to_baggage_dict() + +Serialize to baggage format. + +```python +def to_baggage_dict(self, lean_mode: bool = True) -> Dict[str, str] +``` + +#### to_span_attributes() + +Serialize to span attributes. + +```python +def to_span_attributes(self) -> Dict[str, Any] +``` + +#### as_current() + +Context manager to set this as the current run. + +```python +def as_current(self) -> ContextManager +``` + +#### complete() + +Mark the run as complete. + +```python +def complete( + self, + status: RunStatus, + error_class: Optional[str] = None, +) -> None +``` + +### Fields + +| Field | Type | Description | +|-------|------|-------------| +| `run_id` | `str` | Unique UUIDv7 identifier | +| `root_run_id` | `str` | Root run ID (same as run_id for first attempt) | +| `use_case` | `str` | Business use case name | +| `workflow` | `str` | Workflow/function name | +| `workflow_version` | `str` | Version hash | +| `environment` | `str` | Deployment environment | +| `tenant_id` | `str` | Tenant identifier | +| `parent_run_id` | `str` | Parent run ID | +| `attempt` | `int` | Attempt number | +| `start_time` | `datetime` | Run start time | +| `outcome` | `RunOutcome` | Recorded outcome | + +--- + +## RunStatus + +Enum for run status. + +```python +from botanu.models.run_context import RunStatus + +class RunStatus(Enum): + SUCCESS = "success" + FAILURE = "failure" + PARTIAL = "partial" +``` + +## See Also + +- [Configuration Guide](../getting-started/configuration.md) - Configuration how-to +- [Architecture](../concepts/architecture.md) - SDK design +- [Existing OTel Setup](../integration/existing-otel.md) - Integration patterns diff --git a/docs/api/decorators.md b/docs/api/decorators.md new file mode 100644 index 0000000..36eb768 --- /dev/null +++ b/docs/api/decorators.md @@ -0,0 +1,99 @@ +# Decorators API Reference + +## @botanu_use_case + +The primary decorator for creating runs with automatic context propagation. + +```python +from botanu import botanu_use_case + +@botanu_use_case( + name: str, + workflow: Optional[str] = None, + environment: Optional[str] = None, + tenant_id: Optional[str] = None, +) +``` + +### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `name` | `str` | Required | Use case name for grouping | +| `workflow` | `str` | Function name | Workflow identifier | +| `environment` | `str` | From env | Deployment environment | +| `tenant_id` | `str` | `None` | Tenant identifier for multi-tenant systems | + +### Example + +```python +from botanu import botanu_use_case + +@botanu_use_case(name="my_workflow") +def my_function(): + data = db.query(...) + result = llm.complete(...) + return result +``` + +### Span Attributes + +| Attribute | Description | +|-----------|-------------| +| `botanu.run_id` | Generated UUIDv7 | +| `botanu.use_case` | `name` parameter | +| `botanu.workflow` | `workflow` parameter or function name | +| `botanu.environment` | Deployment environment | +| `botanu.tenant_id` | Tenant identifier (if provided) | + +### Alias + +`use_case` is an alias for `botanu_use_case`: + +```python +from botanu import use_case + +@use_case(name="my_workflow") +def my_function(): + return db.query(...) +``` + +## @botanu_outcome + +Decorator for sub-functions to emit outcomes based on success/failure. + +```python +from botanu import botanu_outcome + +@botanu_outcome() +def extract_data(): + return fetch_from_source() +``` + +- Emits "success" on completion +- Emits "failed" with exception class name if exception raised +- Does NOT create a new run + +### Example + +```python +from botanu import botanu_use_case, botanu_outcome + +@botanu_use_case(name="my_workflow") +def my_function(): + step_one() + step_two() + +@botanu_outcome() +def step_one(): + return do_work() + +@botanu_outcome() +def step_two(): + return do_more_work() +``` + +## See Also + +- [Quickstart](../getting-started/quickstart.md) +- [Run Context](../concepts/run-context.md) diff --git a/docs/api/tracking.md b/docs/api/tracking.md new file mode 100644 index 0000000..dcd35f7 --- /dev/null +++ b/docs/api/tracking.md @@ -0,0 +1,511 @@ +# Tracking API Reference + +## LLM Tracking + +### track_llm_call() + +Context manager for tracking LLM/model calls. + +```python +from botanu.tracking.llm import track_llm_call + +with track_llm_call( + provider: str, + model: str, + operation: str = ModelOperation.CHAT, + client_request_id: Optional[str] = None, + **kwargs: Any, +) -> Generator[LLMTracker, None, None]: +``` + +#### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `provider` | `str` | Required | LLM provider (openai, anthropic, etc.) | +| `model` | `str` | Required | Model name/ID (gpt-4, claude-3-opus, etc.) | +| `operation` | `str` | `"chat"` | Operation type (see ModelOperation) | +| `client_request_id` | `str` | `None` | Your tracking ID | +| `**kwargs` | `Any` | `{}` | Additional span attributes | + +#### Returns + +Yields an `LLMTracker` instance. + +#### Example + +```python +with track_llm_call(provider="openai", model="gpt-4") as tracker: + response = await client.chat.completions.create(...) + tracker.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + ) + tracker.set_request_id(response.id) +``` + +--- + +### LLMTracker + +Tracker object for recording LLM call details. + +#### Methods + +##### set_tokens() + +```python +def set_tokens( + input_tokens: int = 0, + output_tokens: int = 0, + cached_tokens: int = 0, + cache_read_tokens: int = 0, + cache_write_tokens: int = 0, +) -> LLMTracker +``` + +Records token usage. + +##### set_request_id() + +```python +def set_request_id( + provider_request_id: Optional[str] = None, + client_request_id: Optional[str] = None, +) -> LLMTracker +``` + +Records request IDs for billing reconciliation. + +##### set_response_model() + +```python +def set_response_model(model: str) -> LLMTracker +``` + +Records the actual model used in response. + +##### set_finish_reason() + +```python +def set_finish_reason(reason: str) -> LLMTracker +``` + +Records the stop reason (stop, length, content_filter, etc.). + +##### set_streaming() + +```python +def set_streaming(is_streaming: bool = True) -> LLMTracker +``` + +Marks request as streaming. + +##### set_cache_hit() + +```python +def set_cache_hit(cache_hit: bool = True) -> LLMTracker +``` + +Marks as a cache hit. + +##### set_attempt() + +```python +def set_attempt(attempt_number: int) -> LLMTracker +``` + +Sets retry attempt number. + +##### set_request_params() + +```python +def set_request_params( + temperature: Optional[float] = None, + top_p: Optional[float] = None, + max_tokens: Optional[int] = None, + stop_sequences: Optional[List[str]] = None, + frequency_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, +) -> LLMTracker +``` + +Records request parameters. + +##### set_error() + +```python +def set_error(error: Exception) -> LLMTracker +``` + +Records an error. + +##### add_metadata() + +```python +def add_metadata(**kwargs: Any) -> LLMTracker +``` + +Adds custom span attributes. + +--- + +### track_tool_call() + +Context manager for tracking tool/function calls. + +```python +from botanu.tracking.llm import track_tool_call + +with track_tool_call( + tool_name: str, + tool_call_id: Optional[str] = None, + provider: Optional[str] = None, + **kwargs: Any, +) -> Generator[ToolTracker, None, None]: +``` + +#### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `tool_name` | `str` | Required | Name of the tool/function | +| `tool_call_id` | `str` | `None` | Tool call ID from LLM response | +| `provider` | `str` | `None` | Tool provider if external | + +--- + +### ModelOperation + +Constants for operation types. + +| Constant | Value | +|----------|-------| +| `CHAT` | `"chat"` | +| `TEXT_COMPLETION` | `"text_completion"` | +| `EMBEDDINGS` | `"embeddings"` | +| `GENERATE_CONTENT` | `"generate_content"` | +| `EXECUTE_TOOL` | `"execute_tool"` | +| `CREATE_AGENT` | `"create_agent"` | +| `INVOKE_AGENT` | `"invoke_agent"` | +| `RERANK` | `"rerank"` | +| `IMAGE_GENERATION` | `"image_generation"` | +| `SPEECH_TO_TEXT` | `"speech_to_text"` | +| `TEXT_TO_SPEECH` | `"text_to_speech"` | + +--- + +## Data Tracking + +### track_db_operation() + +Context manager for tracking database operations. + +```python +from botanu.tracking.data import track_db_operation + +with track_db_operation( + system: str, + operation: str, + database: Optional[str] = None, + **kwargs: Any, +) -> Generator[DBTracker, None, None]: +``` + +#### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `system` | `str` | Required | Database system (postgresql, mongodb, etc.) | +| `operation` | `str` | Required | Operation type (SELECT, INSERT, etc.) | +| `database` | `str` | `None` | Database name | + +#### Example + +```python +with track_db_operation(system="postgresql", operation="SELECT") as db: + result = await cursor.execute(query) + db.set_result(rows_returned=len(result)) +``` + +--- + +### DBTracker + +#### Methods + +##### set_result() + +```python +def set_result( + rows_returned: int = 0, + rows_affected: int = 0, + bytes_read: int = 0, + bytes_written: int = 0, +) -> DBTracker +``` + +##### set_table() + +```python +def set_table(table_name: str, schema: Optional[str] = None) -> DBTracker +``` + +##### set_query_id() + +```python +def set_query_id(query_id: str) -> DBTracker +``` + +##### set_bytes_scanned() + +```python +def set_bytes_scanned(bytes_scanned: int) -> DBTracker +``` + +##### set_error() + +```python +def set_error(error: Exception) -> DBTracker +``` + +##### add_metadata() + +```python +def add_metadata(**kwargs: Any) -> DBTracker +``` + +--- + +### track_storage_operation() + +Context manager for tracking object storage operations. + +```python +from botanu.tracking.data import track_storage_operation + +with track_storage_operation( + system: str, + operation: str, + **kwargs: Any, +) -> Generator[StorageTracker, None, None]: +``` + +#### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `system` | `str` | Required | Storage system (s3, gcs, azure_blob, etc.) | +| `operation` | `str` | Required | Operation type (GET, PUT, DELETE, etc.) | + +--- + +### StorageTracker + +#### Methods + +##### set_result() + +```python +def set_result( + objects_count: int = 0, + bytes_read: int = 0, + bytes_written: int = 0, +) -> StorageTracker +``` + +##### set_bucket() + +```python +def set_bucket(bucket: str) -> StorageTracker +``` + +##### set_error() + +```python +def set_error(error: Exception) -> StorageTracker +``` + +##### add_metadata() + +```python +def add_metadata(**kwargs: Any) -> StorageTracker +``` + +--- + +### track_messaging_operation() + +Context manager for tracking messaging operations. + +```python +from botanu.tracking.data import track_messaging_operation + +with track_messaging_operation( + system: str, + operation: str, + destination: str, + **kwargs: Any, +) -> Generator[MessagingTracker, None, None]: +``` + +#### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `system` | `str` | Required | Messaging system (sqs, kafka, pubsub, etc.) | +| `operation` | `str` | Required | Operation type (publish, consume, etc.) | +| `destination` | `str` | Required | Queue/topic name | + +--- + +### MessagingTracker + +#### Methods + +##### set_result() + +```python +def set_result( + message_count: int = 0, + bytes_transferred: int = 0, +) -> MessagingTracker +``` + +##### set_error() + +```python +def set_error(error: Exception) -> MessagingTracker +``` + +##### add_metadata() + +```python +def add_metadata(**kwargs: Any) -> MessagingTracker +``` + +--- + +## Span Helpers + +### emit_outcome() + +Emit a business outcome for the current span. + +```python +from botanu import emit_outcome + +emit_outcome( + status: str, + *, + value_type: Optional[str] = None, + value_amount: Optional[float] = None, + confidence: Optional[float] = None, + reason: Optional[str] = None, +) -> None +``` + +#### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `status` | `str` | Required | Outcome status ("success", "partial", "failed") | +| `value_type` | `str` | `None` | Type of business value achieved | +| `value_amount` | `float` | `None` | Quantified value amount | +| `confidence` | `float` | `None` | Confidence score (0.0-1.0) | +| `reason` | `str` | `None` | Reason for the outcome | + +#### Example + +```python +emit_outcome("success", value_type="tickets_resolved", value_amount=1) +emit_outcome("failed", reason="rate_limit_exceeded") +``` + +--- + +### set_business_context() + +Set business context attributes on the current span. + +```python +from botanu import set_business_context + +set_business_context( + *, + customer_id: Optional[str] = None, + team: Optional[str] = None, + cost_center: Optional[str] = None, + region: Optional[str] = None, +) -> None +``` + +#### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `customer_id` | `str` | `None` | Customer identifier | +| `team` | `str` | `None` | Team or department | +| `cost_center` | `str` | `None` | Cost center for financial tracking | +| `region` | `str` | `None` | Geographic region | + +--- + +## Context Helpers + +### get_run_id() + +Get the current run ID from baggage. + +```python +from botanu import get_run_id + +run_id = get_run_id() +``` + +### get_use_case() + +Get the current use case from baggage. + +```python +from botanu import get_use_case + +use_case = get_use_case() +``` + +### get_baggage() + +Get a baggage value by key. + +```python +from botanu import get_baggage + +value = get_baggage("botanu.tenant_id") +``` + +### set_baggage() + +Set a baggage value. + +```python +from botanu import set_baggage + +set_baggage("botanu.custom_field", "my_value") +``` + +### get_current_span() + +Get the current active span. + +```python +from botanu import get_current_span + +span = get_current_span() +span.set_attribute("custom.attribute", "value") +``` + +## See Also + +- [LLM Tracking](../tracking/llm-tracking.md) - Detailed LLM tracking guide +- [Data Tracking](../tracking/data-tracking.md) - Data operation tracking +- [Outcomes](../tracking/outcomes.md) - Outcome recording diff --git a/docs/concepts/architecture.md b/docs/concepts/architecture.md new file mode 100644 index 0000000..2d87ccb --- /dev/null +++ b/docs/concepts/architecture.md @@ -0,0 +1,265 @@ +# Architecture + +Botanu SDK follows a "thin SDK, smart collector" architecture. The SDK does minimal work in your application's hot path, delegating heavy processing to the OpenTelemetry Collector. + +## Design Principles + +### 1. Minimal Hot-Path Overhead + +The SDK only performs lightweight operations during request processing: +- Generate UUIDv7 `run_id` +- Read/write W3C Baggage +- Record token counts as span attributes + +**Target overhead**: < 0.5ms per request + +### 2. OTel-Native + +Built on OpenTelemetry primitives, not alongside them: +- Uses standard `TracerProvider` +- Standard `SpanProcessor` for enrichment +- Standard OTLP export +- W3C Baggage for propagation + +### 3. Collector-Side Processing + +Heavy operations happen in the OTel Collector: +- PII redaction +- Cost calculation from token counts +- Vendor normalization +- Cardinality management +- Aggregation and sampling + +## Component Overview + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ Your Application │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ +│ │ @botanu_use_ │ │ track_llm_ │ │ track_db_ │ │ +│ │ case() │ │ call() │ │ operation() │ │ +│ └────────┬────────┘ └────────┬────────┘ └────────┬────────┘ │ +│ │ │ │ │ +│ └──────────────────────┼──────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌───────────────────────────────────────────────────────────────────────┐ │ +│ │ Botanu SDK Core │ │ +│ ├───────────────────────────────────────────────────────────────────────┤ │ +│ │ RunContext │ RunContextEnricher │ BotanuConfig │ │ +│ │ - generate_run_id() │ - on_start() │ - service_name │ │ +│ │ - to_baggage_dict() │ - reads baggage │ - otlp_endpoint │ │ +│ │ - to_span_attrs() │ - writes to spans │ - propagation_mode │ │ +│ └───────────────────────────────────────────────────────────────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌───────────────────────────────────────────────────────────────────────┐ │ +│ │ OpenTelemetry SDK │ │ +│ │ TracerProvider → BatchSpanProcessor → OTLPSpanExporter │ │ +│ └───────────────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + │ OTLP (HTTP or gRPC) + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ OpenTelemetry Collector │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ receivers: │ +│ otlp: │ +│ │ +│ processors: │ +│ transform: # Normalize vendor names │ +│ redaction: # Remove PII from gen_ai.content.* │ +│ attributes: # Cardinality limits │ +│ botanu/cost: # Calculate $ from tokens │ +│ │ +│ exporters: │ +│ clickhouse: # Or your preferred backend │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +## SDK Components + +### BotanuConfig + +Central configuration for the SDK: + +```python +@dataclass +class BotanuConfig: + service_name: str + deployment_environment: str + otlp_endpoint: str + propagation_mode: str # "lean" or "full" + auto_instrument_packages: List[str] +``` + +### RunContext + +Holds run metadata and provides serialization: + +```python +@dataclass +class RunContext: + run_id: str + root_run_id: str + use_case: str + workflow: Optional[str] + attempt: int + # ... +``` + +### RunContextEnricher + +The only span processor in the SDK. Reads baggage, writes to spans: + +```python +class RunContextEnricher(SpanProcessor): + def on_start(self, span, parent_context): + for key in self._baggage_keys: + value = baggage.get_baggage(key, parent_context) + if value: + span.set_attribute(key, value) +``` + +### Tracking Helpers + +Context managers for manual instrumentation: + +- `track_llm_call()` - LLM/model operations +- `track_db_operation()` - Database operations +- `track_storage_operation()` - Object storage operations +- `track_messaging_operation()` - Message queue operations + +## Data Flow + +### 1. Run Initiation + +```python +@botanu_use_case("Customer Support") +def handle_ticket(): + pass +``` + +1. Generate UUIDv7 `run_id` +2. Create `RunContext` +3. Set baggage in current context +4. Start root span with run attributes + +### 2. Context Propagation + +```python +# Within the run +response = requests.get("https://api.example.com") +``` + +1. HTTP instrumentation reads current context +2. Baggage is injected into request headers +3. Downstream service extracts baggage +4. Context continues propagating + +### 3. Span Enrichment + +Every span (including auto-instrumented): + +1. `RunContextEnricher.on_start()` is called +2. Reads `botanu.run_id` from baggage +3. Writes to span attributes +4. Span is exported with run context + +### 4. Export and Processing + +1. `BatchSpanProcessor` batches spans +2. `OTLPSpanExporter` sends to collector +3. Collector processes (cost calc, PII redaction) +4. Spans written to backend + +## Why This Architecture? + +### SDK Stays Thin + +| Operation | Location | Reason | +|-----------|----------|--------| +| run_id generation | SDK | Must be synchronous | +| Baggage propagation | SDK | Process-local | +| Token counting | SDK | Available at call site | +| Cost calculation | Collector | Pricing tables change | +| PII redaction | Collector | Consistent policy | +| Aggregation | Collector | Reduces data volume | + +### No Vendor Lock-in + +- Standard OTel export format +- Any OTel-compatible backend works +- Collector processors are configurable + +### Minimal Dependencies + +Core SDK only requires `opentelemetry-api`: + +```toml +dependencies = [ + "opentelemetry-api >= 1.20.0", +] +``` + +Full SDK adds export capabilities: + +```toml +[project.optional-dependencies] +sdk = [ + "opentelemetry-sdk >= 1.20.0", + "opentelemetry-exporter-otlp-proto-http >= 1.20.0", +] +``` + +## Integration Points + +### Existing TracerProvider + +If you already have OTel configured: + +```python +from opentelemetry import trace +from botanu.processors.enricher import RunContextEnricher + +# Add our processor to your existing provider +provider = trace.get_tracer_provider() +provider.add_span_processor(RunContextEnricher()) +``` + +### Existing Instrumentation + +Botanu works alongside existing instrumentation: + +```python +# Your existing setup +from opentelemetry.instrumentation.requests import RequestsInstrumentor +RequestsInstrumentor().instrument() + +# Add Botanu +from botanu import init_botanu +init_botanu(service_name="my-service") + +# Both work together - requests are instrumented AND get run_id +``` + +## Performance Characteristics + +| Operation | Typical Latency | +|-----------|-----------------| +| `generate_run_id()` | < 0.01ms | +| `RunContextEnricher.on_start()` | < 0.05ms | +| `track_llm_call()` overhead | < 0.1ms | +| Baggage injection | < 0.01ms | + +Total SDK overhead per request: **< 0.5ms** + +## See Also + +- [Run Context](run-context.md) - RunContext model details +- [Context Propagation](context-propagation.md) - How context flows +- [Collector Configuration](../integration/collector.md) - Collector setup diff --git a/docs/concepts/context-propagation.md b/docs/concepts/context-propagation.md new file mode 100644 index 0000000..80bf319 --- /dev/null +++ b/docs/concepts/context-propagation.md @@ -0,0 +1,239 @@ +# Context Propagation + +Context propagation ensures that the `run_id` and other metadata flow through your entire application — across function calls, HTTP requests, message queues, and async workers. + +## How It Works + +Botanu uses **W3C Baggage** for context propagation, the same standard used by OpenTelemetry for distributed tracing. + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ HTTP Request Headers │ +├─────────────────────────────────────────────────────────────────┤ +│ traceparent: 00-{trace_id}-{span_id}-01 │ +│ baggage: botanu.run_id=019abc12...,botanu.use_case=Support │ +└─────────────────────────────────────────────────────────────────┘ +``` + +When you make an outbound HTTP request, the `botanu.run_id` travels in the `baggage` header alongside the trace context. + +## Propagation Modes + +### Lean Mode (Default) + +Only propagates essential fields to minimize header size: +- `botanu.run_id` +- `botanu.use_case` + +```python +# Lean mode baggage (~100 bytes) +baggage: botanu.run_id=019abc12-def3-7890-abcd-1234567890ab,botanu.use_case=Customer%20Support +``` + +### Full Mode + +Propagates all context fields: +- `botanu.run_id` +- `botanu.use_case` +- `botanu.workflow` +- `botanu.environment` +- `botanu.tenant_id` +- `botanu.parent_run_id` + +```python +# Enable full mode +import os +os.environ["BOTANU_PROPAGATION_MODE"] = "full" +``` + +## In-Process Propagation + +Within a single process, context is propagated via Python's `contextvars`: + +```python +from botanu import botanu_use_case + +@botanu_use_case("Customer Support") +def handle_ticket(ticket_id: str): + # Context is set here + + fetch_context(ticket_id) # Inherits context + call_llm() # Inherits context + save_result() # Inherits context +``` + +The `RunContextEnricher` span processor automatically reads baggage and writes to span attributes: + +```python +class RunContextEnricher(SpanProcessor): + def on_start(self, span, parent_context): + for key in ["botanu.run_id", "botanu.use_case"]: + value = baggage.get_baggage(key, parent_context) + if value: + span.set_attribute(key, value) +``` + +This ensures **every span** — including auto-instrumented ones — gets the `run_id`. + +## HTTP Propagation + +### Outbound Requests + +When using instrumented HTTP clients (`requests`, `httpx`, `urllib3`), baggage is automatically propagated: + +```python +import requests + +@botanu_use_case("Fetch Data") +def fetch_data(): + # Baggage is automatically added to headers + response = requests.get("https://api.example.com/data") +``` + +### Inbound Requests (Frameworks) + +For web frameworks (`FastAPI`, `Flask`, `Django`), use the middleware to extract context: + +```python +# FastAPI +from botanu.sdk.middleware import BotanuMiddleware + +app = FastAPI() +app.add_middleware(BotanuMiddleware) + +@app.post("/tickets") +def create_ticket(request: Request): + # RunContext is extracted from incoming baggage + # or created if not present + pass +``` + +## Message Queue Propagation + +For async messaging systems, you need to manually inject and extract context. + +### Injecting Context (Producer) + +```python +from botanu.sdk.context import get_current_run_context + +def publish_message(queue, payload): + ctx = get_current_run_context() + + message = { + "payload": payload, + "metadata": { + "baggage": ctx.to_baggage_dict() if ctx else {} + } + } + queue.publish(message) +``` + +### Extracting Context (Consumer) + +```python +from botanu.models.run_context import RunContext + +def process_message(message): + baggage = message.get("metadata", {}).get("baggage", {}) + ctx = RunContext.from_baggage(baggage) + + if ctx: + # Continue with existing context + with ctx.as_current(): + handle_message(message["payload"]) + else: + # Create new context + with RunContext.create(use_case="Message Processing").as_current(): + handle_message(message["payload"]) +``` + +## Cross-Service Propagation + +``` +┌──────────────┐ HTTP ┌──────────────┐ Kafka ┌──────────────┐ +│ Service A │ ────────────► │ Service B │ ────────────► │ Service C │ +│ │ baggage: │ │ message │ │ +│ run_id=X │ run_id=X │ run_id=X │ run_id=X │ run_id=X │ +└──────────────┘ └──────────────┘ └──────────────┘ +``` + +The same `run_id` flows through all services, enabling: +- End-to-end cost attribution +- Cross-service trace correlation +- Distributed debugging + +## Baggage Size Limits + +W3C Baggage has practical size limits. The SDK uses lean mode by default to stay well under these limits: + +| Mode | Typical Size | Recommendation | +|------|--------------|----------------| +| Lean | ~100 bytes | Use for most cases | +| Full | ~300 bytes | Use when you need all context downstream | + +## Propagation and Auto-Instrumentation + +The SDK works seamlessly with OTel auto-instrumentation: + +```python +from botanu import init_botanu + +init_botanu( + service_name="my-service", + auto_instrument=True, # Enable auto-instrumentation +) +``` + +Auto-instrumented libraries will automatically propagate baggage: +- `requests`, `httpx`, `urllib3` (HTTP clients) +- `fastapi`, `flask`, `django` (Web frameworks) +- `celery` (Task queues) +- `grpc` (gRPC) + +## Debugging Propagation + +### Check Current Context + +```python +from botanu.sdk.context import get_baggage, get_run_id + +run_id = get_run_id() +print(f"Current run_id: {run_id}") + +use_case = get_baggage("botanu.use_case") +print(f"Current use_case: {use_case}") +``` + +### Verify Header Propagation + +```python +# In your HTTP client +import httpx + +def debug_request(): + with httpx.Client() as client: + response = client.get( + "https://httpbin.org/headers", + ) + print(response.json()) + # Check for 'baggage' header in response +``` + +## Common Issues + +### Context Not Propagating + +1. **Missing initialization**: Ensure `init_botanu()` is called at startup +2. **Missing middleware**: Add `BotanuMiddleware` to your web framework +3. **Async context loss**: Use `contextvars`-aware async patterns + +### Duplicate run_ids + +1. **Multiple decorators**: Only use `@botanu_use_case` at the entry point +2. **Middleware + decorator**: Choose one, not both + +## See Also + +- [Run Context](run-context.md) - Understanding the RunContext model +- [Architecture](architecture.md) - Overall SDK architecture diff --git a/docs/concepts/run-context.md b/docs/concepts/run-context.md new file mode 100644 index 0000000..436be03 --- /dev/null +++ b/docs/concepts/run-context.md @@ -0,0 +1,188 @@ +# Run Context + +The Run Context is the core concept in Botanu SDK. It represents a single business transaction or workflow execution that you want to track for cost attribution. + +## What is a Run? + +A **run** is a logical unit of work that produces a business outcome. Examples: + +- Resolving a customer support ticket +- Processing a document +- Generating a report +- Handling a chatbot conversation + +A single run may involve: +- Multiple LLM calls (possibly to different providers) +- Database queries +- Storage operations +- External API calls +- Message queue operations + +## The run_id + +Every run is identified by a unique `run_id` — a UUIDv7 that is: + +- **Time-sortable**: IDs generated later sort after earlier ones +- **Globally unique**: No collisions across services +- **Propagated automatically**: Flows through your entire application via W3C Baggage + +```python +from botanu.models.run_context import generate_run_id + +run_id = generate_run_id() +# "019abc12-def3-7890-abcd-1234567890ab" +``` + +## RunContext Model + +The `RunContext` dataclass holds all metadata for a run: + +```python +from botanu.models.run_context import RunContext + +ctx = RunContext.create( + use_case="Customer Support", + workflow="handle_ticket", + environment="production", + tenant_id="tenant-123", +) + +print(ctx.run_id) # "019abc12-def3-7890-..." +print(ctx.root_run_id) # Same as run_id for top-level runs +print(ctx.attempt) # 1 (first attempt) +``` + +### Key Fields + +| Field | Description | +|-------|-------------| +| `run_id` | Unique identifier for this run (UUIDv7) | +| `root_run_id` | ID of the original run (for retries, same as `run_id` for first attempt) | +| `use_case` | Business use case name (e.g., "Customer Support") | +| `workflow` | Optional workflow/function name | +| `environment` | Deployment environment (production, staging, etc.) | +| `attempt` | Attempt number (1 for first, 2+ for retries) | +| `tenant_id` | Optional tenant identifier for multi-tenant systems | + +## Creating Runs + +### Using the Decorator (Recommended) + +```python +from botanu import botanu_use_case + +@botanu_use_case("Customer Support") +def handle_ticket(ticket_id: str): + # RunContext is automatically created and propagated + # All operations inside inherit the same run_id + pass +``` + +### Manual Creation + +```python +from botanu.models.run_context import RunContext + +ctx = RunContext.create( + use_case="Document Processing", + workflow="extract_entities", + tenant_id="acme-corp", +) + +# Use ctx.to_baggage_dict() to propagate via HTTP headers +# Use ctx.to_span_attributes() to add to spans +``` + +## Retry Handling + +When a run fails and is retried, use `create_retry()` to maintain lineage: + +```python +original = RunContext.create(use_case="Process Order") + +# First attempt fails... + +retry = RunContext.create_retry(original) +print(retry.attempt) # 2 +print(retry.retry_of_run_id) # Original run_id +print(retry.root_run_id) # Same as original.run_id +print(retry.run_id) # New unique ID +``` + +This enables: +- Tracking total attempts for a business operation +- Correlating retries back to the original request +- Calculating aggregate cost across all attempts + +## Deadlines and Cancellation + +RunContext supports deadline and cancellation tracking: + +```python +ctx = RunContext.create( + use_case="Long Running Task", + deadline_seconds=30.0, # 30 second deadline +) + +# Check deadline +if ctx.is_past_deadline(): + raise TimeoutError("Deadline exceeded") + +# Check remaining time +remaining = ctx.remaining_time_seconds() + +# Request cancellation +ctx.request_cancellation(reason="user") +if ctx.is_cancelled(): + # Clean up and exit + pass +``` + +## Serialization + +### To Baggage (for HTTP propagation) + +```python +# Lean mode (default): only run_id and use_case +baggage = ctx.to_baggage_dict() +# {"botanu.run_id": "...", "botanu.use_case": "..."} + +# Full mode: all fields +baggage = ctx.to_baggage_dict(lean_mode=False) +# Includes workflow, environment, tenant_id, etc. +``` + +### To Span Attributes + +```python +attrs = ctx.to_span_attributes() +# {"botanu.run_id": "...", "botanu.use_case": "...", ...} +``` + +### From Baggage (receiving side) + +```python +ctx = RunContext.from_baggage(baggage_dict) +if ctx is None: + # Required fields missing, create new context + ctx = RunContext.create(use_case="Unknown") +``` + +## Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `BOTANU_ENVIRONMENT` | Default environment | `"production"` | +| `BOTANU_PROPAGATION_MODE` | `"lean"` or `"full"` | `"lean"` | + +## Best Practices + +1. **One run per business outcome**: Don't create runs for internal operations +2. **Use descriptive use_case names**: They appear in dashboards and queries +3. **Leverage tenant_id**: Essential for multi-tenant cost attribution +4. **Handle retries properly**: Always use `create_retry()` for retry attempts + +## See Also + +- [Context Propagation](context-propagation.md) - How context flows through your application +- [Outcomes](../tracking/outcomes.md) - Recording business outcomes diff --git a/docs/getting-started/configuration.md b/docs/getting-started/configuration.md new file mode 100644 index 0000000..48c8c1d --- /dev/null +++ b/docs/getting-started/configuration.md @@ -0,0 +1,271 @@ +# Configuration + +Botanu SDK can be configured through code, environment variables, or YAML files. + +## Configuration Precedence + +1. **Code arguments** (explicit values passed to `BotanuConfig`) +2. **Environment variables** (`BOTANU_*`, `OTEL_*`) +3. **YAML config file** (`botanu.yaml` or specified path) +4. **Built-in defaults** + +## Quick Configuration + +### Code-Based + +```python +from botanu import enable + +enable( + service_name="my-service", + otlp_endpoint="http://collector:4318/v1/traces", +) +``` + +### Environment Variables + +```bash +export OTEL_SERVICE_NAME=my-service +export OTEL_EXPORTER_OTLP_ENDPOINT=http://collector:4318 +export BOTANU_ENVIRONMENT=production +``` + +### YAML File + +```yaml +# botanu.yaml +service: + name: my-service + version: 1.0.0 + environment: production + +otlp: + endpoint: http://collector:4318/v1/traces + +propagation: + mode: lean +``` + +Load with: + +```python +from botanu.sdk.config import BotanuConfig + +config = BotanuConfig.from_yaml("botanu.yaml") +``` + +## Full Configuration Reference + +### BotanuConfig Fields + +```python +from dataclasses import dataclass + +@dataclass +class BotanuConfig: + # Service identification + service_name: str = None # OTEL_SERVICE_NAME + service_version: str = None # OTEL_SERVICE_VERSION + service_namespace: str = None # OTEL_SERVICE_NAMESPACE + deployment_environment: str = None # OTEL_DEPLOYMENT_ENVIRONMENT + + # Resource detection + auto_detect_resources: bool = True # BOTANU_AUTO_DETECT_RESOURCES + + # OTLP exporter + otlp_endpoint: str = None # OTEL_EXPORTER_OTLP_ENDPOINT + otlp_headers: dict = None # Custom headers for auth + + # Span export + max_export_batch_size: int = 512 + max_queue_size: int = 2048 + schedule_delay_millis: int = 5000 + + # Propagation mode + propagation_mode: str = "lean" # BOTANU_PROPAGATION_MODE + + # Auto-instrumentation + auto_instrument_packages: list = [...] +``` + +## Environment Variables + +### OpenTelemetry Standard Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `OTEL_SERVICE_NAME` | Service name | `unknown_service` | +| `OTEL_SERVICE_VERSION` | Service version | None | +| `OTEL_SERVICE_NAMESPACE` | Service namespace | None | +| `OTEL_DEPLOYMENT_ENVIRONMENT` | Environment name | `production` | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP collector base URL | `http://localhost:4318` | +| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP traces endpoint (full URL) | None | + +### Botanu-Specific Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `BOTANU_ENVIRONMENT` | Fallback for environment | `production` | +| `BOTANU_PROPAGATION_MODE` | `lean` or `full` | `lean` | +| `BOTANU_AUTO_DETECT_RESOURCES` | Auto-detect cloud resources | `true` | +| `BOTANU_CONFIG_FILE` | Path to YAML config | None | + +## YAML Configuration + +### Full Example + +```yaml +# botanu.yaml - Full configuration example +service: + name: ${OTEL_SERVICE_NAME:-my-service} + version: ${APP_VERSION:-1.0.0} + namespace: production + environment: ${ENVIRONMENT:-production} + +resource: + auto_detect: true + +otlp: + endpoint: ${OTEL_EXPORTER_OTLP_ENDPOINT:-http://localhost:4318}/v1/traces + headers: + Authorization: Bearer ${OTLP_AUTH_TOKEN} + +export: + batch_size: 512 + queue_size: 2048 + delay_ms: 5000 + +propagation: + mode: lean + +auto_instrument_packages: + - requests + - httpx + - fastapi + - sqlalchemy + - openai_v2 +``` + +### Environment Variable Interpolation + +The YAML loader supports two interpolation patterns: + +```yaml +# Simple interpolation +endpoint: ${COLLECTOR_URL} + +# With default value +endpoint: ${COLLECTOR_URL:-http://localhost:4318} +``` + +### Loading Configuration + +```python +from botanu.sdk.config import BotanuConfig + +# Explicit path +config = BotanuConfig.from_yaml("config/botanu.yaml") + +# Auto-discover (searches botanu.yaml, config/botanu.yaml) +config = BotanuConfig.from_file_or_env() + +# Environment only +config = BotanuConfig() +``` + +## Propagation Modes + +### Lean Mode (Default) + +Propagates only essential fields to minimize header size: + +- `botanu.run_id` +- `botanu.use_case` + +Best for high-traffic systems where header size matters. + +### Full Mode + +Propagates all context fields: + +- `botanu.run_id` +- `botanu.use_case` +- `botanu.workflow` +- `botanu.environment` +- `botanu.tenant_id` +- `botanu.parent_run_id` + +Enable with: + +```bash +export BOTANU_PROPAGATION_MODE=full +``` + +Or: + +```python +enable(service_name="my-service", propagation_mode="full") +``` + +## Auto-Instrumentation + +### Default Packages + +By default, Botanu enables instrumentation for: + +```python +[ + # HTTP clients + "requests", "httpx", "urllib3", "aiohttp_client", + # Web frameworks + "fastapi", "flask", "django", "starlette", + # Databases + "sqlalchemy", "psycopg2", "asyncpg", "pymongo", "redis", + # Messaging + "celery", "kafka_python", + # gRPC + "grpc", + # GenAI + "openai_v2", "anthropic", "vertexai", "google_genai", "langchain", + # Runtime + "logging", +] +``` + +### Customizing Packages + +```python +from botanu import enable + +enable( + service_name="my-service", + auto_instrument_packages=["requests", "fastapi", "openai_v2"], +) +``` + +### Disabling Auto-Instrumentation + +```python +enable( + service_name="my-service", + auto_instrument_packages=[], # Empty list disables +) +``` + +## Exporting Configuration + +```python +config = BotanuConfig( + service_name="my-service", + deployment_environment="production", +) + +# Export as dictionary +print(config.to_dict()) +``` + +## See Also + +- [Architecture](../concepts/architecture.md) - SDK design principles +- [Collector Configuration](../integration/collector.md) - Collector setup +- [Existing OTel Setup](../integration/existing-otel.md) - Integration with existing OTel diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md new file mode 100644 index 0000000..f11f9b1 --- /dev/null +++ b/docs/getting-started/installation.md @@ -0,0 +1,141 @@ +# Installation + +This guide covers installing Botanu SDK and its optional dependencies. + +## Requirements + +- Python 3.9 or later +- OpenTelemetry Collector (for span processing) + +## Basic Installation + +Install the core SDK with pip: + +```bash +pip install botanu +``` + +The core package has minimal dependencies: +- `opentelemetry-api >= 1.20.0` + +This is all you need if you already have OpenTelemetry configured in your application. + +## Installation with Extras + +### Full SDK (Recommended for Standalone) + +If you don't have an existing OpenTelemetry setup: + +```bash +pip install "botanu[sdk]" +``` + +This adds: +- `opentelemetry-sdk` - The OTel SDK implementation +- `opentelemetry-exporter-otlp-proto-http` - OTLP HTTP exporter + +### Auto-Instrumentation + +For automatic instrumentation of common libraries: + +```bash +pip install "botanu[instruments]" +``` + +Includes instrumentation for: +- **HTTP clients**: requests, httpx, urllib3, aiohttp +- **Web frameworks**: FastAPI, Flask, Django, Starlette +- **Databases**: SQLAlchemy, psycopg2, asyncpg, pymongo, redis +- **Messaging**: Celery, Kafka +- **Other**: gRPC, logging + +### GenAI Instrumentation + +For automatic LLM provider instrumentation: + +```bash +pip install "botanu[genai]" +``` + +Includes instrumentation for: +- OpenAI +- Anthropic +- Google Vertex AI +- Google GenAI +- LangChain + +### Everything + +To install all optional dependencies: + +```bash +pip install "botanu[all]" +``` + +### Development + +For development and testing: + +```bash +pip install "botanu[dev]" +``` + +## Verify Installation + +```python +import botanu +print(botanu.__version__) +``` + +## Docker + +In a Dockerfile: + +```dockerfile +FROM python:3.11-slim + +WORKDIR /app + +# Install Botanu with SDK extras +RUN pip install "botanu[sdk]" + +COPY . . + +CMD ["python", "app.py"] +``` + +## Poetry + +```toml +[tool.poetry.dependencies] +botanu = { version = "^0.1.0", extras = ["sdk"] } +``` + +## pip-tools / requirements.txt + +```text +# requirements.in +botanu[sdk]>=0.1.0 +``` + +Generate with: +```bash +pip-compile requirements.in +``` + +## Collector Setup + +Botanu SDK sends traces to an OpenTelemetry Collector. You'll need one running to receive spans. + +Quick start with Docker: + +```bash +docker run -p 4318:4318 otel/opentelemetry-collector:latest +``` + +See [Collector Configuration](../integration/collector.md) for detailed setup. + +## Next Steps + +- [Quickstart](quickstart.md) - Your first instrumented application +- [Configuration](configuration.md) - Customize SDK behavior diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md new file mode 100644 index 0000000..b2fd386 --- /dev/null +++ b/docs/getting-started/quickstart.md @@ -0,0 +1,71 @@ +# Quickstart + +Get run-level cost attribution working in minutes. + +## Prerequisites + +- Python 3.9+ +- OpenTelemetry Collector (see [Collector Configuration](../integration/collector.md)) + +## Step 1: Install + +```bash +pip install "botanu[all]" +``` + +## Step 2: Enable + +```python +from botanu import enable + +enable(service_name="my-service") +``` + +## Step 3: Define Entry Point + +```python +from botanu import botanu_use_case + +@botanu_use_case(name="my_workflow") +def my_function(): + data = db.query(...) + result = llm.complete(...) + return result +``` + +All LLM calls, database queries, and HTTP requests inside the function are automatically tracked with the same `run_id`. + +## Complete Example + +```python +from botanu import enable, botanu_use_case + +enable(service_name="my-service") + +@botanu_use_case(name="my_workflow") +def my_function(): + data = db.query(...) + result = openai.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": data}] + ) + return result +``` + +## What Gets Tracked + +| Attribute | Example | Description | +|-----------|---------|-------------| +| `botanu.run_id` | `019abc12-...` | Unique run identifier | +| `botanu.use_case` | `my_workflow` | Business use case | +| `gen_ai.usage.input_tokens` | `150` | LLM input tokens | +| `gen_ai.usage.output_tokens` | `200` | LLM output tokens | +| `db.system` | `postgresql` | Database system | + +All spans share the same `run_id`, enabling cost-per-transaction analytics. + +## Next Steps + +- [Configuration](configuration.md) - Environment variables and YAML config +- [Kubernetes Deployment](../integration/kubernetes.md) - Zero-code instrumentation at scale +- [Context Propagation](../concepts/context-propagation.md) - Cross-service tracing diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..1f77d25 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,65 @@ +# Botanu SDK Documentation + +Botanu SDK provides OpenTelemetry-native run-level cost attribution for AI workflows. + +## Overview + +Traditional observability tools trace individual requests. But AI workflows are different — a single business outcome (resolving a support ticket, processing an order) might span multiple LLM calls, retries, tool executions, and data operations across different vendors. + +Botanu introduces **run-level attribution**: a unique `run_id` that follows your entire workflow, enabling you to answer "How much did this outcome cost?" + +## Documentation + +### Getting Started + +- [Installation](getting-started/installation.md) - Install and configure the SDK +- [Quick Start](getting-started/quickstart.md) - Get up and running in 5 minutes +- [Configuration](getting-started/configuration.md) - Configuration options and environment variables + +### Core Concepts + +- [Run Context](concepts/run-context.md) - Understanding `run_id` and context propagation +- [Context Propagation](concepts/context-propagation.md) - How context flows through your application +- [Architecture](concepts/architecture.md) - SDK design and component overview + +### Tracking + +- [LLM Tracking](tracking/llm-tracking.md) - Track AI model calls and token usage +- [Data Tracking](tracking/data-tracking.md) - Track database, storage, and messaging operations +- [Outcomes](tracking/outcomes.md) - Record business outcomes for ROI calculation + +### Integration + +- [Auto-Instrumentation](integration/auto-instrumentation.md) - Automatic instrumentation for common libraries +- [Kubernetes Deployment](integration/kubernetes.md) - Zero-code instrumentation at scale +- [Existing OTel Setup](integration/existing-otel.md) - Integrate with existing OpenTelemetry deployments +- [Collector Configuration](integration/collector.md) - Configure the OpenTelemetry Collector + +### Patterns + +- [Best Practices](patterns/best-practices.md) - Recommended patterns for production use +- [Anti-Patterns](patterns/anti-patterns.md) - Common mistakes to avoid + +### API Reference + +- [Decorators](api/decorators.md) - `@botanu_use_case` and related decorators +- [Tracking API](api/tracking.md) - Manual tracking context managers +- [Configuration API](api/configuration.md) - `BotanuConfig` and initialization + +## Quick Example + +```python +from botanu import enable, botanu_use_case + +enable(service_name="my-service") + +@botanu_use_case(name="my_workflow") +def my_function(): + data = db.query(...) + result = llm.complete(...) + return result +``` + +## License + +Apache License 2.0. See [LICENSE](https://github.com/botanu-ai/botanu-sdk-python/blob/main/LICENSE). diff --git a/docs/integration/auto-instrumentation.md b/docs/integration/auto-instrumentation.md new file mode 100644 index 0000000..bec3e44 --- /dev/null +++ b/docs/integration/auto-instrumentation.md @@ -0,0 +1,130 @@ +# Auto-Instrumentation + +Automatically instrument common libraries without code changes. + +## Installation + +```bash +pip install "botanu[all]" +``` + +## Usage + +```python +from botanu import enable, botanu_use_case + +enable(service_name="my-service") + +@botanu_use_case(name="my_workflow") +def my_function(): + data = db.query(...) + result = openai.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": data}] + ) + return result +``` + +All operations inside are automatically traced. + +## Supported Libraries + +### HTTP Clients + +| Library | Package | +|---------|---------| +| requests | `opentelemetry-instrumentation-requests` | +| httpx | `opentelemetry-instrumentation-httpx` | +| urllib3 | `opentelemetry-instrumentation-urllib3` | +| aiohttp | `opentelemetry-instrumentation-aiohttp-client` | + +### Web Frameworks + +| Framework | Package | +|-----------|---------| +| FastAPI | `opentelemetry-instrumentation-fastapi` | +| Flask | `opentelemetry-instrumentation-flask` | +| Django | `opentelemetry-instrumentation-django` | +| Starlette | `opentelemetry-instrumentation-starlette` | + +### Databases + +| Database | Package | +|----------|---------| +| SQLAlchemy | `opentelemetry-instrumentation-sqlalchemy` | +| psycopg2 | `opentelemetry-instrumentation-psycopg2` | +| asyncpg | `opentelemetry-instrumentation-asyncpg` | +| pymongo | `opentelemetry-instrumentation-pymongo` | +| redis | `opentelemetry-instrumentation-redis` | + +### Messaging + +| System | Package | +|--------|---------| +| Celery | `opentelemetry-instrumentation-celery` | +| Kafka | `opentelemetry-instrumentation-kafka-python` | + +### LLM Providers + +| Provider | Package | +|----------|---------| +| OpenAI | `opentelemetry-instrumentation-openai-v2` | +| Anthropic | `opentelemetry-instrumentation-anthropic` | +| Vertex AI | `opentelemetry-instrumentation-vertexai` | +| Google GenAI | `opentelemetry-instrumentation-google-genai` | +| LangChain | `opentelemetry-instrumentation-langchain` | + +## Context Propagation + +HTTP clients automatically propagate `run_id` via W3C Baggage headers: + +``` +traceparent: 00-{trace_id}-{span_id}-01 +baggage: botanu.run_id=019abc12... +``` + +## Span Attributes + +OpenAI calls produce: + +``` +gen_ai.operation.name: chat +gen_ai.provider.name: openai +gen_ai.request.model: gpt-4 +gen_ai.usage.input_tokens: 10 +gen_ai.usage.output_tokens: 25 +``` + +Database calls produce: + +``` +db.system: postgresql +db.operation: SELECT +db.statement: SELECT * FROM orders WHERE id = ? +``` + +## Troubleshooting + +### Spans Not Appearing + +Ensure `enable()` is called before library imports: + +```python +from botanu import enable +enable(service_name="my-service") + +import requests +import openai +``` + +### Check Instrumentation Status + +```python +from opentelemetry.instrumentation.requests import RequestsInstrumentor +print(RequestsInstrumentor().is_instrumented()) +``` + +## See Also + +- [Kubernetes Deployment](kubernetes.md) - Zero-code instrumentation at scale +- [Collector Configuration](collector.md) - Collector setup diff --git a/docs/integration/collector.md b/docs/integration/collector.md new file mode 100644 index 0000000..ed85df9 --- /dev/null +++ b/docs/integration/collector.md @@ -0,0 +1,422 @@ +# Collector Configuration + +Set up the OpenTelemetry Collector for cost attribution processing. + +## Overview + +Botanu follows a "thin SDK, smart collector" architecture. The SDK captures raw telemetry; the collector handles: + +- **PII redaction** - Remove sensitive data from prompts/responses +- **Cost calculation** - Convert tokens to dollars using pricing tables +- **Vendor normalization** - Standardize provider names +- **Cardinality management** - Limit high-cardinality attributes +- **Aggregation** - Pre-aggregate metrics for dashboards + +## Quick Start + +### Docker + +```bash +docker run -p 4318:4318 -p 4317:4317 \ + -v $(pwd)/otel-config.yaml:/etc/otelcol/config.yaml \ + otel/opentelemetry-collector-contrib:latest +``` + +### Docker Compose + +```yaml +services: + collector: + image: otel/opentelemetry-collector-contrib:latest + ports: + - "4318:4318" # OTLP HTTP + - "4317:4317" # OTLP gRPC + volumes: + - ./otel-config.yaml:/etc/otelcol/config.yaml +``` + +## Basic Configuration + +```yaml +# otel-config.yaml +receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 + grpc: + endpoint: 0.0.0.0:4317 + +processors: + batch: + send_batch_size: 1000 + timeout: 10s + +exporters: + debug: + verbosity: detailed + +service: + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [debug] +``` + +## Cost Attribution Configuration + +### Full Pipeline + +```yaml +receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 + grpc: + endpoint: 0.0.0.0:4317 + +processors: + # Batch for efficiency + batch: + send_batch_size: 1000 + timeout: 10s + + # Normalize vendor names + transform/vendor: + trace_statements: + - context: span + statements: + # Normalize provider names to standard format + - set(attributes["botanu.vendor"], "openai") where attributes["gen_ai.provider.name"] == "openai" + - set(attributes["botanu.vendor"], "anthropic") where attributes["gen_ai.provider.name"] == "anthropic" + - set(attributes["botanu.vendor"], "azure.openai") where attributes["gen_ai.provider.name"] == "azure.openai" + - set(attributes["botanu.vendor"], "gcp.vertex_ai") where attributes["gen_ai.provider.name"] == "gcp.vertex_ai" + - set(attributes["botanu.vendor"], "aws.bedrock") where attributes["gen_ai.provider.name"] == "aws.bedrock" + + # Calculate costs from tokens + transform/cost: + trace_statements: + - context: span + statements: + # GPT-4 pricing (example: $30/$60 per 1M tokens) + - set(attributes["botanu.cost.input_usd"], + attributes["gen_ai.usage.input_tokens"] * 0.00003) + where attributes["gen_ai.request.model"] == "gpt-4" + - set(attributes["botanu.cost.output_usd"], + attributes["gen_ai.usage.output_tokens"] * 0.00006) + where attributes["gen_ai.request.model"] == "gpt-4" + + # GPT-4 Turbo pricing ($10/$30 per 1M tokens) + - set(attributes["botanu.cost.input_usd"], + attributes["gen_ai.usage.input_tokens"] * 0.00001) + where attributes["gen_ai.request.model"] == "gpt-4-turbo" + - set(attributes["botanu.cost.output_usd"], + attributes["gen_ai.usage.output_tokens"] * 0.00003) + where attributes["gen_ai.request.model"] == "gpt-4-turbo" + + # Claude 3 Opus pricing ($15/$75 per 1M tokens) + - set(attributes["botanu.cost.input_usd"], + attributes["gen_ai.usage.input_tokens"] * 0.000015) + where attributes["gen_ai.request.model"] == "claude-3-opus-20240229" + - set(attributes["botanu.cost.output_usd"], + attributes["gen_ai.usage.output_tokens"] * 0.000075) + where attributes["gen_ai.request.model"] == "claude-3-opus-20240229" + + # Calculate total + - set(attributes["botanu.cost.total_usd"], + attributes["botanu.cost.input_usd"] + attributes["botanu.cost.output_usd"]) + where attributes["botanu.cost.input_usd"] != nil + + # PII redaction for prompts/responses + redaction: + allow_all_keys: true + blocked_values: + # Email addresses + - "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b" + # Phone numbers + - "\\b\\d{3}[-.]?\\d{3}[-.]?\\d{4}\\b" + # SSN + - "\\b\\d{3}-\\d{2}-\\d{4}\\b" + # Credit card numbers + - "\\b(?:4[0-9]{12}(?:[0-9]{3})?|5[1-5][0-9]{14}|3[47][0-9]{13})\\b" + + # Cardinality limits + attributes: + actions: + - key: botanu.run_id + action: hash + # Keep first 16 chars of hash to reduce cardinality if needed + - key: gen_ai.content.prompt + action: delete + # Remove raw prompts (keep tokens for cost) + +exporters: + # ClickHouse for analytics + clickhouse: + endpoint: tcp://clickhouse:9000 + database: botanu + ttl: 90d + create_schema: true + + # Also send to your APM + otlp/apm: + endpoint: https://your-apm.example.com + headers: + Authorization: Bearer ${APM_TOKEN} + +service: + pipelines: + traces: + receivers: [otlp] + processors: + - batch + - transform/vendor + - transform/cost + - redaction + - attributes + exporters: [clickhouse, otlp/apm] +``` + +## PII Redaction + +### Using Redaction Processor + +```yaml +processors: + redaction: + allow_all_keys: true + blocked_values: + # Redact common PII patterns + - "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b" # Email + - "\\b\\d{3}[-.]?\\d{3}[-.]?\\d{4}\\b" # Phone + - "\\b\\d{3}-\\d{2}-\\d{4}\\b" # SSN + summary: debug # Log redaction summary +``` + +### Using Transform Processor + +```yaml +processors: + transform/pii: + trace_statements: + - context: span + statements: + # Remove prompt content entirely + - delete(attributes["gen_ai.content.prompt"]) + - delete(attributes["gen_ai.content.completion"]) + + # Or replace with placeholder + - replace_pattern(attributes["gen_ai.content.prompt"], + "\\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Z|a-z]{2,}\\b", + "[REDACTED_EMAIL]") +``` + +## Pricing Tables + +Maintain pricing in the collector config: + +```yaml +processors: + transform/cost: + trace_statements: + - context: span + statements: + # OpenAI pricing (as of 2024) + # GPT-4 + - set(attributes["botanu.cost.input_usd"], attributes["gen_ai.usage.input_tokens"] * 0.00003) + where attributes["gen_ai.request.model"] == "gpt-4" or attributes["gen_ai.request.model"] == "gpt-4-0613" + - set(attributes["botanu.cost.output_usd"], attributes["gen_ai.usage.output_tokens"] * 0.00006) + where attributes["gen_ai.request.model"] == "gpt-4" or attributes["gen_ai.request.model"] == "gpt-4-0613" + + # GPT-4 Turbo + - set(attributes["botanu.cost.input_usd"], attributes["gen_ai.usage.input_tokens"] * 0.00001) + where IsMatch(attributes["gen_ai.request.model"], "gpt-4-turbo.*") + - set(attributes["botanu.cost.output_usd"], attributes["gen_ai.usage.output_tokens"] * 0.00003) + where IsMatch(attributes["gen_ai.request.model"], "gpt-4-turbo.*") + + # GPT-4o + - set(attributes["botanu.cost.input_usd"], attributes["gen_ai.usage.input_tokens"] * 0.000005) + where IsMatch(attributes["gen_ai.request.model"], "gpt-4o.*") + - set(attributes["botanu.cost.output_usd"], attributes["gen_ai.usage.output_tokens"] * 0.000015) + where IsMatch(attributes["gen_ai.request.model"], "gpt-4o.*") + + # GPT-3.5 Turbo + - set(attributes["botanu.cost.input_usd"], attributes["gen_ai.usage.input_tokens"] * 0.0000005) + where IsMatch(attributes["gen_ai.request.model"], "gpt-3.5-turbo.*") + - set(attributes["botanu.cost.output_usd"], attributes["gen_ai.usage.output_tokens"] * 0.0000015) + where IsMatch(attributes["gen_ai.request.model"], "gpt-3.5-turbo.*") + + # Claude 3 Opus + - set(attributes["botanu.cost.input_usd"], attributes["gen_ai.usage.input_tokens"] * 0.000015) + where IsMatch(attributes["gen_ai.request.model"], "claude-3-opus.*") + - set(attributes["botanu.cost.output_usd"], attributes["gen_ai.usage.output_tokens"] * 0.000075) + where IsMatch(attributes["gen_ai.request.model"], "claude-3-opus.*") + + # Claude 3 Sonnet + - set(attributes["botanu.cost.input_usd"], attributes["gen_ai.usage.input_tokens"] * 0.000003) + where IsMatch(attributes["gen_ai.request.model"], "claude-3-sonnet.*") + - set(attributes["botanu.cost.output_usd"], attributes["gen_ai.usage.output_tokens"] * 0.000015) + where IsMatch(attributes["gen_ai.request.model"], "claude-3-sonnet.*") + + # Claude 3 Haiku + - set(attributes["botanu.cost.input_usd"], attributes["gen_ai.usage.input_tokens"] * 0.00000025) + where IsMatch(attributes["gen_ai.request.model"], "claude-3-haiku.*") + - set(attributes["botanu.cost.output_usd"], attributes["gen_ai.usage.output_tokens"] * 0.00000125) + where IsMatch(attributes["gen_ai.request.model"], "claude-3-haiku.*") + + # Total cost + - set(attributes["botanu.cost.total_usd"], + attributes["botanu.cost.input_usd"] + attributes["botanu.cost.output_usd"]) + where attributes["botanu.cost.input_usd"] != nil and attributes["botanu.cost.output_usd"] != nil +``` + +## Backend Exporters + +### ClickHouse + +```yaml +exporters: + clickhouse: + endpoint: tcp://clickhouse:9000 + database: botanu + username: default + password: ${CLICKHOUSE_PASSWORD} + ttl: 90d + create_schema: true + logs_table_name: otel_logs + traces_table_name: otel_traces + metrics_table_name: otel_metrics +``` + +### PostgreSQL (via OTLP) + +Use the collector to forward to a service that writes to PostgreSQL: + +```yaml +exporters: + otlp: + endpoint: http://postgres-writer:4317 +``` + +### Prometheus (Metrics) + +```yaml +exporters: + prometheus: + endpoint: 0.0.0.0:8889 + namespace: botanu +``` + +### Grafana Tempo + +```yaml +exporters: + otlp: + endpoint: tempo:4317 + tls: + insecure: true +``` + +## Sampling + +For cost attribution, avoid sampling. If you must sample: + +```yaml +processors: + probabilistic_sampler: + sampling_percentage: 100 # Keep 100% for cost attribution + + # Or sample only non-LLM spans + tail_sampling: + decision_wait: 10s + policies: + # Always keep LLM calls + - name: always-sample-llm + type: string_attribute + string_attribute: + key: gen_ai.operation.name + values: [chat, text_completion, embeddings] + + # Sample other spans at 10% + - name: sample-other + type: probabilistic + probabilistic: + sampling_percentage: 10 +``` + +## High Availability + +### Load Balancing + +```yaml +# collector-1.yaml +receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 + +exporters: + loadbalancing: + protocol: + otlp: + tls: + insecure: true + resolver: + dns: + hostname: collector-pool.svc.cluster.local + port: 4317 +``` + +### Kubernetes Deployment + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: otel-collector +spec: + replicas: 3 + selector: + matchLabels: + app: otel-collector + template: + spec: + containers: + - name: collector + image: otel/opentelemetry-collector-contrib:latest + ports: + - containerPort: 4318 + - containerPort: 4317 + volumeMounts: + - name: config + mountPath: /etc/otelcol + volumes: + - name: config + configMap: + name: otel-collector-config +``` + +## Monitoring the Collector + +Enable internal telemetry: + +```yaml +service: + telemetry: + logs: + level: info + metrics: + level: detailed + address: 0.0.0.0:8888 +``` + +Access metrics at `http://collector:8888/metrics`. + +## See Also + +- [Architecture](../concepts/architecture.md) - SDK architecture +- [Auto-Instrumentation](auto-instrumentation.md) - Library instrumentation +- [Best Practices](../patterns/best-practices.md) - Configuration patterns diff --git a/docs/integration/existing-otel.md b/docs/integration/existing-otel.md new file mode 100644 index 0000000..a008cdb --- /dev/null +++ b/docs/integration/existing-otel.md @@ -0,0 +1,295 @@ +# Existing OpenTelemetry Setup + +Integrate Botanu with your existing OpenTelemetry configuration. + +## Overview + +If you already have OpenTelemetry configured (via Datadog, Splunk, New Relic, or custom setup), Botanu integrates seamlessly. You only need to add the `RunContextEnricher` span processor. + +## Minimal Integration + +Add just the span processor to your existing provider: + +```python +from opentelemetry import trace +from botanu.processors.enricher import RunContextEnricher + +# Your existing TracerProvider +provider = trace.get_tracer_provider() + +# Add Botanu's enricher +provider.add_span_processor(RunContextEnricher()) +``` + +That's it. All spans will now receive `run_id` from baggage. + +## With Existing Instrumentation + +Botanu works alongside any existing instrumentation: + +```python +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.requests import RequestsInstrumentor + +from botanu.processors.enricher import RunContextEnricher + +# Your existing setup +provider = TracerProvider() +provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) +trace.set_tracer_provider(provider) + +# Your existing instrumentation +RequestsInstrumentor().instrument() + +# Add Botanu enricher (order doesn't matter) +provider.add_span_processor(RunContextEnricher()) +``` + +## With Datadog + +```python +from ddtrace import tracer +from ddtrace.opentelemetry import TracerProvider +from opentelemetry import trace + +from botanu.processors.enricher import RunContextEnricher + +# Datadog's TracerProvider +provider = TracerProvider() +trace.set_tracer_provider(provider) + +# Add Botanu enricher +provider.add_span_processor(RunContextEnricher()) +``` + +## With Splunk + +```python +from splunk_otel.tracing import start_tracing +from opentelemetry import trace + +from botanu.processors.enricher import RunContextEnricher + +# Start Splunk tracing +start_tracing() + +# Add Botanu enricher +provider = trace.get_tracer_provider() +provider.add_span_processor(RunContextEnricher()) +``` + +## With New Relic + +```python +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter + +from botanu.processors.enricher import RunContextEnricher + +# New Relic OTLP endpoint +provider = TracerProvider() +provider.add_span_processor( + BatchSpanProcessor( + OTLPSpanExporter( + endpoint="https://otlp.nr-data.net/v1/traces", + headers={"api-key": "YOUR_LICENSE_KEY"}, + ) + ) +) +trace.set_tracer_provider(provider) + +# Add Botanu enricher +provider.add_span_processor(RunContextEnricher()) +``` + +## With Jaeger + +```python +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.jaeger.thrift import JaegerExporter + +from botanu.processors.enricher import RunContextEnricher + +# Jaeger setup +provider = TracerProvider() +provider.add_span_processor( + BatchSpanProcessor( + JaegerExporter( + agent_host_name="localhost", + agent_port=6831, + ) + ) +) +trace.set_tracer_provider(provider) + +# Add Botanu enricher +provider.add_span_processor(RunContextEnricher()) +``` + +## Multiple Exporters + +Send to both your APM and a cost-attribution backend: + +```python +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter + +from botanu.processors.enricher import RunContextEnricher + +provider = TracerProvider() + +# Your APM (e.g., Datadog) +provider.add_span_processor( + BatchSpanProcessor( + OTLPSpanExporter(endpoint="https://your-apm.example.com/v1/traces") + ) +) + +# Botanu collector for cost attribution +provider.add_span_processor( + BatchSpanProcessor( + OTLPSpanExporter(endpoint="http://botanu-collector:4318/v1/traces") + ) +) + +# Botanu enricher (adds run_id to all spans) +provider.add_span_processor(RunContextEnricher()) + +trace.set_tracer_provider(provider) +``` + +## How RunContextEnricher Works + +The enricher reads baggage and writes to span attributes: + +```python +class RunContextEnricher(SpanProcessor): + def on_start(self, span, parent_context): + # Read run_id from baggage + run_id = baggage.get_baggage("botanu.run_id", parent_context) + if run_id: + span.set_attribute("botanu.run_id", run_id) + + # Read use_case from baggage + use_case = baggage.get_baggage("botanu.use_case", parent_context) + if use_case: + span.set_attribute("botanu.use_case", use_case) +``` + +This means: +- Every span gets `run_id` if it exists in baggage +- Auto-instrumented spans are enriched automatically +- No code changes needed in your existing instrumentation + +## Using Botanu Decorators + +With the enricher in place, use Botanu decorators: + +```python +from botanu import botanu_use_case, emit_outcome + +@botanu_use_case("Customer Support") +async def handle_ticket(ticket_id: str): + # All spans created here (by any instrumentation) get run_id + context = requests.get(f"/api/tickets/{ticket_id}") + response = await openai_call(context) + await database.save(response) + + emit_outcome("success", value_type="tickets_resolved", value_amount=1) +``` + +## Without Botanu Bootstrap + +If you don't want to use `enable()`, manually set up propagation: + +```python +from opentelemetry import propagate +from opentelemetry.propagators.composite import CompositePropagator +from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator +from opentelemetry.baggage.propagation import W3CBaggagePropagator + +# Ensure baggage propagation is enabled +propagate.set_global_textmap( + CompositePropagator([ + TraceContextTextMapPropagator(), + W3CBaggagePropagator(), + ]) +) +``` + +## Verifying Integration + +Check that run_id appears on spans: + +```python +from opentelemetry import trace, baggage, context + +# Set baggage (normally done by @botanu_use_case) +ctx = baggage.set_baggage("botanu.run_id", "test-123") +token = context.attach(ctx) + +try: + tracer = trace.get_tracer("test") + with tracer.start_as_current_span("test-span") as span: + # Check attribute was set + print(span.attributes.get("botanu.run_id")) # Should print "test-123" +finally: + context.detach(token) +``` + +## Processor Order + +Span processors are called in order. The enricher should be added after your span exporters: + +```python +# 1. Exporters (send spans to backends) +provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) + +# 2. Enrichers (modify spans before export) +provider.add_span_processor(RunContextEnricher()) +``` + +However, `RunContextEnricher` uses `on_start()`, so it runs before export regardless. + +## Troubleshooting + +### run_id Not Appearing + +1. Check enricher is added: + ```python + provider = trace.get_tracer_provider() + # Verify RunContextEnricher is in the list + ``` + +2. Check baggage is set: + ```python + from opentelemetry import baggage + print(baggage.get_baggage("botanu.run_id")) + ``` + +3. Ensure `@botanu_use_case` is used at entry points + +### Baggage Not Propagating + +Check propagators are configured: +```python +from opentelemetry import propagate +print(propagate.get_global_textmap()) +``` + +Should include `W3CBaggagePropagator`. + +## See Also + +- [Auto-Instrumentation](auto-instrumentation.md) - Library instrumentation +- [Collector Configuration](collector.md) - Collector setup +- [Architecture](../concepts/architecture.md) - SDK design diff --git a/docs/integration/kubernetes.md b/docs/integration/kubernetes.md new file mode 100644 index 0000000..c71cf4e --- /dev/null +++ b/docs/integration/kubernetes.md @@ -0,0 +1,382 @@ +# Kubernetes Deployment + +Zero-code instrumentation for large-scale deployments. + +## Overview + +For organizations with thousands of applications, modifying code in every repo is impractical. This guide covers zero-code instrumentation using Kubernetes-native approaches. + +## What Requires Code Changes + +| Service Type | Code Change | Config Change | +|--------------|-------------|---------------| +| **Entry point** | `@botanu_use_case` decorator (generates `run_id`) | K8s annotation | +| **Intermediate services** | None | K8s annotation only | + +**Entry point** = The service where the business transaction starts (API gateway, webhook handler, queue consumer). + +**Intermediate services** = All downstream services called by the entry point. + +## What Gets Auto-Instrumented + +With zero-code instrumentation, the following are automatically traced: + +- **HTTP clients** — requests, httpx, urllib3, aiohttp (including retries) +- **Frameworks** — FastAPI, Flask, Django, Starlette +- **Databases** — PostgreSQL, MySQL, MongoDB, Redis, SQLAlchemy +- **Messaging** — Celery, Kafka +- **LLM Providers** — OpenAI, Anthropic, Vertex AI + +**Retries are automatically captured.** Each HTTP call (including retries from libraries like `tenacity`, `urllib3.util.retry`, or `httpx` retry) creates a separate span. The `run_id` propagates via W3C Baggage headers on every request. + +## Architecture + +``` +┌────────────────────────────────────────────────────────────────┐ +│ Kubernetes Cluster │ +│ │ +│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ +│ │ App A │ │ App B │ │ App C │ │ +│ │ (entry) │ │ (no change) │ │ (no change) │ │ +│ │ @use_case │ │ │ │ │ │ +│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │ +│ │ │ │ │ +│ │ OTel auto-injected via Operator │ +│ │ │ │ │ +│ └────────────────┼────────────────┘ │ +│ ▼ │ +│ ┌───────────────────────┐ │ +│ │ OTel Collector │ │ +│ │ (DaemonSet) │ │ +│ └───────────┬───────────┘ │ +└──────────────────────────┼──────────────────────────────────────┘ + │ OTLP + ▼ + Observability Backend +``` + +## Option 1: OTel Operator (Recommended) + +The OpenTelemetry Operator automatically injects instrumentation into pods. + +### Install Operator + +```bash +# Install cert-manager (required) +kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.0/cert-manager.yaml + +# Install OTel Operator +kubectl apply -f https://github.com/open-telemetry/opentelemetry-operator/releases/latest/download/opentelemetry-operator.yaml +``` + +### Create Instrumentation Resource + +```yaml +# instrumentation.yaml +apiVersion: opentelemetry.io/v1alpha1 +kind: Instrumentation +metadata: + name: botanu-instrumentation + namespace: default +spec: + exporter: + endpoint: http://otel-collector:4318 + propagators: + - tracecontext + - baggage + python: + image: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-python:latest + env: + - name: OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED + value: "true" +``` + +```bash +kubectl apply -f instrumentation.yaml +``` + +### Annotate Deployments + +Add a single annotation to enable instrumentation: + +```yaml +# deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: my-service +spec: + template: + metadata: + annotations: + instrumentation.opentelemetry.io/inject-python: "true" + spec: + containers: + - name: app + image: my-service:latest + env: + - name: OTEL_SERVICE_NAME + value: "my-service" +``` + +No code changes required. The operator injects instrumentation at pod startup. + +## Option 2: Environment Variables Only + +For apps without operator, use environment variables: + +```yaml +apiVersion: apps/v1 +kind: Deployment +spec: + template: + spec: + containers: + - name: app + image: my-service:latest + command: ["opentelemetry-instrument", "python", "app.py"] + env: + - name: OTEL_SERVICE_NAME + value: "my-service" + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: "http://otel-collector:4318" + - name: OTEL_EXPORTER_OTLP_PROTOCOL + value: "http/protobuf" + - name: OTEL_PROPAGATORS + value: "tracecontext,baggage" + - name: OTEL_TRACES_EXPORTER + value: "otlp" + - name: OTEL_METRICS_EXPORTER + value: "none" + - name: OTEL_LOGS_EXPORTER + value: "none" +``` + +Base image must include: +```dockerfile +RUN pip install opentelemetry-distro opentelemetry-exporter-otlp \ + opentelemetry-instrumentation-fastapi \ + opentelemetry-instrumentation-requests \ + opentelemetry-instrumentation-openai-v2 +``` + +## Option 3: Init Container + +Inject instrumentation via init container: + +```yaml +apiVersion: apps/v1 +kind: Deployment +spec: + template: + spec: + initContainers: + - name: otel-init + image: ghcr.io/open-telemetry/opentelemetry-operator/autoinstrumentation-python:latest + command: ["/bin/sh", "-c"] + args: + - cp -r /autoinstrumentation/. /otel-auto-instrumentation/ + volumeMounts: + - name: otel-auto-instrumentation + mountPath: /otel-auto-instrumentation + containers: + - name: app + image: my-service:latest + env: + - name: PYTHONPATH + value: "/otel-auto-instrumentation" + - name: OTEL_SERVICE_NAME + value: "my-service" + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: "http://otel-collector:4318" + volumeMounts: + - name: otel-auto-instrumentation + mountPath: /otel-auto-instrumentation + volumes: + - name: otel-auto-instrumentation + emptyDir: {} +``` + +## OTel Collector Setup + +Deploy collector as DaemonSet: + +```yaml +# collector.yaml +apiVersion: opentelemetry.io/v1alpha1 +kind: OpenTelemetryCollector +metadata: + name: otel-collector +spec: + mode: daemonset + config: | + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + + processors: + batch: + timeout: 5s + send_batch_size: 1000 + + # Extract run_id from baggage for querying + attributes: + actions: + - key: botanu.run_id + from_context: baggage + action: upsert + + exporters: + otlp: + endpoint: "your-backend:4317" + tls: + insecure: false + + service: + pipelines: + traces: + receivers: [otlp] + processors: [batch, attributes] + exporters: [otlp] +``` + +## Entry Point Service (Code Change Required) + +The entry point service is the **only** service that needs a code change. It must use `@botanu_use_case` to generate the `run_id`: + +```python +from botanu import enable, botanu_use_case + +enable(service_name="entry-service") + +@botanu_use_case(name="my_workflow") +def my_function(): + data = db.query(...) + result = llm.complete(...) + downstream_service.call(result) + return result +``` + +The `@botanu_use_case` decorator generates a `run_id` and propagates it via W3C Baggage to all downstream calls. + +**Downstream services (B, C, D, etc.) need zero code changes** — they just need the K8s annotation. + +## Helm Chart + +For production deployments, use the Botanu Helm chart: + +```bash +helm repo add botanu https://charts.botanu.ai +helm install botanu-collector botanu/collector \ + --set exporter.endpoint=your-backend:4317 +``` + +Values: + +```yaml +# values.yaml +collector: + mode: daemonset + resources: + limits: + cpu: 500m + memory: 512Mi + +instrumentation: + enabled: true + python: + enabled: true + propagators: + - tracecontext + - baggage + +exporter: + endpoint: "your-backend:4317" + tls: + enabled: true +``` + +## GitOps Integration + +Add annotations via Kustomize: + +```yaml +# kustomization.yaml +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +patches: + - patch: | + apiVersion: apps/v1 + kind: Deployment + metadata: + name: any + spec: + template: + metadata: + annotations: + instrumentation.opentelemetry.io/inject-python: "true" + target: + kind: Deployment + labelSelector: "instrumentation=enabled" +``` + +Label deployments to opt-in: + +```yaml +metadata: + labels: + instrumentation: enabled +``` + +## Environment Variables Reference + +| Variable | Description | Example | +|----------|-------------|---------| +| `OTEL_SERVICE_NAME` | Service name | `my-service` | +| `OTEL_EXPORTER_OTLP_ENDPOINT` | Collector endpoint | `http://collector:4318` | +| `OTEL_PROPAGATORS` | Context propagators | `tracecontext,baggage` | +| `OTEL_TRACES_EXPORTER` | Trace exporter | `otlp` | +| `OTEL_RESOURCE_ATTRIBUTES` | Additional attributes | `deployment.environment=prod` | + +## Rollout Strategy + +For 2000+ applications: + +1. **Phase 1**: Deploy OTel Collector (DaemonSet) +2. **Phase 2**: Install OTel Operator +3. **Phase 3**: Create Instrumentation resource +4. **Phase 4**: Add annotations via GitOps (batch by team/namespace) +5. **Phase 5**: Instrument entry points with `@botanu_use_case` + +Each phase is independent. Annotations can be rolled out gradually. + +## Troubleshooting + +### Verify Injection + +```bash +kubectl describe pod my-pod | grep -A5 "Init Containers" +``` + +### Check Instrumentation Logs + +```bash +kubectl logs my-pod -c opentelemetry-auto-instrumentation +``` + +### Verify Collector Receiving + +```bash +kubectl logs -l app=otel-collector | grep "TracesExporter" +``` + +## See Also + +- [Collector Configuration](collector.md) +- [Auto-Instrumentation](auto-instrumentation.md) +- [Context Propagation](../concepts/context-propagation.md) diff --git a/docs/patterns/anti-patterns.md b/docs/patterns/anti-patterns.md new file mode 100644 index 0000000..1e09f23 --- /dev/null +++ b/docs/patterns/anti-patterns.md @@ -0,0 +1,490 @@ +# Anti-Patterns + +Common mistakes to avoid when using Botanu SDK. + +## Run Design Anti-Patterns + +### Creating Runs for Internal Operations + +**Don't** create runs for internal functions: + +```python +# BAD - Too many runs +@botanu_use_case("Fetch Context") # Don't do this +async def fetch_context(ticket_id): + return await db.query(...) + +@botanu_use_case("Generate Response") # Or this +async def generate_response(context): + return await llm.complete(...) + +@botanu_use_case("Customer Support") +async def handle_ticket(ticket_id): + context = await fetch_context(ticket_id) + response = await generate_response(context) + return response +``` + +**Do** use a single run at the entry point: + +```python +# GOOD - One run for the business outcome +@botanu_use_case("Customer Support") +async def handle_ticket(ticket_id): + context = await fetch_context(ticket_id) # Not decorated + response = await generate_response(context) # Not decorated + emit_outcome("success", value_type="tickets_resolved", value_amount=1) + return response +``` + +### Nesting @botanu_use_case Decorators + +**Don't** nest use case decorators: + +```python +# BAD - Nested runs create confusion +@botanu_use_case("Outer") +async def outer(): + await inner() # Creates a second run + +@botanu_use_case("Inner") # Don't do this +async def inner(): + ... +``` + +**Do** use @botanu_use_case only at entry points: + +```python +# GOOD - Only entry point is decorated +@botanu_use_case("Main Workflow") +async def main(): + await step_one() # No decorator + await step_two() # No decorator +``` + +### Generic Use Case Names + +**Don't** use vague names: + +```python +# BAD - Meaningless in dashboards +@botanu_use_case("Process") +@botanu_use_case("Handle") +@botanu_use_case("Main") +@botanu_use_case("DoWork") +``` + +**Do** use descriptive business names: + +```python +# GOOD - Clear in reports +@botanu_use_case("Customer Support") +@botanu_use_case("Invoice Processing") +@botanu_use_case("Lead Qualification") +@botanu_use_case("Document Analysis") +``` + +## Outcome Anti-Patterns + +### Forgetting to Emit Outcomes + +**Don't** leave runs without outcomes: + +```python +# BAD - No outcome recorded +@botanu_use_case("Process Order") +async def process_order(order_id): + result = await process(order_id) + return result # Where's the outcome? +``` + +**Do** always emit an outcome: + +```python +# GOOD - Explicit outcome +@botanu_use_case("Process Order") +async def process_order(order_id): + try: + result = await process(order_id) + emit_outcome("success", value_type="orders_processed", value_amount=1) + return result + except Exception as e: + emit_outcome("failed", reason=type(e).__name__) + raise +``` + +### Multiple Outcomes Per Run + +**Don't** emit multiple outcomes: + +```python +# BAD - Multiple outcomes are confusing +@botanu_use_case("Batch Processing") +async def process_batch(items): + for item in items: + await process(item) + emit_outcome("success", value_type="item_processed") # Don't do this +``` + +**Do** emit one summary outcome: + +```python +# GOOD - One outcome at the end +@botanu_use_case("Batch Processing") +async def process_batch(items): + processed = 0 + for item in items: + await process(item) + processed += 1 + emit_outcome("success", value_type="items_processed", value_amount=processed) +``` + +### Missing Failure Reasons + +**Don't** emit failures without reasons: + +```python +# BAD - No context for debugging +except Exception: + emit_outcome("failed") # Why did it fail? + raise +``` + +**Do** include the failure reason: + +```python +# GOOD - Reason helps debugging +except ValidationError: + emit_outcome("failed", reason="validation_error") + raise +except RateLimitError: + emit_outcome("failed", reason="rate_limit_exceeded") + raise +except Exception as e: + emit_outcome("failed", reason=type(e).__name__) + raise +``` + +## LLM Tracking Anti-Patterns + +### Not Recording Tokens + +**Don't** skip token recording: + +```python +# BAD - No cost data +with track_llm_call(provider="openai", model="gpt-4"): + response = await client.chat.completions.create(...) + # Token usage not recorded +``` + +**Do** always record tokens: + +```python +# GOOD - Tokens enable cost calculation +with track_llm_call(provider="openai", model="gpt-4") as tracker: + response = await client.chat.completions.create(...) + tracker.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + ) +``` + +### Ignoring Cached Tokens + +**Don't** forget cache tokens (they have different pricing): + +```python +# BAD - Missing cache data +tracker.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, +) +``` + +**Do** include cache breakdown: + +```python +# GOOD - Full token breakdown +tracker.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + cache_read_tokens=response.usage.cache_read_tokens, + cache_write_tokens=response.usage.cache_write_tokens, +) +``` + +### Wrong Provider Names + +**Don't** use inconsistent provider names: + +```python +# BAD - Inconsistent naming +track_llm_call(provider="OpenAI", ...) # Mixed case +track_llm_call(provider="open-ai", ...) # Wrong format +track_llm_call(provider="gpt", ...) # Model as provider +``` + +**Do** use standard provider names (auto-normalized): + +```python +# GOOD - Standard names (or let SDK normalize) +track_llm_call(provider="openai", ...) +track_llm_call(provider="anthropic", ...) +track_llm_call(provider="azure_openai", ...) +``` + +## Configuration Anti-Patterns + +### Sampling for Cost Attribution + +### Hardcoding Configuration + +**Don't** hardcode production values: + +```python +# BAD - Hardcoded +enable( + service_name="my-service", + otlp_endpoint="http://prod-collector.internal:4318", +) +``` + +**Do** use environment variables: + +```python +# GOOD - Environment-based +enable(service_name=os.environ["OTEL_SERVICE_NAME"]) + +# Or use YAML with interpolation +# botanu.yaml +# otlp: +# endpoint: ${COLLECTOR_ENDPOINT} +``` + +### Disabling Auto-Instrumentation Unnecessarily + +**Don't** disable auto-instrumentation without reason: + +```python +# BAD - Missing automatic tracing +enable( + service_name="my-service", + auto_instrument_packages=[], # Why? +) +``` + +**Do** keep defaults or be selective: + +```python +# GOOD - Default instrumentation +enable(service_name="my-service") + +# Or selective +enable( + service_name="my-service", + auto_instrument_packages=["fastapi", "openai_v2", "sqlalchemy"], +) +``` + +## Context Propagation Anti-Patterns + +### Losing Context in Async Code + +**Don't** spawn tasks without context: + +```python +# BAD - Context lost +@botanu_use_case("Parallel Processing") +async def process(): + # These tasks don't inherit context + await asyncio.gather( + task_one(), + task_two(), + ) +``` + +**Do** ensure context propagates: + +```python +# GOOD - Context flows through asyncio +@botanu_use_case("Parallel Processing") +async def process(): + # asyncio with contextvars works correctly + await asyncio.gather( + task_one(), # Inherits context + task_two(), # Inherits context + ) +``` + +### Not Extracting Context in Consumers + +**Don't** ignore incoming context: + +```python +# BAD - Context not extracted +def process_message(message): + # run_id from producer is lost + handle_payload(message["payload"]) +``` + +**Do** extract and use context: + +```python +# GOOD - Context continues +def process_message(message): + baggage = message.get("baggage", {}) + ctx = RunContext.from_baggage(baggage) + if ctx: + with ctx.as_current(): + handle_payload(message["payload"]) +``` + +## Data Tracking Anti-Patterns + +### Not Tracking Data Operations + +**Don't** ignore database/storage costs: + +```python +# BAD - Only LLM tracked +@botanu_use_case("Analysis") +async def analyze(): + data = await snowflake.query(expensive_query) # Not tracked! + with track_llm_call(...) as tracker: + result = await llm.complete(data) + tracker.set_tokens(...) +``` + +**Do** track all cost-generating operations: + +```python +# GOOD - Complete cost picture +@botanu_use_case("Analysis") +async def analyze(): + with track_db_operation(system="snowflake", operation="SELECT") as db: + data = await snowflake.query(expensive_query) + db.set_bytes_scanned(data.bytes_scanned) + + with track_llm_call(...) as tracker: + result = await llm.complete(data) + tracker.set_tokens(...) +``` + +### Missing Bytes for Pay-Per-Scan + +**Don't** forget bytes for warehouses: + +```python +# BAD - Missing cost driver +with track_db_operation(system="bigquery", operation="SELECT") as db: + result = await bq.query(sql) + db.set_result(rows_returned=len(result)) # Rows don't determine cost! +``` + +**Do** include bytes scanned: + +```python +# GOOD - Bytes scanned is the cost driver +with track_db_operation(system="bigquery", operation="SELECT") as db: + result = await bq.query(sql) + db.set_bytes_scanned(result.bytes_processed) + db.set_result(rows_returned=len(result)) +``` + +## Error Handling Anti-Patterns + +### Swallowing Errors + +**Don't** hide errors: + +```python +# BAD - Error hidden +with track_llm_call(...) as tracker: + try: + response = await llm.complete(...) + except Exception: + pass # Silently fails - no error recorded +``` + +**Do** record and propagate errors: + +```python +# GOOD - Error tracked and raised +with track_llm_call(...) as tracker: + try: + response = await llm.complete(...) + except Exception as e: + tracker.set_error(e) + emit_outcome("failed", reason=type(e).__name__) + raise +``` + +### Ignoring Partial Successes + +**Don't** mark all-or-nothing: + +```python +# BAD - All items fail if one fails +@botanu_use_case("Batch") +async def process_batch(items): + for item in items: + await process(item) # If one fails, no outcome + emit_outcome("success", value_amount=len(items)) +``` + +**Do** track partial success: + +```python +# GOOD - Partial success recorded +@botanu_use_case("Batch") +async def process_batch(items): + processed = 0 + failed = 0 + for item in items: + try: + await process(item) + processed += 1 + except Exception: + failed += 1 + + if failed == 0: + emit_outcome("success", value_type="items_processed", value_amount=processed) + elif processed > 0: + emit_outcome("partial", value_type="items_processed", value_amount=processed, + reason=f"failed_{failed}_of_{len(items)}") + else: + emit_outcome("failed", reason="all_items_failed") +``` + +## Testing Anti-Patterns + +### Testing with Real Exporters + +**Don't** send telemetry during tests: + +```python +# BAD - Tests hit real collector +def test_workflow(): + enable(service_name="test") # Sends to real endpoint! + await my_workflow() +``` + +**Do** use NoOp or in-memory exporters: + +```python +# GOOD - Tests are isolated +from opentelemetry.trace import NoOpTracerProvider + +def setup_test(): + trace.set_tracer_provider(NoOpTracerProvider()) + +def test_workflow(): + await my_workflow() # No external calls +``` + +## See Also + +- [Best Practices](best-practices.md) - What to do +- [Quickstart](../getting-started/quickstart.md) - Getting started guide +- [Outcomes](../tracking/outcomes.md) - Outcome recording details diff --git a/docs/patterns/best-practices.md b/docs/patterns/best-practices.md new file mode 100644 index 0000000..26372d1 --- /dev/null +++ b/docs/patterns/best-practices.md @@ -0,0 +1,416 @@ +# Best Practices + +Patterns for effective cost attribution with Botanu SDK. + +## Run Design + +### One Run Per Business Outcome + +A run should represent a complete business transaction: + +```python +# GOOD - One run for one business outcome +@botanu_use_case("Customer Support") +async def resolve_ticket(ticket_id: str): + context = await fetch_context(ticket_id) + response = await generate_response(context) + await send_response(ticket_id, response) + emit_outcome("success", value_type="tickets_resolved", value_amount=1) +``` + +```python +# BAD - Multiple runs for one outcome +@botanu_use_case("Fetch Context") +async def fetch_context(ticket_id: str): + ... + +@botanu_use_case("Generate Response") # Don't do this +async def generate_response(context): + ... +``` + +### Use Descriptive Use Case Names + +Use cases appear in dashboards and queries. Choose names carefully: + +```python +# GOOD - Clear, descriptive names +@botanu_use_case("Customer Support") +@botanu_use_case("Document Analysis") +@botanu_use_case("Lead Qualification") + +# BAD - Generic or technical names +@botanu_use_case("HandleRequest") +@botanu_use_case("Process") +@botanu_use_case("Main") +``` + +### Include Workflow Names + +Workflow names help distinguish different paths within a use case: + +```python +@botanu_use_case("Customer Support", workflow="ticket_resolution") +async def resolve_ticket(): + ... + +@botanu_use_case("Customer Support", workflow="escalation") +async def escalate_ticket(): + ... +``` + +## Outcome Recording + +### Always Record Outcomes + +Every run should have an explicit outcome: + +```python +@botanu_use_case("Data Processing") +async def process_data(data_id: str): + try: + result = await process(data_id) + emit_outcome("success", value_type="records_processed", value_amount=result.count) + return result + except ValidationError: + emit_outcome("failed", reason="validation_error") + raise + except TimeoutError: + emit_outcome("failed", reason="timeout") + raise +``` + +### Quantify Value When Possible + +Include value amounts for better ROI analysis: + +```python +# GOOD - Quantified outcomes +emit_outcome("success", value_type="emails_sent", value_amount=50) +emit_outcome("success", value_type="revenue_generated", value_amount=1299.99) +emit_outcome("success", value_type="documents_processed", value_amount=10) + +# LESS USEFUL - No quantity +emit_outcome("success") +``` + +### Use Consistent Value Types + +Standardize your value types across the organization: + +```python +# Define standard value types +class ValueTypes: + TICKETS_RESOLVED = "tickets_resolved" + DOCUMENTS_PROCESSED = "documents_processed" + LEADS_QUALIFIED = "leads_qualified" + EMAILS_SENT = "emails_sent" + REVENUE_GENERATED = "revenue_generated" + +# Use consistently +emit_outcome("success", value_type=ValueTypes.TICKETS_RESOLVED, value_amount=1) +``` + +### Include Reasons for Failures + +Always explain why something failed: + +```python +emit_outcome("failed", reason="rate_limit_exceeded") +emit_outcome("failed", reason="invalid_input") +emit_outcome("failed", reason="model_unavailable") +emit_outcome("failed", reason="context_too_long") +``` + +## LLM Tracking + +### Always Record Token Usage + +Tokens are the primary cost driver for LLMs: + +```python +with track_llm_call(provider="openai", model="gpt-4") as tracker: + response = await client.chat.completions.create(...) + # Always set tokens + tracker.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + ) +``` + +### Record Provider Request IDs + +Request IDs enable reconciliation with provider invoices: + +```python +tracker.set_request_id( + provider_request_id=response.id, # From provider + client_request_id=uuid.uuid4().hex, # Your internal ID +) +``` + +### Track Retries + +Record attempt numbers for accurate cost per success: + +```python +for attempt in range(max_retries): + with track_llm_call(provider="openai", model="gpt-4") as tracker: + tracker.set_attempt(attempt + 1) + try: + response = await client.chat.completions.create(...) + break + except RateLimitError: + if attempt == max_retries - 1: + raise + await asyncio.sleep(backoff) +``` + +### Use Correct Operation Types + +Specify the operation type for accurate categorization: + +```python +from botanu.tracking.llm import track_llm_call, ModelOperation + +# Chat completion +with track_llm_call(provider="openai", model="gpt-4", operation=ModelOperation.CHAT): + ... + +# Embeddings +with track_llm_call(provider="openai", model="text-embedding-3-small", operation=ModelOperation.EMBEDDINGS): + ... +``` + +## Data Tracking + +### Track All Cost-Generating Operations + +Include databases, storage, and messaging: + +```python +@botanu_use_case("ETL Pipeline") +async def run_etl(): + # Track warehouse query (billed by bytes scanned) + with track_db_operation(system="snowflake", operation="SELECT") as db: + db.set_bytes_scanned(result.bytes_scanned) + db.set_query_id(result.query_id) + + # Track storage operations (billed by requests + data) + with track_storage_operation(system="s3", operation="PUT") as storage: + storage.set_result(bytes_written=len(data)) + + # Track messaging (billed by message count) + with track_messaging_operation(system="sqs", operation="publish", destination="queue") as msg: + msg.set_result(message_count=batch_size) +``` + +### Include Bytes for Pay-Per-Scan Services + +For data warehouses billed by data scanned: + +```python +with track_db_operation(system="bigquery", operation="SELECT") as db: + result = await bq_client.query(sql) + db.set_bytes_scanned(result.total_bytes_processed) + db.set_result(rows_returned=result.num_rows) +``` + +## Context Propagation + +### Use Middleware for Web Services + +Extract context from incoming requests: + +```python +from fastapi import FastAPI +from botanu.sdk.middleware import BotanuMiddleware + +app = FastAPI() +app.add_middleware(BotanuMiddleware) +``` + +### Propagate Context in Message Queues + +Inject and extract context manually for async messaging: + +```python +# Producer +def publish_message(payload): + ctx = get_current_run_context() + message = { + "payload": payload, + "baggage": ctx.to_baggage_dict() if ctx else {} + } + queue.publish(message) + +# Consumer +def process_message(message): + baggage = message.get("baggage", {}) + ctx = RunContext.from_baggage(baggage) + with ctx.as_current(): + handle_payload(message["payload"]) +``` + +### Use Lean Mode for High-Traffic Systems + +Default lean mode minimizes header overhead: + +```python +# Lean mode: ~100 bytes of baggage +# Propagates: run_id, use_case + +# Full mode: ~300 bytes of baggage +# Propagates: run_id, use_case, workflow, environment, tenant_id, parent_run_id +``` + +## Configuration + +### Use Environment Variables in Production + +Keep configuration out of code: + +```bash +export OTEL_SERVICE_NAME=my-service +export OTEL_EXPORTER_OTLP_ENDPOINT=http://collector:4318 +export BOTANU_ENVIRONMENT=production +``` + +### Use YAML for Complex Configuration + +For multi-environment setups: + +```yaml +# config/production.yaml +service: + name: ${OTEL_SERVICE_NAME} + environment: production + +otlp: + endpoint: ${COLLECTOR_ENDPOINT} + +propagation: + mode: lean +``` + +## Multi-Tenant Systems + +### Always Include Tenant ID + +For accurate per-tenant cost attribution: + +```python +@botanu_use_case("Customer Support", tenant_id=request.tenant_id) +async def handle_ticket(request): + ... +``` + +### Use Business Context + +Add additional attribution dimensions: + +```python +set_business_context( + customer_id=request.customer_id, + team="engineering", + cost_center="R&D", + region="us-west-2", +) +``` + +## Error Handling + +### Record Errors Explicitly + +Don't lose error context: + +```python +with track_llm_call(provider="openai", model="gpt-4") as tracker: + try: + response = await client.chat.completions.create(...) + except openai.APIError as e: + tracker.set_error(e) # Records error type and message + raise +``` + +### Emit Outcomes for Errors + +Even failed runs should have outcomes: + +```python +@botanu_use_case("Data Processing") +async def process(data_id): + try: + await process_data(data_id) + emit_outcome("success", value_type="items_processed", value_amount=1) + except ValidationError: + emit_outcome("failed", reason="validation_error") + raise + except Exception as e: + emit_outcome("failed", reason=type(e).__name__) + raise +``` + +## Performance + +### Use Async Tracking + +For async applications, ensure tracking is non-blocking: + +```python +# The SDK uses span events, not separate API calls +# This is already non-blocking +with track_llm_call(provider="openai", model="gpt-4") as tracker: + response = await async_llm_call() + tracker.set_tokens(...) # Immediate, non-blocking +``` + +### Batch Database Tracking + +For batch operations, track at batch level: + +```python +# GOOD - Batch tracking +with track_db_operation(system="postgresql", operation="INSERT") as db: + await cursor.executemany(insert_sql, batch_of_1000_rows) + db.set_result(rows_affected=1000) + +# LESS EFFICIENT - Per-row tracking +for row in batch_of_1000_rows: + with track_db_operation(system="postgresql", operation="INSERT") as db: + await cursor.execute(insert_sql, row) + db.set_result(rows_affected=1) +``` + +## Testing + +### Mock Tracing in Tests + +Use the NoOp tracer for unit tests: + +```python +from opentelemetry import trace +from opentelemetry.trace import NoOpTracerProvider + +def setup_test_tracing(): + trace.set_tracer_provider(NoOpTracerProvider()) +``` + +### Test Outcome Recording + +Verify outcomes are emitted correctly: + +```python +from unittest.mock import patch + +def test_successful_outcome(): + with patch("botanu.sdk.span_helpers.emit_outcome") as mock_emit: + result = await handle_ticket("123") + mock_emit.assert_called_with("success", value_type="tickets_resolved", value_amount=1) +``` + +## See Also + +- [Anti-Patterns](anti-patterns.md) - What to avoid +- [Architecture](../concepts/architecture.md) - SDK design principles +- [Configuration](../getting-started/configuration.md) - Configuration options diff --git a/docs/tracking/data-tracking.md b/docs/tracking/data-tracking.md new file mode 100644 index 0000000..9c066a8 --- /dev/null +++ b/docs/tracking/data-tracking.md @@ -0,0 +1,412 @@ +# Data Tracking + +Track database, storage, and messaging operations for complete cost visibility. + +## Overview + +Data operations often contribute significantly to AI workflow costs. Botanu provides tracking for: + +- **Databases** - SQL, NoSQL, data warehouses +- **Object Storage** - S3, GCS, Azure Blob +- **Messaging** - SQS, Kafka, Pub/Sub + +## Database Tracking + +### Basic Usage + +```python +from botanu.tracking.data import track_db_operation + +with track_db_operation(system="postgresql", operation="SELECT") as db: + result = await cursor.execute("SELECT * FROM users WHERE active = true") + db.set_result(rows_returned=len(result)) +``` + +### DBTracker Methods + +#### set_result() + +Record query results: + +```python +db.set_result( + rows_returned=100, # For SELECT queries + rows_affected=5, # For INSERT/UPDATE/DELETE + bytes_read=10240, # Data read + bytes_written=2048, # Data written +) +``` + +#### set_table() + +Record table information: + +```python +db.set_table("users", schema="public") +``` + +#### set_query_id() + +For data warehouses with query IDs: + +```python +db.set_query_id("01abc-def-...") +``` + +#### set_bytes_scanned() + +For pay-per-query warehouses: + +```python +db.set_bytes_scanned(1073741824) # 1 GB +``` + +#### set_error() + +Record errors (automatically called on exceptions): + +```python +db.set_error(exception) +``` + +#### add_metadata() + +Add custom attributes: + +```python +db.add_metadata( + query_type="aggregation", + cache_hit=True, +) +``` + +### Database Operations + +Use `DBOperation` constants: + +```python +from botanu.tracking.data import track_db_operation, DBOperation + +with track_db_operation(system="postgresql", operation=DBOperation.SELECT): + ... + +with track_db_operation(system="postgresql", operation=DBOperation.INSERT): + ... +``` + +Available operations: + +| Constant | Description | +|----------|-------------| +| `SELECT` | Read queries | +| `INSERT` | Insert data | +| `UPDATE` | Update data | +| `DELETE` | Delete data | +| `UPSERT` | Insert or update | +| `MERGE` | Merge operations | +| `CREATE` | Create tables/indexes | +| `DROP` | Drop objects | +| `ALTER` | Alter schema | +| `INDEX` | Index operations | +| `TRANSACTION` | Transaction control | +| `BATCH` | Batch operations | + +### System Normalization + +Database systems are automatically normalized: + +| Input | Normalized | +|-------|------------| +| `postgresql`, `postgres`, `pg` | `postgresql` | +| `mysql` | `mysql` | +| `mongodb`, `mongo` | `mongodb` | +| `dynamodb` | `dynamodb` | +| `redis` | `redis` | +| `elasticsearch` | `elasticsearch` | +| `snowflake` | `snowflake` | +| `bigquery` | `bigquery` | +| `redshift` | `redshift` | + +## Storage Tracking + +### Basic Usage + +```python +from botanu.tracking.data import track_storage_operation + +with track_storage_operation(system="s3", operation="PUT") as storage: + await s3_client.put_object(Bucket="my-bucket", Key="file.txt", Body=data) + storage.set_result(bytes_written=len(data)) +``` + +### StorageTracker Methods + +#### set_result() + +Record operation results: + +```python +storage.set_result( + objects_count=10, # Number of objects + bytes_read=1048576, # Data downloaded + bytes_written=2097152, # Data uploaded +) +``` + +#### set_bucket() + +Record bucket name: + +```python +storage.set_bucket("my-data-bucket") +``` + +#### set_error() + +Record errors: + +```python +storage.set_error(exception) +``` + +#### add_metadata() + +Add custom attributes: + +```python +storage.add_metadata( + storage_class="GLACIER", + encryption="AES256", +) +``` + +### Storage Operations + +| Constant | Description | +|----------|-------------| +| `GET` | Download object | +| `PUT` | Upload object | +| `DELETE` | Delete object | +| `LIST` | List objects | +| `HEAD` | Get metadata | +| `COPY` | Copy object | +| `MULTIPART_UPLOAD` | Multipart upload | + +### System Normalization + +| Input | Normalized | +|-------|------------| +| `s3`, `aws_s3` | `s3` | +| `gcs`, `google_cloud_storage` | `gcs` | +| `blob`, `azure_blob` | `azure_blob` | +| `minio` | `minio` | + +## Messaging Tracking + +### Basic Usage + +```python +from botanu.tracking.data import track_messaging_operation + +with track_messaging_operation(system="sqs", operation="publish", destination="my-queue") as msg: + await sqs_client.send_message(QueueUrl=queue_url, MessageBody=message) + msg.set_result(message_count=1, bytes_transferred=len(message)) +``` + +### MessagingTracker Methods + +#### set_result() + +Record operation results: + +```python +msg.set_result( + message_count=10, + bytes_transferred=4096, +) +``` + +#### set_error() + +Record errors: + +```python +msg.set_error(exception) +``` + +#### add_metadata() + +Add custom attributes: + +```python +msg.add_metadata( + message_group_id="group-1", + deduplication_id="dedup-123", +) +``` + +### Messaging Operations + +| Constant | Description | +|----------|-------------| +| `publish` | Send message | +| `consume` | Receive and process message | +| `receive` | Receive message | +| `send` | Send message (alias for publish) | +| `subscribe` | Subscribe to topic | + +### System Normalization + +| Input | Normalized | +|-------|------------| +| `sqs`, `aws_sqs` | `sqs` | +| `sns` | `sns` | +| `kinesis` | `kinesis` | +| `pubsub`, `google_pubsub` | `pubsub` | +| `kafka` | `kafka` | +| `rabbitmq` | `rabbitmq` | +| `celery` | `celery` | + +## Standalone Helpers + +### set_data_metrics() + +Set data metrics on the current span: + +```python +from botanu.tracking.data import set_data_metrics + +set_data_metrics( + rows_returned=100, + rows_affected=5, + bytes_read=10240, + bytes_written=2048, + objects_count=10, +) +``` + +### set_warehouse_metrics() + +For data warehouse queries: + +```python +from botanu.tracking.data import set_warehouse_metrics + +set_warehouse_metrics( + query_id="01abc-def-...", + bytes_scanned=1073741824, + rows_returned=1000, + partitions_scanned=5, +) +``` + +## Example: Complete Data Pipeline + +```python +from botanu import botanu_use_case, emit_outcome +from botanu.tracking.data import ( + track_db_operation, + track_storage_operation, + track_messaging_operation, + DBOperation, +) +from botanu.tracking.llm import track_llm_call + +@botanu_use_case("ETL Pipeline") +async def process_batch(batch_id: str): + """Complete ETL pipeline with cost tracking.""" + + # 1. Read from data warehouse + with track_db_operation(system="snowflake", operation=DBOperation.SELECT) as db: + db.set_query_id(batch_id) + rows = await snowflake_client.execute( + "SELECT * FROM raw_data WHERE batch_id = %s", + batch_id + ) + db.set_result(rows_returned=len(rows)) + db.set_bytes_scanned(rows.bytes_scanned) + + # 2. Process with LLM + processed = [] + for row in rows: + with track_llm_call(provider="openai", model="gpt-4") as llm: + result = await analyze_row(row) + llm.set_tokens(input_tokens=result.input_tokens, output_tokens=result.output_tokens) + processed.append(result) + + # 3. Write to storage + with track_storage_operation(system="s3", operation="PUT") as storage: + storage.set_bucket("processed-data") + await s3_client.put_object( + Bucket="processed-data", + Key=f"batch/{batch_id}.json", + Body=json.dumps(processed) + ) + storage.set_result(bytes_written=len(json.dumps(processed))) + + # 4. Write to database + with track_db_operation(system="postgresql", operation=DBOperation.INSERT) as db: + await pg_client.executemany( + "INSERT INTO processed_data VALUES (%s, %s, %s)", + [(r.id, r.result, r.score) for r in processed] + ) + db.set_result(rows_affected=len(processed)) + + # 5. Publish completion event + with track_messaging_operation(system="sqs", operation="publish", destination="batch-complete") as msg: + await sqs_client.send_message( + QueueUrl=queue_url, + MessageBody=json.dumps({"batch_id": batch_id, "count": len(processed)}) + ) + msg.set_result(message_count=1) + + emit_outcome("success", value_type="batches_processed", value_amount=1) + return processed +``` + +## Span Attributes + +### Database Spans + +| Attribute | Description | +|-----------|-------------| +| `db.system` | Database system (normalized) | +| `db.operation` | Operation type | +| `db.name` | Database name | +| `db.collection.name` | Table/collection name | +| `botanu.vendor` | Vendor for cost attribution | +| `botanu.data.rows_returned` | Rows returned | +| `botanu.data.rows_affected` | Rows modified | +| `botanu.data.bytes_read` | Bytes read | +| `botanu.data.bytes_written` | Bytes written | +| `botanu.warehouse.query_id` | Warehouse query ID | +| `botanu.warehouse.bytes_scanned` | Bytes scanned | + +### Storage Spans + +| Attribute | Description | +|-----------|-------------| +| `botanu.storage.system` | Storage system | +| `botanu.storage.operation` | Operation type | +| `botanu.storage.bucket` | Bucket name | +| `botanu.vendor` | Vendor for cost attribution | +| `botanu.data.objects_count` | Objects processed | +| `botanu.data.bytes_read` | Bytes downloaded | +| `botanu.data.bytes_written` | Bytes uploaded | + +### Messaging Spans + +| Attribute | Description | +|-----------|-------------| +| `messaging.system` | Messaging system | +| `messaging.operation` | Operation type | +| `messaging.destination.name` | Queue/topic name | +| `botanu.vendor` | Vendor for cost attribution | +| `botanu.messaging.message_count` | Messages processed | +| `botanu.messaging.bytes_transferred` | Bytes transferred | + +## See Also + +- [LLM Tracking](llm-tracking.md) - AI model tracking +- [Outcomes](outcomes.md) - Recording business outcomes +- [Best Practices](../patterns/best-practices.md) - Tracking best practices diff --git a/docs/tracking/llm-tracking.md b/docs/tracking/llm-tracking.md new file mode 100644 index 0000000..138cd7f --- /dev/null +++ b/docs/tracking/llm-tracking.md @@ -0,0 +1,332 @@ +# LLM Tracking + +Track AI model usage for accurate cost attribution across providers. + +## Overview + +Botanu provides LLM tracking that aligns with [OpenTelemetry GenAI Semantic Conventions](https://opentelemetry.io/docs/specs/semconv/gen-ai/). This ensures compatibility with standard observability tooling while enabling detailed cost analysis. + +## Basic Usage + +### Context Manager (Recommended) + +```python +from botanu.tracking.llm import track_llm_call + +with track_llm_call(provider="openai", model="gpt-4") as tracker: + response = await openai.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": "Hello"}] + ) + tracker.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + ) + tracker.set_request_id(response.id) +``` + +### What Gets Recorded + +| Attribute | Example | Description | +|-----------|---------|-------------| +| `gen_ai.operation.name` | `chat` | Type of operation | +| `gen_ai.provider.name` | `openai` | Normalized provider name | +| `gen_ai.request.model` | `gpt-4` | Requested model | +| `gen_ai.response.model` | `gpt-4-0613` | Actual model used | +| `gen_ai.usage.input_tokens` | `150` | Input/prompt tokens | +| `gen_ai.usage.output_tokens` | `200` | Output/completion tokens | +| `gen_ai.response.id` | `chatcmpl-...` | Provider request ID | + +## LLMTracker Methods + +### set_tokens() + +Record token usage from the response: + +```python +tracker.set_tokens( + input_tokens=150, + output_tokens=200, + cached_tokens=50, # For providers with caching + cache_read_tokens=50, # Anthropic-style cache read + cache_write_tokens=100, # Anthropic-style cache write +) +``` + +### set_request_id() + +Record provider and client request IDs for billing reconciliation: + +```python +tracker.set_request_id( + provider_request_id=response.id, # From provider response + client_request_id="my-client-123", # Your tracking ID +) +``` + +### set_response_model() + +When the response uses a different model than requested: + +```python +tracker.set_response_model("gpt-4-0613") +``` + +### set_request_params() + +Record request parameters for analysis: + +```python +tracker.set_request_params( + temperature=0.7, + top_p=0.9, + max_tokens=1000, + stop_sequences=["END"], + frequency_penalty=0.5, + presence_penalty=0.3, +) +``` + +### set_streaming() + +Mark as a streaming request: + +```python +tracker.set_streaming(True) +``` + +### set_cache_hit() + +Mark as a cache hit (for semantic caching): + +```python +tracker.set_cache_hit(True) +``` + +### set_attempt() + +Track retry attempts: + +```python +tracker.set_attempt(2) # Second attempt +``` + +### set_finish_reason() + +Record the stop reason: + +```python +tracker.set_finish_reason("stop") # or "length", "content_filter", etc. +``` + +### set_error() + +Record errors (automatically called on exceptions): + +```python +try: + response = await client.chat(...) +except openai.RateLimitError as e: + tracker.set_error(e) + raise +``` + +### add_metadata() + +Add custom attributes: + +```python +tracker.add_metadata( + prompt_version="v2.1", + experiment_id="exp-123", +) +``` + +## Operation Types + +Use `ModelOperation` constants for the `operation` parameter: + +```python +from botanu.tracking.llm import track_llm_call, ModelOperation + +# Chat completion +with track_llm_call(provider="openai", model="gpt-4", operation=ModelOperation.CHAT): + ... + +# Embeddings +with track_llm_call(provider="openai", model="text-embedding-3-small", operation=ModelOperation.EMBEDDINGS): + ... + +# Text completion (legacy) +with track_llm_call(provider="openai", model="davinci", operation=ModelOperation.TEXT_COMPLETION): + ... +``` + +Available operations: + +| Constant | Value | Use Case | +|----------|-------|----------| +| `CHAT` | `chat` | Chat completions (default) | +| `TEXT_COMPLETION` | `text_completion` | Legacy completions | +| `EMBEDDINGS` | `embeddings` | Embedding generation | +| `GENERATE_CONTENT` | `generate_content` | Generic content generation | +| `EXECUTE_TOOL` | `execute_tool` | Tool/function execution | +| `CREATE_AGENT` | `create_agent` | Agent creation | +| `INVOKE_AGENT` | `invoke_agent` | Agent invocation | +| `RERANK` | `rerank` | Reranking | +| `IMAGE_GENERATION` | `image_generation` | Image generation | +| `SPEECH_TO_TEXT` | `speech_to_text` | Transcription | +| `TEXT_TO_SPEECH` | `text_to_speech` | Speech synthesis | + +## Provider Normalization + +Provider names are automatically normalized: + +| Input | Normalized | +|-------|------------| +| `openai`, `OpenAI` | `openai` | +| `azure_openai`, `azure-openai` | `azure.openai` | +| `anthropic`, `claude` | `anthropic` | +| `bedrock`, `aws_bedrock` | `aws.bedrock` | +| `vertex`, `vertexai`, `gemini` | `gcp.vertex_ai` | +| `cohere` | `cohere` | +| `mistral`, `mistralai` | `mistral` | +| `together`, `togetherai` | `together` | +| `groq` | `groq` | + +## Tool/Function Tracking + +Track tool calls triggered by LLMs: + +```python +from botanu.tracking.llm import track_tool_call + +with track_tool_call(tool_name="search_database", tool_call_id="call_abc123") as tool: + results = await search_database(query) + tool.set_result( + success=True, + items_returned=len(results), + bytes_processed=1024, + ) +``` + +### ToolTracker Methods + +```python +# Set execution result +tool.set_result( + success=True, + items_returned=10, + bytes_processed=2048, +) + +# Set tool call ID from LLM response +tool.set_tool_call_id("call_abc123") + +# Record error +tool.set_error(exception) + +# Add custom metadata +tool.add_metadata(query_type="semantic") +``` + +## Standalone Helpers + +For cases where you can't use context managers: + +### set_llm_attributes() + +```python +from botanu.tracking.llm import set_llm_attributes + +set_llm_attributes( + provider="openai", + model="gpt-4", + operation="chat", + input_tokens=150, + output_tokens=200, + streaming=True, + provider_request_id="chatcmpl-...", +) +``` + +### set_token_usage() + +```python +from botanu.tracking.llm import set_token_usage + +set_token_usage( + input_tokens=150, + output_tokens=200, + cached_tokens=50, +) +``` + +## Decorator for Auto-Instrumentation + +For wrapping existing client methods: + +```python +from botanu.tracking.llm import llm_instrumented + +class MyOpenAIClient: + @llm_instrumented(provider="openai", tokens_from_response=True) + def chat(self, model: str, messages: list): + return openai.chat.completions.create(model=model, messages=messages) +``` + +## Metrics + +The SDK automatically records these metrics: + +| Metric | Type | Description | +|--------|------|-------------| +| `gen_ai.client.token.usage` | Histogram | Token counts by type | +| `gen_ai.client.operation.duration` | Histogram | Operation duration in seconds | +| `botanu.gen_ai.attempts` | Counter | Request attempts (including retries) | + +## Example: Multi-Provider Workflow + +```python +from botanu import botanu_use_case, emit_outcome +from botanu.tracking.llm import track_llm_call + +@botanu_use_case("Document Analysis") +async def analyze_with_fallback(document: str): + """Try Claude first, fall back to GPT-4.""" + + try: + with track_llm_call(provider="anthropic", model="claude-3-opus") as tracker: + tracker.set_attempt(1) + response = await anthropic_client.messages.create( + model="claude-3-opus-20240229", + messages=[{"role": "user", "content": document}] + ) + tracker.set_tokens( + input_tokens=response.usage.input_tokens, + output_tokens=response.usage.output_tokens, + ) + emit_outcome("success", value_type="analyses_completed", value_amount=1) + return response.content[0].text + + except anthropic.RateLimitError: + # Fallback to OpenAI + with track_llm_call(provider="openai", model="gpt-4") as tracker: + tracker.set_attempt(2) + response = await openai_client.chat.completions.create( + model="gpt-4", + messages=[{"role": "user", "content": document}] + ) + tracker.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + ) + emit_outcome("success", value_type="analyses_completed", value_amount=1) + return response.choices[0].message.content +``` + +## See Also + +- [Auto-Instrumentation](../integration/auto-instrumentation.md) - Automatic LLM tracking +- [Data Tracking](data-tracking.md) - Database and storage tracking +- [Outcomes](outcomes.md) - Recording business outcomes diff --git a/docs/tracking/outcomes.md b/docs/tracking/outcomes.md new file mode 100644 index 0000000..0e974ae --- /dev/null +++ b/docs/tracking/outcomes.md @@ -0,0 +1,363 @@ +# Outcomes + +Record business outcomes to enable cost-per-outcome analysis. + +## Overview + +Outcomes connect infrastructure costs to business value. By recording what was achieved (tickets resolved, documents processed, leads qualified), you can calculate the true ROI of your AI workflows. + +## Basic Usage + +```python +from botanu import botanu_use_case, emit_outcome + +@botanu_use_case("Customer Support") +async def handle_ticket(ticket_id: str): + # ... process ticket ... + + # Record the business outcome + emit_outcome("success", value_type="tickets_resolved", value_amount=1) +``` + +## emit_outcome() Parameters + +```python +emit_outcome( + status: str, # Required: "success", "partial", "failed" + value_type: str = None, # What was achieved + value_amount: float = None, # How much + confidence: float = None, # Confidence score (0.0-1.0) + reason: str = None, # Why (especially for failures) +) +``` + +### status + +The outcome status: + +| Status | Description | Use Case | +|--------|-------------|----------| +| `success` | Fully achieved goal | Ticket resolved, document processed | +| `partial` | Partially achieved | 3 of 5 items processed | +| `failed` | Did not achieve goal | Error, timeout, rejection | + +### value_type + +A descriptive label for what was achieved: + +```python +emit_outcome("success", value_type="tickets_resolved", value_amount=1) +emit_outcome("success", value_type="documents_processed", value_amount=5) +emit_outcome("success", value_type="leads_qualified", value_amount=1) +emit_outcome("success", value_type="revenue_generated", value_amount=499.99) +``` + +### value_amount + +The quantified value: + +```python +# Count +emit_outcome("success", value_type="emails_sent", value_amount=100) + +# Revenue +emit_outcome("success", value_type="order_value", value_amount=1299.99) + +# Score +emit_outcome("success", value_type="satisfaction_score", value_amount=4.5) +``` + +### confidence + +For probabilistic outcomes: + +```python +emit_outcome( + "success", + value_type="intent_classified", + value_amount=1, + confidence=0.92, +) +``` + +### reason + +Explain the outcome (especially for failures): + +```python +emit_outcome("failed", reason="rate_limit_exceeded") +emit_outcome("failed", reason="invalid_input") +emit_outcome("partial", reason="timeout_partial_results", value_amount=3) +``` + +## Outcome Patterns + +### Success with Value + +```python +@botanu_use_case("Order Processing") +async def process_order(order_id: str): + order = await fetch_order(order_id) + await fulfill_order(order) + + emit_outcome( + "success", + value_type="orders_fulfilled", + value_amount=1, + ) +``` + +### Success with Revenue + +```python +@botanu_use_case("Sales Bot") +async def handle_inquiry(inquiry_id: str): + result = await process_sale(inquiry_id) + + if result.sale_completed: + emit_outcome( + "success", + value_type="revenue_generated", + value_amount=result.order_total, + ) + else: + emit_outcome( + "partial", + value_type="leads_qualified", + value_amount=1, + ) +``` + +### Partial Success + +```python +@botanu_use_case("Batch Processing") +async def process_batch(items: list): + processed = 0 + for item in items: + try: + await process_item(item) + processed += 1 + except Exception: + continue + + if processed == len(items): + emit_outcome("success", value_type="items_processed", value_amount=processed) + elif processed > 0: + emit_outcome( + "partial", + value_type="items_processed", + value_amount=processed, + reason=f"processed_{processed}_of_{len(items)}", + ) + else: + emit_outcome("failed", reason="no_items_processed") +``` + +### Failure with Reason + +```python +@botanu_use_case("Document Analysis") +async def analyze_document(doc_id: str): + try: + document = await fetch_document(doc_id) + if not document: + emit_outcome("failed", reason="document_not_found") + return None + + result = await analyze(document) + emit_outcome("success", value_type="documents_analyzed", value_amount=1) + return result + + except RateLimitError: + emit_outcome("failed", reason="rate_limit_exceeded") + raise + except TimeoutError: + emit_outcome("failed", reason="analysis_timeout") + raise +``` + +### Classification with Confidence + +```python +@botanu_use_case("Intent Classification") +async def classify_intent(message: str): + result = await classifier.predict(message) + + emit_outcome( + "success", + value_type="intents_classified", + value_amount=1, + confidence=result.confidence, + ) + + return result.intent +``` + +## Automatic Outcomes + +The `@botanu_use_case` decorator automatically emits outcomes: + +```python +@botanu_use_case("My Use Case", auto_outcome_on_success=True) # Default +async def my_function(): + # If no exception and no explicit emit_outcome, emits "success" + return result +``` + +If an exception is raised, it automatically emits `"failed"` with the exception class as the reason. + +To disable: + +```python +@botanu_use_case("My Use Case", auto_outcome_on_success=False) +async def my_function(): + # Must call emit_outcome explicitly + emit_outcome("success") +``` + +## @botanu_outcome Decorator + +For sub-functions within a use case: + +```python +from botanu import botanu_use_case, botanu_outcome + +@botanu_use_case("Data Pipeline") +async def run_pipeline(): + await step_one() + await step_two() + +@botanu_outcome() +async def step_one(): + # Emits "success" on completion, "failed" on exception + await process_data() + +@botanu_outcome(success="data_extracted", failed="extraction_failed") +async def step_two(): + # Custom outcome labels + await extract_data() +``` + +## Span Attributes + +Outcomes are recorded as span attributes: + +| Attribute | Description | +|-----------|-------------| +| `botanu.outcome` | Status (success/partial/failed) | +| `botanu.outcome.value_type` | What was achieved | +| `botanu.outcome.value_amount` | Quantified value | +| `botanu.outcome.confidence` | Confidence score | +| `botanu.outcome.reason` | Reason for outcome | + +## Span Events + +An event is also emitted for timeline visibility: + +```python +# Event: botanu.outcome_emitted +# Attributes: +# status: "success" +# value_type: "tickets_resolved" +# value_amount: 1 +``` + +## Cost-Per-Outcome Analysis + +With outcomes recorded, you can calculate: + +```sql +-- Cost per successful ticket resolution +SELECT + AVG(total_cost) as avg_cost_per_resolution +FROM runs +WHERE use_case = 'Customer Support' + AND outcome_status = 'success' + AND outcome_value_type = 'tickets_resolved'; + +-- ROI by use case +SELECT + use_case, + SUM(outcome_value_amount * value_per_unit) as total_value, + SUM(total_cost) as total_cost, + (SUM(outcome_value_amount * value_per_unit) - SUM(total_cost)) / SUM(total_cost) as roi +FROM runs +GROUP BY use_case; +``` + +## Best Practices + +### 1. Always Record Outcomes + +Every use case should emit an outcome: + +```python +@botanu_use_case("My Use Case") +async def my_function(): + try: + result = await do_work() + emit_outcome("success", value_type="items_processed", value_amount=result.count) + return result + except Exception as e: + emit_outcome("failed", reason=type(e).__name__) + raise +``` + +### 2. Use Consistent Value Types + +Define standard value types for your organization: + +```python +# Good - consistent naming +emit_outcome("success", value_type="tickets_resolved", value_amount=1) +emit_outcome("success", value_type="documents_processed", value_amount=1) + +# Bad - inconsistent +emit_outcome("success", value_type="ticket_done", value_amount=1) +emit_outcome("success", value_type="doc processed", value_amount=1) +``` + +### 3. Quantify When Possible + +Include amounts for better analysis: + +```python +# Good - quantified +emit_outcome("success", value_type="emails_sent", value_amount=50) + +# Less useful - no amount +emit_outcome("success") +``` + +### 4. Include Reasons for Failures + +Always explain why something failed: + +```python +emit_outcome("failed", reason="api_rate_limit") +emit_outcome("failed", reason="invalid_input_format") +emit_outcome("failed", reason="model_unavailable") +``` + +### 5. One Outcome Per Run + +Emit only one outcome per use case execution: + +```python +@botanu_use_case("Process Items") +async def process_items(items): + successful = 0 + for item in items: + if await process(item): + successful += 1 + + # One outcome at the end + emit_outcome("success", value_type="items_processed", value_amount=successful) +``` + +## See Also + +- [Run Context](../concepts/run-context.md) - Understanding runs +- [LLM Tracking](llm-tracking.md) - Tracking LLM costs +- [Best Practices](../patterns/best-practices.md) - More patterns diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..93be461 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,220 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +# --------------------------------------------------------------------------- +# Project metadata (PEP 621) +# --------------------------------------------------------------------------- +[project] +name = "botanu" +dynamic = ["version"] +description = "OpenTelemetry-native run-level cost attribution for AI workflows" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.9" +authors = [ + { name = "The Botanu Authors", email = "oss@botanu.ai" }, +] +keywords = [ + "opentelemetry", + "tracing", + "observability", + "ai", + "llm", + "cost-attribution", + "mlops", +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Topic :: Software Development :: Libraries :: Python Modules", + "Topic :: System :: Monitoring", + "Typing :: Typed", +] + +# Core dependency — opentelemetry-api only (~50 KB, zero transitive deps). +# Everything else is behind optional extras so adopters never pay for what +# they don't use. +dependencies = [ + "opentelemetry-api >= 1.20.0", +] + +[project.urls] +Homepage = "https://github.com/botanu-ai/botanu-sdk-python" +Documentation = "https://docs.botanu.ai" +Repository = "https://github.com/botanu-ai/botanu-sdk-python" +Changelog = "https://github.com/botanu-ai/botanu-sdk-python/blob/main/CHANGELOG.md" +Issues = "https://github.com/botanu-ai/botanu-sdk-python/issues" + +# --------------------------------------------------------------------------- +# Optional extras +# --------------------------------------------------------------------------- +[project.optional-dependencies] +# Full OTel SDK + OTLP exporter — needed only when running standalone +# (no pre-existing TracerProvider from Datadog / Splunk / etc.) +sdk = [ + "opentelemetry-sdk >= 1.20.0", + "opentelemetry-exporter-otlp-proto-http >= 1.20.0", +] + +# Auto-instrumentation libraries for common frameworks +instruments = [ + "opentelemetry-instrumentation >= 0.41b0", + "opentelemetry-instrumentation-fastapi >= 0.41b0", + "opentelemetry-instrumentation-requests >= 0.41b0", + "opentelemetry-instrumentation-httpx >= 0.41b0", + "opentelemetry-instrumentation-flask >= 0.41b0", + "opentelemetry-instrumentation-django >= 0.41b0", + "opentelemetry-instrumentation-urllib3 >= 0.41b0", + "opentelemetry-instrumentation-starlette >= 0.41b0", + "opentelemetry-instrumentation-sqlalchemy >= 0.41b0", + "opentelemetry-instrumentation-redis >= 0.41b0", + "opentelemetry-instrumentation-celery >= 0.41b0", + "opentelemetry-instrumentation-grpc >= 0.41b0", + "opentelemetry-instrumentation-logging >= 0.41b0", +] + +# GenAI / AI model auto-instrumentation +genai = [ + "opentelemetry-instrumentation-openai-v2 >= 2.0b0", + "opentelemetry-instrumentation-anthropic >= 0.1b0", + "opentelemetry-instrumentation-vertexai >= 0.1b0", + "opentelemetry-instrumentation-google-genai >= 0.1b0", + "opentelemetry-instrumentation-langchain >= 0.1b0", +] + +# Cross-service carrier propagation (SQS, Kafka, Celery, Redis) +carriers = [ + "celery >= 5.0.0", + "aiokafka >= 0.8.0", +] + +# Everything +all = [ + "botanu[sdk,instruments,genai,carriers]", +] + +# Development / CI +dev = [ + "botanu[all]", + "pytest >= 7.4.0", + "pytest-asyncio >= 0.21.0", + "pytest-cov >= 4.1.0", + "coverage[toml] >= 7.0", + "httpx >= 0.24.0", + "ruff >= 0.4.0", + "mypy >= 1.7.0", + "pre-commit >= 3.5.0", +] + +# --------------------------------------------------------------------------- +# Hatch — build targets & versioning +# --------------------------------------------------------------------------- +[tool.hatch.version] +source = "vcs" + +[tool.hatch.version.raw-options] +version_scheme = "guess-next-dev" +local_scheme = "no-local-version" + +[tool.hatch.build.targets.sdist] +include = ["src/botanu/**", "LICENSE", "NOTICE", "README.md"] + +[tool.hatch.build.targets.wheel] +packages = ["src/botanu"] + +# --------------------------------------------------------------------------- +# Ruff (linter + formatter) +# --------------------------------------------------------------------------- +[tool.ruff] +line-length = 120 +target-version = "py39" +src = ["src"] + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort + "B", # flake8-bugbear + "UP", # pyupgrade + "S", # flake8-bandit (security) + "RUF", # ruff-specific +] +ignore = [ + "E501", # line too long — handled by formatter + "S101", # assert in tests is fine + "S110", # try-except-pass is intentional in resource detection + "UP006", # dict vs Dict — keep Dict[] for 3.9 compat + "UP007", # X | Y syntax — keep Optional[] for 3.9 compat + "UP035", # typing.Dict deprecated — keep for 3.9 compat + "UP045", # X | None vs Optional — keep Optional[] for 3.9 compat + "RUF002", # ambiguous dash — intentional in docstrings + "RUF022", # __all__ not sorted — grouped logically +] + +[tool.ruff.lint.per-file-ignores] +"tests/**" = ["S101", "S106"] + +[tool.ruff.format] +quote-style = "double" +indent-style = "space" +line-ending = "auto" + +# --------------------------------------------------------------------------- +# mypy +# --------------------------------------------------------------------------- +[tool.mypy] +python_version = "3.9" +warn_return_any = false +warn_unused_configs = true +ignore_missing_imports = true +strict = false +# OTel SDK types are not always precise; runtime behavior is correct +disable_error_code = ["arg-type", "attr-defined", "operator", "misc"] + +# --------------------------------------------------------------------------- +# pytest +# --------------------------------------------------------------------------- +[tool.pytest.ini_options] +asyncio_mode = "auto" +testpaths = ["tests"] +addopts = [ + "--strict-markers", + "--tb=short", +] +markers = [ + "integration: marks tests that require external services", +] + +# --------------------------------------------------------------------------- +# coverage +# --------------------------------------------------------------------------- +[tool.coverage.run] +source = ["botanu"] +branch = true + +[tool.coverage.report] +show_missing = true +fail_under = 70 +exclude_lines = [ + "pragma: no cover", + "if TYPE_CHECKING:", + "if __name__ == .__main__.", +] +# Exclude integration-heavy modules that require full OTel SDK setup +omit = [ + "src/botanu/sdk/bootstrap.py", + "src/botanu/sdk/middleware.py", +] diff --git a/src/botanu/__init__.py b/src/botanu/__init__.py new file mode 100644 index 0000000..2ccf3d8 --- /dev/null +++ b/src/botanu/__init__.py @@ -0,0 +1,76 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Botanu SDK - OpenTelemetry-native cost attribution for AI workflows. + +Quick Start:: + + from botanu import enable, botanu_use_case, emit_outcome + + enable(service_name="my-app") + + @botanu_use_case(name="Customer Support") + async def handle_request(data): + result = await process(data) + emit_outcome("success", value_type="tickets_resolved", value_amount=1) + return result +""" + +from __future__ import annotations + +from botanu._version import __version__ + +# Run context model +from botanu.models.run_context import RunContext, RunOutcome, RunStatus + +# Bootstrap +from botanu.sdk.bootstrap import ( + disable, + enable, + is_enabled, +) + +# Configuration +from botanu.sdk.config import BotanuConfig + +# Context helpers (core — no SDK dependency) +from botanu.sdk.context import ( + get_baggage, + get_current_span, + get_run_id, + get_use_case, + set_baggage, +) + +# Decorators (primary integration point) +from botanu.sdk.decorators import botanu_outcome, botanu_use_case, use_case + +# Span helpers +from botanu.sdk.span_helpers import emit_outcome, set_business_context + +__all__ = [ + "__version__", + # Bootstrap + "enable", + "disable", + "is_enabled", + # Configuration + "BotanuConfig", + # Decorators + "botanu_use_case", + "use_case", + "botanu_outcome", + # Span helpers + "emit_outcome", + "set_business_context", + "get_current_span", + # Context + "get_run_id", + "get_use_case", + "set_baggage", + "get_baggage", + # Run context + "RunContext", + "RunStatus", + "RunOutcome", +] diff --git a/src/botanu/_version.py b/src/botanu/_version.py new file mode 100644 index 0000000..e7fea48 --- /dev/null +++ b/src/botanu/_version.py @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Dynamic version from package metadata (set by hatch-vcs at build time).""" + +from __future__ import annotations + +try: + from importlib.metadata import version + + __version__: str = version("botanu") +except Exception: + __version__ = "0.0.0.dev0" diff --git a/src/botanu/models/__init__.py b/src/botanu/models/__init__.py new file mode 100644 index 0000000..2fa20c3 --- /dev/null +++ b/src/botanu/models/__init__.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Botanu data models.""" + +from __future__ import annotations + +from botanu.models.run_context import RunContext, RunOutcome, RunStatus + +__all__ = ["RunContext", "RunOutcome", "RunStatus"] diff --git a/src/botanu/models/run_context.py b/src/botanu/models/run_context.py new file mode 100644 index 0000000..264801f --- /dev/null +++ b/src/botanu/models/run_context.py @@ -0,0 +1,320 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Run Context - The core data model for Botanu runs. + +A "Run" is orthogonal to tracing: +- Trace context (W3C): ties distributed spans together (trace_id, span_id) +- Run context (Botanu): ties business execution together (run_id, use_case, outcome) + +Invariant: A run can span multiple traces (retries, async fanout). +The run_id must remain stable across those boundaries. +""" + +from __future__ import annotations + +import os +import time +from dataclasses import dataclass, field +from datetime import datetime, timezone +from enum import Enum +from typing import Dict, Optional, Union + + +def generate_run_id() -> str: + """Generate a UUIDv7-style sortable run ID. + + UUIDv7 provides: + - Sortable by time (first 48 bits are millisecond timestamp) + - Globally unique + - Compatible with UUID format + + Uses ``os.urandom()`` for ~2x faster generation than ``secrets``. + """ + timestamp_ms = int(time.time() * 1000) + + uuid_bytes = bytearray(16) + uuid_bytes[0] = (timestamp_ms >> 40) & 0xFF + uuid_bytes[1] = (timestamp_ms >> 32) & 0xFF + uuid_bytes[2] = (timestamp_ms >> 24) & 0xFF + uuid_bytes[3] = (timestamp_ms >> 16) & 0xFF + uuid_bytes[4] = (timestamp_ms >> 8) & 0xFF + uuid_bytes[5] = timestamp_ms & 0xFF + + random_bytes = os.urandom(10) + uuid_bytes[6] = 0x70 | (random_bytes[0] & 0x0F) + uuid_bytes[7] = random_bytes[1] + uuid_bytes[8] = 0x80 | (random_bytes[2] & 0x3F) + uuid_bytes[9:16] = random_bytes[3:10] + + hex_str = uuid_bytes.hex() + return f"{hex_str[:8]}-{hex_str[8:12]}-{hex_str[12:16]}-{hex_str[16:20]}-{hex_str[20:]}" + + +class RunStatus(str, Enum): + """Run outcome status.""" + + SUCCESS = "success" + FAILURE = "failure" + PARTIAL = "partial" + TIMEOUT = "timeout" + CANCELED = "canceled" + + +@dataclass +class RunOutcome: + """Outcome attached at run completion.""" + + status: RunStatus + reason_code: Optional[str] = None + error_class: Optional[str] = None + value_type: Optional[str] = None + value_amount: Optional[float] = None + confidence: Optional[float] = None + + +@dataclass +class RunContext: + """Canonical run context data model. + + Propagated via W3C Baggage and stored as span attributes. + + Retry model: + Each attempt gets a NEW run_id for clean cost accounting. + ``root_run_id`` stays stable across all attempts. + """ + + run_id: str + use_case: str + environment: str + workflow: Optional[str] = None + workflow_version: Optional[str] = None + tenant_id: Optional[str] = None + parent_run_id: Optional[str] = None + root_run_id: Optional[str] = None + attempt: int = 1 + retry_of_run_id: Optional[str] = None + start_time: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) + deadline: Optional[float] = None + cancelled: bool = False + cancelled_at: Optional[float] = None + outcome: Optional[RunOutcome] = None + + def __post_init__(self) -> None: + if self.root_run_id is None: + object.__setattr__(self, "root_run_id", self.run_id) + + # ------------------------------------------------------------------ + # Factory + # ------------------------------------------------------------------ + + @classmethod + def create( + cls, + use_case: str, + workflow: Optional[str] = None, + workflow_version: Optional[str] = None, + environment: Optional[str] = None, + tenant_id: Optional[str] = None, + parent_run_id: Optional[str] = None, + root_run_id: Optional[str] = None, + attempt: int = 1, + retry_of_run_id: Optional[str] = None, + deadline_seconds: Optional[float] = None, + ) -> RunContext: + """Create a new RunContext with auto-generated run_id.""" + env = environment or os.getenv("BOTANU_ENVIRONMENT") or os.getenv("DEPLOYMENT_ENVIRONMENT") or "production" + run_id = generate_run_id() + deadline = None + if deadline_seconds is not None: + deadline = time.time() + deadline_seconds + + return cls( + run_id=run_id, + use_case=use_case, + environment=env, + workflow=workflow, + workflow_version=workflow_version, + tenant_id=tenant_id, + parent_run_id=parent_run_id, + root_run_id=root_run_id or run_id, + attempt=attempt, + retry_of_run_id=retry_of_run_id, + deadline=deadline, + ) + + @classmethod + def create_retry(cls, previous: RunContext) -> RunContext: + """Create a new RunContext for a retry attempt.""" + return cls.create( + use_case=previous.use_case, + workflow=previous.workflow, + workflow_version=previous.workflow_version, + environment=previous.environment, + tenant_id=previous.tenant_id, + parent_run_id=previous.parent_run_id, + root_run_id=previous.root_run_id, + attempt=previous.attempt + 1, + retry_of_run_id=previous.run_id, + ) + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def is_past_deadline(self) -> bool: + if self.deadline is None: + return False + return time.time() > self.deadline + + def is_cancelled(self) -> bool: + return self.cancelled or self.is_past_deadline() + + def request_cancellation(self, reason: str = "user") -> None: + self.cancelled = True + self.cancelled_at = time.time() + + def remaining_time_seconds(self) -> Optional[float]: + if self.deadline is None: + return None + return max(0.0, self.deadline - time.time()) + + def complete( + self, + status: RunStatus, + reason_code: Optional[str] = None, + error_class: Optional[str] = None, + value_type: Optional[str] = None, + value_amount: Optional[float] = None, + confidence: Optional[float] = None, + ) -> None: + self.outcome = RunOutcome( + status=status, + reason_code=reason_code, + error_class=error_class, + value_type=value_type, + value_amount=value_amount, + confidence=confidence, + ) + + @property + def duration_ms(self) -> Optional[float]: + if self.outcome is None: + return None + return (datetime.now(timezone.utc) - self.start_time).total_seconds() * 1000 + + # ------------------------------------------------------------------ + # Serialisation + # ------------------------------------------------------------------ + + def to_baggage_dict(self, lean_mode: Optional[bool] = None) -> Dict[str, str]: + """Convert to dict for W3C Baggage propagation.""" + if lean_mode is None: + env_mode = os.getenv("BOTANU_PROPAGATION_MODE", "lean") + lean_mode = env_mode != "full" + + baggage: Dict[str, str] = { + "botanu.run_id": self.run_id, + "botanu.use_case": self.use_case, + } + if lean_mode: + return baggage + + baggage["botanu.environment"] = self.environment + if self.workflow: + baggage["botanu.workflow"] = self.workflow + if self.tenant_id: + baggage["botanu.tenant_id"] = self.tenant_id + if self.parent_run_id: + baggage["botanu.parent_run_id"] = self.parent_run_id + if self.root_run_id and self.root_run_id != self.run_id: + baggage["botanu.root_run_id"] = self.root_run_id + if self.attempt > 1: + baggage["botanu.attempt"] = str(self.attempt) + if self.retry_of_run_id: + baggage["botanu.retry_of_run_id"] = self.retry_of_run_id + if self.deadline is not None: + baggage["botanu.deadline"] = str(int(self.deadline * 1000)) + if self.cancelled: + baggage["botanu.cancelled"] = "true" + return baggage + + def to_span_attributes(self) -> Dict[str, Union[str, float, int, bool]]: + """Convert to dict for span attributes.""" + attrs: Dict[str, Union[str, float, int, bool]] = { + "botanu.run_id": self.run_id, + "botanu.use_case": self.use_case, + "botanu.environment": self.environment, + "botanu.run.start_time": self.start_time.isoformat(), + } + if self.workflow: + attrs["botanu.workflow"] = self.workflow + if self.workflow_version: + attrs["botanu.workflow.version"] = self.workflow_version + if self.tenant_id: + attrs["botanu.tenant_id"] = self.tenant_id + if self.parent_run_id: + attrs["botanu.parent_run_id"] = self.parent_run_id + attrs["botanu.root_run_id"] = self.root_run_id or self.run_id + attrs["botanu.attempt"] = self.attempt + if self.retry_of_run_id: + attrs["botanu.retry_of_run_id"] = self.retry_of_run_id + if self.deadline is not None: + attrs["botanu.run.deadline_ts"] = self.deadline + if self.cancelled: + attrs["botanu.run.cancelled"] = True + if self.cancelled_at: + attrs["botanu.run.cancelled_at"] = self.cancelled_at + if self.outcome: + attrs["botanu.outcome.status"] = self.outcome.status.value + if self.outcome.reason_code: + attrs["botanu.outcome.reason_code"] = self.outcome.reason_code + if self.outcome.error_class: + attrs["botanu.outcome.error_class"] = self.outcome.error_class + if self.outcome.value_type: + attrs["botanu.outcome.value_type"] = self.outcome.value_type + if self.outcome.value_amount is not None: + attrs["botanu.outcome.value_amount"] = self.outcome.value_amount + if self.outcome.confidence is not None: + attrs["botanu.outcome.confidence"] = self.outcome.confidence + if self.duration_ms is not None: + attrs["botanu.run.duration_ms"] = self.duration_ms + return attrs + + @classmethod + def from_baggage(cls, baggage: Dict[str, str]) -> Optional[RunContext]: + """Reconstruct RunContext from baggage dict.""" + run_id = baggage.get("botanu.run_id") + use_case = baggage.get("botanu.use_case") + if not run_id or not use_case: + return None + + attempt_str = baggage.get("botanu.attempt", "1") + try: + attempt = int(attempt_str) + except ValueError: + attempt = 1 + + deadline: Optional[float] = None + deadline_str = baggage.get("botanu.deadline") + if deadline_str: + try: + deadline = float(deadline_str) / 1000.0 + except ValueError: + pass + + cancelled = baggage.get("botanu.cancelled", "").lower() == "true" + + return cls( + run_id=run_id, + use_case=use_case, + environment=baggage.get("botanu.environment", "unknown"), + workflow=baggage.get("botanu.workflow"), + tenant_id=baggage.get("botanu.tenant_id"), + parent_run_id=baggage.get("botanu.parent_run_id"), + root_run_id=baggage.get("botanu.root_run_id") or run_id, + attempt=attempt, + retry_of_run_id=baggage.get("botanu.retry_of_run_id"), + deadline=deadline, + cancelled=cancelled, + ) diff --git a/src/botanu/processors/__init__.py b/src/botanu/processors/__init__.py new file mode 100644 index 0000000..680a413 --- /dev/null +++ b/src/botanu/processors/__init__.py @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Botanu span processors. + +Only :class:`RunContextEnricher` is needed in the SDK. +All other processing should happen in the OTel Collector. +""" + +from botanu.processors.enricher import RunContextEnricher + +__all__ = ["RunContextEnricher"] diff --git a/src/botanu/processors/enricher.py b/src/botanu/processors/enricher.py new file mode 100644 index 0000000..85b3f78 --- /dev/null +++ b/src/botanu/processors/enricher.py @@ -0,0 +1,81 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""RunContextEnricher — the only span processor needed in the SDK. + +Why this MUST be in SDK (not collector): +- Baggage is process-local (not sent over the wire). +- Only the SDK can read baggage and write it to span attributes. +- The collector only sees spans after they're exported. + +All heavy processing should happen in the OTel Collector: +- PII redaction → ``redactionprocessor`` +- Cardinality limits → ``attributesprocessor`` +- Vendor detection → ``transformprocessor`` +""" + +from __future__ import annotations + +import logging +from typing import ClassVar, List, Optional + +from opentelemetry import baggage, context +from opentelemetry.sdk.trace import ReadableSpan, SpanProcessor +from opentelemetry.trace import Span + +logger = logging.getLogger(__name__) + + +class RunContextEnricher(SpanProcessor): + """Enriches ALL spans with run context from baggage. + + This ensures that every span (including auto-instrumented ones) + gets ``botanu.run_id``, ``botanu.use_case``, etc. attributes. + + Without this processor, only the root ``botanu.run`` span would + have these attributes. + + In ``lean_mode`` (default), only ``run_id`` and ``use_case`` are + propagated to minimise per-span overhead. + """ + + BAGGAGE_KEYS_FULL: ClassVar[List[str]] = [ + "botanu.run_id", + "botanu.use_case", + "botanu.workflow", + "botanu.environment", + "botanu.tenant_id", + "botanu.parent_run_id", + ] + + BAGGAGE_KEYS_LEAN: ClassVar[List[str]] = [ + "botanu.run_id", + "botanu.use_case", + ] + + def __init__(self, lean_mode: bool = True) -> None: + self._lean_mode = lean_mode + self._baggage_keys = self.BAGGAGE_KEYS_LEAN if lean_mode else self.BAGGAGE_KEYS_FULL + + def on_start( + self, + span: Span, + parent_context: Optional[context.Context] = None, + ) -> None: + """Called when a span starts — enrich with run context from baggage.""" + ctx = parent_context or context.get_current() + + for key in self._baggage_keys: + value = baggage.get_baggage(key, ctx) + if value: + if not span.attributes or key not in span.attributes: + span.set_attribute(key, value) + + def on_end(self, span: ReadableSpan) -> None: + pass + + def shutdown(self) -> None: + pass + + def force_flush(self, timeout_millis: int = 30000) -> bool: + return True diff --git a/src/botanu/py.typed b/src/botanu/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/src/botanu/resources/__init__.py b/src/botanu/resources/__init__.py new file mode 100644 index 0000000..474c051 --- /dev/null +++ b/src/botanu/resources/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Botanu resource detection.""" + +from botanu.resources.detector import detect_all_resources, get_resource_attributes + +__all__ = ["detect_all_resources", "get_resource_attributes"] diff --git a/src/botanu/resources/detector.py b/src/botanu/resources/detector.py new file mode 100644 index 0000000..1a6bf50 --- /dev/null +++ b/src/botanu/resources/detector.py @@ -0,0 +1,366 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Resource Detector — auto-detect execution environment for cost attribution. + +Detects attributes from: +- Kubernetes (``k8s.*``) +- Cloud providers (``cloud.*``, ``aws.*``, ``gcp.*``, ``azure.*``) +- Host / VM (``host.*``, ``os.*``) +- Container (``container.*``) +- Serverless / FaaS (``faas.*``) +- Process (``process.*``) +""" + +from __future__ import annotations + +import os +import platform +import socket +import sys +from functools import lru_cache +from typing import Any, Dict, Optional + +# ========================================================================= +# Environment Variable Mappings +# ========================================================================= + +K8S_ENV_MAPPINGS: Dict[str, Optional[str]] = { + "KUBERNETES_SERVICE_HOST": None, + "HOSTNAME": "k8s.pod.name", + "K8S_POD_NAME": "k8s.pod.name", + "K8S_POD_UID": "k8s.pod.uid", + "K8S_NAMESPACE": "k8s.namespace.name", + "K8S_NODE_NAME": "k8s.node.name", + "K8S_CLUSTER_NAME": "k8s.cluster.name", + "K8S_DEPLOYMENT_NAME": "k8s.deployment.name", + "K8S_STATEFULSET_NAME": "k8s.statefulset.name", + "K8S_CONTAINER_NAME": "k8s.container.name", +} + +AWS_ENV_MAPPINGS: Dict[str, Optional[str]] = { + "AWS_REGION": "cloud.region", + "AWS_DEFAULT_REGION": "cloud.region", + "AWS_ACCOUNT_ID": "cloud.account.id", + "ECS_CONTAINER_METADATA_URI": None, + "ECS_CLUSTER": "aws.ecs.cluster.name", + "ECS_TASK_ARN": "aws.ecs.task.arn", + "ECS_TASK_DEFINITION_FAMILY": "aws.ecs.task.family", + "AWS_LAMBDA_FUNCTION_NAME": "faas.name", + "AWS_LAMBDA_FUNCTION_VERSION": "faas.version", + "AWS_LAMBDA_LOG_GROUP_NAME": "aws.lambda.log_group", + "AWS_LAMBDA_FUNCTION_MEMORY_SIZE": "faas.max_memory", +} + +GCP_ENV_MAPPINGS: Dict[str, Optional[str]] = { + "GOOGLE_CLOUD_PROJECT": "cloud.account.id", + "GCLOUD_PROJECT": "cloud.account.id", + "GCP_PROJECT": "cloud.account.id", + "GOOGLE_CLOUD_REGION": "cloud.region", + "K_SERVICE": "faas.name", + "K_REVISION": "faas.version", + "K_CONFIGURATION": "gcp.cloud_run.configuration", + "FUNCTION_NAME": "faas.name", + "FUNCTION_TARGET": "faas.trigger", + "FUNCTION_SIGNATURE_TYPE": "gcp.function.signature_type", +} + +AZURE_ENV_MAPPINGS: Dict[str, Optional[str]] = { + "AZURE_SUBSCRIPTION_ID": "cloud.account.id", + "AZURE_RESOURCE_GROUP": "azure.resource_group", + "WEBSITE_SITE_NAME": "faas.name", + "FUNCTIONS_EXTENSION_VERSION": "azure.functions.version", + "WEBSITE_INSTANCE_ID": "faas.instance", + "REGION_NAME": "cloud.region", +} + + +# ========================================================================= +# Detection Functions +# ========================================================================= + + +def detect_kubernetes() -> Dict[str, Any]: + attrs: Dict[str, Any] = {} + if not os.environ.get("KUBERNETES_SERVICE_HOST"): + return attrs + + for env_var, attr_name in K8S_ENV_MAPPINGS.items(): + value = os.environ.get(env_var) + if attr_name and value: + attrs[attr_name] = value + + if "k8s.pod.name" not in attrs: + hostname = os.environ.get("HOSTNAME", socket.gethostname()) + if hostname: + attrs["k8s.pod.name"] = hostname + + namespace_file = "/var/run/secrets/kubernetes.io/serviceaccount/namespace" + if "k8s.namespace.name" not in attrs and os.path.exists(namespace_file): + try: + with open(namespace_file) as fh: + attrs["k8s.namespace.name"] = fh.read().strip() + except OSError: + pass + + return attrs + + +def detect_cloud_provider() -> Dict[str, Any]: + attrs: Dict[str, Any] = {} + + if _is_aws(): + attrs["cloud.provider"] = "aws" + for env_var, attr_name in AWS_ENV_MAPPINGS.items(): + value = os.environ.get(env_var) + if attr_name and value: + attrs[attr_name] = value + + if os.environ.get("AWS_LAMBDA_FUNCTION_NAME"): + attrs["faas.id"] = ( + f"arn:aws:lambda:{attrs.get('cloud.region', 'unknown')}:" + f"{attrs.get('cloud.account.id', 'unknown')}:" + f"function:{os.environ['AWS_LAMBDA_FUNCTION_NAME']}" + ) + + az = _get_aws_availability_zone() + if az: + attrs["cloud.availability_zone"] = az + if "cloud.region" not in attrs: + attrs["cloud.region"] = az[:-1] + + elif _is_gcp(): + attrs["cloud.provider"] = "gcp" + for env_var, attr_name in GCP_ENV_MAPPINGS.items(): + value = os.environ.get(env_var) + if attr_name and value: + attrs[attr_name] = value + if os.environ.get("K_SERVICE"): + attrs["faas.trigger"] = "http" + elif os.environ.get("FUNCTION_NAME"): + attrs["faas.trigger"] = os.environ.get("FUNCTION_TRIGGER_TYPE", "unknown") + + elif _is_azure(): + attrs["cloud.provider"] = "azure" + for env_var, attr_name in AZURE_ENV_MAPPINGS.items(): + value = os.environ.get(env_var) + if attr_name and value: + attrs[attr_name] = value + + return attrs + + +def _is_aws() -> bool: + indicators = [ + "AWS_REGION", + "AWS_DEFAULT_REGION", + "AWS_LAMBDA_FUNCTION_NAME", + "ECS_CONTAINER_METADATA_URI", + "AWS_EXECUTION_ENV", + ] + return any(os.environ.get(var) for var in indicators) + + +def _is_gcp() -> bool: + indicators = [ + "GOOGLE_CLOUD_PROJECT", + "GCLOUD_PROJECT", + "GCP_PROJECT", + "K_SERVICE", + "FUNCTION_NAME", + ] + return any(os.environ.get(var) for var in indicators) + + +def _is_azure() -> bool: + indicators = [ + "WEBSITE_SITE_NAME", + "AZURE_FUNCTIONS_ENVIRONMENT", + "AZURE_SUBSCRIPTION_ID", + ] + return any(os.environ.get(var) for var in indicators) + + +def _get_aws_availability_zone() -> Optional[str]: + """Get AWS availability zone from EC2 instance metadata. + + Uses IMDS (Instance Metadata Service) which is only accessible from within EC2. + Configure via environment variables: + - AWS_EC2_METADATA_SERVICE_ENDPOINT: Override the metadata endpoint + - AWS_EC2_METADATA_DISABLED: Set to 'true' to disable metadata calls + """ + if os.environ.get("AWS_LAMBDA_FUNCTION_NAME"): + return None + + # Respect AWS SDK standard env vars for disabling/configuring metadata + if os.environ.get("AWS_EC2_METADATA_DISABLED", "").lower() == "true": + return None + + # Use AWS SDK standard endpoint override, or default to standard IMDS address + endpoint = os.environ.get("AWS_EC2_METADATA_SERVICE_ENDPOINT", "http://169.254.169.254") + if not endpoint or not endpoint.startswith(("http://", "https://")): + return None + + try: + import urllib.request + + url = f"{endpoint}/latest/meta-data/placement/availability-zone" + req = urllib.request.Request(url, headers={"Accept": "text/plain"}) # noqa: S310 + with urllib.request.urlopen(req, timeout=0.5) as resp: # noqa: S310 + return resp.read().decode("utf-8").strip() + except Exception: + return None + + +def detect_host() -> Dict[str, Any]: + attrs: Dict[str, Any] = {} + try: + hostname = socket.gethostname() + if hostname: + attrs["host.name"] = hostname + except Exception: + pass + + host_id = os.environ.get("HOST_ID") or os.environ.get("INSTANCE_ID") + if host_id: + attrs["host.id"] = host_id + elif "host.name" in attrs: + attrs["host.id"] = attrs["host.name"] + + attrs["os.type"] = sys.platform + attrs["host.arch"] = platform.machine() + return attrs + + +def detect_container() -> Dict[str, Any]: + attrs: Dict[str, Any] = {} + container_id = _get_container_id() + if container_id: + attrs["container.id"] = container_id + + if os.path.exists("/.dockerenv"): + attrs["container.runtime"] = "docker" + elif os.environ.get("KUBERNETES_SERVICE_HOST"): + attrs["container.runtime"] = "containerd" + return attrs + + +def _get_container_id() -> Optional[str]: + container_id = os.environ.get("CONTAINER_ID") or os.environ.get("HOSTNAME") + + cgroup_path = "/proc/self/cgroup" + if os.path.exists(cgroup_path): + try: + with open(cgroup_path) as fh: + for line in fh: + if "docker" in line or "kubepods" in line: + parts = line.strip().split("/") + if parts: + last = parts[-1] + if last.startswith("cri-containerd-"): + last = last[15:] + if len(last) >= 12: + return last[:64] + except OSError: + pass + + return container_id if container_id and len(container_id) >= 12 else None + + +def detect_process() -> Dict[str, Any]: + attrs: Dict[str, Any] = {} + attrs["process.pid"] = os.getpid() + attrs["process.runtime.name"] = "python" + attrs["process.runtime.version"] = sys.version.split()[0] + if sys.argv: + attrs["process.command"] = sys.argv[0][:200] + return attrs + + +def detect_serverless() -> Dict[str, Any]: + attrs: Dict[str, Any] = {} + + if os.environ.get("AWS_LAMBDA_FUNCTION_NAME"): + attrs["faas.name"] = os.environ["AWS_LAMBDA_FUNCTION_NAME"] + version = os.environ.get("AWS_LAMBDA_FUNCTION_VERSION") + if version: + attrs["faas.version"] = version + memory = os.environ.get("AWS_LAMBDA_FUNCTION_MEMORY_SIZE") + if memory: + attrs["faas.max_memory"] = int(memory) * 1024 * 1024 + + elif os.environ.get("K_SERVICE"): + attrs["faas.name"] = os.environ["K_SERVICE"] + revision = os.environ.get("K_REVISION") + if revision: + attrs["faas.version"] = revision + + elif os.environ.get("FUNCTION_NAME"): + attrs["faas.name"] = os.environ["FUNCTION_NAME"] + target = os.environ.get("FUNCTION_TARGET") + if target: + attrs["faas.trigger"] = target + + elif os.environ.get("WEBSITE_SITE_NAME"): + attrs["faas.name"] = os.environ["WEBSITE_SITE_NAME"] + instance = os.environ.get("WEBSITE_INSTANCE_ID") + if instance: + attrs["faas.instance"] = instance + + return attrs + + +# ========================================================================= +# Main Detection +# ========================================================================= + + +@lru_cache(maxsize=1) +def detect_all_resources() -> Dict[str, Any]: + """Detect all environment resource attributes. + + Results are cached (environment doesn't change during runtime). + """ + attrs: Dict[str, Any] = {} + attrs.update(detect_host()) + attrs.update(detect_process()) + attrs.update(detect_container()) + attrs.update(detect_cloud_provider()) + attrs.update(detect_kubernetes()) + attrs.update(detect_serverless()) + + if "service.instance.id" not in attrs: + container_id = attrs.get("container.id") + if container_id: + attrs["service.instance.id"] = container_id[:12] + elif pod_name := attrs.get("k8s.pod.name"): + attrs["service.instance.id"] = pod_name + elif host_id := attrs.get("host.id"): + attrs["service.instance.id"] = host_id + + return attrs + + +def get_resource_attributes( + include_host: bool = True, + include_process: bool = True, + include_container: bool = True, + include_cloud: bool = True, + include_k8s: bool = True, + include_faas: bool = True, +) -> Dict[str, Any]: + """Get resource attributes with selective detection.""" + attrs: Dict[str, Any] = {} + if include_host: + attrs.update(detect_host()) + if include_process: + attrs.update(detect_process()) + if include_container: + attrs.update(detect_container()) + if include_cloud: + attrs.update(detect_cloud_provider()) + if include_k8s: + attrs.update(detect_kubernetes()) + if include_faas: + attrs.update(detect_serverless()) + return attrs diff --git a/src/botanu/sdk/__init__.py b/src/botanu/sdk/__init__.py new file mode 100644 index 0000000..2a6229d --- /dev/null +++ b/src/botanu/sdk/__init__.py @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Botanu SDK core components.""" + +from __future__ import annotations + +from botanu.sdk.bootstrap import disable, enable, get_config, is_enabled +from botanu.sdk.config import BotanuConfig +from botanu.sdk.context import ( + get_baggage, + get_current_span, + get_run_id, + get_use_case, + get_workflow, + set_baggage, +) +from botanu.sdk.decorators import botanu_outcome, botanu_use_case, use_case +from botanu.sdk.span_helpers import emit_outcome, set_business_context + +__all__ = [ + "BotanuConfig", + "botanu_outcome", + "botanu_use_case", + "disable", + "emit_outcome", + "enable", + "get_baggage", + "get_config", + "get_current_span", + "get_run_id", + "get_use_case", + "get_workflow", + "is_enabled", + "set_baggage", + "set_business_context", + "use_case", +] diff --git a/src/botanu/sdk/bootstrap.py b/src/botanu/sdk/bootstrap.py new file mode 100644 index 0000000..4fa34f2 --- /dev/null +++ b/src/botanu/sdk/bootstrap.py @@ -0,0 +1,309 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Botanu Bootstrap — one-switch enablement for OTEL auto-instrumentation. + +This is the "Botanu OTel Distribution" — a curated bundle that: + +1. Configures OTEL SDK with OTLP exporter +2. Enables OTEL auto-instrumentation for popular libraries +3. Adds :class:`~botanu.processors.enricher.RunContextEnricher` + (propagates ``run_id`` to all spans) +4. Sets up W3C TraceContext + Baggage propagators + +Usage:: + + from botanu import enable + enable(service_name="my-app", otlp_endpoint="http://collector:4318") +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, List, Optional + +if TYPE_CHECKING: + from botanu.sdk.config import BotanuConfig + +logger = logging.getLogger(__name__) + +_initialized = False +_current_config: Optional[BotanuConfig] = None + + +def enable( + service_name: Optional[str] = None, + otlp_endpoint: Optional[str] = None, + environment: Optional[str] = None, + auto_instrumentation: bool = True, + propagators: Optional[List[str]] = None, + log_level: str = "INFO", + config: Optional[BotanuConfig] = None, + config_file: Optional[str] = None, +) -> bool: + """Enable Botanu SDK with OTEL auto-instrumentation. + + This is the ONE function customers need to call to get full observability. + + Args: + service_name: Service name. + otlp_endpoint: OTLP collector endpoint. + environment: Deployment environment. + auto_instrumentation: Enable OTEL auto-instrumentation (default: ``True``). + propagators: List of propagators (default: ``["tracecontext", "baggage"]``). + log_level: Logging level (default: ``"INFO"``). + config: Full :class:`BotanuConfig` (overrides individual params). + config_file: Path to YAML config file. + + Returns: + ``True`` if successfully initialized, ``False`` if already initialized. + """ + global _initialized, _current_config + + if _initialized: + logger.warning("Botanu SDK already initialized") + return False + + logging.basicConfig(level=getattr(logging, log_level.upper())) + + from botanu.sdk.config import BotanuConfig as ConfigClass + + if config is not None: + cfg = config + elif config_file is not None: + cfg = ConfigClass.from_yaml(config_file) + else: + cfg = ConfigClass.from_file_or_env() + + if service_name is not None: + cfg.service_name = service_name + if otlp_endpoint is not None: + cfg.otlp_endpoint = otlp_endpoint + if environment is not None: + cfg.deployment_environment = environment + + _current_config = cfg + + traces_endpoint = cfg.otlp_endpoint + if traces_endpoint and not traces_endpoint.endswith("/v1/traces"): + traces_endpoint = f"{traces_endpoint.rstrip('/')}/v1/traces" + + logger.info( + "Initializing Botanu SDK: service=%s, env=%s, endpoint=%s", + cfg.service_name, + cfg.deployment_environment, + traces_endpoint, + ) + + try: + from opentelemetry import trace + from opentelemetry.baggage.propagation import W3CBaggagePropagator + from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter + from opentelemetry.propagate import set_global_textmap + from opentelemetry.propagators.composite import CompositePropagator + from opentelemetry.sdk.resources import Resource + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator + + from botanu._version import __version__ + from botanu.processors import RunContextEnricher + from botanu.resources.detector import detect_all_resources + + # Build resource attributes + resource_attrs = { + "service.name": cfg.service_name, + "deployment.environment": cfg.deployment_environment, + "telemetry.sdk.name": "botanu", + "telemetry.sdk.version": __version__, + } + if cfg.service_version: + resource_attrs["service.version"] = cfg.service_version + if cfg.service_namespace: + resource_attrs["service.namespace"] = cfg.service_namespace + + # Auto-detect resources (K8s, cloud, host, container, FaaS) + if cfg.auto_detect_resources: + detected = detect_all_resources() + for key, value in detected.items(): + if key not in resource_attrs: + resource_attrs[key] = value + if detected: + logger.debug("Auto-detected resources: %s", list(detected.keys())) + + resource = Resource.create(resource_attrs) + provider = TracerProvider(resource=resource) + + # RunContextEnricher — the ONLY processor in SDK. + # Reads run_id from baggage, stamps on all spans. + lean_mode = cfg.propagation_mode == "lean" + provider.add_span_processor(RunContextEnricher(lean_mode=lean_mode)) + + # OTLP exporter + exporter = OTLPSpanExporter( + endpoint=traces_endpoint, + headers=cfg.otlp_headers or {}, + ) + provider.add_span_processor( + BatchSpanProcessor( + exporter, + max_export_batch_size=cfg.max_export_batch_size, + max_queue_size=cfg.max_queue_size, + schedule_delay_millis=cfg.schedule_delay_millis, + ) + ) + + trace.set_tracer_provider(provider) + + # Propagators (W3C TraceContext + Baggage) + set_global_textmap( + CompositePropagator( + [ + TraceContextTextMapPropagator(), + W3CBaggagePropagator(), + ] + ) + ) + + logger.info("Botanu SDK tracing initialized") + + if auto_instrumentation: + _enable_auto_instrumentation() + + _initialized = True + return True + + except Exception as exc: + logger.error("Failed to initialize Botanu SDK: %s", exc, exc_info=True) + return False + + +def _enable_auto_instrumentation() -> None: + """Enable OTEL auto-instrumentation for common libraries. + + Each instrumentation is optional — if the underlying library or + instrumentation package isn't installed, it is silently skipped. + """ + enabled: List[str] = [] + failed: List[tuple[str, str]] = [] + + # HTTP clients + _try_instrument(enabled, failed, "httpx", "opentelemetry.instrumentation.httpx", "HTTPXClientInstrumentation") + _try_instrument(enabled, failed, "requests", "opentelemetry.instrumentation.requests", "RequestsInstrumentor") + _try_instrument(enabled, failed, "urllib3", "opentelemetry.instrumentation.urllib3", "URLLib3Instrumentor") + _try_instrument( + enabled, failed, "aiohttp", "opentelemetry.instrumentation.aiohttp_client", "AioHttpClientInstrumentor" + ) + + # Web frameworks + _try_instrument(enabled, failed, "fastapi", "opentelemetry.instrumentation.fastapi", "FastAPIInstrumentor") + _try_instrument(enabled, failed, "flask", "opentelemetry.instrumentation.flask", "FlaskInstrumentor") + _try_instrument(enabled, failed, "django", "opentelemetry.instrumentation.django", "DjangoInstrumentor") + _try_instrument(enabled, failed, "starlette", "opentelemetry.instrumentation.starlette", "StarletteInstrumentor") + + # Databases + _try_instrument(enabled, failed, "sqlalchemy", "opentelemetry.instrumentation.sqlalchemy", "SQLAlchemyInstrumentor") + _try_instrument(enabled, failed, "psycopg2", "opentelemetry.instrumentation.psycopg2", "Psycopg2Instrumentor") + _try_instrument(enabled, failed, "asyncpg", "opentelemetry.instrumentation.asyncpg", "AsyncPGInstrumentor") + _try_instrument(enabled, failed, "pymongo", "opentelemetry.instrumentation.pymongo", "PymongoInstrumentor") + _try_instrument(enabled, failed, "redis", "opentelemetry.instrumentation.redis", "RedisInstrumentor") + + # Messaging + _try_instrument(enabled, failed, "celery", "opentelemetry.instrumentation.celery", "CeleryInstrumentor") + _try_instrument(enabled, failed, "kafka", "opentelemetry.instrumentation.kafka", "KafkaInstrumentor") + + # gRPC + _try_instrument_grpc(enabled, failed) + + # GenAI / AI + _try_instrument(enabled, failed, "openai", "opentelemetry.instrumentation.openai_v2", "OpenAIInstrumentor") + _try_instrument(enabled, failed, "anthropic", "opentelemetry.instrumentation.anthropic", "AnthropicInstrumentor") + _try_instrument(enabled, failed, "vertexai", "opentelemetry.instrumentation.vertexai", "VertexAIInstrumentor") + _try_instrument( + enabled, failed, "google_genai", "opentelemetry.instrumentation.google_genai", "GoogleGenAiInstrumentor" + ) + _try_instrument(enabled, failed, "langchain", "opentelemetry.instrumentation.langchain", "LangchainInstrumentor") + + # Runtime + _try_instrument(enabled, failed, "logging", "opentelemetry.instrumentation.logging", "LoggingInstrumentor") + + if enabled: + logger.info("Auto-instrumentation enabled: %s", ", ".join(enabled)) + if failed: + for name, error in failed: + logger.warning("Auto-instrumentation failed for %s: %s", name, error) + + +def _try_instrument( + enabled: List[str], + failed: List[tuple[str, str]], + name: str, + module_path: str, + class_name: str, +) -> None: + """Try to import and instrument a single library.""" + try: + import importlib + + mod = importlib.import_module(module_path) + instrumentor_cls = getattr(mod, class_name) + instrumentor_cls().instrument() + enabled.append(name) + except ImportError: + pass + except Exception as exc: + failed.append((name, str(exc))) + + +def _try_instrument_grpc( + enabled: List[str], + failed: List[tuple[str, str]], +) -> None: + """Try to instrument gRPC (client + server).""" + try: + from opentelemetry.instrumentation.grpc import ( + GrpcInstrumentorClient, + GrpcInstrumentorServer, + ) + + GrpcInstrumentorClient().instrument() + GrpcInstrumentorServer().instrument() + enabled.append("grpc") + except ImportError: + pass + except Exception as exc: + failed.append(("grpc", str(exc))) + + +def is_enabled() -> bool: + """Check if Botanu SDK is initialized.""" + return _initialized + + +def get_config() -> Optional[BotanuConfig]: + """Get the current Botanu configuration.""" + return _current_config + + +def disable() -> None: + """Disable Botanu SDK and shutdown OTEL. + + Call on application shutdown for clean exit. + """ + global _initialized + + if not _initialized: + return + + try: + from opentelemetry import trace + + provider = trace.get_tracer_provider() + if hasattr(provider, "shutdown"): + provider.shutdown() + + _initialized = False + logger.info("Botanu SDK shutdown complete") + + except Exception as exc: + logger.error("Error during Botanu SDK shutdown: %s", exc) diff --git a/src/botanu/sdk/config.py b/src/botanu/sdk/config.py new file mode 100644 index 0000000..c52ffc5 --- /dev/null +++ b/src/botanu/sdk/config.py @@ -0,0 +1,294 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Configuration for Botanu SDK. + +The SDK is intentionally minimal on the hot path. Heavy processing happens in +the OpenTelemetry Collector, not in the application: + +- **SDK responsibility**: Generate run_id, propagate minimal context (run_id, use_case) +- **Collector responsibility**: PII redaction, vendor detection, attribute enrichment + +Configuration precedence (highest to lowest): +1. Code arguments (explicit values passed to BotanuConfig) +2. Environment variables (BOTANU_*, OTEL_*) +3. YAML config file (botanu.yaml or specified path) +4. Built-in defaults +""" + +from __future__ import annotations + +import logging +import os +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, List, Optional + +logger = logging.getLogger(__name__) + + +@dataclass +class BotanuConfig: + """Configuration for Botanu SDK and OpenTelemetry. + + The SDK is a thin wrapper on OpenTelemetry. PII redaction, cardinality + limits, and vendor enrichment are handled by the OTel Collector — not here. + + Example:: + + >>> config = BotanuConfig( + ... service_name="my-service", + ... otlp_endpoint="http://collector:4318/v1/traces", + ... ) + + >>> # Or load from YAML + >>> config = BotanuConfig.from_yaml("config/botanu.yaml") + """ + + # Service identification + service_name: Optional[str] = None + service_version: Optional[str] = None + service_namespace: Optional[str] = None + deployment_environment: Optional[str] = None + + # Resource detection + auto_detect_resources: bool = True + + # OTLP exporter configuration + otlp_endpoint: Optional[str] = None + otlp_headers: Optional[Dict[str, str]] = None + + # Span export configuration + max_export_batch_size: int = 512 + max_queue_size: int = 2048 + schedule_delay_millis: int = 5000 + + # Propagation mode: "lean" (run_id + use_case only) or "full" (all context) + propagation_mode: str = "lean" + + # Auto-instrumentation packages to enable + auto_instrument_packages: List[str] = field( + default_factory=lambda: [ + # HTTP clients + "requests", + "httpx", + "urllib3", + "aiohttp_client", + # Web frameworks + "fastapi", + "flask", + "django", + "starlette", + # Databases + "sqlalchemy", + "psycopg2", + "asyncpg", + "pymongo", + "redis", + # Messaging + "celery", + "kafka_python", + # gRPC + "grpc", + # GenAI / AI + "openai_v2", + "anthropic", + "vertexai", + "google_genai", + "langchain", + # Runtime + "logging", + ] + ) + + # Config file path (for tracking where config was loaded from) + _config_file: Optional[str] = field(default=None, repr=False) + + def __post_init__(self) -> None: + """Apply environment variable defaults.""" + if self.service_name is None: + self.service_name = os.getenv("OTEL_SERVICE_NAME", "unknown_service") + + if self.service_version is None: + self.service_version = os.getenv("OTEL_SERVICE_VERSION") + + if self.service_namespace is None: + self.service_namespace = os.getenv("OTEL_SERVICE_NAMESPACE") + + env_auto_detect = os.getenv("BOTANU_AUTO_DETECT_RESOURCES") + if env_auto_detect is not None: + self.auto_detect_resources = env_auto_detect.lower() in ("true", "1", "yes") + + if self.deployment_environment is None: + self.deployment_environment = os.getenv( + "OTEL_DEPLOYMENT_ENVIRONMENT", + os.getenv("BOTANU_ENVIRONMENT", "production"), + ) + + if self.otlp_endpoint is None: + env_endpoint = os.getenv("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT") + if env_endpoint: + self.otlp_endpoint = env_endpoint + else: + base = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4318") + self.otlp_endpoint = f"{base}/v1/traces" + + env_propagation_mode = os.getenv("BOTANU_PROPAGATION_MODE") + if env_propagation_mode and env_propagation_mode in ("lean", "full"): + self.propagation_mode = env_propagation_mode + + # ------------------------------------------------------------------ + # YAML loading + # ------------------------------------------------------------------ + + @classmethod + def from_yaml(cls, path: Optional[str] = None) -> BotanuConfig: + """Load configuration from a YAML file. + + Supports environment variable interpolation using ``${VAR_NAME}`` syntax. + + Args: + path: Path to YAML config file. + + Raises: + FileNotFoundError: If config file doesn't exist. + ValueError: If YAML is malformed. + """ + if path is None: + raise FileNotFoundError("No config file path provided") + + resolved = Path(path) + if not resolved.exists(): + raise FileNotFoundError(f"Config file not found: {resolved}") + + try: + import yaml # type: ignore[import-untyped] + except ImportError as err: + raise ImportError("PyYAML required for YAML config. Install with: pip install pyyaml") from err + + with open(resolved) as fh: + raw_content = fh.read() + + content = _interpolate_env_vars(raw_content) + + try: + data = yaml.safe_load(content) + except yaml.YAMLError as exc: + raise ValueError(f"Invalid YAML in {resolved}: {exc}") from exc + + if data is None: + data = {} + + return cls._from_dict(data, config_file=str(resolved)) + + @classmethod + def from_file_or_env(cls, path: Optional[str] = None) -> BotanuConfig: + """Load config from file if exists, otherwise use environment variables. + + Search order: + 1. Explicit *path* argument + 2. ``BOTANU_CONFIG_FILE`` env var + 3. ``./botanu.yaml`` + 4. ``./config/botanu.yaml`` + 5. Falls back to env-only config + """ + search_paths: List[Path] = [] + + if path: + search_paths.append(Path(path)) + + env_path = os.getenv("BOTANU_CONFIG_FILE") + if env_path: + search_paths.append(Path(env_path)) + + search_paths.extend( + [ + Path("botanu.yaml"), + Path("botanu.yml"), + Path("config/botanu.yaml"), + Path("config/botanu.yml"), + ] + ) + + for candidate in search_paths: + if candidate.exists(): + logger.info("Loading config from: %s", candidate) + return cls.from_yaml(str(candidate)) + + logger.debug("No config file found, using environment variables only") + return cls() + + @classmethod + def _from_dict( + cls, + data: Dict[str, Any], + config_file: Optional[str] = None, + ) -> BotanuConfig: + """Create config from dictionary (parsed YAML).""" + service = data.get("service", {}) + otlp = data.get("otlp", {}) + export = data.get("export", {}) + propagation = data.get("propagation", {}) + resource = data.get("resource", {}) + auto_packages = data.get("auto_instrument_packages") + + return cls( + service_name=service.get("name"), + service_version=service.get("version"), + service_namespace=service.get("namespace"), + deployment_environment=service.get("environment"), + auto_detect_resources=resource.get("auto_detect", True), + otlp_endpoint=otlp.get("endpoint"), + otlp_headers=otlp.get("headers"), + max_export_batch_size=export.get("batch_size", 512), + max_queue_size=export.get("queue_size", 2048), + schedule_delay_millis=export.get("delay_ms", 5000), + propagation_mode=propagation.get("mode", "lean"), + auto_instrument_packages=(auto_packages if auto_packages else BotanuConfig().auto_instrument_packages), + _config_file=config_file, + ) + + def to_dict(self) -> Dict[str, Any]: + """Export configuration as dictionary.""" + return { + "service": { + "name": self.service_name, + "version": self.service_version, + "namespace": self.service_namespace, + "environment": self.deployment_environment, + }, + "resource": { + "auto_detect": self.auto_detect_resources, + }, + "otlp": { + "endpoint": self.otlp_endpoint, + "headers": self.otlp_headers, + }, + "export": { + "batch_size": self.max_export_batch_size, + "queue_size": self.max_queue_size, + "delay_ms": self.schedule_delay_millis, + }, + "propagation": { + "mode": self.propagation_mode, + }, + "auto_instrument_packages": self.auto_instrument_packages, + } + + +def _interpolate_env_vars(content: str) -> str: + """Interpolate ``${VAR_NAME}`` and ``${VAR_NAME:-default}`` in *content*.""" + pattern = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)(?::-([^}]*))?\}") + + def _replace(match: re.Match) -> str: # type: ignore[type-arg] + var_name = match.group(1) + default = match.group(2) + value = os.getenv(var_name) + if value is not None: + return value + if default is not None: + return default + return match.group(0) + + return pattern.sub(_replace, content) diff --git a/src/botanu/sdk/context.py b/src/botanu/sdk/context.py new file mode 100644 index 0000000..1beaeaf --- /dev/null +++ b/src/botanu/sdk/context.py @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Context and baggage helpers for Botanu SDK. + +Uses OpenTelemetry Context and Baggage for propagation. +""" + +from __future__ import annotations + +from typing import Optional, cast + +from opentelemetry import baggage, trace +from opentelemetry.context import attach, get_current + + +def set_baggage(key: str, value: str) -> object: + """Set a baggage value and attach the new context. + + Baggage is automatically propagated across service boundaries via + W3C Baggage header. + + Args: + key: Baggage key (e.g., ``"botanu.run_id"``). + value: Baggage value. + + Returns: + Token for detaching the context later. + """ + ctx = baggage.set_baggage(key, value, context=get_current()) + return attach(ctx) + + +def get_baggage(key: str) -> Optional[str]: + """Get a baggage value from the current context. + + Args: + key: Baggage key (e.g., ``"botanu.run_id"``). + + Returns: + Baggage value or ``None`` if not set. + """ + value = baggage.get_baggage(key, context=get_current()) + return cast(Optional[str], value) + + +def get_current_span() -> trace.Span: + """Get the current active span. + + Returns: + Current span (may be non-recording if no span is active). + """ + return trace.get_current_span() + + +def get_run_id() -> Optional[str]: + """Get the current ``run_id`` from baggage.""" + return get_baggage("botanu.run_id") + + +def get_use_case() -> Optional[str]: + """Get the current ``use_case`` from baggage.""" + return get_baggage("botanu.use_case") + + +def get_workflow() -> Optional[str]: + """Get the current ``workflow`` from baggage.""" + return get_baggage("botanu.workflow") diff --git a/src/botanu/sdk/decorators.py b/src/botanu/sdk/decorators.py new file mode 100644 index 0000000..7b490af --- /dev/null +++ b/src/botanu/sdk/decorators.py @@ -0,0 +1,280 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Decorators for automatic run span creation and context propagation. + +The ``@botanu_use_case`` decorator is the primary integration point. +It creates a "run span" that: +- Generates a UUIDv7 run_id +- Emits ``run.started`` and ``run.completed`` events +- Propagates run context via W3C Baggage +- Records outcome at completion +""" + +from __future__ import annotations + +import functools +import hashlib +import inspect +from datetime import datetime, timezone +from typing import Any, Callable, Dict, Optional, TypeVar, Union + +from opentelemetry import trace +from opentelemetry.trace import SpanKind, Status, StatusCode + +from botanu.models.run_context import RunContext, RunStatus +from botanu.sdk.context import get_baggage, set_baggage + +T = TypeVar("T") + +tracer = trace.get_tracer("botanu_sdk") + + +def _compute_workflow_version(func: Callable[..., Any]) -> str: + try: + source = inspect.getsource(func) + code_hash = hashlib.sha256(source.encode()).hexdigest() + return f"v:{code_hash[:12]}" + except (OSError, TypeError): + return "v:unknown" + + +def _get_parent_run_id() -> Optional[str]: + return get_baggage("botanu.run_id") + + +def botanu_use_case( + name: str, + workflow: Optional[str] = None, + *, + environment: Optional[str] = None, + tenant_id: Optional[str] = None, + auto_outcome_on_success: bool = True, + span_kind: SpanKind = SpanKind.SERVER, +) -> Callable[[Callable[..., T]], Callable[..., T]]: + """Decorator to create a run span with automatic context propagation. + + This is the primary integration point. It: + + 1. Creates a UUIDv7 ``run_id`` (sortable, globally unique) + 2. Creates a ``botanu.run`` span as the root of the run + 3. Emits ``run.started`` event + 4. Propagates run context via W3C Baggage + 5. On completion: emits ``run.completed`` event with outcome + + Args: + name: Use case name (low cardinality, e.g. ``"Customer Support"``). + workflow: Workflow name (defaults to function qualified name). + environment: Deployment environment. + tenant_id: Tenant identifier for multi-tenant apps. + auto_outcome_on_success: Emit ``"success"`` if no exception. + span_kind: OpenTelemetry span kind (default: ``SERVER``). + + Example:: + + @botanu_use_case("Customer Support") + async def handle_ticket(ticket_id: str): + result = await process_ticket(ticket_id) + emit_outcome("success", value_type="tickets_resolved", value_amount=1) + return result + """ + + def decorator(func: Callable[..., T]) -> Callable[..., T]: + workflow_name = workflow or func.__qualname__ + workflow_version = _compute_workflow_version(func) + is_async = inspect.iscoroutinefunction(func) + + @functools.wraps(func) + async def async_wrapper(*args: Any, **kwargs: Any) -> T: + parent_run_id = _get_parent_run_id() + run_ctx = RunContext.create( + use_case=name, + workflow=workflow_name, + workflow_version=workflow_version, + environment=environment, + tenant_id=tenant_id, + parent_run_id=parent_run_id, + ) + + with tracer.start_as_current_span( + name=f"botanu.run/{name}", + kind=span_kind, + ) as span: + for key, value in run_ctx.to_span_attributes().items(): + span.set_attribute(key, value) + + span.add_event( + "botanu.run.started", + attributes={ + "run_id": run_ctx.run_id, + "use_case": run_ctx.use_case, + "workflow": workflow_name, + }, + ) + + for key, value in run_ctx.to_baggage_dict().items(): + set_baggage(key, value) + + try: + result = await func(*args, **kwargs) + + span_attrs = getattr(span, "attributes", None) + existing_outcome = span_attrs.get("botanu.outcome.status") if isinstance(span_attrs, dict) else None + + if existing_outcome is None and auto_outcome_on_success: + run_ctx.complete(RunStatus.SUCCESS) + + span.set_status(Status(StatusCode.OK)) + _emit_run_completed(span, run_ctx, RunStatus.SUCCESS) + return result + + except Exception as exc: + span.set_status(Status(StatusCode.ERROR, str(exc))) + span.record_exception(exc) + run_ctx.complete(RunStatus.FAILURE, error_class=exc.__class__.__name__) + _emit_run_completed( + span, + run_ctx, + RunStatus.FAILURE, + error_class=exc.__class__.__name__, + ) + raise + + @functools.wraps(func) + def sync_wrapper(*args: Any, **kwargs: Any) -> T: + parent_run_id = _get_parent_run_id() + run_ctx = RunContext.create( + use_case=name, + workflow=workflow_name, + workflow_version=workflow_version, + environment=environment, + tenant_id=tenant_id, + parent_run_id=parent_run_id, + ) + + with tracer.start_as_current_span( + name=f"botanu.run/{name}", + kind=span_kind, + ) as span: + for key, value in run_ctx.to_span_attributes().items(): + span.set_attribute(key, value) + + span.add_event( + "botanu.run.started", + attributes={ + "run_id": run_ctx.run_id, + "use_case": run_ctx.use_case, + "workflow": workflow_name, + }, + ) + + for key, value in run_ctx.to_baggage_dict().items(): + set_baggage(key, value) + + try: + result = func(*args, **kwargs) + + span_attrs = getattr(span, "attributes", None) + existing_outcome = span_attrs.get("botanu.outcome.status") if isinstance(span_attrs, dict) else None + + if existing_outcome is None and auto_outcome_on_success: + run_ctx.complete(RunStatus.SUCCESS) + + span.set_status(Status(StatusCode.OK)) + _emit_run_completed(span, run_ctx, RunStatus.SUCCESS) + return result + + except Exception as exc: + span.set_status(Status(StatusCode.ERROR, str(exc))) + span.record_exception(exc) + run_ctx.complete(RunStatus.FAILURE, error_class=exc.__class__.__name__) + _emit_run_completed( + span, + run_ctx, + RunStatus.FAILURE, + error_class=exc.__class__.__name__, + ) + raise + + if is_async: + return async_wrapper # type: ignore[return-value] + return sync_wrapper # type: ignore[return-value] + + return decorator + + +def _emit_run_completed( + span: trace.Span, + run_ctx: RunContext, + status: RunStatus, + error_class: Optional[str] = None, +) -> None: + duration_ms = (datetime.now(timezone.utc) - run_ctx.start_time).total_seconds() * 1000 + + event_attrs: Dict[str, Union[str, float]] = { + "run_id": run_ctx.run_id, + "use_case": run_ctx.use_case, + "status": status.value, + "duration_ms": duration_ms, + } + if error_class: + event_attrs["error_class"] = error_class + if run_ctx.outcome and run_ctx.outcome.value_type: + event_attrs["value_type"] = run_ctx.outcome.value_type + if run_ctx.outcome and run_ctx.outcome.value_amount is not None: + event_attrs["value_amount"] = run_ctx.outcome.value_amount + + span.add_event("botanu.run.completed", attributes=event_attrs) + + span.set_attribute("botanu.outcome.status", status.value) + span.set_attribute("botanu.run.duration_ms", duration_ms) + + +# Alias +use_case = botanu_use_case + + +def botanu_outcome( + success: Optional[str] = None, + partial: Optional[str] = None, + failed: Optional[str] = None, +) -> Callable[[Callable[..., T]], Callable[..., T]]: + """Decorator to automatically emit outcomes based on function result. + + This is a convenience decorator for sub-functions within a use case. + It does NOT create a new run — use ``@botanu_use_case`` for that. + """ + from botanu.sdk.span_helpers import emit_outcome + + def decorator(func: Callable[..., T]) -> Callable[..., T]: + is_async = inspect.iscoroutinefunction(func) + + @functools.wraps(func) + async def async_wrapper(*args: Any, **kwargs: Any) -> T: + try: + result = await func(*args, **kwargs) + span = trace.get_current_span() + if not span.attributes or "botanu.outcome.status" not in span.attributes: + emit_outcome("success") + return result + except Exception as exc: + emit_outcome("failed", reason=exc.__class__.__name__) + raise + + @functools.wraps(func) + def sync_wrapper(*args: Any, **kwargs: Any) -> T: + try: + result = func(*args, **kwargs) + span = trace.get_current_span() + if not span.attributes or "botanu.outcome.status" not in span.attributes: + emit_outcome("success") + return result + except Exception as exc: + emit_outcome("failed", reason=exc.__class__.__name__) + raise + + if is_async: + return async_wrapper # type: ignore[return-value] + return sync_wrapper # type: ignore[return-value] + + return decorator diff --git a/src/botanu/sdk/middleware.py b/src/botanu/sdk/middleware.py new file mode 100644 index 0000000..78bc987 --- /dev/null +++ b/src/botanu/sdk/middleware.py @@ -0,0 +1,99 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""FastAPI / Starlette middleware for span enrichment. + +This middleware works alongside OpenTelemetry's FastAPIInstrumentor to enrich +spans with Botanu-specific context. +""" + +from __future__ import annotations + +import uuid +from typing import Optional + +from opentelemetry import baggage, trace +from starlette.middleware.base import BaseHTTPMiddleware +from starlette.requests import Request +from starlette.responses import Response + +from botanu.sdk.context import set_baggage + + +class BotanuMiddleware(BaseHTTPMiddleware): + """FastAPI middleware to enrich spans with Botanu context. + + This middleware should be used **after** OpenTelemetry's + ``FastAPIInstrumentor``. It extracts Botanu context from incoming + requests and enriches the current span with Botanu attributes. + + Example:: + + from fastapi import FastAPI + from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor + from botanu.sdk.middleware import BotanuMiddleware + + app = FastAPI() + FastAPIInstrumentor.instrument_app(app) + app.add_middleware( + BotanuMiddleware, + use_case="customer_support", + workflow="ticket_api", + ) + """ + + def __init__( + self, + app: object, + *, + use_case: str, + workflow: Optional[str] = None, + auto_generate_run_id: bool = True, + ) -> None: + super().__init__(app) # type: ignore[arg-type] + self.use_case = use_case + self.workflow = workflow or use_case + self.auto_generate_run_id = auto_generate_run_id + + async def dispatch(self, request: Request, call_next: object) -> Response: # type: ignore[override] + """Process request and enrich span with Botanu context.""" + span = trace.get_current_span() + + # Extract run_id from baggage or headers + run_id = baggage.get_baggage("botanu.run_id") + if not run_id: + run_id = request.headers.get("x-botanu-run-id") + + if not run_id and self.auto_generate_run_id: + run_id = str(uuid.uuid4()) + + use_case = baggage.get_baggage("botanu.use_case") or request.headers.get("x-botanu-use-case") or self.use_case + workflow = baggage.get_baggage("botanu.workflow") or request.headers.get("x-botanu-workflow") or self.workflow + customer_id = baggage.get_baggage("botanu.customer_id") or request.headers.get("x-botanu-customer-id") + + # Enrich span with Botanu attributes + if run_id: + span.set_attribute("botanu.run_id", run_id) + set_baggage("botanu.run_id", run_id) + + span.set_attribute("botanu.use_case", use_case) + set_baggage("botanu.use_case", use_case) + + span.set_attribute("botanu.workflow", workflow) + set_baggage("botanu.workflow", workflow) + + if customer_id: + span.set_attribute("botanu.customer_id", customer_id) + set_baggage("botanu.customer_id", customer_id) + + span.set_attribute("http.route", request.url.path) + span.set_attribute("http.method", request.method) + + response = await call_next(request) # type: ignore[misc] + + if run_id: + response.headers["x-botanu-run-id"] = run_id + response.headers["x-botanu-use-case"] = use_case + response.headers["x-botanu-workflow"] = workflow + + return response diff --git a/src/botanu/sdk/span_helpers.py b/src/botanu/sdk/span_helpers.py new file mode 100644 index 0000000..98eaffd --- /dev/null +++ b/src/botanu/sdk/span_helpers.py @@ -0,0 +1,93 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Helper functions for working with OpenTelemetry spans. + +These functions add Botanu-specific attributes to the current span. +""" + +from __future__ import annotations + +from typing import Optional + +from opentelemetry import trace + + +def emit_outcome( + status: str, + *, + value_type: Optional[str] = None, + value_amount: Optional[float] = None, + confidence: Optional[float] = None, + reason: Optional[str] = None, +) -> None: + """Emit an outcome for the current span. + + Sets span attributes for outcome tracking and ROI calculation. + + Args: + status: Outcome status (``"success"``, ``"partial"``, ``"failed"``). + value_type: Type of business value (e.g., ``"tickets_resolved"``). + value_amount: Quantified value amount. + confidence: Confidence score (0.0–1.0). + reason: Optional reason for the outcome. + + Example:: + + >>> emit_outcome("success", value_type="tickets_resolved", value_amount=1) + >>> emit_outcome("failed", reason="missing_context") + """ + span = trace.get_current_span() + + span.set_attribute("botanu.outcome", status) + + if value_type: + span.set_attribute("botanu.outcome.value_type", value_type) + + if value_amount is not None: + span.set_attribute("botanu.outcome.value_amount", value_amount) + + if confidence is not None: + span.set_attribute("botanu.outcome.confidence", confidence) + + if reason: + span.set_attribute("botanu.outcome.reason", reason) + + # Add span event for timeline visibility + event_attrs: dict[str, object] = {"status": status} + if value_type: + event_attrs["value_type"] = value_type + if value_amount is not None: + event_attrs["value_amount"] = value_amount + + span.add_event("botanu.outcome_emitted", event_attrs) + + +def set_business_context( + *, + customer_id: Optional[str] = None, + team: Optional[str] = None, + cost_center: Optional[str] = None, + region: Optional[str] = None, +) -> None: + """Set business context attributes on the current span. + + Args: + customer_id: Customer identifier for multi-tenant attribution. + team: Team or department. + cost_center: Cost centre for financial tracking. + region: Geographic region. + """ + span = trace.get_current_span() + + if customer_id: + span.set_attribute("botanu.customer_id", customer_id) + + if team: + span.set_attribute("botanu.team", team) + + if cost_center: + span.set_attribute("botanu.cost_center", cost_center) + + if region: + span.set_attribute("botanu.region", region) diff --git a/src/botanu/tracking/__init__.py b/src/botanu/tracking/__init__.py new file mode 100644 index 0000000..5933aa6 --- /dev/null +++ b/src/botanu/tracking/__init__.py @@ -0,0 +1,77 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Botanu tracking components. + +Provides tracking for different operation types: +- LLM/GenAI model calls +- Database, storage, and messaging operations +- Attempt ledger for durable cost tracking +""" + +from __future__ import annotations + +from botanu.tracking.data import ( + DBOperation, + MessagingOperation, + StorageOperation, + set_data_metrics, + set_warehouse_metrics, + track_db_operation, + track_messaging_operation, + track_storage_operation, +) +from botanu.tracking.ledger import ( + AttemptLedger, + AttemptStatus, + LedgerEventType, + get_ledger, + record_attempt_ended, + record_attempt_started, + record_llm_attempted, + record_tool_attempted, + set_ledger, +) +from botanu.tracking.llm import ( + BotanuAttributes, + GenAIAttributes, + LLMTracker, + ModelOperation, + ToolTracker, + set_llm_attributes, + set_token_usage, + track_llm_call, + track_tool_call, +) + +__all__ = [ + # LLM tracking + "track_llm_call", + "track_tool_call", + "set_llm_attributes", + "set_token_usage", + "ModelOperation", + "GenAIAttributes", + "BotanuAttributes", + "LLMTracker", + "ToolTracker", + # Data tracking + "track_db_operation", + "track_storage_operation", + "track_messaging_operation", + "set_data_metrics", + "set_warehouse_metrics", + "DBOperation", + "StorageOperation", + "MessagingOperation", + # Attempt ledger + "AttemptLedger", + "get_ledger", + "set_ledger", + "record_attempt_started", + "record_attempt_ended", + "record_llm_attempted", + "record_tool_attempted", + "LedgerEventType", + "AttemptStatus", +] diff --git a/src/botanu/tracking/data.py b/src/botanu/tracking/data.py new file mode 100644 index 0000000..5a58f57 --- /dev/null +++ b/src/botanu/tracking/data.py @@ -0,0 +1,488 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Data Tracking — Track database, storage, and messaging operations. + +Usage:: + + from botanu.tracking.data import track_db_operation, track_storage_operation + + with track_db_operation(system="postgresql", operation="SELECT") as db: + result = cursor.execute("SELECT * FROM users WHERE active = true") + db.set_result(rows_returned=len(result)) +""" + +from __future__ import annotations + +from contextlib import contextmanager +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Any, Dict, Generator, Optional + +from opentelemetry import trace +from opentelemetry.trace import Span, SpanKind, Status, StatusCode + +# ========================================================================= +# System Normalization Maps +# ========================================================================= + +DB_SYSTEMS: Dict[str, str] = { + "postgresql": "postgresql", + "postgres": "postgresql", + "pg": "postgresql", + "mysql": "mysql", + "mariadb": "mariadb", + "mssql": "mssql", + "sqlserver": "mssql", + "oracle": "oracle", + "sqlite": "sqlite", + "mongodb": "mongodb", + "mongo": "mongodb", + "dynamodb": "dynamodb", + "cassandra": "cassandra", + "couchdb": "couchdb", + "firestore": "firestore", + "cosmosdb": "cosmosdb", + "redis": "redis", + "memcached": "memcached", + "elasticache": "elasticache", + "elasticsearch": "elasticsearch", + "opensearch": "opensearch", + "snowflake": "snowflake", + "bigquery": "bigquery", + "redshift": "redshift", + "databricks": "databricks", + "athena": "athena", + "synapse": "synapse", + "influxdb": "influxdb", + "timescaledb": "timescaledb", + "neo4j": "neo4j", + "neptune": "neptune", +} + +STORAGE_SYSTEMS: Dict[str, str] = { + "s3": "s3", + "aws_s3": "s3", + "gcs": "gcs", + "google_cloud_storage": "gcs", + "blob": "azure_blob", + "azure_blob": "azure_blob", + "minio": "minio", + "ceph": "ceph", + "nfs": "nfs", + "efs": "efs", +} + +MESSAGING_SYSTEMS: Dict[str, str] = { + "sqs": "sqs", + "aws_sqs": "sqs", + "sns": "sns", + "kinesis": "kinesis", + "eventbridge": "eventbridge", + "pubsub": "pubsub", + "google_pubsub": "pubsub", + "servicebus": "servicebus", + "azure_servicebus": "servicebus", + "eventhub": "eventhub", + "kafka": "kafka", + "rabbitmq": "rabbitmq", + "nats": "nats", + "redis_pubsub": "redis_pubsub", + "celery": "celery", +} + + +class DBOperation: + SELECT = "SELECT" + INSERT = "INSERT" + UPDATE = "UPDATE" + DELETE = "DELETE" + UPSERT = "UPSERT" + MERGE = "MERGE" + CREATE = "CREATE" + DROP = "DROP" + ALTER = "ALTER" + INDEX = "INDEX" + TRANSACTION = "TRANSACTION" + BATCH = "BATCH" + + +class StorageOperation: + GET = "GET" + PUT = "PUT" + DELETE = "DELETE" + LIST = "LIST" + HEAD = "HEAD" + COPY = "COPY" + MULTIPART_UPLOAD = "MULTIPART_UPLOAD" + + +class MessagingOperation: + PUBLISH = "publish" + CONSUME = "consume" + RECEIVE = "receive" + SEND = "send" + SUBSCRIBE = "subscribe" + + +# ========================================================================= +# Database Tracker +# ========================================================================= + + +@dataclass +class DBTracker: + """Tracks database operations.""" + + system: str + operation: str + span: Optional[Span] = field(default=None, repr=False) + start_time: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) + + rows_returned: int = 0 + rows_affected: int = 0 + bytes_read: int = 0 + bytes_written: int = 0 + + def set_result( + self, + rows_returned: int = 0, + rows_affected: int = 0, + bytes_read: int = 0, + bytes_written: int = 0, + ) -> DBTracker: + self.rows_returned = rows_returned + self.rows_affected = rows_affected + self.bytes_read = bytes_read + self.bytes_written = bytes_written + if self.span: + if rows_returned > 0: + self.span.set_attribute("botanu.data.rows_returned", rows_returned) + if rows_affected > 0: + self.span.set_attribute("botanu.data.rows_affected", rows_affected) + if bytes_read > 0: + self.span.set_attribute("botanu.data.bytes_read", bytes_read) + if bytes_written > 0: + self.span.set_attribute("botanu.data.bytes_written", bytes_written) + return self + + def set_table(self, table_name: str, schema: Optional[str] = None) -> DBTracker: + if self.span: + self.span.set_attribute("db.collection.name", table_name) + if schema: + self.span.set_attribute("db.schema", schema) + return self + + def set_query_id(self, query_id: str) -> DBTracker: + if self.span: + self.span.set_attribute("botanu.warehouse.query_id", query_id) + return self + + def set_bytes_scanned(self, bytes_scanned: int) -> DBTracker: + self.bytes_read = bytes_scanned + if self.span: + self.span.set_attribute("botanu.warehouse.bytes_scanned", bytes_scanned) + return self + + def set_error(self, error: Exception) -> DBTracker: + if self.span: + self.span.set_status(Status(StatusCode.ERROR, str(error))) + self.span.set_attribute("botanu.data.error", type(error).__name__) + self.span.record_exception(error) + return self + + def add_metadata(self, **kwargs: Any) -> DBTracker: + if self.span: + for key, value in kwargs.items(): + attr_key = key if key.startswith("botanu.") else f"botanu.data.{key}" + self.span.set_attribute(attr_key, value) + return self + + def _finalize(self) -> None: + if not self.span: + return + duration_ms = (datetime.now(timezone.utc) - self.start_time).total_seconds() * 1000 + self.span.set_attribute("botanu.data.duration_ms", duration_ms) + + +@contextmanager +def track_db_operation( + system: str, + operation: str, + database: Optional[str] = None, + **kwargs: Any, +) -> Generator[DBTracker, None, None]: + """Track a database operation. + + Args: + system: Database system (postgresql, mysql, mongodb, …). + operation: Type of operation (SELECT, INSERT, …). + database: Database name (optional). + """ + tracer = trace.get_tracer("botanu.data") + normalized_system = DB_SYSTEMS.get(system.lower(), system.lower()) + + with tracer.start_as_current_span( + name=f"db.{normalized_system}.{operation.lower()}", + kind=SpanKind.CLIENT, + ) as span: + span.set_attribute("db.system", normalized_system) + span.set_attribute("db.operation", operation.upper()) + span.set_attribute("botanu.vendor", normalized_system) + if database: + span.set_attribute("db.name", database) + for key, value in kwargs.items(): + span.set_attribute(f"botanu.data.{key}", value) + + tracker = DBTracker(system=normalized_system, operation=operation, span=span) + try: + yield tracker + except Exception as exc: + tracker.set_error(exc) + raise + finally: + tracker._finalize() + + +# ========================================================================= +# Storage Tracker +# ========================================================================= + + +@dataclass +class StorageTracker: + """Tracks storage operations.""" + + system: str + operation: str + span: Optional[Span] = field(default=None, repr=False) + start_time: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) + + objects_count: int = 0 + bytes_read: int = 0 + bytes_written: int = 0 + + def set_result( + self, + objects_count: int = 0, + bytes_read: int = 0, + bytes_written: int = 0, + ) -> StorageTracker: + self.objects_count = objects_count + self.bytes_read = bytes_read + self.bytes_written = bytes_written + if self.span: + if objects_count > 0: + self.span.set_attribute("botanu.data.objects_count", objects_count) + if bytes_read > 0: + self.span.set_attribute("botanu.data.bytes_read", bytes_read) + if bytes_written > 0: + self.span.set_attribute("botanu.data.bytes_written", bytes_written) + return self + + def set_bucket(self, bucket: str) -> StorageTracker: + if self.span: + self.span.set_attribute("botanu.storage.bucket", bucket) + return self + + def set_error(self, error: Exception) -> StorageTracker: + if self.span: + self.span.set_status(Status(StatusCode.ERROR, str(error))) + self.span.set_attribute("botanu.storage.error", type(error).__name__) + self.span.record_exception(error) + return self + + def add_metadata(self, **kwargs: Any) -> StorageTracker: + if self.span: + for key, value in kwargs.items(): + attr_key = key if key.startswith("botanu.") else f"botanu.storage.{key}" + self.span.set_attribute(attr_key, value) + return self + + def _finalize(self) -> None: + if not self.span: + return + duration_ms = (datetime.now(timezone.utc) - self.start_time).total_seconds() * 1000 + self.span.set_attribute("botanu.storage.duration_ms", duration_ms) + + +@contextmanager +def track_storage_operation( + system: str, + operation: str, + **kwargs: Any, +) -> Generator[StorageTracker, None, None]: + """Track a storage operation. + + Args: + system: Storage system (s3, gcs, azure_blob, …). + operation: Type of operation (GET, PUT, DELETE, …). + """ + tracer = trace.get_tracer("botanu.storage") + normalized_system = STORAGE_SYSTEMS.get(system.lower(), system.lower()) + + with tracer.start_as_current_span( + name=f"storage.{normalized_system}.{operation.lower()}", + kind=SpanKind.CLIENT, + ) as span: + span.set_attribute("botanu.storage.system", normalized_system) + span.set_attribute("botanu.storage.operation", operation.upper()) + span.set_attribute("botanu.vendor", normalized_system) + for key, value in kwargs.items(): + span.set_attribute(f"botanu.storage.{key}", value) + + tracker = StorageTracker(system=normalized_system, operation=operation, span=span) + try: + yield tracker + except Exception as exc: + tracker.set_error(exc) + raise + finally: + tracker._finalize() + + +# ========================================================================= +# Messaging Tracker +# ========================================================================= + + +@dataclass +class MessagingTracker: + """Tracks messaging operations.""" + + system: str + operation: str + destination: str + span: Optional[Span] = field(default=None, repr=False) + start_time: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) + + message_count: int = 0 + bytes_transferred: int = 0 + + def set_result( + self, + message_count: int = 0, + bytes_transferred: int = 0, + ) -> MessagingTracker: + self.message_count = message_count + self.bytes_transferred = bytes_transferred + if self.span: + if message_count > 0: + self.span.set_attribute("botanu.messaging.message_count", message_count) + if bytes_transferred > 0: + self.span.set_attribute("botanu.messaging.bytes_transferred", bytes_transferred) + return self + + def set_error(self, error: Exception) -> MessagingTracker: + if self.span: + self.span.set_status(Status(StatusCode.ERROR, str(error))) + self.span.set_attribute("botanu.messaging.error", type(error).__name__) + self.span.record_exception(error) + return self + + def add_metadata(self, **kwargs: Any) -> MessagingTracker: + if self.span: + for key, value in kwargs.items(): + attr_key = key if key.startswith("botanu.") else f"botanu.messaging.{key}" + self.span.set_attribute(attr_key, value) + return self + + def _finalize(self) -> None: + if not self.span: + return + duration_ms = (datetime.now(timezone.utc) - self.start_time).total_seconds() * 1000 + self.span.set_attribute("botanu.messaging.duration_ms", duration_ms) + + +@contextmanager +def track_messaging_operation( + system: str, + operation: str, + destination: str, + **kwargs: Any, +) -> Generator[MessagingTracker, None, None]: + """Track a messaging operation. + + Args: + system: Messaging system (sqs, kafka, pubsub, …). + operation: Type of operation (publish, consume, …). + destination: Queue/topic name. + """ + tracer = trace.get_tracer("botanu.messaging") + normalized_system = MESSAGING_SYSTEMS.get(system.lower(), system.lower()) + span_kind = SpanKind.PRODUCER if operation in ("publish", "send") else SpanKind.CONSUMER + + with tracer.start_as_current_span( + name=f"messaging.{normalized_system}.{operation.lower()}", + kind=span_kind, + ) as span: + span.set_attribute("messaging.system", normalized_system) + span.set_attribute("messaging.operation", operation.lower()) + span.set_attribute("messaging.destination.name", destination) + span.set_attribute("botanu.vendor", normalized_system) + for key, value in kwargs.items(): + span.set_attribute(f"botanu.messaging.{key}", value) + + tracker = MessagingTracker( + system=normalized_system, + operation=operation, + destination=destination, + span=span, + ) + try: + yield tracker + except Exception as exc: + tracker.set_error(exc) + raise + finally: + tracker._finalize() + + +# ========================================================================= +# Standalone Helpers +# ========================================================================= + + +def set_data_metrics( + rows_returned: int = 0, + rows_affected: int = 0, + bytes_read: int = 0, + bytes_written: int = 0, + objects_count: int = 0, + span: Optional[Span] = None, +) -> None: + """Set data operation metrics on the current span.""" + target_span = span or trace.get_current_span() + if not target_span or not target_span.is_recording(): + return + + if rows_returned > 0: + target_span.set_attribute("botanu.data.rows_returned", rows_returned) + if rows_affected > 0: + target_span.set_attribute("botanu.data.rows_affected", rows_affected) + if bytes_read > 0: + target_span.set_attribute("botanu.data.bytes_read", bytes_read) + if bytes_written > 0: + target_span.set_attribute("botanu.data.bytes_written", bytes_written) + if objects_count > 0: + target_span.set_attribute("botanu.data.objects_count", objects_count) + + +def set_warehouse_metrics( + query_id: str, + bytes_scanned: int, + rows_returned: int = 0, + partitions_scanned: int = 0, + span: Optional[Span] = None, +) -> None: + """Set data warehouse query metrics on the current span.""" + target_span = span or trace.get_current_span() + if not target_span or not target_span.is_recording(): + return + + target_span.set_attribute("botanu.warehouse.query_id", query_id) + target_span.set_attribute("botanu.warehouse.bytes_scanned", bytes_scanned) + if rows_returned > 0: + target_span.set_attribute("botanu.data.rows_returned", rows_returned) + if partitions_scanned > 0: + target_span.set_attribute("botanu.warehouse.partitions_scanned", partitions_scanned) diff --git a/src/botanu/tracking/ledger.py b/src/botanu/tracking/ledger.py new file mode 100644 index 0000000..3fe982a --- /dev/null +++ b/src/botanu/tracking/ledger.py @@ -0,0 +1,420 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Attempt Ledger — durable event log for invisible cost tracking. + +An append-only event log that is NEVER sampled and survives crashes. +Uses OTel Logs API to emit structured events. + +Event Types: +- ``attempt.started``: Run/attempt began +- ``llm.attempted``: LLM call attempt (with tokens, cost) +- ``tool.attempted``: Tool execution attempt +- ``attempt.ended``: Run/attempt completed +- ``cancellation.requested``: Cancellation was requested +- ``zombie.detected``: Work continued after timeout +""" + +from __future__ import annotations + +import logging +import os +import time +from dataclasses import dataclass, field +from enum import Enum +from functools import lru_cache +from typing import Any, Dict, Optional + +from opentelemetry import trace + +logger = logging.getLogger(__name__) + + +class LedgerEventType(str, Enum): + ATTEMPT_STARTED = "attempt.started" + ATTEMPT_ENDED = "attempt.ended" + LLM_ATTEMPTED = "llm.attempted" + TOOL_ATTEMPTED = "tool.attempted" + CANCEL_REQUESTED = "cancellation.requested" + CANCEL_ACKNOWLEDGED = "cancellation.acknowledged" + ZOMBIE_DETECTED = "zombie.detected" + REDELIVERY_DETECTED = "redelivery.detected" + + +class AttemptStatus(str, Enum): + SUCCESS = "success" + ERROR = "error" + TIMEOUT = "timeout" + CANCELLED = "cancelled" + RATE_LIMITED = "rate_limited" + + +@dataclass +class AttemptLedger: + """Durable event ledger for cost tracking. + + Emits structured log records that are never sampled, providing a + reliable source of truth for attempt counts, token costs, and zombie work. + """ + + service_name: str = field( + default_factory=lambda: os.getenv("OTEL_SERVICE_NAME", "unknown"), + ) + otlp_endpoint: Optional[str] = field(default=None) + _logger: Any = field(default=None, init=False, repr=False) + _initialized: bool = field(default=False, init=False) + + def __post_init__(self) -> None: + self._initialize_logger() + + def _initialize_logger(self) -> None: + try: + from opentelemetry._logs import get_logger_provider, set_logger_provider + from opentelemetry.exporter.otlp.proto.http._log_exporter import OTLPLogExporter + from opentelemetry.sdk._logs import LoggerProvider + from opentelemetry.sdk._logs.export import BatchLogRecordProcessor + + provider = get_logger_provider() + + endpoint = self.otlp_endpoint + if not endpoint: + traces_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT") + if traces_endpoint: + endpoint = f"{traces_endpoint.rstrip('/')}/v1/logs" + else: + endpoint = "http://localhost:4318/v1/logs" + + if provider is None or not hasattr(provider, "get_logger"): + new_provider = LoggerProvider() + exporter = OTLPLogExporter(endpoint=endpoint) + new_provider.add_log_record_processor(BatchLogRecordProcessor(exporter)) + set_logger_provider(new_provider) + provider = new_provider + + self._logger = provider.get_logger("botanu.attempt_ledger") + self._initialized = True + logger.debug("AttemptLedger initialized with endpoint: %s", endpoint) + + except Exception as exc: + logger.warning("Failed to initialize AttemptLedger: %s", exc) + self._initialized = False + + def _get_trace_context(self) -> Dict[str, str]: + span = trace.get_current_span() + ctx = span.get_span_context() if span else None + if ctx and ctx.is_valid: + return { + "trace_id": format(ctx.trace_id, "032x"), + "span_id": format(ctx.span_id, "016x"), + } + return {} + + def _emit( + self, + event_type: LedgerEventType, + severity: Any, + attributes: Dict[str, Any], + ) -> None: + if not self._initialized or not self._logger: + return + + try: + from opentelemetry.sdk._logs import LogRecord + + attrs = { + "event.name": event_type.value, + "service.name": self.service_name, + "timestamp_ms": int(time.time() * 1000), + **self._get_trace_context(), + **attributes, + } + + self._logger.emit( + LogRecord( + timestamp=int(time.time_ns()), + severity_number=severity, + severity_text=severity.name, + body=event_type.value, + attributes=attrs, + ) + ) + except Exception as exc: + logger.debug("Failed to emit ledger event: %s", exc) + + # ----------------------------------------------------------------- + # Attempt Lifecycle + # ----------------------------------------------------------------- + + def attempt_started( + self, + run_id: str, + use_case: str, + attempt: int = 1, + root_run_id: Optional[str] = None, + workflow: Optional[str] = None, + tenant_id: Optional[str] = None, + deadline_ts: Optional[float] = None, + ) -> None: + from opentelemetry._logs import SeverityNumber + + self._emit( + LedgerEventType.ATTEMPT_STARTED, + SeverityNumber.INFO, + { + "botanu.run_id": run_id, + "botanu.use_case": use_case, + "botanu.attempt": attempt, + "botanu.root_run_id": root_run_id or run_id, + "botanu.workflow": workflow, + "botanu.tenant_id": tenant_id, + "botanu.deadline_ts": deadline_ts, + }, + ) + + def attempt_ended( + self, + run_id: str, + status: str, + duration_ms: Optional[float] = None, + error_class: Optional[str] = None, + reason_code: Optional[str] = None, + ) -> None: + from opentelemetry._logs import SeverityNumber + + self._emit( + LedgerEventType.ATTEMPT_ENDED, + SeverityNumber.INFO if status == "success" else SeverityNumber.WARN, + { + "botanu.run_id": run_id, + "status": status, + "duration_ms": duration_ms, + "error_class": error_class, + "reason_code": reason_code, + }, + ) + + # ----------------------------------------------------------------- + # LLM Attempt Events + # ----------------------------------------------------------------- + + def llm_attempted( + self, + run_id: str, + provider: str, + model: str, + operation: str = "chat", + attempt_number: int = 1, + input_tokens: int = 0, + output_tokens: int = 0, + cached_tokens: int = 0, + duration_ms: Optional[float] = None, + status: str = "success", + error_class: Optional[str] = None, + provider_request_id: Optional[str] = None, + estimated_cost_usd: Optional[float] = None, + ) -> None: + from opentelemetry._logs import SeverityNumber + + self._emit( + LedgerEventType.LLM_ATTEMPTED, + SeverityNumber.INFO if status == "success" else SeverityNumber.WARN, + { + "botanu.run_id": run_id, + "gen_ai.provider.name": provider, + "gen_ai.request.model": model, + "gen_ai.operation.name": operation, + "botanu.attempt": attempt_number, + "gen_ai.usage.input_tokens": input_tokens, + "gen_ai.usage.output_tokens": output_tokens, + "botanu.usage.cached_tokens": cached_tokens, + "duration_ms": duration_ms, + "status": status, + "error_class": error_class, + "gen_ai.response.id": provider_request_id, + "botanu.cost.estimated_usd": estimated_cost_usd, + }, + ) + + def tool_attempted( + self, + run_id: str, + tool_name: str, + tool_call_id: Optional[str] = None, + attempt_number: int = 1, + duration_ms: Optional[float] = None, + status: str = "success", + error_class: Optional[str] = None, + items_returned: int = 0, + bytes_processed: int = 0, + ) -> None: + from opentelemetry._logs import SeverityNumber + + self._emit( + LedgerEventType.TOOL_ATTEMPTED, + SeverityNumber.INFO if status == "success" else SeverityNumber.WARN, + { + "botanu.run_id": run_id, + "gen_ai.tool.name": tool_name, + "gen_ai.tool.call.id": tool_call_id, + "botanu.attempt": attempt_number, + "duration_ms": duration_ms, + "status": status, + "error_class": error_class, + "items_returned": items_returned, + "bytes_processed": bytes_processed, + }, + ) + + # ----------------------------------------------------------------- + # Cancellation & Zombie Detection + # ----------------------------------------------------------------- + + def cancel_requested( + self, + run_id: str, + reason: str = "user", + requested_at_ms: Optional[float] = None, + ) -> None: + from opentelemetry._logs import SeverityNumber + + self._emit( + LedgerEventType.CANCEL_REQUESTED, + SeverityNumber.WARN, + { + "botanu.run_id": run_id, + "cancellation.reason": reason, + "cancellation.requested_at_ms": requested_at_ms or int(time.time() * 1000), + }, + ) + + def cancel_acknowledged( + self, + run_id: str, + acknowledged_by: str, + latency_ms: Optional[float] = None, + ) -> None: + from opentelemetry._logs import SeverityNumber + + self._emit( + LedgerEventType.CANCEL_ACKNOWLEDGED, + SeverityNumber.INFO, + { + "botanu.run_id": run_id, + "cancellation.acknowledged_by": acknowledged_by, + "cancellation.latency_ms": latency_ms, + }, + ) + + def zombie_detected( + self, + run_id: str, + deadline_ts: float, + actual_end_ts: float, + zombie_duration_ms: float, + component: str, + ) -> None: + from opentelemetry._logs import SeverityNumber + + self._emit( + LedgerEventType.ZOMBIE_DETECTED, + SeverityNumber.ERROR, + { + "botanu.run_id": run_id, + "deadline_ts": deadline_ts, + "actual_end_ts": actual_end_ts, + "zombie_duration_ms": zombie_duration_ms, + "zombie_component": component, + }, + ) + + def redelivery_detected( + self, + run_id: str, + queue_name: str, + delivery_count: int, + original_message_id: Optional[str] = None, + ) -> None: + from opentelemetry._logs import SeverityNumber + + self._emit( + LedgerEventType.REDELIVERY_DETECTED, + SeverityNumber.WARN, + { + "botanu.run_id": run_id, + "queue.name": queue_name, + "delivery_count": delivery_count, + "original_message_id": original_message_id, + }, + ) + + # ----------------------------------------------------------------- + # Lifecycle + # ----------------------------------------------------------------- + + def flush(self, timeout_ms: int = 5000) -> bool: + if not self._initialized: + return True + try: + from opentelemetry._logs import get_logger_provider + + provider = get_logger_provider() + if hasattr(provider, "force_flush"): + return provider.force_flush(timeout_ms) + return True + except Exception as exc: + logger.debug("Failed to flush AttemptLedger: %s", exc) + return False + + def shutdown(self) -> None: + if not self._initialized: + return + try: + from opentelemetry._logs import get_logger_provider + + provider = get_logger_provider() + if hasattr(provider, "shutdown"): + provider.shutdown() + except Exception as exc: + logger.debug("Failed to shutdown AttemptLedger: %s", exc) + + +# ========================================================================= +# Global ledger +# ========================================================================= + +_global_ledger: Optional[AttemptLedger] = None + + +@lru_cache(maxsize=1) +def _create_default_ledger() -> AttemptLedger: + """Create default ledger instance (thread-safe via lru_cache).""" + return AttemptLedger() + + +def get_ledger() -> AttemptLedger: + """Get the global attempt ledger instance (thread-safe).""" + if _global_ledger is not None: + return _global_ledger + return _create_default_ledger() + + +def set_ledger(ledger: AttemptLedger) -> None: + """Set the global attempt ledger instance.""" + global _global_ledger + _global_ledger = ledger + + +def record_attempt_started(**kwargs: Any) -> None: + get_ledger().attempt_started(**kwargs) + + +def record_attempt_ended(**kwargs: Any) -> None: + get_ledger().attempt_ended(**kwargs) + + +def record_llm_attempted(**kwargs: Any) -> None: + get_ledger().llm_attempted(**kwargs) + + +def record_tool_attempted(**kwargs: Any) -> None: + get_ledger().tool_attempted(**kwargs) diff --git a/src/botanu/tracking/llm.py b/src/botanu/tracking/llm.py new file mode 100644 index 0000000..9ddccc4 --- /dev/null +++ b/src/botanu/tracking/llm.py @@ -0,0 +1,688 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""LLM/Model Tracking — Track AI model usage for cost attribution. + +Aligned with OpenTelemetry GenAI Semantic Conventions: +https://opentelemetry.io/docs/specs/semconv/gen-ai/gen-ai-spans/ + +Usage:: + + from botanu.tracking.llm import track_llm_call, track_tool_call + + with track_llm_call(provider="openai", model="gpt-4") as tracker: + response = openai.chat.completions.create(...) + tracker.set_tokens( + input_tokens=response.usage.prompt_tokens, + output_tokens=response.usage.completion_tokens, + ) + tracker.set_request_id(response.id) +""" + +from __future__ import annotations + +import functools +from contextlib import contextmanager +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Any, Dict, Generator, List, Optional + +from opentelemetry import metrics, trace +from opentelemetry.trace import Span, SpanKind, Status, StatusCode + +# ========================================================================= +# OTel GenAI Semantic Convention Attribute Names +# ========================================================================= + + +class GenAIAttributes: + """OpenTelemetry GenAI Semantic Convention attribute names.""" + + OPERATION_NAME = "gen_ai.operation.name" + PROVIDER_NAME = "gen_ai.provider.name" + REQUEST_MODEL = "gen_ai.request.model" + RESPONSE_MODEL = "gen_ai.response.model" + USAGE_INPUT_TOKENS = "gen_ai.usage.input_tokens" + USAGE_OUTPUT_TOKENS = "gen_ai.usage.output_tokens" + REQUEST_TEMPERATURE = "gen_ai.request.temperature" + REQUEST_TOP_P = "gen_ai.request.top_p" + REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens" + REQUEST_STOP_SEQUENCES = "gen_ai.request.stop_sequences" + REQUEST_FREQUENCY_PENALTY = "gen_ai.request.frequency_penalty" + REQUEST_PRESENCE_PENALTY = "gen_ai.request.presence_penalty" + RESPONSE_ID = "gen_ai.response.id" + RESPONSE_FINISH_REASONS = "gen_ai.response.finish_reasons" + TOOL_NAME = "gen_ai.tool.name" + TOOL_CALL_ID = "gen_ai.tool.call.id" + ERROR_TYPE = "error.type" + + +class BotanuAttributes: + """Botanu-specific attributes for cost attribution.""" + + PROVIDER_REQUEST_ID = "botanu.provider.request_id" + CLIENT_REQUEST_ID = "botanu.provider.client_request_id" + TOKENS_CACHED = "botanu.usage.cached_tokens" + TOKENS_CACHED_READ = "botanu.usage.cache_read_tokens" + TOKENS_CACHED_WRITE = "botanu.usage.cache_write_tokens" + STREAMING = "botanu.request.streaming" + CACHE_HIT = "botanu.request.cache_hit" + ATTEMPT_NUMBER = "botanu.request.attempt" + TOOL_SUCCESS = "botanu.tool.success" + TOOL_ITEMS_RETURNED = "botanu.tool.items_returned" + TOOL_BYTES_PROCESSED = "botanu.tool.bytes_processed" + TOOL_DURATION_MS = "botanu.tool.duration_ms" + VENDOR = "botanu.vendor" + + +# ========================================================================= +# Provider name mapping +# ========================================================================= + +LLM_PROVIDERS: Dict[str, str] = { + "openai": "openai", + "azure_openai": "azure.openai", + "azure-openai": "azure.openai", + "azureopenai": "azure.openai", + "anthropic": "anthropic", + "claude": "anthropic", + "bedrock": "aws.bedrock", + "aws_bedrock": "aws.bedrock", + "amazon_bedrock": "aws.bedrock", + "vertex": "gcp.vertex_ai", + "vertexai": "gcp.vertex_ai", + "vertex_ai": "gcp.vertex_ai", + "gcp_vertex": "gcp.vertex_ai", + "gemini": "gcp.vertex_ai", + "google": "gcp.vertex_ai", + "cohere": "cohere", + "mistral": "mistral", + "mistralai": "mistral", + "together": "together", + "togetherai": "together", + "groq": "groq", + "replicate": "replicate", + "ollama": "ollama", + "huggingface": "huggingface", + "hf": "huggingface", + "fireworks": "fireworks", + "perplexity": "perplexity", +} + + +class ModelOperation: + """GenAI operation types per OTel semconv.""" + + CHAT = "chat" + TEXT_COMPLETION = "text_completion" + EMBEDDINGS = "embeddings" + GENERATE_CONTENT = "generate_content" + EXECUTE_TOOL = "execute_tool" + CREATE_AGENT = "create_agent" + INVOKE_AGENT = "invoke_agent" + RERANK = "rerank" + IMAGE_GENERATION = "image_generation" + IMAGE_EDIT = "image_edit" + SPEECH_TO_TEXT = "speech_to_text" + TEXT_TO_SPEECH = "text_to_speech" + MODERATION = "moderation" + + # Aliases + COMPLETION = "text_completion" + EMBEDDING = "embeddings" + FUNCTION_CALL = "execute_tool" + TOOL_USE = "execute_tool" + + +# ========================================================================= +# GenAI Metrics +# ========================================================================= + +_meter = metrics.get_meter("botanu.gen_ai") + +_token_usage_histogram = _meter.create_histogram( + name="gen_ai.client.token.usage", + description="Number of input and output tokens used", + unit="{token}", +) + +_operation_duration_histogram = _meter.create_histogram( + name="gen_ai.client.operation.duration", + description="GenAI operation duration", + unit="s", +) + +_attempt_counter = _meter.create_counter( + name="botanu.gen_ai.attempts", + description="Number of request attempts (including retries)", + unit="{attempt}", +) + + +def _record_token_metrics( + provider: str, + model: str, + operation: str, + input_tokens: int, + output_tokens: int, + error_type: Optional[str] = None, +) -> None: + base_attrs: Dict[str, str] = { + GenAIAttributes.OPERATION_NAME: operation, + GenAIAttributes.PROVIDER_NAME: provider, + GenAIAttributes.REQUEST_MODEL: model, + } + if error_type: + base_attrs[GenAIAttributes.ERROR_TYPE] = error_type + + if input_tokens > 0: + _token_usage_histogram.record( + input_tokens, + {**base_attrs, "gen_ai.token.type": "input"}, + ) + if output_tokens > 0: + _token_usage_histogram.record( + output_tokens, + {**base_attrs, "gen_ai.token.type": "output"}, + ) + + +def _record_duration_metric( + provider: str, + model: str, + operation: str, + duration_seconds: float, + error_type: Optional[str] = None, +) -> None: + attrs: Dict[str, str] = { + GenAIAttributes.OPERATION_NAME: operation, + GenAIAttributes.PROVIDER_NAME: provider, + GenAIAttributes.REQUEST_MODEL: model, + } + if error_type: + attrs[GenAIAttributes.ERROR_TYPE] = error_type + + _operation_duration_histogram.record(duration_seconds, attrs) + + +# ========================================================================= +# LLM Tracker +# ========================================================================= + + +@dataclass +class LLMTracker: + """Context manager for tracking LLM calls with OTel GenAI semconv.""" + + provider: str + model: str + operation: str = ModelOperation.CHAT + span: Optional[Span] = field(default=None, repr=False) + start_time: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) + + input_tokens: int = 0 + output_tokens: int = 0 + cached_tokens: int = 0 + cache_read_tokens: int = 0 + cache_write_tokens: int = 0 + + provider_request_id: Optional[str] = None + client_request_id: Optional[str] = None + response_model: Optional[str] = None + finish_reason: Optional[str] = None + is_streaming: bool = False + cache_hit: bool = False + attempt_number: int = 1 + error_type: Optional[str] = None + + def set_tokens( + self, + input_tokens: int = 0, + output_tokens: int = 0, + cached_tokens: int = 0, + cache_read_tokens: int = 0, + cache_write_tokens: int = 0, + ) -> LLMTracker: + """Set token counts from model response.""" + self.input_tokens = input_tokens + self.output_tokens = output_tokens + self.cached_tokens = cached_tokens or cache_read_tokens + self.cache_read_tokens = cache_read_tokens + self.cache_write_tokens = cache_write_tokens + + if self.span: + self.span.set_attribute(GenAIAttributes.USAGE_INPUT_TOKENS, input_tokens) + self.span.set_attribute(GenAIAttributes.USAGE_OUTPUT_TOKENS, output_tokens) + if self.cached_tokens > 0: + self.span.set_attribute(BotanuAttributes.TOKENS_CACHED, self.cached_tokens) + if cache_read_tokens > 0: + self.span.set_attribute(BotanuAttributes.TOKENS_CACHED_READ, cache_read_tokens) + if cache_write_tokens > 0: + self.span.set_attribute(BotanuAttributes.TOKENS_CACHED_WRITE, cache_write_tokens) + return self + + def set_request_id( + self, + provider_request_id: Optional[str] = None, + client_request_id: Optional[str] = None, + ) -> LLMTracker: + """Set provider request IDs for billing reconciliation.""" + if provider_request_id: + self.provider_request_id = provider_request_id + if self.span: + self.span.set_attribute(GenAIAttributes.RESPONSE_ID, provider_request_id) + self.span.set_attribute(BotanuAttributes.PROVIDER_REQUEST_ID, provider_request_id) + if client_request_id: + self.client_request_id = client_request_id + if self.span: + self.span.set_attribute(BotanuAttributes.CLIENT_REQUEST_ID, client_request_id) + return self + + def set_response_model(self, model: str) -> LLMTracker: + """Set the actual model used in the response.""" + self.response_model = model + if self.span: + self.span.set_attribute(GenAIAttributes.RESPONSE_MODEL, model) + return self + + def set_finish_reason(self, reason: str) -> LLMTracker: + """Set the finish/stop reason from the response.""" + self.finish_reason = reason + if self.span: + self.span.set_attribute(GenAIAttributes.RESPONSE_FINISH_REASONS, [reason]) + return self + + def set_streaming(self, is_streaming: bool = True) -> LLMTracker: + """Mark request as streaming.""" + self.is_streaming = is_streaming + if self.span: + self.span.set_attribute(BotanuAttributes.STREAMING, is_streaming) + return self + + def set_cache_hit(self, cache_hit: bool = True) -> LLMTracker: + """Mark as cache hit.""" + self.cache_hit = cache_hit + if self.span: + self.span.set_attribute(BotanuAttributes.CACHE_HIT, cache_hit) + return self + + def set_attempt(self, attempt_number: int) -> LLMTracker: + """Set the attempt number (for retry tracking).""" + self.attempt_number = attempt_number + if self.span: + self.span.set_attribute(BotanuAttributes.ATTEMPT_NUMBER, attempt_number) + return self + + def set_request_params( + self, + temperature: Optional[float] = None, + top_p: Optional[float] = None, + max_tokens: Optional[int] = None, + stop_sequences: Optional[List[str]] = None, + frequency_penalty: Optional[float] = None, + presence_penalty: Optional[float] = None, + ) -> LLMTracker: + """Set request parameters per OTel GenAI semconv.""" + if self.span: + if temperature is not None: + self.span.set_attribute(GenAIAttributes.REQUEST_TEMPERATURE, temperature) + if top_p is not None: + self.span.set_attribute(GenAIAttributes.REQUEST_TOP_P, top_p) + if max_tokens is not None: + self.span.set_attribute(GenAIAttributes.REQUEST_MAX_TOKENS, max_tokens) + if stop_sequences is not None: + self.span.set_attribute(GenAIAttributes.REQUEST_STOP_SEQUENCES, stop_sequences) + if frequency_penalty is not None: + self.span.set_attribute(GenAIAttributes.REQUEST_FREQUENCY_PENALTY, frequency_penalty) + if presence_penalty is not None: + self.span.set_attribute(GenAIAttributes.REQUEST_PRESENCE_PENALTY, presence_penalty) + return self + + def set_error(self, error: Exception) -> LLMTracker: + """Record an error from the LLM call.""" + self.error_type = type(error).__name__ + if self.span: + self.span.set_status(Status(StatusCode.ERROR, str(error))) + self.span.set_attribute(GenAIAttributes.ERROR_TYPE, self.error_type) + self.span.record_exception(error) + return self + + def add_metadata(self, **kwargs: Any) -> LLMTracker: + """Add custom metadata to the span.""" + if self.span: + for key, value in kwargs.items(): + attr_key = key if key.startswith(("botanu.", "gen_ai.")) else f"botanu.{key}" + self.span.set_attribute(attr_key, value) + return self + + def _finalize(self) -> None: + if not self.span: + return + + duration_seconds = (datetime.now(timezone.utc) - self.start_time).total_seconds() + + _record_token_metrics( + provider=self.provider, + model=self.model, + operation=self.operation, + input_tokens=self.input_tokens, + output_tokens=self.output_tokens, + error_type=self.error_type, + ) + _record_duration_metric( + provider=self.provider, + model=self.model, + operation=self.operation, + duration_seconds=duration_seconds, + error_type=self.error_type, + ) + _attempt_counter.add( + 1, + { + GenAIAttributes.PROVIDER_NAME: self.provider, + GenAIAttributes.REQUEST_MODEL: self.model, + GenAIAttributes.OPERATION_NAME: self.operation, + "status": "error" if self.error_type else "success", + }, + ) + + +@contextmanager +def track_llm_call( + provider: str, + model: str, + operation: str = ModelOperation.CHAT, + client_request_id: Optional[str] = None, + **kwargs: Any, +) -> Generator[LLMTracker, None, None]: + """Context manager for tracking LLM/model calls with OTel GenAI semconv. + + Args: + provider: LLM provider (openai, anthropic, bedrock, vertex, …). + model: Model name/ID (gpt-4, claude-3-opus, …). + operation: Type of operation (chat, embeddings, text_completion, …). + client_request_id: Optional client-generated request ID. + **kwargs: Additional span attributes. + + Yields: + :class:`LLMTracker` instance. + """ + tracer = trace.get_tracer("botanu.gen_ai") + normalized_provider = LLM_PROVIDERS.get(provider.lower(), provider.lower()) + span_name = f"{operation} {model}" + + with tracer.start_as_current_span(name=span_name, kind=SpanKind.CLIENT) as span: + span.set_attribute(GenAIAttributes.OPERATION_NAME, operation) + span.set_attribute(GenAIAttributes.PROVIDER_NAME, normalized_provider) + span.set_attribute(GenAIAttributes.REQUEST_MODEL, model) + span.set_attribute(BotanuAttributes.VENDOR, normalized_provider) + + for key, value in kwargs.items(): + attr_key = key if key.startswith(("botanu.", "gen_ai.")) else f"botanu.{key}" + span.set_attribute(attr_key, value) + + tracker = LLMTracker( + provider=normalized_provider, + model=model, + operation=operation, + span=span, + ) + if client_request_id: + tracker.set_request_id(client_request_id=client_request_id) + + try: + yield tracker + except Exception as exc: + tracker.set_error(exc) + raise + finally: + tracker._finalize() + + +# ========================================================================= +# Tool/Function Call Tracker +# ========================================================================= + +_tool_duration_histogram = _meter.create_histogram( + name="botanu.tool.duration", + description="Tool execution duration", + unit="s", +) + +_tool_counter = _meter.create_counter( + name="botanu.tool.executions", + description="Number of tool executions", + unit="{execution}", +) + + +@dataclass +class ToolTracker: + """Context manager for tracking tool/function calls.""" + + tool_name: str + tool_call_id: Optional[str] = None + provider: Optional[str] = None + span: Optional[Span] = field(default=None, repr=False) + start_time: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) + + success: bool = True + items_returned: int = 0 + bytes_processed: int = 0 + error_type: Optional[str] = None + + def set_result( + self, + success: bool = True, + items_returned: int = 0, + bytes_processed: int = 0, + ) -> ToolTracker: + """Set tool execution result.""" + self.success = success + self.items_returned = items_returned + self.bytes_processed = bytes_processed + if self.span: + self.span.set_attribute(BotanuAttributes.TOOL_SUCCESS, success) + if items_returned > 0: + self.span.set_attribute(BotanuAttributes.TOOL_ITEMS_RETURNED, items_returned) + if bytes_processed > 0: + self.span.set_attribute(BotanuAttributes.TOOL_BYTES_PROCESSED, bytes_processed) + return self + + def set_tool_call_id(self, tool_call_id: str) -> ToolTracker: + """Set the tool call ID from the LLM response.""" + self.tool_call_id = tool_call_id + if self.span: + self.span.set_attribute(GenAIAttributes.TOOL_CALL_ID, tool_call_id) + return self + + def set_error(self, error: Exception) -> ToolTracker: + """Record tool execution error.""" + self.success = False + self.error_type = type(error).__name__ + if self.span: + self.span.set_status(Status(StatusCode.ERROR, str(error))) + self.span.set_attribute(GenAIAttributes.ERROR_TYPE, self.error_type) + self.span.record_exception(error) + return self + + def add_metadata(self, **kwargs: Any) -> ToolTracker: + """Add custom metadata to the span.""" + if self.span: + for key, value in kwargs.items(): + attr_key = key if key.startswith(("botanu.", "gen_ai.")) else f"botanu.tool.{key}" + self.span.set_attribute(attr_key, value) + return self + + def _finalize(self) -> None: + if not self.span: + return + duration_seconds = (datetime.now(timezone.utc) - self.start_time).total_seconds() + self.span.set_attribute(BotanuAttributes.TOOL_DURATION_MS, duration_seconds * 1000) + + attrs: Dict[str, str] = { + GenAIAttributes.TOOL_NAME: self.tool_name, + "status": "error" if self.error_type else "success", + } + if self.provider: + attrs[GenAIAttributes.PROVIDER_NAME] = self.provider + + _tool_duration_histogram.record(duration_seconds, attrs) + _tool_counter.add(1, attrs) + + +@contextmanager +def track_tool_call( + tool_name: str, + tool_call_id: Optional[str] = None, + provider: Optional[str] = None, + **kwargs: Any, +) -> Generator[ToolTracker, None, None]: + """Context manager for tracking tool/function calls. + + Args: + tool_name: Name of the tool/function. + tool_call_id: Tool call ID from the LLM response. + provider: Tool provider if external (e.g., ``"tavily"``). + **kwargs: Additional span attributes. + + Yields: + :class:`ToolTracker` instance. + """ + tracer = trace.get_tracer("botanu.gen_ai") + span_name = f"execute_tool {tool_name}" + + with tracer.start_as_current_span(name=span_name, kind=SpanKind.INTERNAL) as span: + span.set_attribute(GenAIAttributes.OPERATION_NAME, ModelOperation.EXECUTE_TOOL) + span.set_attribute(GenAIAttributes.TOOL_NAME, tool_name) + + if tool_call_id: + span.set_attribute(GenAIAttributes.TOOL_CALL_ID, tool_call_id) + if provider: + normalized = LLM_PROVIDERS.get(provider.lower(), provider.lower()) + span.set_attribute(GenAIAttributes.PROVIDER_NAME, normalized) + span.set_attribute(BotanuAttributes.VENDOR, normalized) + + for key, value in kwargs.items(): + attr_key = key if key.startswith(("botanu.", "gen_ai.")) else f"botanu.tool.{key}" + span.set_attribute(attr_key, value) + + tracker = ToolTracker( + tool_name=tool_name, + tool_call_id=tool_call_id, + provider=provider, + span=span, + ) + + try: + yield tracker + except Exception as exc: + tracker.set_error(exc) + raise + finally: + tracker._finalize() + + +# ========================================================================= +# Standalone Helpers +# ========================================================================= + + +def set_llm_attributes( + provider: str, + model: str, + operation: str = ModelOperation.CHAT, + input_tokens: int = 0, + output_tokens: int = 0, + cached_tokens: int = 0, + streaming: bool = False, + provider_request_id: Optional[str] = None, + span: Optional[Span] = None, +) -> None: + """Set LLM attributes on the current span using OTel GenAI semconv.""" + target_span = span or trace.get_current_span() + if not target_span or not target_span.is_recording(): + return + + normalized_provider = LLM_PROVIDERS.get(provider.lower(), provider.lower()) + + target_span.set_attribute(GenAIAttributes.OPERATION_NAME, operation) + target_span.set_attribute(GenAIAttributes.PROVIDER_NAME, normalized_provider) + target_span.set_attribute(GenAIAttributes.REQUEST_MODEL, model) + target_span.set_attribute(BotanuAttributes.VENDOR, normalized_provider) + + if input_tokens > 0: + target_span.set_attribute(GenAIAttributes.USAGE_INPUT_TOKENS, input_tokens) + if output_tokens > 0: + target_span.set_attribute(GenAIAttributes.USAGE_OUTPUT_TOKENS, output_tokens) + if cached_tokens > 0: + target_span.set_attribute(BotanuAttributes.TOKENS_CACHED, cached_tokens) + if streaming: + target_span.set_attribute(BotanuAttributes.STREAMING, True) + if provider_request_id: + target_span.set_attribute(GenAIAttributes.RESPONSE_ID, provider_request_id) + target_span.set_attribute(BotanuAttributes.PROVIDER_REQUEST_ID, provider_request_id) + + _record_token_metrics( + provider=normalized_provider, + model=model, + operation=operation, + input_tokens=input_tokens, + output_tokens=output_tokens, + ) + + +def set_token_usage( + input_tokens: int, + output_tokens: int, + cached_tokens: int = 0, + span: Optional[Span] = None, +) -> None: + """Set token usage on the current span using OTel GenAI semconv.""" + target_span = span or trace.get_current_span() + if not target_span or not target_span.is_recording(): + return + + target_span.set_attribute(GenAIAttributes.USAGE_INPUT_TOKENS, input_tokens) + target_span.set_attribute(GenAIAttributes.USAGE_OUTPUT_TOKENS, output_tokens) + + if cached_tokens > 0: + target_span.set_attribute(BotanuAttributes.TOKENS_CACHED, cached_tokens) + + +def llm_instrumented( + provider: str, + model_param: str = "model", + tokens_from_response: bool = True, +) -> Any: + """Decorator to auto-instrument LLM client methods. + + Args: + provider: LLM provider name. + model_param: Name of the parameter containing the model name. + tokens_from_response: Whether to extract tokens from ``response.usage``. + """ + + def decorator(func: Any) -> Any: + @functools.wraps(func) + def wrapper(*args: Any, **kwargs: Any) -> Any: + model = kwargs.get(model_param) or (args[1] if len(args) > 1 else "unknown") + + with track_llm_call(provider, model) as tracker: + if kwargs.get("stream"): + tracker.set_streaming(True) + + response = func(*args, **kwargs) + + if tokens_from_response and hasattr(response, "usage"): + usage = response.usage + tracker.set_tokens( + input_tokens=getattr(usage, "prompt_tokens", 0) or getattr(usage, "input_tokens", 0), + output_tokens=getattr(usage, "completion_tokens", 0) or getattr(usage, "output_tokens", 0), + ) + + return response + + return wrapper + + return decorator diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..288f918 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Shared test fixtures for Botanu SDK tests.""" + +from __future__ import annotations + +import pytest +from opentelemetry import trace +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter + +# Module-level provider and exporter to avoid "cannot override" warnings +_provider: TracerProvider = None +_exporter: InMemorySpanExporter = None + + +def _get_or_create_provider() -> tuple[TracerProvider, InMemorySpanExporter]: + """Get or create the global test provider.""" + global _provider, _exporter + + if _provider is None: + _provider = TracerProvider() + _exporter = InMemorySpanExporter() + _provider.add_span_processor(SimpleSpanProcessor(_exporter)) + trace.set_tracer_provider(_provider) + + return _provider, _exporter + + +@pytest.fixture(autouse=True) +def reset_tracing(): + """Reset tracing state before each test.""" + _, exporter = _get_or_create_provider() + exporter.clear() + yield + exporter.clear() + + +@pytest.fixture +def tracer_provider(): + """Get the test TracerProvider.""" + provider, _ = _get_or_create_provider() + return provider + + +@pytest.fixture +def memory_exporter(): + """Get the in-memory span exporter for testing.""" + _, exporter = _get_or_create_provider() + return exporter + + +@pytest.fixture +def tracer(tracer_provider): + """Get a tracer instance.""" + return trace.get_tracer("test-tracer") diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 0000000..e6ae60f --- /dev/null +++ b/tests/integration/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 0000000..e6ae60f --- /dev/null +++ b/tests/unit/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py new file mode 100644 index 0000000..117d75b --- /dev/null +++ b/tests/unit/test_config.py @@ -0,0 +1,203 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for BotanuConfig.""" + +from __future__ import annotations + +import os +from unittest import mock + +import pytest + +from botanu.sdk.config import BotanuConfig, _interpolate_env_vars + + +class TestInterpolateEnvVars: + """Tests for environment variable interpolation.""" + + def test_interpolates_env_vars(self): + with mock.patch.dict(os.environ, {"MY_VAR": "my_value"}): + result = _interpolate_env_vars("endpoint: ${MY_VAR}") + assert result == "endpoint: my_value" + + def test_preserves_unset_vars(self): + result = _interpolate_env_vars("endpoint: ${UNSET_VAR}") + assert result == "endpoint: ${UNSET_VAR}" + + def test_no_interpolation_needed(self): + result = _interpolate_env_vars("endpoint: http://localhost") + assert result == "endpoint: http://localhost" + + def test_default_value_when_unset(self): + result = _interpolate_env_vars("endpoint: ${UNSET_VAR:-default_value}") + assert result == "endpoint: default_value" + + def test_default_value_ignored_when_set(self): + with mock.patch.dict(os.environ, {"MY_VAR": "actual_value"}): + result = _interpolate_env_vars("endpoint: ${MY_VAR:-default_value}") + assert result == "endpoint: actual_value" + + +class TestBotanuConfigDefaults: + """Tests for BotanuConfig defaults.""" + + def test_default_values(self): + with mock.patch.dict(os.environ, {}, clear=True): + # Clear relevant env vars + for key in ["OTEL_SERVICE_NAME", "BOTANU_ENVIRONMENT", "OTEL_EXPORTER_OTLP_ENDPOINT"]: + os.environ.pop(key, None) + + config = BotanuConfig() + + assert config.service_name == "unknown_service" + assert config.deployment_environment == "production" + assert config.propagation_mode == "lean" + assert config.auto_detect_resources is True + + def test_env_var_service_name(self): + with mock.patch.dict(os.environ, {"OTEL_SERVICE_NAME": "my-service"}): + config = BotanuConfig() + assert config.service_name == "my-service" + + def test_env_var_environment(self): + with mock.patch.dict(os.environ, {"BOTANU_ENVIRONMENT": "staging"}): + config = BotanuConfig() + assert config.deployment_environment == "staging" + + def test_env_var_otlp_endpoint_base(self): + """OTEL_EXPORTER_OTLP_ENDPOINT gets /v1/traces appended.""" + with mock.patch.dict(os.environ, {"OTEL_EXPORTER_OTLP_ENDPOINT": "http://collector:4318"}): + config = BotanuConfig() + # Base endpoint gets /v1/traces appended + assert config.otlp_endpoint == "http://collector:4318/v1/traces" + + def test_env_var_otlp_traces_endpoint_direct(self): + """OTEL_EXPORTER_OTLP_TRACES_ENDPOINT is used directly without appending.""" + with mock.patch.dict(os.environ, {"OTEL_EXPORTER_OTLP_TRACES_ENDPOINT": "http://collector:4318/v1/traces"}): + config = BotanuConfig() + # Direct traces endpoint is used as-is + assert config.otlp_endpoint == "http://collector:4318/v1/traces" + + def test_explicit_values_override_env(self): + with mock.patch.dict(os.environ, {"OTEL_SERVICE_NAME": "env-service"}): + config = BotanuConfig(service_name="explicit-service") + assert config.service_name == "explicit-service" + + def test_env_var_propagation_mode(self): + with mock.patch.dict(os.environ, {"BOTANU_PROPAGATION_MODE": "full"}): + config = BotanuConfig() + assert config.propagation_mode == "full" + + +class TestBotanuConfigFromYaml: + """Tests for loading config from YAML.""" + + def test_from_yaml_basic(self, tmp_path): + yaml_content = """ +service: + name: yaml-service + environment: production +""" + yaml_file = tmp_path / "config.yaml" + yaml_file.write_text(yaml_content) + + config = BotanuConfig.from_yaml(str(yaml_file)) + assert config.service_name == "yaml-service" + assert config.deployment_environment == "production" + + def test_from_yaml_with_otlp(self, tmp_path): + yaml_content = """ +service: + name: test-service +otlp: + endpoint: http://localhost:4318 + headers: + Authorization: Bearer token123 +""" + yaml_file = tmp_path / "config.yaml" + yaml_file.write_text(yaml_content) + + config = BotanuConfig.from_yaml(str(yaml_file)) + assert config.otlp_endpoint == "http://localhost:4318" + assert config.otlp_headers == {"Authorization": "Bearer token123"} + + def test_from_yaml_file_not_found(self): + with pytest.raises(FileNotFoundError): + BotanuConfig.from_yaml("/nonexistent/path/config.yaml") + + def test_from_yaml_empty_file(self, tmp_path): + yaml_file = tmp_path / "empty.yaml" + yaml_file.write_text("") + + config = BotanuConfig.from_yaml(str(yaml_file)) + # Should use defaults + assert config.service_name is not None + + def test_from_yaml_env_interpolation(self, tmp_path): + yaml_content = """ +service: + name: ${TEST_SERVICE_NAME} +""" + yaml_file = tmp_path / "config.yaml" + yaml_file.write_text(yaml_content) + + with mock.patch.dict(os.environ, {"TEST_SERVICE_NAME": "interpolated-service"}): + config = BotanuConfig.from_yaml(str(yaml_file)) + assert config.service_name == "interpolated-service" + + +class TestBotanuConfigFromFileOrEnv: + """Tests for from_file_or_env method.""" + + def test_uses_env_when_no_file(self): + with mock.patch.dict( + os.environ, + {"OTEL_SERVICE_NAME": "env-only-service"}, + clear=False, + ): + # Ensure no config files exist in current directory + config = BotanuConfig.from_file_or_env() + # Should use env vars + assert config.service_name == "env-only-service" + + def test_uses_specified_path(self, tmp_path): + yaml_content = """ +service: + name: file-service +""" + yaml_file = tmp_path / "config.yaml" + yaml_file.write_text(yaml_content) + + config = BotanuConfig.from_file_or_env(path=str(yaml_file)) + assert config.service_name == "file-service" + + +class TestBotanuConfigToDict: + """Tests for config serialization.""" + + def test_to_dict(self): + config = BotanuConfig( + service_name="test-service", + deployment_environment="staging", + otlp_endpoint="http://localhost:4318", + ) + d = config.to_dict() + + assert d["service"]["name"] == "test-service" + assert d["service"]["environment"] == "staging" + assert d["otlp"]["endpoint"] == "http://localhost:4318" + + +class TestBotanuConfigAutoInstrument: + """Tests for auto-instrumentation configuration.""" + + def test_default_packages(self): + config = BotanuConfig() + packages = config.auto_instrument_packages + + assert "requests" in packages + assert "httpx" in packages + assert "fastapi" in packages + assert "openai_v2" in packages + assert "anthropic" in packages diff --git a/tests/unit/test_context.py b/tests/unit/test_context.py new file mode 100644 index 0000000..77f7ded --- /dev/null +++ b/tests/unit/test_context.py @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for context and baggage helpers.""" + +from __future__ import annotations + +from opentelemetry import trace + +from botanu.sdk.context import ( + get_baggage, + get_current_span, + get_run_id, + get_use_case, + set_baggage, +) + + +class TestBaggageHelpers: + """Tests for baggage helper functions.""" + + def test_set_and_get_baggage(self): + token = set_baggage("test.key", "test-value") + assert token is not None + + value = get_baggage("test.key") + assert value == "test-value" + + def test_get_baggage_missing_key(self): + value = get_baggage("nonexistent.key") + assert value is None + + def test_get_run_id(self): + set_baggage("botanu.run_id", "run-12345") + assert get_run_id() == "run-12345" + + def test_get_run_id_not_set(self): + # In a fresh context, run_id might not be set + # This tests the function doesn't crash + result = get_run_id() + # Result could be None or a previously set value + assert result is None or isinstance(result, str) + + def test_get_use_case(self): + set_baggage("botanu.use_case", "Customer Support") + assert get_use_case() == "Customer Support" + + +class TestSpanHelpers: + """Tests for span helper functions.""" + + def test_get_current_span_with_active_span(self, memory_exporter): + tracer = trace.get_tracer("test") + with tracer.start_as_current_span("test-span") as expected_span: + current = get_current_span() + assert current == expected_span + + def test_get_current_span_no_active_span(self): + # When no span is active, should return a non-recording span + span = get_current_span() + assert span is not None + # Non-recording spans have is_recording() == False + assert not span.is_recording() diff --git a/tests/unit/test_data_tracking.py b/tests/unit/test_data_tracking.py new file mode 100644 index 0000000..3c6680e --- /dev/null +++ b/tests/unit/test_data_tracking.py @@ -0,0 +1,209 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for data tracking (DB, storage, messaging).""" + +from __future__ import annotations + +import pytest + +from botanu.tracking.data import ( + DBOperation, + MessagingOperation, + StorageOperation, + track_db_operation, + track_messaging_operation, + track_storage_operation, +) + + +class TestTrackDBOperation: + """Tests for track_db_operation context manager.""" + + def test_creates_span_with_operation(self, memory_exporter): + with track_db_operation( + system="postgresql", + operation=DBOperation.SELECT, + database="mydb", + ) as tracker: + tracker.set_result(rows_returned=10) + + spans = memory_exporter.get_finished_spans() + assert len(spans) == 1 + assert "db" in spans[0].name.lower() or "select" in spans[0].name.lower() + + def test_records_db_attributes(self, memory_exporter): + with track_db_operation( + system="postgresql", + operation=DBOperation.INSERT, + database="users_db", + ) as tracker: + tracker.set_result(rows_affected=1) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("db.system") == "postgresql" + assert attrs.get("db.name") == "users_db" + + def test_records_error_on_exception(self, memory_exporter): + with pytest.raises(ValueError): + with track_db_operation( + system="mysql", + operation=DBOperation.SELECT, + ): + raise ValueError("Connection failed") + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("botanu.data.error") == "ValueError" + + def test_set_table(self, memory_exporter): + with track_db_operation( + system="postgresql", + operation=DBOperation.SELECT, + ) as tracker: + tracker.set_table("users", schema="public") + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("db.collection.name") == "users" + assert attrs.get("db.schema") == "public" + + def test_set_query_id(self, memory_exporter): + with track_db_operation( + system="snowflake", + operation=DBOperation.SELECT, + ) as tracker: + tracker.set_query_id("01abc123-def4-5678") + tracker.set_bytes_scanned(1024000) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("botanu.warehouse.query_id") == "01abc123-def4-5678" + assert attrs.get("botanu.warehouse.bytes_scanned") == 1024000 + + +class TestTrackStorageOperation: + """Tests for track_storage_operation context manager.""" + + def test_creates_span_for_read(self, memory_exporter): + with track_storage_operation( + system="s3", + operation=StorageOperation.GET, + ) as tracker: + tracker.set_result(bytes_read=1024) + + spans = memory_exporter.get_finished_spans() + assert len(spans) == 1 + + def test_records_storage_attributes(self, memory_exporter): + with track_storage_operation( + system="gcs", + operation=StorageOperation.PUT, + ) as tracker: + tracker.set_bucket("data-bucket") + tracker.set_result(bytes_written=2048) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("botanu.storage.system") == "gcs" + assert attrs.get("botanu.storage.bucket") == "data-bucket" + + def test_records_error(self, memory_exporter): + with pytest.raises(IOError): + with track_storage_operation( + system="s3", + operation=StorageOperation.GET, + ): + raise OSError("Access denied") + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("botanu.storage.error") == "OSError" # IOError is alias for OSError + + def test_objects_count(self, memory_exporter): + with track_storage_operation( + system="s3", + operation=StorageOperation.LIST, + ) as tracker: + tracker.set_result(objects_count=50) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("botanu.data.objects_count") == 50 + + +class TestTrackMessagingOperation: + """Tests for track_messaging_operation context manager.""" + + def test_creates_span_for_publish(self, memory_exporter): + with track_messaging_operation( + system="kafka", + operation=MessagingOperation.PUBLISH, + destination="orders-topic", + ) as tracker: + tracker.set_result(message_count=1) + + spans = memory_exporter.get_finished_spans() + assert len(spans) == 1 + + def test_records_messaging_attributes(self, memory_exporter): + with track_messaging_operation( + system="sqs", + operation=MessagingOperation.RECEIVE, + destination="my-queue", + ) as tracker: + tracker.set_result(message_count=5) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("messaging.system") == "sqs" + assert attrs.get("messaging.destination.name") == "my-queue" + + def test_records_error(self, memory_exporter): + with pytest.raises(TimeoutError): + with track_messaging_operation( + system="rabbitmq", + operation=MessagingOperation.PUBLISH, + destination="events", + ): + raise TimeoutError("Queue full") + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("botanu.messaging.error") == "TimeoutError" + + def test_consume_operation(self, memory_exporter): + with track_messaging_operation( + system="kafka", + operation=MessagingOperation.CONSUME, + destination="events-topic", + ) as tracker: + tracker.set_result(message_count=10, bytes_transferred=4096) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("messaging.operation") == "consume" + assert attrs.get("botanu.messaging.message_count") == 10 + assert attrs.get("botanu.messaging.bytes_transferred") == 4096 + + +class TestOperationEnums: + """Tests for operation type enums.""" + + def test_db_operations(self): + assert DBOperation.SELECT == "SELECT" + assert DBOperation.INSERT == "INSERT" + assert DBOperation.UPDATE == "UPDATE" + assert DBOperation.DELETE == "DELETE" + + def test_storage_operations(self): + assert StorageOperation.GET == "GET" + assert StorageOperation.PUT == "PUT" + assert StorageOperation.DELETE == "DELETE" + assert StorageOperation.LIST == "LIST" + + def test_messaging_operations(self): + assert MessagingOperation.PUBLISH == "publish" + assert MessagingOperation.RECEIVE == "receive" + assert MessagingOperation.CONSUME == "consume" diff --git a/tests/unit/test_decorators.py b/tests/unit/test_decorators.py new file mode 100644 index 0000000..e7b7dc6 --- /dev/null +++ b/tests/unit/test_decorators.py @@ -0,0 +1,124 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for SDK decorators.""" + +from __future__ import annotations + +import pytest + +from botanu.sdk.decorators import botanu_use_case + + +class TestBotanuUseCaseDecorator: + """Tests for @botanu_use_case decorator.""" + + def test_sync_function_creates_span(self, memory_exporter): + @botanu_use_case("Test Use Case") + def my_function(): + return "result" + + result = my_function() + + assert result == "result" + spans = memory_exporter.get_finished_spans() + assert len(spans) == 1 + assert spans[0].name == "botanu.run/Test Use Case" + + def test_span_has_run_attributes(self, memory_exporter): + @botanu_use_case("Customer Support", workflow="handle_ticket") + def my_function(): + return "done" + + my_function() + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + + assert "botanu.run_id" in attrs + assert attrs["botanu.use_case"] == "Customer Support" + assert attrs["botanu.workflow"] == "handle_ticket" + + def test_emits_started_event(self, memory_exporter): + @botanu_use_case("Test") + def my_function(): + pass + + my_function() + + spans = memory_exporter.get_finished_spans() + events = spans[0].events + + started_events = [e for e in events if e.name == "botanu.run.started"] + assert len(started_events) == 1 + + def test_emits_completed_event(self, memory_exporter): + @botanu_use_case("Test") + def my_function(): + return "done" + + my_function() + + spans = memory_exporter.get_finished_spans() + events = spans[0].events + + completed_events = [e for e in events if e.name == "botanu.run.completed"] + assert len(completed_events) == 1 + assert completed_events[0].attributes["status"] == "success" + + def test_records_exception_on_failure(self, memory_exporter): + @botanu_use_case("Test") + def failing_function(): + raise ValueError("test error") + + with pytest.raises(ValueError): + failing_function() + + spans = memory_exporter.get_finished_spans() + assert len(spans) == 1 + + events = spans[0].events + completed_events = [e for e in events if e.name == "botanu.run.completed"] + assert len(completed_events) == 1 + assert completed_events[0].attributes["status"] == "failure" + assert completed_events[0].attributes["error_class"] == "ValueError" + + @pytest.mark.asyncio + async def test_async_function_creates_span(self, memory_exporter): + @botanu_use_case("Async Test") + async def async_function(): + return "async result" + + result = await async_function() + + assert result == "async result" + spans = memory_exporter.get_finished_spans() + assert len(spans) == 1 + assert spans[0].name == "botanu.run/Async Test" + + @pytest.mark.asyncio + async def test_async_exception_handling(self, memory_exporter): + @botanu_use_case("Async Test") + async def failing_async(): + raise RuntimeError("async error") + + with pytest.raises(RuntimeError): + await failing_async() + + spans = memory_exporter.get_finished_spans() + events = spans[0].events + completed_events = [e for e in events if e.name == "botanu.run.completed"] + assert completed_events[0].attributes["status"] == "failure" + + def test_workflow_version_computed(self, memory_exporter): + @botanu_use_case("Test") + def versioned_function(): + return "versioned" + + versioned_function() + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + + assert "botanu.workflow.version" in attrs + assert attrs["botanu.workflow.version"].startswith("v:") diff --git a/tests/unit/test_enricher.py b/tests/unit/test_enricher.py new file mode 100644 index 0000000..a08cfbb --- /dev/null +++ b/tests/unit/test_enricher.py @@ -0,0 +1,160 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for RunContextEnricher processor.""" + +from __future__ import annotations + +from unittest import mock + +from opentelemetry import baggage, context, trace +from opentelemetry.sdk.trace import ReadableSpan + +from botanu.processors.enricher import RunContextEnricher + + +class TestRunContextEnricher: + """Tests for RunContextEnricher processor.""" + + def test_init_lean_mode_default(self): + """Default should be lean mode.""" + enricher = RunContextEnricher() + assert enricher._lean_mode is True + assert enricher._baggage_keys == RunContextEnricher.BAGGAGE_KEYS_LEAN + + def test_init_lean_mode_false(self): + """Can enable full mode.""" + enricher = RunContextEnricher(lean_mode=False) + assert enricher._lean_mode is False + assert enricher._baggage_keys == RunContextEnricher.BAGGAGE_KEYS_FULL + + def test_on_start_reads_baggage(self, memory_exporter): + """on_start should read baggage and set span attributes.""" + enricher = RunContextEnricher(lean_mode=True) + + # Set up baggage context - start from a clean context + ctx = context.Context() + ctx = baggage.set_baggage("botanu.run_id", "test-run-123", context=ctx) + ctx = baggage.set_baggage("botanu.use_case", "Test Case", context=ctx) + + # Create a span with the baggage context + tracer = trace.get_tracer("test") + token = context.attach(ctx) + try: + with tracer.start_as_current_span("test-span") as span: + # Manually call on_start to simulate processor behavior + enricher.on_start(span, ctx) + finally: + context.detach(token) + + spans = memory_exporter.get_finished_spans() + assert len(spans) == 1 + attrs = dict(spans[0].attributes) + assert attrs.get("botanu.run_id") == "test-run-123" + assert attrs.get("botanu.use_case") == "Test Case" + + def test_on_start_full_mode(self, memory_exporter): + """Full mode should read all baggage keys.""" + enricher = RunContextEnricher(lean_mode=False) + + # Set up baggage context with all keys - start from a clean context + ctx = context.Context() + ctx = baggage.set_baggage("botanu.run_id", "run-456", context=ctx) + ctx = baggage.set_baggage("botanu.use_case", "Full Test", context=ctx) + ctx = baggage.set_baggage("botanu.workflow", "my_workflow", context=ctx) + ctx = baggage.set_baggage("botanu.environment", "staging", context=ctx) + ctx = baggage.set_baggage("botanu.tenant_id", "tenant-789", context=ctx) + + tracer = trace.get_tracer("test") + token = context.attach(ctx) + try: + with tracer.start_as_current_span("test-span") as span: + enricher.on_start(span, ctx) + finally: + context.detach(token) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("botanu.run_id") == "run-456" + assert attrs.get("botanu.use_case") == "Full Test" + assert attrs.get("botanu.workflow") == "my_workflow" + assert attrs.get("botanu.environment") == "staging" + assert attrs.get("botanu.tenant_id") == "tenant-789" + + def test_on_start_missing_baggage(self, memory_exporter): + """Should handle missing baggage gracefully.""" + enricher = RunContextEnricher() + + # Create a clean context with no baggage + clean_ctx = context.Context() + + tracer = trace.get_tracer("test") + token = context.attach(clean_ctx) + try: + with tracer.start_as_current_span("test-span") as span: + # Pass the clean context with no baggage + enricher.on_start(span, clean_ctx) + finally: + context.detach(token) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + # No botanu attributes should be set + assert "botanu.run_id" not in attrs + + def test_on_start_does_not_override_existing(self, memory_exporter): + """Should not override existing span attributes.""" + enricher = RunContextEnricher() + + # Set up baggage context + ctx = context.Context() + ctx = baggage.set_baggage("botanu.run_id", "baggage-id", context=ctx) + ctx = baggage.set_baggage("botanu.use_case", "Baggage Case", context=ctx) + + tracer = trace.get_tracer("test") + token = context.attach(ctx) + try: + with tracer.start_as_current_span("test-span") as span: + # Set attribute before enricher runs + span.set_attribute("botanu.run_id", "existing-id") + # Now run enricher - should not override + enricher.on_start(span, ctx) + finally: + context.detach(token) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + # Should keep existing value + assert attrs.get("botanu.run_id") == "existing-id" + # But should set use_case since it wasn't set before + assert attrs.get("botanu.use_case") == "Baggage Case" + + def test_on_end_noop(self): + """on_end should be a no-op.""" + enricher = RunContextEnricher() + mock_span = mock.MagicMock(spec=ReadableSpan) + # Should not raise + enricher.on_end(mock_span) + + def test_shutdown_noop(self): + """shutdown should be a no-op.""" + enricher = RunContextEnricher() + # Should not raise + enricher.shutdown() + + def test_force_flush_returns_true(self): + """force_flush should return True.""" + enricher = RunContextEnricher() + assert enricher.force_flush() is True + assert enricher.force_flush(timeout_millis=1000) is True + + def test_baggage_keys_constants(self): + """Verify baggage key constants.""" + assert "botanu.run_id" in RunContextEnricher.BAGGAGE_KEYS_LEAN + assert "botanu.use_case" in RunContextEnricher.BAGGAGE_KEYS_LEAN + assert len(RunContextEnricher.BAGGAGE_KEYS_LEAN) == 2 + + assert "botanu.run_id" in RunContextEnricher.BAGGAGE_KEYS_FULL + assert "botanu.workflow" in RunContextEnricher.BAGGAGE_KEYS_FULL + assert "botanu.environment" in RunContextEnricher.BAGGAGE_KEYS_FULL + assert len(RunContextEnricher.BAGGAGE_KEYS_FULL) == 6 diff --git a/tests/unit/test_ledger.py b/tests/unit/test_ledger.py new file mode 100644 index 0000000..c4ea3e3 --- /dev/null +++ b/tests/unit/test_ledger.py @@ -0,0 +1,277 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for Attempt Ledger.""" + +from __future__ import annotations + +import os +from unittest import mock + +from opentelemetry import trace + +from botanu.tracking.ledger import ( + AttemptLedger, + AttemptStatus, + LedgerEventType, + get_ledger, + record_attempt_ended, + record_attempt_started, + record_llm_attempted, + record_tool_attempted, + set_ledger, +) + + +class TestLedgerEventType: + """Tests for LedgerEventType enum.""" + + def test_event_types_are_strings(self): + assert LedgerEventType.ATTEMPT_STARTED == "attempt.started" + assert LedgerEventType.ATTEMPT_ENDED == "attempt.ended" + assert LedgerEventType.LLM_ATTEMPTED == "llm.attempted" + assert LedgerEventType.TOOL_ATTEMPTED == "tool.attempted" + assert LedgerEventType.CANCEL_REQUESTED == "cancellation.requested" + assert LedgerEventType.CANCEL_ACKNOWLEDGED == "cancellation.acknowledged" + assert LedgerEventType.ZOMBIE_DETECTED == "zombie.detected" + assert LedgerEventType.REDELIVERY_DETECTED == "redelivery.detected" + + +class TestAttemptStatus: + """Tests for AttemptStatus enum.""" + + def test_status_values(self): + assert AttemptStatus.SUCCESS == "success" + assert AttemptStatus.ERROR == "error" + assert AttemptStatus.TIMEOUT == "timeout" + assert AttemptStatus.CANCELLED == "cancelled" + assert AttemptStatus.RATE_LIMITED == "rate_limited" + + +class TestAttemptLedger: + """Tests for AttemptLedger class.""" + + def test_default_service_name(self): + """Should use environment variable for default service name.""" + with mock.patch.dict(os.environ, {"OTEL_SERVICE_NAME": "test-service"}): + ledger = AttemptLedger.__new__(AttemptLedger) + ledger.service_name = os.getenv("OTEL_SERVICE_NAME", "unknown") + ledger._initialized = False + assert ledger.service_name == "test-service" + + def test_get_trace_context_no_span(self): + """Should return empty dict when no active span.""" + ledger = AttemptLedger.__new__(AttemptLedger) + ledger._initialized = False + ledger._logger = None + + # No span context - should return empty + ctx = ledger._get_trace_context() + assert ctx == {} or "trace_id" in ctx # May have context from other tests + + def test_get_trace_context_with_span(self, memory_exporter): + """Should return trace context when span is active.""" + ledger = AttemptLedger.__new__(AttemptLedger) + ledger._initialized = False + ledger._logger = None + + tracer = trace.get_tracer("test") + with tracer.start_as_current_span("test-span") as span: + span_ctx = span.get_span_context() + ctx = ledger._get_trace_context() + + assert "trace_id" in ctx + assert "span_id" in ctx + assert ctx["trace_id"] == format(span_ctx.trace_id, "032x") + assert ctx["span_id"] == format(span_ctx.span_id, "016x") + + def test_emit_when_not_initialized(self): + """Should not raise when emitting without initialization.""" + ledger = AttemptLedger.__new__(AttemptLedger) + ledger._initialized = False + ledger._logger = None + + # Should not raise + ledger._emit(LedgerEventType.ATTEMPT_STARTED, None, {"test": "value"}) + + def test_attempt_started_not_initialized(self): + """Should not raise when calling methods without initialization.""" + ledger = AttemptLedger.__new__(AttemptLedger) + ledger._initialized = False + ledger._logger = None + ledger.service_name = "test" + + # Should not raise + ledger.attempt_started( + run_id="run-123", + use_case="Test Case", + attempt=1, + ) + + def test_attempt_ended_not_initialized(self): + """Should not raise when calling methods without initialization.""" + ledger = AttemptLedger.__new__(AttemptLedger) + ledger._initialized = False + ledger._logger = None + ledger.service_name = "test" + + # Should not raise + ledger.attempt_ended( + run_id="run-123", + status="success", + duration_ms=1000.0, + ) + + def test_llm_attempted_not_initialized(self): + """Should not raise when calling methods without initialization.""" + ledger = AttemptLedger.__new__(AttemptLedger) + ledger._initialized = False + ledger._logger = None + ledger.service_name = "test" + + # Should not raise + ledger.llm_attempted( + run_id="run-123", + provider="openai", + model="gpt-4", + input_tokens=100, + output_tokens=50, + ) + + def test_tool_attempted_not_initialized(self): + """Should not raise when calling methods without initialization.""" + ledger = AttemptLedger.__new__(AttemptLedger) + ledger._initialized = False + ledger._logger = None + ledger.service_name = "test" + + # Should not raise + ledger.tool_attempted( + run_id="run-123", + tool_name="search", + ) + + def test_cancel_requested_not_initialized(self): + """Should not raise when calling methods without initialization.""" + ledger = AttemptLedger.__new__(AttemptLedger) + ledger._initialized = False + ledger._logger = None + ledger.service_name = "test" + + # Should not raise + ledger.cancel_requested(run_id="run-123", reason="user") + + def test_cancel_acknowledged_not_initialized(self): + """Should not raise when calling methods without initialization.""" + ledger = AttemptLedger.__new__(AttemptLedger) + ledger._initialized = False + ledger._logger = None + ledger.service_name = "test" + + # Should not raise + ledger.cancel_acknowledged(run_id="run-123", acknowledged_by="handler") + + def test_zombie_detected_not_initialized(self): + """Should not raise when calling methods without initialization.""" + ledger = AttemptLedger.__new__(AttemptLedger) + ledger._initialized = False + ledger._logger = None + ledger.service_name = "test" + + # Should not raise + ledger.zombie_detected( + run_id="run-123", + deadline_ts=1000.0, + actual_end_ts=2000.0, + zombie_duration_ms=1000.0, + component="handler", + ) + + def test_redelivery_detected_not_initialized(self): + """Should not raise when calling methods without initialization.""" + ledger = AttemptLedger.__new__(AttemptLedger) + ledger._initialized = False + ledger._logger = None + ledger.service_name = "test" + + # Should not raise + ledger.redelivery_detected( + run_id="run-123", + queue_name="my-queue", + delivery_count=3, + ) + + def test_flush_when_not_initialized(self): + """Should return True when flushing without initialization.""" + ledger = AttemptLedger.__new__(AttemptLedger) + ledger._initialized = False + + result = ledger.flush() + assert result is True + + def test_shutdown_when_not_initialized(self): + """Should not raise when shutting down without initialization.""" + ledger = AttemptLedger.__new__(AttemptLedger) + ledger._initialized = False + + # Should not raise + ledger.shutdown() + + +class TestGlobalLedger: + """Tests for global ledger functions.""" + + def test_get_ledger_creates_instance(self): + """get_ledger should create a ledger if none exists.""" + # Reset global + import botanu.tracking.ledger as ledger_module + + ledger_module._global_ledger = None + + ledger = get_ledger() + assert isinstance(ledger, AttemptLedger) + + def test_set_ledger(self): + """set_ledger should update the global instance.""" + custom_ledger = AttemptLedger.__new__(AttemptLedger) + custom_ledger._initialized = False + custom_ledger.service_name = "custom-service" + + set_ledger(custom_ledger) + assert get_ledger() is custom_ledger + + def test_record_attempt_started(self): + """record_attempt_started should call the global ledger.""" + mock_ledger = mock.MagicMock(spec=AttemptLedger) + set_ledger(mock_ledger) + + record_attempt_started(run_id="run-123", use_case="Test") + + mock_ledger.attempt_started.assert_called_once_with(run_id="run-123", use_case="Test") + + def test_record_attempt_ended(self): + """record_attempt_ended should call the global ledger.""" + mock_ledger = mock.MagicMock(spec=AttemptLedger) + set_ledger(mock_ledger) + + record_attempt_ended(run_id="run-123", status="success") + + mock_ledger.attempt_ended.assert_called_once_with(run_id="run-123", status="success") + + def test_record_llm_attempted(self): + """record_llm_attempted should call the global ledger.""" + mock_ledger = mock.MagicMock(spec=AttemptLedger) + set_ledger(mock_ledger) + + record_llm_attempted(run_id="run-123", provider="openai", model="gpt-4") + + mock_ledger.llm_attempted.assert_called_once_with(run_id="run-123", provider="openai", model="gpt-4") + + def test_record_tool_attempted(self): + """record_tool_attempted should call the global ledger.""" + mock_ledger = mock.MagicMock(spec=AttemptLedger) + set_ledger(mock_ledger) + + record_tool_attempted(run_id="run-123", tool_name="search") + + mock_ledger.tool_attempted.assert_called_once_with(run_id="run-123", tool_name="search") diff --git a/tests/unit/test_llm_tracking.py b/tests/unit/test_llm_tracking.py new file mode 100644 index 0000000..c9b7b58 --- /dev/null +++ b/tests/unit/test_llm_tracking.py @@ -0,0 +1,307 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for LLM tracking.""" + +from __future__ import annotations + +import pytest + +from botanu.tracking.llm import ( + GenAIAttributes, + ModelOperation, + track_llm_call, +) + + +class TestTrackLLMCall: + """Tests for track_llm_call context manager.""" + + def test_creates_span_with_model_name(self, memory_exporter): + with track_llm_call(model="gpt-4", provider="openai") as tracker: + tracker.set_tokens(input_tokens=100, output_tokens=50) + + spans = memory_exporter.get_finished_spans() + assert len(spans) == 1 + # Span name format: "{operation} {model}" + assert spans[0].name == "chat gpt-4" + + def test_records_token_usage(self, memory_exporter): + with track_llm_call(model="claude-3-opus", provider="anthropic") as tracker: + tracker.set_tokens(input_tokens=500, output_tokens=200) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + + assert attrs[GenAIAttributes.USAGE_INPUT_TOKENS] == 500 + assert attrs[GenAIAttributes.USAGE_OUTPUT_TOKENS] == 200 + + def test_records_error_on_exception(self, memory_exporter): + with pytest.raises(ValueError): + with track_llm_call(model="gpt-4", provider="openai") as _tracker: + raise ValueError("API error") + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get(GenAIAttributes.ERROR_TYPE) == "ValueError" + + def test_operation_type_attribute(self, memory_exporter): + with track_llm_call( + model="gpt-4", + provider="openai", + operation=ModelOperation.EMBEDDINGS, + ): + pass + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs[GenAIAttributes.OPERATION_NAME] == "embeddings" + + def test_request_params(self, memory_exporter): + with track_llm_call( + model="gpt-4", + provider="openai", + ) as tracker: + tracker.set_request_params(temperature=0.7, max_tokens=1000) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs[GenAIAttributes.REQUEST_TEMPERATURE] == 0.7 + assert attrs[GenAIAttributes.REQUEST_MAX_TOKENS] == 1000 + + +class TestLLMTracker: + """Tests for LLMTracker helper methods.""" + + def test_set_request_id(self, memory_exporter): + with track_llm_call(model="gpt-4", provider="openai") as tracker: + tracker.set_request_id(provider_request_id="resp_123") + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs[GenAIAttributes.RESPONSE_ID] == "resp_123" + + def test_set_finish_reason(self, memory_exporter): + with track_llm_call(model="gpt-4", provider="openai") as tracker: + tracker.set_finish_reason("stop") + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + # OTel converts lists to tuples for span attributes + assert attrs[GenAIAttributes.RESPONSE_FINISH_REASONS] == ("stop",) + + +class TestProviderNormalization: + """Tests for provider name normalization.""" + + def test_openai_normalized(self, memory_exporter): + with track_llm_call(model="gpt-4", provider="OpenAI"): + pass + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs[GenAIAttributes.PROVIDER_NAME] == "openai" + + def test_anthropic_normalized(self, memory_exporter): + with track_llm_call(model="claude-3", provider="Anthropic"): + pass + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs[GenAIAttributes.PROVIDER_NAME] == "anthropic" + + def test_bedrock_normalized(self, memory_exporter): + with track_llm_call(model="claude-v2", provider="bedrock"): + pass + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs[GenAIAttributes.PROVIDER_NAME] == "aws.bedrock" + + def test_vertex_normalized(self, memory_exporter): + with track_llm_call(model="gemini-pro", provider="vertex_ai"): + pass + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs[GenAIAttributes.PROVIDER_NAME] == "gcp.vertex_ai" + + def test_azure_openai_normalized(self, memory_exporter): + with track_llm_call(model="gpt-4", provider="azure_openai"): + pass + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs[GenAIAttributes.PROVIDER_NAME] == "azure.openai" + + def test_unknown_provider_passthrough(self, memory_exporter): + """Unknown provider names should be normalized to lowercase.""" + with track_llm_call(model="custom-model", provider="CustomProvider"): + pass + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs[GenAIAttributes.PROVIDER_NAME] == "customprovider" + + +class TestLLMTrackerExtended: + """Extended tests for LLMTracker methods.""" + + def test_set_streaming(self, memory_exporter): + from botanu.tracking.llm import BotanuAttributes + + with track_llm_call(model="gpt-4", provider="openai") as tracker: + tracker.set_streaming(True) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs[BotanuAttributes.STREAMING] is True + + def test_set_cache_hit(self, memory_exporter): + from botanu.tracking.llm import BotanuAttributes + + with track_llm_call(model="gpt-4", provider="openai") as tracker: + tracker.set_cache_hit(True) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs[BotanuAttributes.CACHE_HIT] is True + + def test_set_attempt(self, memory_exporter): + from botanu.tracking.llm import BotanuAttributes + + with track_llm_call(model="gpt-4", provider="openai") as tracker: + tracker.set_attempt(3) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs[BotanuAttributes.ATTEMPT_NUMBER] == 3 + + def test_set_response_model(self, memory_exporter): + with track_llm_call(model="gpt-4", provider="openai") as tracker: + tracker.set_response_model("gpt-4-0125-preview") + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs[GenAIAttributes.RESPONSE_MODEL] == "gpt-4-0125-preview" + + def test_set_tokens_with_cache(self, memory_exporter): + from botanu.tracking.llm import BotanuAttributes + + with track_llm_call(model="claude-3", provider="anthropic") as tracker: + tracker.set_tokens( + input_tokens=100, + output_tokens=50, + cache_read_tokens=80, + cache_write_tokens=20, + ) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs[GenAIAttributes.USAGE_INPUT_TOKENS] == 100 + assert attrs[GenAIAttributes.USAGE_OUTPUT_TOKENS] == 50 + assert attrs[BotanuAttributes.TOKENS_CACHED_READ] == 80 + assert attrs[BotanuAttributes.TOKENS_CACHED_WRITE] == 20 + + def test_set_request_id_with_client_id(self, memory_exporter): + from botanu.tracking.llm import BotanuAttributes + + with track_llm_call(model="gpt-4", provider="openai") as tracker: + tracker.set_request_id( + provider_request_id="resp_123", + client_request_id="client_456", + ) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs[GenAIAttributes.RESPONSE_ID] == "resp_123" + assert attrs[BotanuAttributes.CLIENT_REQUEST_ID] == "client_456" + + def test_set_request_params_extended(self, memory_exporter): + with track_llm_call(model="gpt-4", provider="openai") as tracker: + tracker.set_request_params( + temperature=0.8, + top_p=0.95, + max_tokens=2000, + stop_sequences=["END", "STOP"], + frequency_penalty=0.5, + presence_penalty=0.3, + ) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs[GenAIAttributes.REQUEST_TEMPERATURE] == 0.8 + assert attrs[GenAIAttributes.REQUEST_TOP_P] == 0.95 + assert attrs[GenAIAttributes.REQUEST_MAX_TOKENS] == 2000 + # OTel converts lists to tuples + assert attrs[GenAIAttributes.REQUEST_STOP_SEQUENCES] == ("END", "STOP") + assert attrs[GenAIAttributes.REQUEST_FREQUENCY_PENALTY] == 0.5 + assert attrs[GenAIAttributes.REQUEST_PRESENCE_PENALTY] == 0.3 + + def test_add_metadata(self, memory_exporter): + with track_llm_call(model="gpt-4", provider="openai") as tracker: + tracker.add_metadata(custom_field="value", another_field=123) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs["botanu.custom_field"] == "value" + assert attrs["botanu.another_field"] == 123 + + def test_add_metadata_preserves_prefix(self, memory_exporter): + with track_llm_call(model="gpt-4", provider="openai") as tracker: + tracker.add_metadata(**{"botanu.explicit": "prefixed"}) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs["botanu.explicit"] == "prefixed" + + def test_set_error_manually(self, memory_exporter): + with track_llm_call(model="gpt-4", provider="openai") as tracker: + error = RuntimeError("Rate limit exceeded") + tracker.set_error(error) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs[GenAIAttributes.ERROR_TYPE] == "RuntimeError" + + +class TestModelOperationConstants: + """Tests for ModelOperation constants.""" + + def test_operation_types(self): + assert ModelOperation.CHAT == "chat" + assert ModelOperation.TEXT_COMPLETION == "text_completion" + assert ModelOperation.EMBEDDINGS == "embeddings" + assert ModelOperation.GENERATE_CONTENT == "generate_content" + assert ModelOperation.EXECUTE_TOOL == "execute_tool" + assert ModelOperation.IMAGE_GENERATION == "image_generation" + assert ModelOperation.SPEECH_TO_TEXT == "speech_to_text" + assert ModelOperation.TEXT_TO_SPEECH == "text_to_speech" + + def test_operation_aliases(self): + """Aliases should match their canonical forms.""" + assert ModelOperation.COMPLETION == ModelOperation.TEXT_COMPLETION + assert ModelOperation.EMBEDDING == ModelOperation.EMBEDDINGS + assert ModelOperation.FUNCTION_CALL == ModelOperation.EXECUTE_TOOL + assert ModelOperation.TOOL_USE == ModelOperation.EXECUTE_TOOL + + +class TestGenAIAttributeConstants: + """Tests for GenAIAttributes and BotanuAttributes constants.""" + + def test_genai_attributes(self): + assert GenAIAttributes.OPERATION_NAME == "gen_ai.operation.name" + assert GenAIAttributes.PROVIDER_NAME == "gen_ai.provider.name" + assert GenAIAttributes.REQUEST_MODEL == "gen_ai.request.model" + assert GenAIAttributes.RESPONSE_MODEL == "gen_ai.response.model" + assert GenAIAttributes.USAGE_INPUT_TOKENS == "gen_ai.usage.input_tokens" + assert GenAIAttributes.USAGE_OUTPUT_TOKENS == "gen_ai.usage.output_tokens" + + def test_botanu_attributes(self): + from botanu.tracking.llm import BotanuAttributes + + assert BotanuAttributes.TOKENS_CACHED == "botanu.usage.cached_tokens" + assert BotanuAttributes.STREAMING == "botanu.request.streaming" + assert BotanuAttributes.CACHE_HIT == "botanu.request.cache_hit" + assert BotanuAttributes.ATTEMPT_NUMBER == "botanu.request.attempt" + assert BotanuAttributes.VENDOR == "botanu.vendor" diff --git a/tests/unit/test_resource_detector.py b/tests/unit/test_resource_detector.py new file mode 100644 index 0000000..7ec32b8 --- /dev/null +++ b/tests/unit/test_resource_detector.py @@ -0,0 +1,269 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for resource detection.""" + +from __future__ import annotations + +import os +import sys +from unittest import mock + +from botanu.resources.detector import ( + detect_all_resources, + detect_cloud_provider, + detect_container, + detect_host, + detect_kubernetes, + detect_process, + detect_serverless, + get_resource_attributes, +) + + +class TestDetectHost: + """Tests for host detection.""" + + def test_detects_hostname(self): + attrs = detect_host() + assert "host.name" in attrs + assert isinstance(attrs["host.name"], str) + + def test_detects_os_type(self): + attrs = detect_host() + assert attrs["os.type"] == sys.platform + + def test_detects_host_arch(self): + attrs = detect_host() + assert "host.arch" in attrs + + +class TestDetectProcess: + """Tests for process detection.""" + + def test_detects_pid(self): + attrs = detect_process() + assert attrs["process.pid"] == os.getpid() + + def test_detects_runtime(self): + attrs = detect_process() + assert attrs["process.runtime.name"] == "python" + assert "process.runtime.version" in attrs + + +class TestDetectKubernetes: + """Tests for Kubernetes detection.""" + + def test_no_k8s_when_not_in_cluster(self): + with mock.patch.dict(os.environ, {}, clear=True): + os.environ.pop("KUBERNETES_SERVICE_HOST", None) + attrs = detect_kubernetes() + assert attrs == {} + + def test_detects_k8s_pod_name(self): + with mock.patch.dict( + os.environ, + { + "KUBERNETES_SERVICE_HOST": "10.0.0.1", + "HOSTNAME": "my-pod-abc123", + "K8S_NAMESPACE": "default", + }, + ): + attrs = detect_kubernetes() + assert attrs.get("k8s.pod.name") == "my-pod-abc123" + assert attrs.get("k8s.namespace.name") == "default" + + def test_detects_k8s_from_env_vars(self): + with mock.patch.dict( + os.environ, + { + "KUBERNETES_SERVICE_HOST": "10.0.0.1", + "K8S_POD_NAME": "explicit-pod", + "K8S_POD_UID": "uid-12345", + "K8S_CLUSTER_NAME": "prod-cluster", + }, + ): + attrs = detect_kubernetes() + assert attrs.get("k8s.pod.name") == "explicit-pod" + assert attrs.get("k8s.pod.uid") == "uid-12345" + assert attrs.get("k8s.cluster.name") == "prod-cluster" + + +class TestDetectCloudProvider: + """Tests for cloud provider detection.""" + + def test_no_cloud_when_not_in_cloud(self): + with mock.patch.dict(os.environ, {}, clear=True): + # Clear all cloud env vars + for key in list(os.environ.keys()): + if any( + prefix in key + for prefix in ["AWS_", "GOOGLE_", "GCLOUD_", "GCP_", "AZURE_", "K_", "FUNCTION_", "WEBSITE_"] + ): + os.environ.pop(key, None) + attrs = detect_cloud_provider() + assert "cloud.provider" not in attrs + + def test_detects_aws(self): + with mock.patch.dict( + os.environ, + { + "AWS_REGION": "us-east-1", + "AWS_ACCOUNT_ID": "123456789012", + }, + clear=False, + ): + attrs = detect_cloud_provider() + assert attrs.get("cloud.provider") == "aws" + assert attrs.get("cloud.region") == "us-east-1" + + def test_detects_aws_lambda(self): + with mock.patch.dict( + os.environ, + { + "AWS_LAMBDA_FUNCTION_NAME": "my-function", + "AWS_LAMBDA_FUNCTION_VERSION": "$LATEST", + "AWS_REGION": "us-west-2", + }, + clear=False, + ): + attrs = detect_cloud_provider() + assert attrs.get("cloud.provider") == "aws" + assert attrs.get("faas.name") == "my-function" + + def test_detects_gcp(self): + with mock.patch.dict( + os.environ, + {"GOOGLE_CLOUD_PROJECT": "my-project", "GOOGLE_CLOUD_REGION": "us-central1"}, + clear=False, + ): + # Clear AWS vars + os.environ.pop("AWS_REGION", None) + os.environ.pop("AWS_DEFAULT_REGION", None) + attrs = detect_cloud_provider() + assert attrs.get("cloud.provider") == "gcp" + assert attrs.get("cloud.account.id") == "my-project" + + def test_detects_gcp_cloud_run(self): + with mock.patch.dict( + os.environ, + { + "K_SERVICE": "my-service", + "K_REVISION": "my-service-00001", + "GOOGLE_CLOUD_PROJECT": "my-project", + }, + clear=False, + ): + os.environ.pop("AWS_REGION", None) + attrs = detect_cloud_provider() + assert attrs.get("cloud.provider") == "gcp" + assert attrs.get("faas.name") == "my-service" + + def test_detects_azure(self): + with mock.patch.dict( + os.environ, + { + "WEBSITE_SITE_NAME": "my-app", + "AZURE_SUBSCRIPTION_ID": "sub-12345", + "REGION_NAME": "eastus", + }, + clear=False, + ): + # Clear other cloud vars + os.environ.pop("AWS_REGION", None) + os.environ.pop("GOOGLE_CLOUD_PROJECT", None) + attrs = detect_cloud_provider() + assert attrs.get("cloud.provider") == "azure" + assert attrs.get("faas.name") == "my-app" + + +class TestDetectContainer: + """Tests for container detection.""" + + def test_detects_container_id_from_env(self): + with mock.patch.dict(os.environ, {"CONTAINER_ID": "abc123def456"}): + attrs = detect_container() + # Container ID detection depends on cgroup files + # In test environment, may or may not detect + assert isinstance(attrs, dict) + + +class TestDetectServerless: + """Tests for serverless/FaaS detection.""" + + def test_detects_lambda(self): + with mock.patch.dict( + os.environ, + { + "AWS_LAMBDA_FUNCTION_NAME": "my-lambda", + "AWS_LAMBDA_FUNCTION_VERSION": "1", + "AWS_LAMBDA_FUNCTION_MEMORY_SIZE": "512", + }, + ): + attrs = detect_serverless() + assert attrs.get("faas.name") == "my-lambda" + assert attrs.get("faas.version") == "1" + assert attrs.get("faas.max_memory") == 512 * 1024 * 1024 + + def test_detects_cloud_run(self): + with mock.patch.dict( + os.environ, + { + "K_SERVICE": "cloud-run-service", + "K_REVISION": "rev-001", + }, + ): + # Clear Lambda vars + os.environ.pop("AWS_LAMBDA_FUNCTION_NAME", None) + attrs = detect_serverless() + assert attrs.get("faas.name") == "cloud-run-service" + assert attrs.get("faas.version") == "rev-001" + + +class TestDetectAllResources: + """Tests for combined resource detection.""" + + def test_returns_dict(self): + attrs = detect_all_resources() + assert isinstance(attrs, dict) + + def test_includes_host_info(self): + # Clear cache to ensure fresh detection + detect_all_resources.cache_clear() + attrs = detect_all_resources() + assert "host.name" in attrs + assert "process.pid" in attrs + + def test_caches_results(self): + detect_all_resources.cache_clear() + result1 = detect_all_resources() + result2 = detect_all_resources() + assert result1 is result2 # Same object due to caching + + +class TestGetResourceAttributes: + """Tests for selective resource detection.""" + + def test_include_host_only(self): + attrs = get_resource_attributes( + include_host=True, + include_process=False, + include_container=False, + include_cloud=False, + include_k8s=False, + include_faas=False, + ) + assert "host.name" in attrs + assert "process.pid" not in attrs + + def test_include_process_only(self): + attrs = get_resource_attributes( + include_host=False, + include_process=True, + include_container=False, + include_cloud=False, + include_k8s=False, + include_faas=False, + ) + assert "process.pid" in attrs + assert "host.name" not in attrs diff --git a/tests/unit/test_run_context.py b/tests/unit/test_run_context.py new file mode 100644 index 0000000..0869676 --- /dev/null +++ b/tests/unit/test_run_context.py @@ -0,0 +1,204 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for RunContext model.""" + +from __future__ import annotations + +import os +import re +import time +from unittest import mock + +from botanu.models.run_context import ( + RunContext, + RunStatus, + generate_run_id, +) + + +class TestGenerateRunId: + """Tests for UUIDv7 generation.""" + + def test_format_is_uuid(self): + """run_id should be valid UUID format.""" + run_id = generate_run_id() + uuid_pattern = r"^[0-9a-f]{8}-[0-9a-f]{4}-7[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$" + assert re.match(uuid_pattern, run_id), f"Invalid UUID format: {run_id}" + + def test_uniqueness(self): + """Generated IDs should be unique.""" + ids = [generate_run_id() for _ in range(1000)] + assert len(set(ids)) == 1000 + + def test_sortable_by_time(self): + """IDs generated later should sort after earlier ones.""" + id1 = generate_run_id() + time.sleep(0.002) + id2 = generate_run_id() + assert id1 < id2 + + +class TestRunContextCreate: + """Tests for RunContext.create factory.""" + + def test_creates_with_required_fields(self): + ctx = RunContext.create(use_case="Customer Support") + assert ctx.run_id is not None + assert ctx.use_case == "Customer Support" + assert ctx.environment == "production" # default + assert ctx.attempt == 1 + + def test_root_run_id_defaults_to_run_id(self): + ctx = RunContext.create(use_case="test") + assert ctx.root_run_id == ctx.run_id + + def test_accepts_custom_root_run_id(self): + ctx = RunContext.create(use_case="test", root_run_id="custom-root") + assert ctx.root_run_id == "custom-root" + + def test_environment_from_env_var(self): + with mock.patch.dict(os.environ, {"BOTANU_ENVIRONMENT": "staging"}): + ctx = RunContext.create(use_case="test") + assert ctx.environment == "staging" + + def test_explicit_environment_overrides_env_var(self): + with mock.patch.dict(os.environ, {"BOTANU_ENVIRONMENT": "staging"}): + ctx = RunContext.create(use_case="test", environment="production") + assert ctx.environment == "production" + + +class TestRunContextRetry: + """Tests for retry handling.""" + + def test_create_retry_increments_attempt(self): + original = RunContext.create(use_case="test") + retry = RunContext.create_retry(original) + + assert retry.attempt == 2 + assert retry.retry_of_run_id == original.run_id + assert retry.root_run_id == original.root_run_id + assert retry.run_id != original.run_id + + def test_multiple_retries_preserve_root(self): + original = RunContext.create(use_case="test") + retry1 = RunContext.create_retry(original) + retry2 = RunContext.create_retry(retry1) + + assert retry2.attempt == 3 + assert retry2.root_run_id == original.run_id + + +class TestRunContextDeadline: + """Tests for deadline handling.""" + + def test_deadline_seconds(self): + ctx = RunContext.create(use_case="test", deadline_seconds=10.0) + assert ctx.deadline is not None + assert ctx.deadline > time.time() + + def test_is_past_deadline(self): + ctx = RunContext.create(use_case="test", deadline_seconds=0.001) + time.sleep(0.01) + assert ctx.is_past_deadline() is True + + def test_remaining_time_seconds(self): + ctx = RunContext.create(use_case="test", deadline_seconds=10.0) + remaining = ctx.remaining_time_seconds() + assert remaining is not None + assert 9.0 < remaining <= 10.0 + + +class TestRunContextCancellation: + """Tests for cancellation handling.""" + + def test_request_cancellation(self): + ctx = RunContext.create(use_case="test") + assert ctx.is_cancelled() is False + + ctx.request_cancellation("user") + assert ctx.is_cancelled() is True + assert ctx.cancelled_at is not None + + +class TestRunContextOutcome: + """Tests for outcome recording.""" + + def test_complete_sets_outcome(self): + ctx = RunContext.create(use_case="test") + ctx.complete( + status=RunStatus.SUCCESS, + value_type="tickets_resolved", + value_amount=1.0, + ) + + assert ctx.outcome is not None + assert ctx.outcome.status == RunStatus.SUCCESS + assert ctx.outcome.value_type == "tickets_resolved" + assert ctx.outcome.value_amount == 1.0 + + +class TestRunContextSerialization: + """Tests for baggage and span attribute serialization.""" + + def test_to_baggage_dict_lean_mode(self): + with mock.patch.dict(os.environ, {"BOTANU_PROPAGATION_MODE": "lean"}): + ctx = RunContext.create( + use_case="Customer Support", + workflow="handle_ticket", + tenant_id="tenant-123", + ) + baggage = ctx.to_baggage_dict() + + # Lean mode only includes run_id and use_case + assert "botanu.run_id" in baggage + assert "botanu.use_case" in baggage + assert "botanu.workflow" not in baggage + assert "botanu.tenant_id" not in baggage + + def test_to_baggage_dict_full_mode(self): + with mock.patch.dict(os.environ, {"BOTANU_PROPAGATION_MODE": "full"}): + ctx = RunContext.create( + use_case="Customer Support", + workflow="handle_ticket", + tenant_id="tenant-123", + ) + baggage = ctx.to_baggage_dict() + + assert baggage["botanu.workflow"] == "handle_ticket" + assert baggage["botanu.tenant_id"] == "tenant-123" + + def test_to_span_attributes(self): + ctx = RunContext.create( + use_case="Customer Support", + workflow="handle_ticket", + tenant_id="tenant-123", + ) + attrs = ctx.to_span_attributes() + + assert attrs["botanu.run_id"] == ctx.run_id + assert attrs["botanu.use_case"] == "Customer Support" + assert attrs["botanu.workflow"] == "handle_ticket" + assert attrs["botanu.tenant_id"] == "tenant-123" + + def test_from_baggage_roundtrip(self): + original = RunContext.create( + use_case="test", + workflow="my_workflow", + tenant_id="tenant-abc", + ) + baggage = original.to_baggage_dict(lean_mode=False) + restored = RunContext.from_baggage(baggage) + + assert restored is not None + assert restored.run_id == original.run_id + assert restored.use_case == original.use_case + assert restored.workflow == original.workflow + assert restored.tenant_id == original.tenant_id + + def test_from_baggage_returns_none_for_missing_fields(self): + result = RunContext.from_baggage({}) + assert result is None + + result = RunContext.from_baggage({"botanu.run_id": "some-id"}) + assert result is None diff --git a/tests/unit/test_span_helpers.py b/tests/unit/test_span_helpers.py new file mode 100644 index 0000000..799bcf4 --- /dev/null +++ b/tests/unit/test_span_helpers.py @@ -0,0 +1,124 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +"""Tests for span helper functions.""" + +from __future__ import annotations + +from opentelemetry import trace + +from botanu.sdk.span_helpers import emit_outcome, set_business_context + + +class TestEmitOutcome: + """Tests for emit_outcome function.""" + + def test_emit_success_outcome(self, memory_exporter): + tracer = trace.get_tracer("test") + with tracer.start_as_current_span("test-span"): + emit_outcome("success") + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("botanu.outcome") == "success" + + def test_emit_failure_outcome(self, memory_exporter): + tracer = trace.get_tracer("test") + with tracer.start_as_current_span("test-span"): + emit_outcome("failed", reason="timeout") + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("botanu.outcome") == "failed" + assert attrs.get("botanu.outcome.reason") == "timeout" + + def test_emit_outcome_with_value(self, memory_exporter): + tracer = trace.get_tracer("test") + with tracer.start_as_current_span("test-span"): + emit_outcome( + "success", + value_type="tickets_resolved", + value_amount=5.0, + ) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("botanu.outcome") == "success" + assert attrs.get("botanu.outcome.value_type") == "tickets_resolved" + assert attrs.get("botanu.outcome.value_amount") == 5.0 + + def test_emit_outcome_with_confidence(self, memory_exporter): + tracer = trace.get_tracer("test") + with tracer.start_as_current_span("test-span"): + emit_outcome("success", confidence=0.95) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("botanu.outcome.confidence") == 0.95 + + def test_emit_outcome_adds_event(self, memory_exporter): + tracer = trace.get_tracer("test") + with tracer.start_as_current_span("test-span"): + emit_outcome("success", value_type="orders", value_amount=1) + + spans = memory_exporter.get_finished_spans() + events = [e for e in spans[0].events if e.name == "botanu.outcome_emitted"] + assert len(events) == 1 + assert events[0].attributes["status"] == "success" + + +class TestSetBusinessContext: + """Tests for set_business_context function.""" + + def test_set_customer_id(self, memory_exporter): + tracer = trace.get_tracer("test") + with tracer.start_as_current_span("test-span"): + set_business_context(customer_id="cust-123") + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("botanu.customer_id") == "cust-123" + + def test_set_team(self, memory_exporter): + tracer = trace.get_tracer("test") + with tracer.start_as_current_span("test-span"): + set_business_context(team="platform-team") + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("botanu.team") == "platform-team" + + def test_set_cost_center(self, memory_exporter): + tracer = trace.get_tracer("test") + with tracer.start_as_current_span("test-span"): + set_business_context(cost_center="CC-456") + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("botanu.cost_center") == "CC-456" + + def test_set_region(self, memory_exporter): + tracer = trace.get_tracer("test") + with tracer.start_as_current_span("test-span"): + set_business_context(region="us-west-2") + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("botanu.region") == "us-west-2" + + def test_set_multiple_contexts(self, memory_exporter): + tracer = trace.get_tracer("test") + with tracer.start_as_current_span("test-span"): + set_business_context( + customer_id="cust-123", + team="support", + cost_center="CC-456", + region="eu-central-1", + ) + + spans = memory_exporter.get_finished_spans() + attrs = dict(spans[0].attributes) + assert attrs.get("botanu.customer_id") == "cust-123" + assert attrs.get("botanu.team") == "support" + assert attrs.get("botanu.cost_center") == "CC-456" + assert attrs.get("botanu.region") == "eu-central-1" From 7d65915052841d069dc30b5733b29ce64d24eb74 Mon Sep 17 00:00:00 2001 From: Deborah Jacob Date: Fri, 6 Feb 2026 11:43:08 -0500 Subject: [PATCH 2/2] Add GitHub Actions workflow for PyPI publishing Supports: - TestPyPI publishing via workflow_dispatch - PyPI publishing on release - OIDC trusted publisher (no tokens needed) Signed-off-by: Claude Opus 4.5 Signed-off-by: Deborah Jacob --- .github/workflows/publish.yml | 81 +++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 .github/workflows/publish.yml diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..afbd8fc --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,81 @@ +# SPDX-FileCopyrightText: 2026 The Botanu Authors +# SPDX-License-Identifier: Apache-2.0 + +name: Publish to PyPI + +on: + release: + types: [published] + workflow_dispatch: + inputs: + target: + description: 'Target repository' + required: true + default: 'testpypi' + type: choice + options: + - testpypi + - pypi + +permissions: + contents: read + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install build dependencies + run: pip install build + + - name: Build package + run: python -m build + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/ + + publish-testpypi: + needs: build + runs-on: ubuntu-latest + if: github.event_name == 'workflow_dispatch' && github.event.inputs.target == 'testpypi' + environment: testpypi + permissions: + id-token: write + steps: + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + + - name: Publish to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ + + publish-pypi: + needs: build + runs-on: ubuntu-latest + if: github.event_name == 'release' || (github.event_name == 'workflow_dispatch' && github.event.inputs.target == 'pypi') + environment: pypi + permissions: + id-token: write + steps: + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + name: dist + path: dist/ + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1