Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/auto_merge_codex_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ jobs:
auto-merge:
if: github.event.workflow_run.conclusion == 'success' && startsWith(github.event.workflow_run.head_branch, 'codex/monthly-review-issue-')
runs-on: ubuntu-latest
timeout-minutes: 15
permissions:
contents: write
pull-requests: write
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,13 @@ on:
branches: [main]
pull_request:

permissions:
contents: read

jobs:
test:
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- uses: actions/checkout@v6
- uses: actions/setup-python@v6
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/codex_pr_feedback.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ jobs:
ci-feedback:
if: github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure' && startsWith(github.event.workflow_run.head_branch, 'codex/monthly-review-issue-')
runs-on: ubuntu-latest
timeout-minutes: 15
permissions:
issues: write
pull-requests: read
Expand Down Expand Up @@ -114,6 +115,7 @@ jobs:
review-feedback:
if: github.event_name == 'pull_request_review' && github.event.review.state == 'changes_requested' && startsWith(github.event.pull_request.head.ref, 'codex/monthly-review-issue-')
runs-on: ubuntu-latest
timeout-minutes: 15
permissions:
issues: write

Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/monthly_review.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,15 @@ name: Monthly Snapshot Review
required: false
default: ""

concurrency:
group: ${{ github.workflow }}-${{ github.ref_name }}
cancel-in-progress: false

jobs:
monthly-review:
if: github.event_name == 'workflow_dispatch' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success' && contains(fromJSON('["schedule","workflow_run"]'), github.event.workflow_run.event))
runs-on: ubuntu-latest
timeout-minutes: 60
concurrency:
group: monthly-snapshot-review-${{ github.ref_name }}
cancel-in-progress: false
Expand Down Expand Up @@ -162,7 +167,7 @@ jobs:
"User-Agent": "us-equity-snapshot-monthly-review",
},
)
with urllib.request.urlopen(request) as response:
with urllib.request.urlopen(request, timeout=30) as response:
return response.status

def dispatch_codex() -> None:
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/publish-snapshot-artifacts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,15 @@ on:
required: false
type: string

concurrency:
group: ${{ github.workflow }}-${{ github.ref_name }}
cancel-in-progress: false
Comment on lines +72 to +74

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Include the dispatched profile in the concurrency group

When a Publish Snapshot Artifacts run is already in progress, the manual refresh path in update-source-input-data.yml dispatches this workflow twice back-to-back for the Russell and mega-cap profiles; with this workflow-level group both dispatched runs share the same key. GitHub’s default concurrency behavior allows only one pending run and cancels/replaces an existing pending run, so the first profile publish can be dropped before it ever runs (GitHub docs). Include inputs.profile in the group or use queue: max so both requested publishes are preserved.

Useful? React with 👍 / 👎.


jobs:
build-and-publish:
if: github.event_name != 'workflow_run' || (github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.event == 'schedule')
runs-on: ubuntu-latest
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -456,6 +461,7 @@ jobs:
needs: build-and-publish
if: ${{ failure() && github.event_name == 'schedule' && fromJSON(github.run_attempt) < 2 }}
runs-on: ubuntu-latest
timeout-minutes: 60
permissions:
actions: write
contents: read
Expand Down
7 changes: 7 additions & 0 deletions .github/workflows/publish-strategy-plugins.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,14 @@ on:
required: false
type: string

concurrency:
group: ${{ github.workflow }}-${{ github.ref_name }}
cancel-in-progress: false

jobs:
market-regime-control:
runs-on: ubuntu-latest
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -361,6 +366,7 @@ jobs:

crisis-response-shadow:
runs-on: ubuntu-latest
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
Expand Down Expand Up @@ -572,6 +578,7 @@ jobs:

taco-rebound-shadow:
runs-on: ubuntu-latest
timeout-minutes: 60
permissions:
contents: read
id-token: write
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/snapshot-artifact-health.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,14 @@ on:
default: '1'
type: string

concurrency:
group: ${{ github.workflow }}-${{ github.ref_name }}
cancel-in-progress: false

jobs:
check:
runs-on: ubuntu-latest
timeout-minutes: 20
strategy:
fail-fast: false
matrix:
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/update-source-input-data.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,14 @@ on:
default: '100'
type: string

concurrency:
group: ${{ github.workflow }}-${{ github.ref_name }}
cancel-in-progress: false

jobs:
update-inputs:
runs-on: ubuntu-latest
timeout-minutes: 20
permissions:
actions: write
contents: read
Expand Down Expand Up @@ -285,6 +290,7 @@ jobs:
needs: update-inputs
if: ${{ failure() && github.event_name == 'schedule' && fromJSON(github.run_attempt) < 2 }}
runs-on: ubuntu-latest
timeout-minutes: 20
permissions:
actions: write
contents: read
Expand Down
12 changes: 10 additions & 2 deletions scripts/post_monthly_ai_review_issue.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,17 @@

DEFAULT_API_URL = "https://api.github.com"
DEFAULT_LABEL = "monthly-review"
DEFAULT_TIMEOUT_SECONDS = 30


def github_request(method: str, url: str, token: str, payload: dict[str, Any] | None = None) -> Any:
def github_request(
method: str,
url: str,
token: str,
payload: dict[str, Any] | None = None,
*,
timeout: float = DEFAULT_TIMEOUT_SECONDS,
) -> Any:
data = None
headers = {
"Accept": "application/vnd.github+json",
Expand All @@ -27,7 +35,7 @@ def github_request(method: str, url: str, token: str, payload: dict[str, Any] |
data = json.dumps(payload).encode("utf-8")
headers["Content-Type"] = "application/json"
request = urllib.request.Request(url, data=data, headers=headers, method=method)
with urllib.request.urlopen(request) as response:
with urllib.request.urlopen(request, timeout=timeout) as response:
charset = response.headers.get_content_charset("utf-8")
raw = response.read().decode(charset)
return json.loads(raw) if raw else None
Expand Down
31 changes: 29 additions & 2 deletions src/us_equity_snapshot_pipelines/input_sources.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,30 @@
from __future__ import annotations

import argparse
import shutil
import shlex
import subprocess
from collections.abc import Callable
from dataclasses import dataclass
from pathlib import Path
from urllib.parse import urlparse
from urllib.request import urlretrieve
from urllib.request import Request, urlopen

SUPPORTED_TABLE_SUFFIXES = frozenset({".csv", ".json", ".jsonl", ".parquet"})
SUPPORTED_CONFIG_SUFFIXES = frozenset({".json"})
DEFAULT_REMOTE_COPY_TIMEOUT_SECONDS = 60
SENSITIVE_REMOTE_SOURCE_QUERY_MARKERS = (
"access_token=",
"api_key=",
"auth=",
"jwt=",
"password=",
"secret=",
"signature=",
"token=",
"x-amz-signature=",
"x-goog-signature=",
)

CopyFn = Callable[[str, Path], None]

Expand All @@ -37,12 +51,23 @@ def source_needs_gcloud(source: str | None) -> bool:
return is_gcs_uri(str(source or ""))


def _reject_sensitive_remote_source(source: str) -> None:
normalized = str(source or "").strip().lower()
if not (is_gcs_uri(normalized) or is_http_uri(normalized)):
return
for marker in SENSITIVE_REMOTE_SOURCE_QUERY_MARKERS:
if marker in normalized:
raise ValueError("remote input URI must not contain token/password/signature-like query parameters")


def _default_gcs_copy(source: str, target: Path) -> None:
subprocess.run(["gcloud", "storage", "cp", source, str(target)], check=True)


def _default_http_copy(source: str, target: Path) -> None:
urlretrieve(source, target) # noqa: S310 - operator-supplied data source URL.
request = Request(source, headers={"User-Agent": "us-equity-snapshot-pipelines"})
with urlopen(request, timeout=DEFAULT_REMOTE_COPY_TIMEOUT_SECONDS) as response, target.open("wb") as output: # noqa: S310 - operator-supplied data source URL.
shutil.copyfileobj(response, output)


def _source_suffix(source: str, *, allowed_suffixes: frozenset[str], default_suffix: str) -> str:
Expand All @@ -69,6 +94,8 @@ def resolve_input_source(
output_root = Path(output_dir)
output_root.mkdir(parents=True, exist_ok=True)

_reject_sensitive_remote_source(source_text)

if is_gcs_uri(source_text):
target = output_root / f"{stem}{_source_suffix(source_text, allowed_suffixes=allowed_suffixes, default_suffix=default_suffix)}"
(gcs_copy or _default_gcs_copy)(source_text, target)
Expand Down
47 changes: 46 additions & 1 deletion tests/test_input_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,13 @@

import pytest

from us_equity_snapshot_pipelines.input_sources import resolve_input_source, resolve_input_sources, source_needs_gcloud
from us_equity_snapshot_pipelines.input_sources import (
DEFAULT_REMOTE_COPY_TIMEOUT_SECONDS,
_default_http_copy,
resolve_input_source,
resolve_input_sources,
source_needs_gcloud,
)


def test_resolves_local_input_without_copying(tmp_path) -> None:
Expand Down Expand Up @@ -59,3 +65,42 @@ def test_source_needs_gcloud_only_for_gcs() -> None:
assert source_needs_gcloud("gs://bucket/path.csv") is True
assert source_needs_gcloud("https://example.com/path.csv") is False
assert source_needs_gcloud("data/prices.csv") is False


def test_resolve_input_source_rejects_secret_like_remote_uri(tmp_path) -> None:
with pytest.raises(ValueError, match="must not contain token"):
resolve_input_source(
"https://example.com/prices.csv?token=abc",
output_dir=tmp_path / "resolved",
stem="prices",
)


def test_default_http_copy_uses_timeout_and_streams_response(monkeypatch, tmp_path) -> None:
calls: list[tuple[str, float]] = []

class FakeResponse:
def __init__(self) -> None:
self._payload = b"symbol,close\nQQQ,100\n"

def __enter__(self) -> "FakeResponse":
return self

def __exit__(self, *args: object) -> None:
return None

def read(self, size: int = -1) -> bytes:
payload, self._payload = self._payload, b""
return payload

def fake_urlopen(request, *, timeout: float):
calls.append((request.full_url, timeout))
return FakeResponse()

monkeypatch.setattr("us_equity_snapshot_pipelines.input_sources.urlopen", fake_urlopen)
target = tmp_path / "prices.csv"

_default_http_copy("https://example.com/prices.csv", target)

assert target.read_text(encoding="utf-8") == "symbol,close\nQQQ,100\n"
assert calls == [("https://example.com/prices.csv", DEFAULT_REMOTE_COPY_TIMEOUT_SECONDS)]
50 changes: 50 additions & 0 deletions tests/test_post_monthly_ai_review_issue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from __future__ import annotations

import importlib.util
from pathlib import Path


SCRIPT = Path(__file__).resolve().parents[1] / "scripts" / "post_monthly_ai_review_issue.py"


def _load_script_module():
spec = importlib.util.spec_from_file_location("post_monthly_ai_review_issue", SCRIPT)
assert spec is not None
assert spec.loader is not None
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module


def test_github_request_uses_default_timeout(monkeypatch) -> None:
module = _load_script_module()
calls: list[tuple[str, str, float]] = []

class FakeHeaders:
def get_content_charset(self, default: str) -> str:
return default

class FakeResponse:
headers = FakeHeaders()

def __enter__(self) -> "FakeResponse":
return self

def __exit__(self, *args: object) -> None:
return None

def read(self) -> bytes:
return b'{"ok": true}'

def fake_urlopen(request, *, timeout: float):
calls.append((request.get_method(), request.full_url, timeout))
return FakeResponse()

monkeypatch.setattr(module.urllib.request, "urlopen", fake_urlopen)

result = module.github_request("GET", "https://api.github.com/repos/example/repo/issues", "token")

assert result == {"ok": True}
assert calls == [
("GET", "https://api.github.com/repos/example/repo/issues", module.DEFAULT_TIMEOUT_SECONDS),
]