diff --git a/.github/workflows/auto_merge_codex_pr.yml b/.github/workflows/auto_merge_codex_pr.yml index dc994fa..7dd1a0e 100644 --- a/.github/workflows/auto_merge_codex_pr.yml +++ b/.github/workflows/auto_merge_codex_pr.yml @@ -9,6 +9,7 @@ jobs: auto-merge: if: github.event.workflow_run.conclusion == 'success' && startsWith(github.event.workflow_run.head_branch, 'codex/monthly-review-issue-') runs-on: ubuntu-latest + timeout-minutes: 15 permissions: contents: write pull-requests: write diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1424bc4..6fc82b0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,9 +5,13 @@ on: branches: [main] pull_request: +permissions: + contents: read + jobs: test: runs-on: ubuntu-latest + timeout-minutes: 20 steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v6 diff --git a/.github/workflows/codex_pr_feedback.yml b/.github/workflows/codex_pr_feedback.yml index ea61cf7..28681b1 100644 --- a/.github/workflows/codex_pr_feedback.yml +++ b/.github/workflows/codex_pr_feedback.yml @@ -11,6 +11,7 @@ jobs: ci-feedback: if: github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'failure' && startsWith(github.event.workflow_run.head_branch, 'codex/monthly-review-issue-') runs-on: ubuntu-latest + timeout-minutes: 15 permissions: issues: write pull-requests: read @@ -114,6 +115,7 @@ jobs: review-feedback: if: github.event_name == 'pull_request_review' && github.event.review.state == 'changes_requested' && startsWith(github.event.pull_request.head.ref, 'codex/monthly-review-issue-') runs-on: ubuntu-latest + timeout-minutes: 15 permissions: issues: write diff --git a/.github/workflows/monthly_review.yml b/.github/workflows/monthly_review.yml index 882e444..e0c8725 100644 --- a/.github/workflows/monthly_review.yml +++ b/.github/workflows/monthly_review.yml @@ -15,10 +15,15 @@ name: Monthly Snapshot Review required: false default: "" +concurrency: + group: ${{ github.workflow }}-${{ github.ref_name }} + cancel-in-progress: false + jobs: monthly-review: if: github.event_name == 'workflow_dispatch' || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success' && contains(fromJSON('["schedule","workflow_run"]'), github.event.workflow_run.event)) runs-on: ubuntu-latest + timeout-minutes: 60 concurrency: group: monthly-snapshot-review-${{ github.ref_name }} cancel-in-progress: false @@ -162,7 +167,7 @@ jobs: "User-Agent": "us-equity-snapshot-monthly-review", }, ) - with urllib.request.urlopen(request) as response: + with urllib.request.urlopen(request, timeout=30) as response: return response.status def dispatch_codex() -> None: diff --git a/.github/workflows/publish-snapshot-artifacts.yml b/.github/workflows/publish-snapshot-artifacts.yml index 07578a6..e566c1d 100644 --- a/.github/workflows/publish-snapshot-artifacts.yml +++ b/.github/workflows/publish-snapshot-artifacts.yml @@ -69,10 +69,15 @@ on: required: false type: string +concurrency: + group: ${{ github.workflow }}-${{ github.ref_name }} + cancel-in-progress: false + jobs: build-and-publish: if: github.event_name != 'workflow_run' || (github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.event == 'schedule') runs-on: ubuntu-latest + timeout-minutes: 60 strategy: fail-fast: false matrix: @@ -456,6 +461,7 @@ jobs: needs: build-and-publish if: ${{ failure() && github.event_name == 'schedule' && fromJSON(github.run_attempt) < 2 }} runs-on: ubuntu-latest + timeout-minutes: 60 permissions: actions: write contents: read diff --git a/.github/workflows/publish-strategy-plugins.yml b/.github/workflows/publish-strategy-plugins.yml index 676b247..c3678bd 100644 --- a/.github/workflows/publish-strategy-plugins.yml +++ b/.github/workflows/publish-strategy-plugins.yml @@ -37,9 +37,14 @@ on: required: false type: string +concurrency: + group: ${{ github.workflow }}-${{ github.ref_name }} + cancel-in-progress: false + jobs: market-regime-control: runs-on: ubuntu-latest + timeout-minutes: 60 strategy: fail-fast: false matrix: @@ -361,6 +366,7 @@ jobs: crisis-response-shadow: runs-on: ubuntu-latest + timeout-minutes: 60 strategy: fail-fast: false matrix: @@ -572,6 +578,7 @@ jobs: taco-rebound-shadow: runs-on: ubuntu-latest + timeout-minutes: 60 permissions: contents: read id-token: write diff --git a/.github/workflows/snapshot-artifact-health.yml b/.github/workflows/snapshot-artifact-health.yml index fe5f102..cfaf5fb 100644 --- a/.github/workflows/snapshot-artifact-health.yml +++ b/.github/workflows/snapshot-artifact-health.yml @@ -20,9 +20,14 @@ on: default: '1' type: string +concurrency: + group: ${{ github.workflow }}-${{ github.ref_name }} + cancel-in-progress: false + jobs: check: runs-on: ubuntu-latest + timeout-minutes: 20 strategy: fail-fast: false matrix: diff --git a/.github/workflows/update-source-input-data.yml b/.github/workflows/update-source-input-data.yml index d0ebd9a..30832e6 100644 --- a/.github/workflows/update-source-input-data.yml +++ b/.github/workflows/update-source-input-data.yml @@ -65,9 +65,14 @@ on: default: '100' type: string +concurrency: + group: ${{ github.workflow }}-${{ github.ref_name }} + cancel-in-progress: false + jobs: update-inputs: runs-on: ubuntu-latest + timeout-minutes: 20 permissions: actions: write contents: read @@ -285,6 +290,7 @@ jobs: needs: update-inputs if: ${{ failure() && github.event_name == 'schedule' && fromJSON(github.run_attempt) < 2 }} runs-on: ubuntu-latest + timeout-minutes: 20 permissions: actions: write contents: read diff --git a/scripts/post_monthly_ai_review_issue.py b/scripts/post_monthly_ai_review_issue.py index 7fe5ef5..20d6281 100644 --- a/scripts/post_monthly_ai_review_issue.py +++ b/scripts/post_monthly_ai_review_issue.py @@ -13,9 +13,17 @@ DEFAULT_API_URL = "https://api.github.com" DEFAULT_LABEL = "monthly-review" +DEFAULT_TIMEOUT_SECONDS = 30 -def github_request(method: str, url: str, token: str, payload: dict[str, Any] | None = None) -> Any: +def github_request( + method: str, + url: str, + token: str, + payload: dict[str, Any] | None = None, + *, + timeout: float = DEFAULT_TIMEOUT_SECONDS, +) -> Any: data = None headers = { "Accept": "application/vnd.github+json", @@ -27,7 +35,7 @@ def github_request(method: str, url: str, token: str, payload: dict[str, Any] | data = json.dumps(payload).encode("utf-8") headers["Content-Type"] = "application/json" request = urllib.request.Request(url, data=data, headers=headers, method=method) - with urllib.request.urlopen(request) as response: + with urllib.request.urlopen(request, timeout=timeout) as response: charset = response.headers.get_content_charset("utf-8") raw = response.read().decode(charset) return json.loads(raw) if raw else None diff --git a/src/us_equity_snapshot_pipelines/input_sources.py b/src/us_equity_snapshot_pipelines/input_sources.py index addcae0..4ca3b62 100644 --- a/src/us_equity_snapshot_pipelines/input_sources.py +++ b/src/us_equity_snapshot_pipelines/input_sources.py @@ -1,16 +1,30 @@ from __future__ import annotations import argparse +import shutil import shlex import subprocess from collections.abc import Callable from dataclasses import dataclass from pathlib import Path from urllib.parse import urlparse -from urllib.request import urlretrieve +from urllib.request import Request, urlopen SUPPORTED_TABLE_SUFFIXES = frozenset({".csv", ".json", ".jsonl", ".parquet"}) SUPPORTED_CONFIG_SUFFIXES = frozenset({".json"}) +DEFAULT_REMOTE_COPY_TIMEOUT_SECONDS = 60 +SENSITIVE_REMOTE_SOURCE_QUERY_MARKERS = ( + "access_token=", + "api_key=", + "auth=", + "jwt=", + "password=", + "secret=", + "signature=", + "token=", + "x-amz-signature=", + "x-goog-signature=", +) CopyFn = Callable[[str, Path], None] @@ -37,12 +51,23 @@ def source_needs_gcloud(source: str | None) -> bool: return is_gcs_uri(str(source or "")) +def _reject_sensitive_remote_source(source: str) -> None: + normalized = str(source or "").strip().lower() + if not (is_gcs_uri(normalized) or is_http_uri(normalized)): + return + for marker in SENSITIVE_REMOTE_SOURCE_QUERY_MARKERS: + if marker in normalized: + raise ValueError("remote input URI must not contain token/password/signature-like query parameters") + + def _default_gcs_copy(source: str, target: Path) -> None: subprocess.run(["gcloud", "storage", "cp", source, str(target)], check=True) def _default_http_copy(source: str, target: Path) -> None: - urlretrieve(source, target) # noqa: S310 - operator-supplied data source URL. + request = Request(source, headers={"User-Agent": "us-equity-snapshot-pipelines"}) + with urlopen(request, timeout=DEFAULT_REMOTE_COPY_TIMEOUT_SECONDS) as response, target.open("wb") as output: # noqa: S310 - operator-supplied data source URL. + shutil.copyfileobj(response, output) def _source_suffix(source: str, *, allowed_suffixes: frozenset[str], default_suffix: str) -> str: @@ -69,6 +94,8 @@ def resolve_input_source( output_root = Path(output_dir) output_root.mkdir(parents=True, exist_ok=True) + _reject_sensitive_remote_source(source_text) + if is_gcs_uri(source_text): target = output_root / f"{stem}{_source_suffix(source_text, allowed_suffixes=allowed_suffixes, default_suffix=default_suffix)}" (gcs_copy or _default_gcs_copy)(source_text, target) diff --git a/tests/test_input_sources.py b/tests/test_input_sources.py index 91e6245..b87b6e1 100644 --- a/tests/test_input_sources.py +++ b/tests/test_input_sources.py @@ -4,7 +4,13 @@ import pytest -from us_equity_snapshot_pipelines.input_sources import resolve_input_source, resolve_input_sources, source_needs_gcloud +from us_equity_snapshot_pipelines.input_sources import ( + DEFAULT_REMOTE_COPY_TIMEOUT_SECONDS, + _default_http_copy, + resolve_input_source, + resolve_input_sources, + source_needs_gcloud, +) def test_resolves_local_input_without_copying(tmp_path) -> None: @@ -59,3 +65,42 @@ def test_source_needs_gcloud_only_for_gcs() -> None: assert source_needs_gcloud("gs://bucket/path.csv") is True assert source_needs_gcloud("https://example.com/path.csv") is False assert source_needs_gcloud("data/prices.csv") is False + + +def test_resolve_input_source_rejects_secret_like_remote_uri(tmp_path) -> None: + with pytest.raises(ValueError, match="must not contain token"): + resolve_input_source( + "https://example.com/prices.csv?token=abc", + output_dir=tmp_path / "resolved", + stem="prices", + ) + + +def test_default_http_copy_uses_timeout_and_streams_response(monkeypatch, tmp_path) -> None: + calls: list[tuple[str, float]] = [] + + class FakeResponse: + def __init__(self) -> None: + self._payload = b"symbol,close\nQQQ,100\n" + + def __enter__(self) -> "FakeResponse": + return self + + def __exit__(self, *args: object) -> None: + return None + + def read(self, size: int = -1) -> bytes: + payload, self._payload = self._payload, b"" + return payload + + def fake_urlopen(request, *, timeout: float): + calls.append((request.full_url, timeout)) + return FakeResponse() + + monkeypatch.setattr("us_equity_snapshot_pipelines.input_sources.urlopen", fake_urlopen) + target = tmp_path / "prices.csv" + + _default_http_copy("https://example.com/prices.csv", target) + + assert target.read_text(encoding="utf-8") == "symbol,close\nQQQ,100\n" + assert calls == [("https://example.com/prices.csv", DEFAULT_REMOTE_COPY_TIMEOUT_SECONDS)] diff --git a/tests/test_post_monthly_ai_review_issue.py b/tests/test_post_monthly_ai_review_issue.py new file mode 100644 index 0000000..ebe700c --- /dev/null +++ b/tests/test_post_monthly_ai_review_issue.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +import importlib.util +from pathlib import Path + + +SCRIPT = Path(__file__).resolve().parents[1] / "scripts" / "post_monthly_ai_review_issue.py" + + +def _load_script_module(): + spec = importlib.util.spec_from_file_location("post_monthly_ai_review_issue", SCRIPT) + assert spec is not None + assert spec.loader is not None + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def test_github_request_uses_default_timeout(monkeypatch) -> None: + module = _load_script_module() + calls: list[tuple[str, str, float]] = [] + + class FakeHeaders: + def get_content_charset(self, default: str) -> str: + return default + + class FakeResponse: + headers = FakeHeaders() + + def __enter__(self) -> "FakeResponse": + return self + + def __exit__(self, *args: object) -> None: + return None + + def read(self) -> bytes: + return b'{"ok": true}' + + def fake_urlopen(request, *, timeout: float): + calls.append((request.get_method(), request.full_url, timeout)) + return FakeResponse() + + monkeypatch.setattr(module.urllib.request, "urlopen", fake_urlopen) + + result = module.github_request("GET", "https://api.github.com/repos/example/repo/issues", "token") + + assert result == {"ok": True} + assert calls == [ + ("GET", "https://api.github.com/repos/example/repo/issues", module.DEFAULT_TIMEOUT_SECONDS), + ]