From 838bb117f2830824f0f0737311f12ec84c5d9bf7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 25 Jun 2026 07:37:19 +0000 Subject: [PATCH 1/8] Add vally eval specs and PR workflow --- .github/workflows/vally-pr-evals.yml | 30 +++++++++ .vally.yaml | 9 +++ evals/linting/eval.yaml | 62 +++++++++++++++++++ .../linting/fixtures/check-spelling/input.md | 2 + evals/linting/fixtures/lint-markdown/input.md | 4 ++ evals/security/eval.yaml | 35 +++++++++++ .../fixtures/pin-github-actions/workflow.yml | 14 +++++ 7 files changed, 156 insertions(+) create mode 100644 .github/workflows/vally-pr-evals.yml create mode 100644 .vally.yaml create mode 100644 evals/linting/eval.yaml create mode 100644 evals/linting/fixtures/check-spelling/input.md create mode 100644 evals/linting/fixtures/lint-markdown/input.md create mode 100644 evals/security/eval.yaml create mode 100644 evals/security/fixtures/pin-github-actions/workflow.yml diff --git a/.github/workflows/vally-pr-evals.yml b/.github/workflows/vally-pr-evals.yml new file mode 100644 index 0000000..08a4ac4 --- /dev/null +++ b/.github/workflows/vally-pr-evals.yml @@ -0,0 +1,30 @@ +name: Vally PR Evals + +on: + pull_request: + +jobs: + run-evals: + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.2.2 + - uses: actions/setup-node@49933ea5288caeca8642e84d8d5f7a3f7c7bd7e8 # v4.4.0 + with: + node-version: 22 + + - name: Lint eval specs + run: npx -y @microsoft/vally-cli@0.6.0 lint --eval-spec evals + + - name: Run evals + env: + GITHUB_TOKEN: ${{ github.token }} + run: npx -y @microsoft/vally-cli@0.6.0 eval --suite pr --output-dir vally-results --junit + + - name: Upload eval results + if: always() + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 + with: + name: vally-results + path: vally-results diff --git a/.vally.yaml b/.vally.yaml new file mode 100644 index 0000000..afc1f9d --- /dev/null +++ b/.vally.yaml @@ -0,0 +1,9 @@ +paths: + evals: evals + +suites: + pr: + description: Run all plugin skill evals for pull requests + evals: + - evals/linting/eval.yaml + - evals/security/eval.yaml diff --git a/evals/linting/eval.yaml b/evals/linting/eval.yaml new file mode 100644 index 0000000..2e07101 --- /dev/null +++ b/evals/linting/eval.yaml @@ -0,0 +1,62 @@ +name: linting skills evals +version: 1 +description: Evaluates the linting plugin skills against representative file fixes + +defaults: + runs: 1 + timeout: 5m + +stimuli: + - name: check-spelling-fixes-typos-and-updates-dictionary + prompt: | + Use the check-spelling skill to fix spelling issues in changed files. + Ensure the obvious typo "teh" is corrected to "the" in docs/spelling.md. + Create cspell.json in the repository root and add "copilotcliword" under words so it is treated as a valid project term. + environment: + skills: + - ../../plugins/linting/skills/check-spelling + files: + - src: ./fixtures/check-spelling/input.md + dest: docs/spelling.md + constraints: + expect_skills: + - check-spelling + graders: + - type: file-exists + config: + path: cspell.json + - type: file-matches + config: + path: cspell.json + pattern: '"words"\\s*:\\s*\\[[^\\]]*"copilotcliword"' + - type: file-not-matches + config: + path: docs/spelling.md + pattern: '(?i)\\bteh\\b' + - type: file-matches + config: + path: docs/spelling.md + pattern: '(?i)\\bthe\\b' + + - name: lint-markdown-fixes-bullet-style + prompt: | + Use the lint-markdown skill to fix markdownlint issues in changed markdown files. + In docs/bullets.md, fix lint issues caused by inconsistent bullet characters. + environment: + skills: + - ../../plugins/linting/skills/lint-markdown + files: + - src: ./fixtures/lint-markdown/input.md + dest: docs/bullets.md + constraints: + expect_skills: + - lint-markdown + graders: + - type: file-not-matches + config: + path: docs/bullets.md + pattern: '(?m)^\\* ' + - type: file-matches + config: + path: docs/bullets.md + pattern: '(?m)^- ' diff --git a/evals/linting/fixtures/check-spelling/input.md b/evals/linting/fixtures/check-spelling/input.md new file mode 100644 index 0000000..de1b668 --- /dev/null +++ b/evals/linting/fixtures/check-spelling/input.md @@ -0,0 +1,2 @@ +This sentence contains teh typo that should be corrected. +The project uses copilotcliword in docs and it should be treated as valid. diff --git a/evals/linting/fixtures/lint-markdown/input.md b/evals/linting/fixtures/lint-markdown/input.md new file mode 100644 index 0000000..aa46978 --- /dev/null +++ b/evals/linting/fixtures/lint-markdown/input.md @@ -0,0 +1,4 @@ +# Bullet styles + +- first item +* second item that uses the wrong bullet style diff --git a/evals/security/eval.yaml b/evals/security/eval.yaml new file mode 100644 index 0000000..ac03cbe --- /dev/null +++ b/evals/security/eval.yaml @@ -0,0 +1,35 @@ +name: security skills evals +version: 1 +description: Evaluates the security plugin skills against representative workflow hardening tasks + +defaults: + runs: 1 + timeout: 5m + +stimuli: + - name: pin-github-actions-to-shas + prompt: | + Use the pin-github-actions skill to update .github/workflows/ci.yml. + Pin each versioned GitHub Action to a commit SHA and append the resolved version tag as a trailing comment. + environment: + skills: + - ../../plugins/security/skills/pin-github-actions + files: + - src: ./fixtures/pin-github-actions/workflow.yml + dest: .github/workflows/ci.yml + constraints: + expect_skills: + - pin-github-actions + graders: + - type: file-matches + config: + path: .github/workflows/ci.yml + pattern: '(?m)actions/checkout@[0-9a-f]{40} # v[0-9]+\\.[0-9]+\\.[0-9]+' + - type: file-matches + config: + path: .github/workflows/ci.yml + pattern: '(?m)actions/setup-node@[0-9a-f]{40} # v[0-9]+\\.[0-9]+\\.[0-9]+' + - type: file-not-matches + config: + path: .github/workflows/ci.yml + pattern: '(?m)uses:\\s+[^\\n]+@v[0-9]+' diff --git a/evals/security/fixtures/pin-github-actions/workflow.yml b/evals/security/fixtures/pin-github-actions/workflow.yml new file mode 100644 index 0000000..792c940 --- /dev/null +++ b/evals/security/fixtures/pin-github-actions/workflow.yml @@ -0,0 +1,14 @@ +name: CI + +on: + pull_request: + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: 22 + - run: npm test From 9b0c1b87d378c26347f154aa23ee112e3ef1a210 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 25 Jun 2026 07:49:31 +0000 Subject: [PATCH 2/8] Use vague prompts in Vally eval stimuli --- evals/linting/eval.yaml | 4 ++-- evals/security/eval.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/evals/linting/eval.yaml b/evals/linting/eval.yaml index 2e07101..9525aaa 100644 --- a/evals/linting/eval.yaml +++ b/evals/linting/eval.yaml @@ -9,7 +9,7 @@ defaults: stimuli: - name: check-spelling-fixes-typos-and-updates-dictionary prompt: | - Use the check-spelling skill to fix spelling issues in changed files. + Check spelling in changed files and fix any issues you find. Ensure the obvious typo "teh" is corrected to "the" in docs/spelling.md. Create cspell.json in the repository root and add "copilotcliword" under words so it is treated as a valid project term. environment: @@ -40,7 +40,7 @@ stimuli: - name: lint-markdown-fixes-bullet-style prompt: | - Use the lint-markdown skill to fix markdownlint issues in changed markdown files. + Lint changed markdown files and fix markdownlint issues. In docs/bullets.md, fix lint issues caused by inconsistent bullet characters. environment: skills: diff --git a/evals/security/eval.yaml b/evals/security/eval.yaml index ac03cbe..47a6193 100644 --- a/evals/security/eval.yaml +++ b/evals/security/eval.yaml @@ -9,7 +9,7 @@ defaults: stimuli: - name: pin-github-actions-to-shas prompt: | - Use the pin-github-actions skill to update .github/workflows/ci.yml. + Update .github/workflows/ci.yml to harden GitHub Actions usage. Pin each versioned GitHub Action to a commit SHA and append the resolved version tag as a trailing comment. environment: skills: From 3a2b9d9db34aa3932258906967a0ec03266ff9ca Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 25 Jun 2026 08:03:33 +0000 Subject: [PATCH 3/8] Use vaguer Vally eval prompts --- evals/linting/eval.yaml | 7 ++----- evals/security/eval.yaml | 3 +-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/evals/linting/eval.yaml b/evals/linting/eval.yaml index 9525aaa..9291bcc 100644 --- a/evals/linting/eval.yaml +++ b/evals/linting/eval.yaml @@ -9,9 +9,7 @@ defaults: stimuli: - name: check-spelling-fixes-typos-and-updates-dictionary prompt: | - Check spelling in changed files and fix any issues you find. - Ensure the obvious typo "teh" is corrected to "the" in docs/spelling.md. - Create cspell.json in the repository root and add "copilotcliword" under words so it is treated as a valid project term. + Check spelling in docs/spelling.md and fix any issues you find. environment: skills: - ../../plugins/linting/skills/check-spelling @@ -40,8 +38,7 @@ stimuli: - name: lint-markdown-fixes-bullet-style prompt: | - Lint changed markdown files and fix markdownlint issues. - In docs/bullets.md, fix lint issues caused by inconsistent bullet characters. + Lint docs/bullets.md and fix any markdown issues you find. environment: skills: - ../../plugins/linting/skills/lint-markdown diff --git a/evals/security/eval.yaml b/evals/security/eval.yaml index 47a6193..142a73b 100644 --- a/evals/security/eval.yaml +++ b/evals/security/eval.yaml @@ -9,8 +9,7 @@ defaults: stimuli: - name: pin-github-actions-to-shas prompt: | - Update .github/workflows/ci.yml to harden GitHub Actions usage. - Pin each versioned GitHub Action to a commit SHA and append the resolved version tag as a trailing comment. + Review .github/workflows/ci.yml and harden any insecure GitHub Actions references you find. environment: skills: - ../../plugins/security/skills/pin-github-actions From a5127a4d98dcff90240bd15f0cb66d08b6d8a075 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 25 Jun 2026 08:14:21 +0000 Subject: [PATCH 4/8] Tighten Vally eval graders --- evals/linting/eval.yaml | 14 +++++++++----- evals/security/eval.yaml | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/evals/linting/eval.yaml b/evals/linting/eval.yaml index 9291bcc..08b91bf 100644 --- a/evals/linting/eval.yaml +++ b/evals/linting/eval.yaml @@ -22,11 +22,11 @@ stimuli: graders: - type: file-exists config: - path: cspell.json + path: .cspell.json - type: file-matches config: - path: cspell.json - pattern: '"words"\\s*:\\s*\\[[^\\]]*"copilotcliword"' + path: .cspell.json + pattern: 'copilotcliword' - type: file-not-matches config: path: docs/spelling.md @@ -34,7 +34,7 @@ stimuli: - type: file-matches config: path: docs/spelling.md - pattern: '(?i)\\bthe\\b' + pattern: 'contains the typo that should be corrected[.]' - name: lint-markdown-fixes-bullet-style prompt: | @@ -56,4 +56,8 @@ stimuli: - type: file-matches config: path: docs/bullets.md - pattern: '(?m)^- ' + pattern: '(?m)^- first item$' + - type: file-matches + config: + path: docs/bullets.md + pattern: '(?m)^- second item that uses the wrong bullet style$' diff --git a/evals/security/eval.yaml b/evals/security/eval.yaml index 142a73b..a951727 100644 --- a/evals/security/eval.yaml +++ b/evals/security/eval.yaml @@ -23,11 +23,11 @@ stimuli: - type: file-matches config: path: .github/workflows/ci.yml - pattern: '(?m)actions/checkout@[0-9a-f]{40} # v[0-9]+\\.[0-9]+\\.[0-9]+' + pattern: '(?m)^ *- uses: actions/checkout@[0-9a-f]{40} # v[0-9]+[.][0-9]+[.][0-9]+$' - type: file-matches config: path: .github/workflows/ci.yml - pattern: '(?m)actions/setup-node@[0-9a-f]{40} # v[0-9]+\\.[0-9]+\\.[0-9]+' + pattern: '(?m)^ *- uses: actions/setup-node@[0-9a-f]{40} # v[0-9]+[.][0-9]+[.][0-9]+$' - type: file-not-matches config: path: .github/workflows/ci.yml From 4a6d841f9684c922db8a2cb71e29d2a62e240e2a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 25 Jun 2026 08:26:00 +0000 Subject: [PATCH 5/8] Fix eval workflow action pins --- .github/workflows/{vally-pr-evals.yml => eval.yml} | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename .github/workflows/{vally-pr-evals.yml => eval.yml} (78%) diff --git a/.github/workflows/vally-pr-evals.yml b/.github/workflows/eval.yml similarity index 78% rename from .github/workflows/vally-pr-evals.yml rename to .github/workflows/eval.yml index 08a4ac4..092b608 100644 --- a/.github/workflows/vally-pr-evals.yml +++ b/.github/workflows/eval.yml @@ -1,4 +1,4 @@ -name: Vally PR Evals +name: eval on: pull_request: @@ -9,8 +9,8 @@ jobs: permissions: contents: read steps: - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.2.2 - - uses: actions/setup-node@49933ea5288caeca8642e84d8d5f7a3f7c7bd7e8 # v4.4.0 + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0 with: node-version: 22 From e9cfcc230b3fe7016ed80db8be4af76e1336dcad Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 25 Jun 2026 08:34:38 +0000 Subject: [PATCH 6/8] Pass GitHub token to eval workflow --- .github/workflows/eval.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index 092b608..b5a0c2f 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -4,10 +4,8 @@ on: pull_request: jobs: - run-evals: + eval: runs-on: ubuntu-latest - permissions: - contents: read steps: - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0 @@ -19,7 +17,7 @@ jobs: - name: Run evals env: - GITHUB_TOKEN: ${{ github.token }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: npx -y @microsoft/vally-cli@0.6.0 eval --suite pr --output-dir vally-results --junit - name: Upload eval results From 64662b5ff60ba96bc7399ac4b24cb0aa8d8d3598 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 25 Jun 2026 18:55:34 +0000 Subject: [PATCH 7/8] Update eval workflow with latest actions and global tool installs --- .github/workflows/eval.yml | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index b5a0c2f..ffefb2e 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -7,22 +7,27 @@ jobs: eval: runs-on: ubuntu-latest steps: - - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 - - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0 + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: 22 + - name: Install tools + run: | + npm install -g @microsoft/vally-cli@0.6.0 + npm install -g @github/copilot@1.0.65 + - name: Lint eval specs - run: npx -y @microsoft/vally-cli@0.6.0 lint --eval-spec evals + run: vally lint --eval-spec evals - name: Run evals env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: npx -y @microsoft/vally-cli@0.6.0 eval --suite pr --output-dir vally-results --junit + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + run: vally eval --suite pr --output-dir vally-results --junit - name: Upload eval results if: always() - uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: vally-results path: vally-results From 68ec8c45fcd932f3cde80ce37f48e8e671915008 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 25 Jun 2026 19:04:28 +0000 Subject: [PATCH 8/8] Split tool install into separate steps --- .github/workflows/eval.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index ffefb2e..3915feb 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -12,10 +12,11 @@ jobs: with: node-version: 22 - - name: Install tools - run: | - npm install -g @microsoft/vally-cli@0.6.0 - npm install -g @github/copilot@1.0.65 + - name: Install vally + run: npm install -g @microsoft/vally-cli@0.6.0 + + - name: Install copilot CLI + run: npm install -g @github/copilot@1.0.65 - name: Lint eval specs run: vally lint --eval-spec evals