Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions .github/workflows/eval.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: eval

on:
pull_request:

jobs:
eval:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
- uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
with:
node-version: 22

- name: Install vally
run: npm install -g @microsoft/vally-cli@0.6.0

- name: Install copilot CLI
run: npm install -g @github/copilot@1.0.65

- name: Lint eval specs
run: vally lint --eval-spec evals

- name: Run evals
env:
COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }}
run: vally eval --suite pr --output-dir vally-results --junit

- name: Upload eval results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: vally-results
path: vally-results
9 changes: 9 additions & 0 deletions .vally.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
paths:
evals: evals

suites:
pr:
description: Run all plugin skill evals for pull requests
evals:
- evals/linting/eval.yaml
- evals/security/eval.yaml
63 changes: 63 additions & 0 deletions evals/linting/eval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: linting skills evals
version: 1
description: Evaluates the linting plugin skills against representative file fixes

defaults:
runs: 1
timeout: 5m

stimuli:
- name: check-spelling-fixes-typos-and-updates-dictionary
prompt: |
Check spelling in docs/spelling.md and fix any issues you find.
environment:
skills:
- ../../plugins/linting/skills/check-spelling
files:
- src: ./fixtures/check-spelling/input.md
dest: docs/spelling.md
constraints:
expect_skills:
- check-spelling
graders:
- type: file-exists
config:
path: .cspell.json
- type: file-matches
config:
path: .cspell.json
pattern: 'copilotcliword'
- type: file-not-matches
config:
path: docs/spelling.md
pattern: '(?i)\\bteh\\b'
- type: file-matches
config:
path: docs/spelling.md
pattern: 'contains the typo that should be corrected[.]'

- name: lint-markdown-fixes-bullet-style
prompt: |
Lint docs/bullets.md and fix any markdown issues you find.
environment:
skills:
- ../../plugins/linting/skills/lint-markdown
files:
- src: ./fixtures/lint-markdown/input.md
dest: docs/bullets.md
constraints:
expect_skills:
- lint-markdown
graders:
- type: file-not-matches
config:
path: docs/bullets.md
pattern: '(?m)^\\* '
- type: file-matches
config:
path: docs/bullets.md
pattern: '(?m)^- first item$'
- type: file-matches
config:
path: docs/bullets.md
pattern: '(?m)^- second item that uses the wrong bullet style$'
2 changes: 2 additions & 0 deletions evals/linting/fixtures/check-spelling/input.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
This sentence contains teh typo that should be corrected.
The project uses copilotcliword in docs and it should be treated as valid.
4 changes: 4 additions & 0 deletions evals/linting/fixtures/lint-markdown/input.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Bullet styles

- first item
* second item that uses the wrong bullet style
34 changes: 34 additions & 0 deletions evals/security/eval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: security skills evals
version: 1
description: Evaluates the security plugin skills against representative workflow hardening tasks

defaults:
runs: 1
timeout: 5m

stimuli:
- name: pin-github-actions-to-shas
prompt: |
Review .github/workflows/ci.yml and harden any insecure GitHub Actions references you find.
environment:
skills:
- ../../plugins/security/skills/pin-github-actions
files:
- src: ./fixtures/pin-github-actions/workflow.yml
dest: .github/workflows/ci.yml
constraints:
expect_skills:
- pin-github-actions
graders:
- type: file-matches
config:
path: .github/workflows/ci.yml
pattern: '(?m)^ *- uses: actions/checkout@[0-9a-f]{40} # v[0-9]+[.][0-9]+[.][0-9]+$'
- type: file-matches
config:
path: .github/workflows/ci.yml
pattern: '(?m)^ *- uses: actions/setup-node@[0-9a-f]{40} # v[0-9]+[.][0-9]+[.][0-9]+$'
- type: file-not-matches
config:
path: .github/workflows/ci.yml
pattern: '(?m)uses:\\s+[^\\n]+@v[0-9]+'
14 changes: 14 additions & 0 deletions evals/security/fixtures/pin-github-actions/workflow.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: CI

on:
pull_request:

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: 22
- run: npm test