diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml new file mode 100644 index 0000000..3915feb --- /dev/null +++ b/.github/workflows/eval.yml @@ -0,0 +1,34 @@ +name: eval + +on: + pull_request: + +jobs: + eval: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: 22 + + - name: Install vally + run: npm install -g @microsoft/vally-cli@0.6.0 + + - name: Install copilot CLI + run: npm install -g @github/copilot@1.0.65 + + - name: Lint eval specs + run: vally lint --eval-spec evals + + - name: Run evals + env: + COPILOT_GITHUB_TOKEN: ${{ secrets.COPILOT_GITHUB_TOKEN }} + run: vally eval --suite pr --output-dir vally-results --junit + + - name: Upload eval results + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: vally-results + path: vally-results diff --git a/.vally.yaml b/.vally.yaml new file mode 100644 index 0000000..afc1f9d --- /dev/null +++ b/.vally.yaml @@ -0,0 +1,9 @@ +paths: + evals: evals + +suites: + pr: + description: Run all plugin skill evals for pull requests + evals: + - evals/linting/eval.yaml + - evals/security/eval.yaml diff --git a/evals/linting/eval.yaml b/evals/linting/eval.yaml new file mode 100644 index 0000000..08b91bf --- /dev/null +++ b/evals/linting/eval.yaml @@ -0,0 +1,63 @@ +name: linting skills evals +version: 1 +description: Evaluates the linting plugin skills against representative file fixes + +defaults: + runs: 1 + timeout: 5m + +stimuli: + - name: check-spelling-fixes-typos-and-updates-dictionary + prompt: | + Check spelling in docs/spelling.md and fix any issues you find. + environment: + skills: + - ../../plugins/linting/skills/check-spelling + files: + - src: ./fixtures/check-spelling/input.md + dest: docs/spelling.md + constraints: + expect_skills: + - check-spelling + graders: + - type: file-exists + config: + path: .cspell.json + - type: file-matches + config: + path: .cspell.json + pattern: 'copilotcliword' + - type: file-not-matches + config: + path: docs/spelling.md + pattern: '(?i)\\bteh\\b' + - type: file-matches + config: + path: docs/spelling.md + pattern: 'contains the typo that should be corrected[.]' + + - name: lint-markdown-fixes-bullet-style + prompt: | + Lint docs/bullets.md and fix any markdown issues you find. + environment: + skills: + - ../../plugins/linting/skills/lint-markdown + files: + - src: ./fixtures/lint-markdown/input.md + dest: docs/bullets.md + constraints: + expect_skills: + - lint-markdown + graders: + - type: file-not-matches + config: + path: docs/bullets.md + pattern: '(?m)^\\* ' + - type: file-matches + config: + path: docs/bullets.md + pattern: '(?m)^- first item$' + - type: file-matches + config: + path: docs/bullets.md + pattern: '(?m)^- second item that uses the wrong bullet style$' diff --git a/evals/linting/fixtures/check-spelling/input.md b/evals/linting/fixtures/check-spelling/input.md new file mode 100644 index 0000000..de1b668 --- /dev/null +++ b/evals/linting/fixtures/check-spelling/input.md @@ -0,0 +1,2 @@ +This sentence contains teh typo that should be corrected. +The project uses copilotcliword in docs and it should be treated as valid. diff --git a/evals/linting/fixtures/lint-markdown/input.md b/evals/linting/fixtures/lint-markdown/input.md new file mode 100644 index 0000000..aa46978 --- /dev/null +++ b/evals/linting/fixtures/lint-markdown/input.md @@ -0,0 +1,4 @@ +# Bullet styles + +- first item +* second item that uses the wrong bullet style diff --git a/evals/security/eval.yaml b/evals/security/eval.yaml new file mode 100644 index 0000000..a951727 --- /dev/null +++ b/evals/security/eval.yaml @@ -0,0 +1,34 @@ +name: security skills evals +version: 1 +description: Evaluates the security plugin skills against representative workflow hardening tasks + +defaults: + runs: 1 + timeout: 5m + +stimuli: + - name: pin-github-actions-to-shas + prompt: | + Review .github/workflows/ci.yml and harden any insecure GitHub Actions references you find. + environment: + skills: + - ../../plugins/security/skills/pin-github-actions + files: + - src: ./fixtures/pin-github-actions/workflow.yml + dest: .github/workflows/ci.yml + constraints: + expect_skills: + - pin-github-actions + graders: + - type: file-matches + config: + path: .github/workflows/ci.yml + pattern: '(?m)^ *- uses: actions/checkout@[0-9a-f]{40} # v[0-9]+[.][0-9]+[.][0-9]+$' + - type: file-matches + config: + path: .github/workflows/ci.yml + pattern: '(?m)^ *- uses: actions/setup-node@[0-9a-f]{40} # v[0-9]+[.][0-9]+[.][0-9]+$' + - type: file-not-matches + config: + path: .github/workflows/ci.yml + pattern: '(?m)uses:\\s+[^\\n]+@v[0-9]+' diff --git a/evals/security/fixtures/pin-github-actions/workflow.yml b/evals/security/fixtures/pin-github-actions/workflow.yml new file mode 100644 index 0000000..792c940 --- /dev/null +++ b/evals/security/fixtures/pin-github-actions/workflow.yml @@ -0,0 +1,14 @@ +name: CI + +on: + pull_request: + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: 22 + - run: npm test