diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json
index dc326cff..3a337c31 100644
--- a/.claude-plugin/marketplace.json
+++ b/.claude-plugin/marketplace.json
@@ -146,9 +146,9 @@
       ]
     },
     {
-      "name": "devflow-frontend-design",
-      "source": "./plugins/devflow-frontend-design",
-      "description": "Frontend design patterns - typography, color systems, spacing, motion, responsive design",
+      "name": "devflow-ui-design",
+      "source": "./plugins/devflow-ui-design",
+      "description": "UI design patterns - typography, color systems, spacing, motion, responsive design",
       "version": "1.8.3",
       "keywords": [
         "design",
diff --git a/.gitignore b/.gitignore
index 890ea114..0b043f4f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,7 +20,8 @@ plugins/*/agents/simplifier.md
 plugins/*/agents/coder.md
 plugins/*/agents/reviewer.md
 plugins/*/agents/resolver.md
-plugins/*/agents/shepherd.md
+plugins/*/agents/evaluator.md
+plugins/*/agents/tester.md
 plugins/*/agents/scrutinizer.md
 plugins/*/agents/validator.md
 npm-debug.log*
diff --git a/CLAUDE.md b/CLAUDE.md
index da23594c..4adc871c 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -28,7 +28,7 @@ Plugin marketplace with 17 plugins (8 core + 9 optional language/ecosystem), eac
 | `devflow-typescript` | TypeScript language patterns (optional) | No |
 | `devflow-react` | React framework patterns (optional) | No |
 | `devflow-accessibility` | Web accessibility patterns (optional) | No |
-| `devflow-frontend-design` | Frontend design patterns (optional) | No |
+| `devflow-ui-design` | UI design patterns (optional) | No |
 | `devflow-go` | Go language patterns (optional) | No |
 | `devflow-python` | Python language patterns (optional) | No |
 | `devflow-java` | Java language patterns (optional) | No |
@@ -50,8 +50,8 @@ Commands with Teams Variant ship as `{name}.md` (parallel subagents) and `{name}
 
 ```
 devflow/
-├── shared/skills/          # 37 skills (single source of truth)
-├── shared/agents/          # 10 shared agents (single source of truth)
+├── shared/skills/          # 38 skills (single source of truth)
+├── shared/agents/          # 11 shared agents (single source of truth)
 ├── plugins/devflow-*/      # 17 plugins (8 core + 9 optional language/ecosystem)
 ├── docs/reference/         # Detailed reference documentation
 ├── scripts/                # Helper scripts (statusline, docs-helpers)
@@ -130,20 +130,20 @@ Working memory files live in a dedicated `.memory/` directory:
 
 **Universal Skill Installation**: All skills from all plugins are always installed, regardless of plugin selection. Skills are tiny markdown files installed as `~/.claude/skills/devflow:{name}/` (namespaced to avoid collisions with other plugin ecosystems). Source directories in `shared/skills/` stay unprefixed — the `devflow:` prefix is applied at install-time only. Shadow overrides live at `~/.devflow/skills/{name}/` (unprefixed); when shadowed, the installer copies the user's version to the prefixed install target. Only commands and agents remain plugin-specific.
 
-**Model Strategy**: Explicit model assignments in agent frontmatter override the user's session model. Opus for analysis agents (reviewer, scrutinizer, shepherd), Sonnet for execution agents (coder, simplifier, resolver, skimmer), Haiku for I/O agents (git, synthesizer, validator).
+**Model Strategy**: Explicit model assignments in agent frontmatter override the user's session model. Opus for analysis agents (reviewer, scrutinizer, evaluator), Sonnet for execution agents (coder, simplifier, resolver, skimmer, tester), Haiku for I/O agents (git, synthesizer, validator).
 
 ## Agent & Command Roster
 
 **Orchestration commands** (spawn agents, never do agent work in main session):
 - `/specify` — Skimmer + Explore + Synthesizer + Plan + Synthesizer → GitHub issue
-- `/implement` — Git + Skimmer + Explore + Synthesizer + Plan + Synthesizer + Coder + Simplifier + Scrutinizer + Shepherd → PR
+- `/implement` — Git + Skimmer + Explore + Synthesizer + Plan + Synthesizer + Coder + Simplifier + Scrutinizer + Evaluator + Tester → PR
 - `/code-review` — 7-11 Reviewer agents + Git + Synthesizer
 - `/resolve` — N Resolver agents + Git
 - `/debug` — Agent Teams competing hypotheses
 - `/self-review` — Simplifier then Scrutinizer (sequential)
 - `/audit-claude` — CLAUDE.md audit (optional plugin)
 
-**Shared agents** (10): git, synthesizer, skimmer, simplifier, coder, reviewer, resolver, shepherd, scrutinizer, validator
+**Shared agents** (11): git, synthesizer, skimmer, simplifier, coder, reviewer, resolver, evaluator, tester, scrutinizer, validator
 
 **Plugin-specific agents** (1): claude-md-auditor
 
diff --git a/README.md b/README.md
index 384b40f0..79ec8bcf 100644
--- a/README.md
+++ b/README.md
@@ -29,7 +29,8 @@ DevFlow: Ambient: IMPLEMENT/ORCHESTRATED
          → Validator: build ✓ typecheck ✓ lint ✓ tests ✓
          → Simplifier: cleaned up 3 files
          → Scrutinizer: 9-pillar quality check passed
-         → Shepherd: implementation matches request ✓
+         → Evaluator: implementation matches request ✓
+         → Tester: 5/5 QA scenarios passed ✓
 ```
 
 ```
@@ -47,7 +48,7 @@ DevFlow: Ambient: IMPLEMENT/ORCHESTRATED
 
 **18 parallel code reviewers.** Security, architecture, performance, complexity, consistency, regression, testing, and more. Each produces findings with severity, confidence scoring, and concrete fixes. Conditional reviewers activate when relevant (TypeScript for `.ts` files, database for schema changes). Every finding gets validated and resolved automatically.
 
-**34 skills grounded in expert material.** Every skill is backed by peer-reviewed papers, canonical books, and industry standards — security (OWASP, Shostack), architecture (Parnas, Evans, Fowler), performance (Brendan Gregg), testing (Beck, Meszaros), design (Wlaschin, Hickey). 200+ sources total.
+**38 skills grounded in expert material.** Every skill is backed by peer-reviewed papers, canonical books, and industry standards — security (OWASP, Shostack), architecture (Parnas, Evans, Fowler), performance (Brendan Gregg), testing (Beck, Meszaros), design (Wlaschin, Hickey). 200+ sources total.
 
 **Skill shadowing.** Override any built-in skill with your own version. Drop a file into `~/.devflow/skills/{name}/` and the installer uses yours instead of the default — same activation, your rules.
 
@@ -60,7 +61,7 @@ DevFlow: Ambient: IMPLEMENT/ORCHESTRATED
 ```
 devflow · feat/auth-middleware* · 3↑ · v1.8.3 +5 · 12 files · +234 -56
 Current Session ████░░░░ 42% · Session 5h ██░░░░░░ 18% · 7d █░░░░░░░ 8%
-Opus 4.6 [1m] · 23m · $1.24 · 2 CLAUDE.md · 4 MCPs · 8 hooks · 34 skills
+Opus 4.6 [1m] · 23m · $1.24 · 2 CLAUDE.md · 4 MCPs · 8 hooks · 38 skills
 ```
 
 **Security.** Deny lists block dangerous tool patterns out of the box — configurable during init.
@@ -88,7 +89,7 @@ See [docs/commands.md](docs/commands.md) for detailed usage.
 
 ## Language Support
 
-Optional plugins add language-specific patterns for TypeScript, React, Go, Python, Java, Rust, accessibility, and frontend design.
+Optional plugins add language-specific patterns for TypeScript, React, Go, Python, Java, Rust, accessibility, and UI design.
 
 ```bash
 npx devflow-kit init --plugin=typescript,react
diff --git a/docs/cli-reference.md b/docs/cli-reference.md
index 39f5cb95..1b57b91e 100644
--- a/docs/cli-reference.md
+++ b/docs/cli-reference.md
@@ -54,7 +54,7 @@ npx devflow-kit init --plugin=implement,code-review  # Install multiple
 | `devflow-typescript` | Language | TypeScript patterns |
 | `devflow-react` | Language | React patterns |
 | `devflow-accessibility` | Language | Web accessibility patterns |
-| `devflow-frontend-design` | Language | Frontend design patterns |
+| `devflow-ui-design` | Language | UI design patterns |
 | `devflow-go` | Language | Go patterns |
 | `devflow-python` | Language | Python patterns |
 | `devflow-java` | Language | Java patterns |
diff --git a/docs/commands.md b/docs/commands.md
index 47d28a58..8c2b6d10 100644
--- a/docs/commands.md
+++ b/docs/commands.md
@@ -26,7 +26,8 @@ Executes a single task through the complete development lifecycle:
 4. **Implementation** — Write code on the feature branch
 5. **Validation** — Build, typecheck, lint, and test
 6. **Refinement** — Simplifier (code clarity) + Scrutinizer (9-pillar quality)
-7. **Alignment** — Shepherd verifies implementation matches the original request
+7. **Alignment** — Evaluator verifies implementation matches the original request
+8. **QA Testing** — Tester executes scenario-based acceptance tests
 
 Creates a PR when complete.
 
diff --git a/docs/reference/file-organization.md b/docs/reference/file-organization.md
index e98fc9ba..a0a90d27 100644
--- a/docs/reference/file-organization.md
+++ b/docs/reference/file-organization.md
@@ -9,13 +9,13 @@ devflow/
 ├── .claude-plugin/                   # Marketplace registry (repo root)
 │   └── marketplace.json
 ├── shared/
-│   ├── skills/                       # SINGLE SOURCE OF TRUTH (37 skills)
+│   ├── skills/                       # SINGLE SOURCE OF TRUTH (38 skills)
 │   │   ├── git/
 │   │   │   ├── SKILL.md
 │   │   │   └── references/
 │   │   ├── software-design/
 │   │   └── ...
-│   └── agents/                       # SINGLE SOURCE OF TRUTH (10 shared agents)
+│   └── agents/                       # SINGLE SOURCE OF TRUTH (11 shared agents)
 │       ├── git.md
 │       ├── synthesizer.md
 │       ├── coder.md
@@ -135,7 +135,7 @@ Skills and agents are **not duplicated** in git. Instead:
 
 ### Shared vs Plugin-Specific Agents
 
-- **Shared** (10): `git`, `synthesizer`, `skimmer`, `simplifier`, `coder`, `reviewer`, `resolver`, `shepherd`, `scrutinizer`, `validator`
+- **Shared** (11): `git`, `synthesizer`, `skimmer`, `simplifier`, `coder`, `reviewer`, `resolver`, `evaluator`, `tester`, `scrutinizer`, `validator`
 - **Plugin-specific** (1): `claude-md-auditor` — committed directly in its plugin
 
 ## Settings Override
diff --git a/docs/reference/skills-architecture.md b/docs/reference/skills-architecture.md
index bc31adb5..a5b63b79 100644
--- a/docs/reference/skills-architecture.md
+++ b/docs/reference/skills-architecture.md
@@ -12,7 +12,7 @@ Shared patterns used by multiple agents.
 
 | Skill | Purpose | Used By |
 |-------|---------|---------|
-| `software-design` | Engineering patterns (Result types, DI, immutability, workaround labeling) | Coder, Scrutinizer, Resolver, Shepherd |
+| `software-design` | Engineering patterns (Result types, DI, immutability, workaround labeling) | Coder, Scrutinizer, Resolver, Evaluator |
 | `review-methodology` | 6-step review process, 3-category issue classification | Reviewer, Synthesizer |
 | `self-review` | 9-pillar self-review framework | Scrutinizer |
 | `docs-framework` | Documentation conventions (.docs/ structure, naming, templates) | Synthesizer |
@@ -21,6 +21,7 @@ Shared patterns used by multiple agents.
 | `agent-teams` | Agent Teams patterns for peer-to-peer collaboration, debate, consensus | /code-review, /implement, /debug |
 | `ambient-router` | Intent classification and proportional skill loading for ambient mode (unrestricted tools — orchestrator) | Ambient UserPromptSubmit hook |
 | `knowledge-persistence` | Record/load architectural decisions and pitfalls to `.memory/knowledge/` | /implement, /code-review, /resolve, /debug, /specify, /self-review |
+| `qa` | Scenario-based acceptance testing methodology, evidence collection | Tester |
 
 ### Tier 1b: Pattern Skills
 
diff --git a/plugins/devflow-ambient/.claude-plugin/plugin.json b/plugins/devflow-ambient/.claude-plugin/plugin.json
index 333810c9..31484c55 100644
--- a/plugins/devflow-ambient/.claude-plugin/plugin.json
+++ b/plugins/devflow-ambient/.claude-plugin/plugin.json
@@ -20,7 +20,8 @@
     "validator",
     "simplifier",
     "scrutinizer",
-    "shepherd",
+    "evaluator",
+    "tester",
     "skimmer",
     "reviewer",
     "git",
@@ -48,6 +49,7 @@
     "documentation",
     "implementation-patterns",
     "knowledge-persistence",
+    "qa",
     "worktree-support"
   ]
 }
diff --git a/plugins/devflow-ambient/README.md b/plugins/devflow-ambient/README.md
index 6abe3d52..3f803f20 100644
--- a/plugins/devflow-ambient/README.md
+++ b/plugins/devflow-ambient/README.md
@@ -53,7 +53,7 @@ Skills are loaded via the Skill tool and work happens in the main session:
 
 | Intent | Pipeline |
 |--------|----------|
-| IMPLEMENT | Pre-flight → Coder → Validator → Simplifier → Scrutinizer → Shepherd |
+| IMPLEMENT | Pre-flight → Coder → Validator → Simplifier → Scrutinizer → Evaluator → Tester |
 | DEBUG | Hypotheses → parallel Explores → convergence → report → offer fix |
 | PLAN | Skimmer → Explores → Plan agent → gap validation |
 
diff --git a/plugins/devflow-implement/.claude-plugin/plugin.json b/plugins/devflow-implement/.claude-plugin/plugin.json
index 40127c22..f1d57156 100644
--- a/plugins/devflow-implement/.claude-plugin/plugin.json
+++ b/plugins/devflow-implement/.claude-plugin/plugin.json
@@ -23,13 +23,15 @@
     "coder",
     "simplifier",
     "scrutinizer",
-    "shepherd",
+    "evaluator",
+    "tester",
     "validator"
   ],
   "skills": [
     "agent-teams",
     "implementation-patterns",
     "knowledge-persistence",
+    "qa",
     "self-review",
     "worktree-support"
   ]
diff --git a/plugins/devflow-implement/README.md b/plugins/devflow-implement/README.md
index 624715a8..fcd508ec 100644
--- a/plugins/devflow-implement/README.md
+++ b/plugins/devflow-implement/README.md
@@ -26,10 +26,11 @@ npx devflow-kit init --plugin=implement
 2. **Planning** - Plan agents design implementation approach
 3. **Implementation** - Coder agent implements on feature branch
 4. **Validation** - Validator runs build/test/lint checks
-5. **Self-Review** - Scrutinizer evaluates against 9-pillar framework
-6. **Alignment Check** - Shepherd validates against original request
-7. **Simplification** - Simplifier refines code clarity
-8. **PR Creation** - Git agent creates pull request
+5. **Simplification** - Simplifier refines code clarity
+6. **Self-Review** - Scrutinizer evaluates against 9-pillar framework
+7. **Alignment Check** - Evaluator validates against original request
+8. **QA Testing** - Tester executes scenario-based acceptance tests
+9. **PR Creation** - Git agent creates pull request
 
 ## Components
 
@@ -43,19 +44,17 @@ npx devflow-kit init --plugin=implement
 - `coder` - Autonomous implementation
 - `simplifier` - Code refinement
 - `scrutinizer` - Self-review (9-pillar framework)
-- `shepherd` - Alignment validation
+- `evaluator` - Alignment validation
+- `tester` - Scenario-based QA testing
 - `validator` - Build/test validation
 
-### Skills (9)
-- `software-design` - Result types, DI, immutability, workaround labeling
-- `git` - Git safety, atomic commits, PR descriptions
+### Skills (6)
+- `agent-teams` - Agent Teams orchestration patterns
 - `implementation-patterns` - CRUD, API, events
-- `testing` - Test quality, coverage
-- `boundary-validation` - Boundary validation
+- `knowledge-persistence` - Architectural decision recording
+- `qa` - Scenario-based acceptance testing
 - `self-review` - 9-pillar framework
-- `typescript` - TypeScript patterns
-- `react` - React patterns
-- `accessibility` - Keyboard, ARIA, focus management
+- `worktree-support` - Worktree-aware path resolution
 
 ## Output
 
diff --git a/plugins/devflow-implement/commands/implement-teams.md b/plugins/devflow-implement/commands/implement-teams.md
index c8fe4679..5811acdf 100644
--- a/plugins/devflow-implement/commands/implement-teams.md
+++ b/plugins/devflow-implement/commands/implement-teams.md
@@ -430,7 +430,7 @@ Verify Scrutinizer's fixes didn't break anything."
 
 **If PASS:** Continue to Phase 12
 
-### Phase 12: Shepherd↔Coder Dialogue
+### Phase 12: Evaluator↔Coder Dialogue
 
 After Scrutinizer passes (and re-validation if needed), check alignment using direct dialogue:
 
@@ -441,7 +441,7 @@ Create a team named "align-{task-id}" for alignment check.
 
 Spawn teammates with self-contained prompts:
 
-- Name: "shepherd"
+- Name: "evaluator"
   Prompt: |
     You are validating that the implementation aligns with the original request.
     ORIGINAL_REQUEST: {task description or issue content}
@@ -463,18 +463,18 @@ Spawn teammates with self-contained prompts:
 
 - Name: "alignment-coder"
   Prompt: |
-    You are fixing alignment issues identified by the Shepherd.
+    You are fixing alignment issues identified by the Evaluator.
     TASK_ID: {task-id}
     ORIGINAL_REQUEST: {task description or issue content}
     FILES_CHANGED: {list of files from Coder output}
 
     Steps:
-    1. Wait for Shepherd's findings via message
+    1. Wait for Evaluator's findings via message
     2. For each misalignment: fix the code or explain why it's correct
-    3. Reply to Shepherd:
-       SendMessage(type: "message", recipient: "shepherd",
+    3. Reply to Evaluator:
+       SendMessage(type: "message", recipient: "evaluator",
          summary: "Fixes applied: {n} issues")
-    4. SCOPE: Fix only misalignments identified by Shepherd — no other changes
+    4. SCOPE: Fix only misalignments identified by Evaluator — no other changes
     5. Max 2 exchanges. Then report to lead:
        SendMessage(type: "message", recipient: "team-lead",
          summary: "Alignment fixes complete")
@@ -484,7 +484,7 @@ Spawn teammates with self-contained prompts:
 
 ```
 Step 1: Shutdown each teammate
-  SendMessage(type: "shutdown_request", recipient: "shepherd", content: "Alignment complete")
+  SendMessage(type: "shutdown_request", recipient: "evaluator", content: "Alignment complete")
   SendMessage(type: "shutdown_request", recipient: "alignment-coder", content: "Alignment complete")
   Wait for each shutdown_response (approve: true)
 
@@ -498,7 +498,7 @@ Step 3: GATE — Verify TeamDelete succeeded
 **If ALIGNED:** Continue to Phase 13
 
 **If MISALIGNED:**
-1. Extract misalignment details from Shepherd output
+1. Extract misalignment details from Evaluator output
 2. Increment `alignment_fix_count`
 3. If `alignment_fix_count <= 2`:
    - Spawn Coder to fix misalignments:
@@ -507,7 +507,7 @@ Step 3: GATE — Verify TeamDelete succeeded
    "TASK_ID: {task-id}
    TASK_DESCRIPTION: Fix alignment issues
    OPERATION: alignment-fix
-   MISALIGNMENTS: {structured misalignments from Shepherd}
+   MISALIGNMENTS: {structured misalignments from Evaluator}
    SCOPE: Fix only the listed misalignments, no other changes
    CREATE_PR: false"
    ```
@@ -521,17 +521,56 @@ Step 3: GATE — Verify TeamDelete succeeded
    - If Validator PASS: Loop back to Phase 12 (re-check alignment)
 4. If `alignment_fix_count > 2`: Report misalignments to user for decision
 
-### Phase 13: Create PR
+### Phase 13: QA Testing
+
+After Evaluator passes, spawn Tester for scenario-based acceptance testing (standalone agent, not a teammate — testing is sequential, not debate):
+
+```
+Task(subagent_type="Tester"):
+"ORIGINAL_REQUEST: {task description or issue content}
+EXECUTION_PLAN: {synthesized plan from Phase 6}
+FILES_CHANGED: {list of files from Coder output}
+ACCEPTANCE_CRITERIA: {extracted criteria if available}
+Design and execute scenario-based acceptance tests. Report PASS or FAIL with evidence."
+```
+
+**If PASS:** Continue to Phase 14
+
+**If FAIL:**
+1. Extract failure details from Tester output
+2. Increment `qa_retry_count`
+3. If `qa_retry_count <= 2`:
+   - Spawn Coder to fix QA failures:
+   ```
+   Task(subagent_type="Coder"):
+   "TASK_ID: {task-id}
+   TASK_DESCRIPTION: Fix QA test failures
+   OPERATION: qa-fix
+   QA_FAILURES: {structured failures from Tester}
+   SCOPE: Fix only the listed failures, no other changes
+   CREATE_PR: false"
+   ```
+   - Spawn Validator to verify fix didn't break tests:
+   ```
+   Task(subagent_type="Validator", model="haiku"):
+   "FILES_CHANGED: {files modified by fix Coder}
+   VALIDATION_SCOPE: changed-only"
+   ```
+   - If Validator FAIL: Report to user
+   - If Validator PASS: Loop back to Phase 13 (re-run Tester)
+4. If `qa_retry_count > 2`: Report QA failures to user for decision
+
+### Phase 14: Create PR
 
 **For SEQUENTIAL_CODERS or PARALLEL_CODERS**: The last sequential Coder (with CREATE_PR: true) handles PR creation. For parallel coders, create unified PR using `devflow:git` skill patterns. Push branch and run `gh pr create` with comprehensive description, targeting `BASE_BRANCH`.
 
 **For SINGLE_CODER**: PR is created by the Coder agent (CREATE_PR: true).
 
-### Phase 14: Report
+### Phase 15: Report
 
 Display completion summary with phase status, PR info, and next steps.
 
-### Phase 15: Record Decisions (if any)
+### Phase 16: Record Decisions (if any)
 
 If the Coder's report includes Key Decisions with architectural significance:
 1. Read `~/.claude/skills/devflow:knowledge-persistence/SKILL.md` and follow its extraction procedure to record decisions to `.memory/knowledge/decisions.md`
@@ -586,17 +625,21 @@ If the Coder's report includes Key Decisions with architectural significance:
 ├─ Phase 11: Re-Validate (if Scrutinizer made changes)
 │  └─ Validator agent (verify Scrutinizer fixes)
 │
-├─ Phase 12: Shepherd↔Coder Dialogue (Agent Teams)
-│  └─ Direct Shepherd↔Coder messaging (max 2 exchanges)
+├─ Phase 12: Evaluator↔Coder Dialogue (Agent Teams)
+│  └─ Direct Evaluator↔Coder messaging (max 2 exchanges)
+│
+├─ Phase 13: QA Testing
+│  └─ Tester agent (scenario-based acceptance tests)
+│  └─ If FAIL: Coder fix loop (max 2 retries) → Validator → re-test
 │
-├─ Phase 13: Create PR (if needed)
+├─ Phase 14: Create PR (if needed)
 │  └─ SINGLE_CODER: handled by Coder
 │  └─ SEQUENTIAL: handled by last Coder
 │  └─ PARALLEL: orchestrator creates unified PR
 │
-├─ Phase 14: Display agent outputs
+├─ Phase 15: Display agent outputs
 │
-└─ Phase 15: Record Decisions (inline, if any)
+└─ Phase 16: Record Decisions (inline, if any)
 ```
 
 ## Principles
diff --git a/plugins/devflow-implement/commands/implement.md b/plugins/devflow-implement/commands/implement.md
index d5d2b8e1..79a1634f 100644
--- a/plugins/devflow-implement/commands/implement.md
+++ b/plugins/devflow-implement/commands/implement.md
@@ -297,10 +297,10 @@ Verify Scrutinizer's fixes didn't break anything."
 
 ### Phase 12: Alignment Check
 
-After Scrutinizer passes (and re-validation if needed), spawn Shepherd to validate alignment:
+After Scrutinizer passes (and re-validation if needed), spawn Evaluator to validate alignment:
 
 ```
-Task(subagent_type="Shepherd"):
+Task(subagent_type="Evaluator"):
 "ORIGINAL_REQUEST: {task description or issue content}
 EXECUTION_PLAN: {synthesized plan from Phase 6}
 FILES_CHANGED: {list of files from Coder output}
@@ -311,7 +311,7 @@ Validate alignment with request and plan. Report ALIGNED or MISALIGNED with deta
 **If ALIGNED:** Continue to Phase 13
 
 **If MISALIGNED:**
-1. Extract misalignment details from Shepherd output
+1. Extract misalignment details from Evaluator output
 2. Increment `alignment_fix_count`
 3. If `alignment_fix_count <= 2`:
    - Spawn Coder to fix misalignments:
@@ -320,7 +320,7 @@ Validate alignment with request and plan. Report ALIGNED or MISALIGNED with deta
    "TASK_ID: {task-id}
    TASK_DESCRIPTION: Fix alignment issues
    OPERATION: alignment-fix
-   MISALIGNMENTS: {structured misalignments from Shepherd}
+   MISALIGNMENTS: {structured misalignments from Evaluator}
    SCOPE: Fix only the listed misalignments, no other changes
    CREATE_PR: false"
    ```
@@ -334,17 +334,56 @@ Validate alignment with request and plan. Report ALIGNED or MISALIGNED with deta
    - If Validator PASS: Loop back to Phase 12 (re-check alignment)
 4. If `alignment_fix_count > 2`: Report misalignments to user for decision
 
-### Phase 13: Create PR
+### Phase 13: QA Testing
+
+After Evaluator passes, spawn Tester for scenario-based acceptance testing:
+
+```
+Task(subagent_type="Tester"):
+"ORIGINAL_REQUEST: {task description or issue content}
+EXECUTION_PLAN: {synthesized plan from Phase 6}
+FILES_CHANGED: {list of files from Coder output}
+ACCEPTANCE_CRITERIA: {extracted criteria if available}
+Design and execute scenario-based acceptance tests. Report PASS or FAIL with evidence."
+```
+
+**If PASS:** Continue to Phase 14
+
+**If FAIL:**
+1. Extract failure details from Tester output
+2. Increment `qa_retry_count`
+3. If `qa_retry_count <= 2`:
+   - Spawn Coder to fix QA failures:
+   ```
+   Task(subagent_type="Coder"):
+   "TASK_ID: {task-id}
+   TASK_DESCRIPTION: Fix QA test failures
+   OPERATION: qa-fix
+   QA_FAILURES: {structured failures from Tester}
+   SCOPE: Fix only the listed failures, no other changes
+   CREATE_PR: false"
+   ```
+   - Spawn Validator to verify fix didn't break tests:
+   ```
+   Task(subagent_type="Validator", model="haiku"):
+   "FILES_CHANGED: {files modified by fix Coder}
+   VALIDATION_SCOPE: changed-only"
+   ```
+   - If Validator FAIL: Report to user
+   - If Validator PASS: Loop back to Phase 13 (re-run Tester)
+4. If `qa_retry_count > 2`: Report QA failures to user for decision
+
+### Phase 14: Create PR
 
 **For SEQUENTIAL_CODERS or PARALLEL_CODERS**: The last sequential Coder (with CREATE_PR: true) handles PR creation. For parallel coders, create unified PR using `devflow:git` skill patterns. Push branch and run `gh pr create` with comprehensive description, targeting `BASE_BRANCH`.
 
 **For SINGLE_CODER**: PR is created by the Coder agent (CREATE_PR: true).
 
-### Phase 14: Report
+### Phase 15: Report
 
 Display completion summary with phase status, PR info, and next steps.
 
-### Phase 15: Record Decisions (if any)
+### Phase 16: Record Decisions (if any)
 
 If the Coder's report includes Key Decisions with architectural significance:
 1. Read `~/.claude/skills/devflow:knowledge-persistence/SKILL.md` and follow its extraction procedure to record decisions to `.memory/knowledge/decisions.md`
@@ -398,17 +437,21 @@ If the Coder's report includes Key Decisions with architectural significance:
 │  └─ Validator agent (verify Scrutinizer fixes)
 │
 ├─ Phase 12: Alignment Check
-│  └─ Shepherd agent (validates alignment - reports only, no fixes)
+│  └─ Evaluator agent (validates alignment - reports only, no fixes)
 │  └─ If MISALIGNED: Coder fix loop (max 2 iterations) → Validator → re-check
 │
-├─ Phase 13: Create PR (if needed)
+├─ Phase 13: QA Testing
+│  └─ Tester agent (scenario-based acceptance tests)
+│  └─ If FAIL: Coder fix loop (max 2 retries) → Validator → re-test
+│
+├─ Phase 14: Create PR (if needed)
 │  └─ SINGLE_CODER: handled by Coder
 │  └─ SEQUENTIAL: handled by last Coder
 │  └─ PARALLEL: orchestrator creates unified PR
 │
-├─ Phase 14: Display agent outputs
+├─ Phase 15: Display agent outputs
 │
-└─ Phase 15: Record Decisions (inline, if any)
+└─ Phase 16: Record Decisions (inline, if any)
 ```
 
 ## Principles
diff --git a/plugins/devflow-frontend-design/.claude-plugin/plugin.json b/plugins/devflow-ui-design/.claude-plugin/plugin.json
similarity index 68%
rename from plugins/devflow-frontend-design/.claude-plugin/plugin.json
rename to plugins/devflow-ui-design/.claude-plugin/plugin.json
index 2d584269..669f444d 100644
--- a/plugins/devflow-frontend-design/.claude-plugin/plugin.json
+++ b/plugins/devflow-ui-design/.claude-plugin/plugin.json
@@ -1,6 +1,6 @@
 {
-  "name": "devflow-frontend-design",
-  "description": "Frontend design patterns - typography, color systems, spacing, motion, responsive design",
+  "name": "devflow-ui-design",
+  "description": "UI design patterns - typography, color systems, spacing, motion, responsive design",
   "author": {
     "name": "Dean0x"
   },
diff --git a/scripts/hooks/background-learning b/scripts/hooks/background-learning
index 800b4761..95d78a9e 100755
--- a/scripts/hooks/background-learning
+++ b/scripts/hooks/background-learning
@@ -399,9 +399,9 @@ run_sonnet_analysis() {
   # Read response, strip markdown fences, validate, write back for Node operations
   local RESPONSE
   RESPONSE=$(cat "$RESPONSE_FILE")
-  RESPONSE=$(echo "$RESPONSE" | sed '1s/^```json$//' | sed '1s/^```$//' | sed '$s/^```$//')
+  RESPONSE=$(printf '%s\n' "$RESPONSE" | sed '1s/^```json$//' | sed '1s/^```$//' | sed '$s/^```$//')
 
-  if ! echo "$RESPONSE" | json_valid; then
+  if ! printf '%s\n' "$RESPONSE" | json_valid; then
     log "Invalid JSON response from model — skipping"
     log "--- Raw response ---"
     log "$RESPONSE"
@@ -410,7 +410,7 @@ run_sonnet_analysis() {
     return 1
   fi
 
-  echo "$RESPONSE" > "$RESPONSE_FILE"
+  printf '%s\n' "$RESPONSE" > "$RESPONSE_FILE"
   return 0
 }
 
diff --git a/scripts/hooks/background-memory-update b/scripts/hooks/background-memory-update
index d884916e..22e6cc2f 100755
--- a/scripts/hooks/background-memory-update
+++ b/scripts/hooks/background-memory-update
@@ -99,12 +99,14 @@ extract_last_turn() {
 
   last_user=$(grep '"type":"user"' "$transcript" 2>/dev/null \
     | tail -3 \
-    | while IFS= read -r line; do echo "$line" | json_extract_messages; done \
+    | while IFS= read -r line; do printf '%s\n' "$line" | head -c 100000 | json_extract_messages; done \
+    | awk 'NF' \
     | tail -1)
 
   last_assistant=$(grep '"type":"assistant"' "$transcript" 2>/dev/null \
     | tail -3 \
-    | while IFS= read -r line; do echo "$line" | json_extract_messages; done \
+    | while IFS= read -r line; do printf '%s\n' "$line" | head -c 100000 | json_extract_messages; done \
+    | awk 'NF' \
     | tail -1)
 
   # Truncate to ~4000 chars total to keep token cost low
diff --git a/scripts/hooks/json-parse b/scripts/hooks/json-parse
index 6ca91c34..2c109fe8 100755
--- a/scripts/hooks/json-parse
+++ b/scripts/hooks/json-parse
@@ -159,7 +159,7 @@ json_array_item() {
 
 # --- Transcript extraction ---
 
-# Extract text messages from Claude message JSON. Usage: echo '{"message":...}' | json_extract_messages
+# Extract text messages from Claude message JSON. Usage: printf '%s\n' '{"message":...}' | json_extract_messages
 json_extract_messages() {
   if [ "$_HAS_JQ" = "true" ]; then
     jq -r 'if .message.content then
diff --git a/shared/agents/shepherd.md b/shared/agents/evaluator.md
similarity index 99%
rename from shared/agents/shepherd.md
rename to shared/agents/evaluator.md
index cd2dfc97..5ce596fb 100644
--- a/shared/agents/shepherd.md
+++ b/shared/agents/evaluator.md
@@ -1,11 +1,11 @@
 ---
-name: Shepherd
+name: Evaluator
 description: Validates implementation aligns with original request and plan. Catches missed requirements, scope creep, and intent drift. Reports misalignments for Coder to fix.
 model: opus
 skills: devflow:software-design, devflow:worktree-support
 ---
 
-# Shepherd Agent
+# Evaluator Agent
 
 You are an alignment validation specialist. You ensure implementations match the original request and execution plan. You catch missed requirements, scope creep, and intent drift. You report misalignments with structured details for the Coder agent to fix - you never fix code yourself.
 
diff --git a/shared/agents/tester.md b/shared/agents/tester.md
new file mode 100644
index 00000000..569347bc
--- /dev/null
+++ b/shared/agents/tester.md
@@ -0,0 +1,139 @@
+---
+name: Tester
+description: Scenario-based QA agent. Designs and executes acceptance tests from criteria and implementation. Reports pass/fail with evidence — never fixes code.
+model: sonnet
+tools: ["Read", "Grep", "Glob", "Bash", "mcp__claude-in-chrome__tabs_context_mcp", "mcp__claude-in-chrome__tabs_create_mcp", "mcp__claude-in-chrome__navigate", "mcp__claude-in-chrome__get_page_text", "mcp__claude-in-chrome__read_page", "mcp__claude-in-chrome__find", "mcp__claude-in-chrome__form_input", "mcp__claude-in-chrome__javascript_tool", "mcp__claude-in-chrome__read_console_messages"]
+skills: devflow:qa, devflow:testing, devflow:worktree-support
+---
+
+# Tester Agent
+
+You are a scenario-based QA specialist. You design and execute acceptance tests that verify implementation behavior from the user's perspective. You test what was asked for, not implementation details. You report results with evidence — you never fix code yourself.
+
+## Input Context
+
+You receive from orchestrator:
+- **ORIGINAL_REQUEST**: Task description or GitHub issue content
+- **EXECUTION_PLAN**: Synthesized plan from planning phase
+- **FILES_CHANGED**: List of modified files from Coder output
+- **ACCEPTANCE_CRITERIA**: Extracted acceptance criteria (if any)
+- **PREVIOUS_FAILURES**: Structured failures from prior Tester run (if retry)
+
+**Worktree Support**: If `WORKTREE_PATH` is provided, follow the `devflow:worktree-support` skill for path resolution. If omitted, use cwd.
+
+## Responsibilities
+
+1. **Assess testability**: If FILES_CHANGED contains only documentation, configuration, or non-executable files, report PASS with "No testable behavior changes — QA scenarios not applicable."
+2. **Detect web-facing changes**: Scan FILES_CHANGED for web indicators:
+   - File extensions: `.tsx`, `.jsx`, `.html`, `.css`, `.scss`
+   - Path patterns: `routes/`, `pages/`, `components/`, `views/`, `app/`
+   If web files detected → execute browser scenarios alongside standard scenarios (follow Dev Server Lifecycle and Browser Execution procedures in `devflow:qa/references/browser-testing.md`).
+   If only backend/CLI files → standard Bash execution only.
+3. **Assess local testability**: Before designing scenarios, determine what CAN be tested locally:
+   - Check for package.json / requirements.txt / go.mod to identify project type
+   - Identify required infrastructure: database, Redis, external APIs, OAuth providers
+   - Check if dependencies are available (e.g., `docker ps`, `pg_isready`, env vars for API keys)
+   - Scenarios requiring unavailable infrastructure are marked SKIPPED with reason
+   - Report all untestable scenarios alongside tested ones in the QA report
+4. **Extract criteria**: Derive acceptance criteria from ORIGINAL_REQUEST and EXECUTION_PLAN. If ACCEPTANCE_CRITERIA is provided, use it as the primary source.
+5. **Design scenarios**: Create 5-8 concrete test scenarios across these types:
+   - **Happy path**: Core functionality works as described
+   - **Boundary/edge**: Limits, empty inputs, maximum values
+   - **Negative path**: Invalid inputs, missing permissions, error conditions
+   - **Integration**: Components work together correctly
+   - **Regression**: Existing behavior preserved (if applicable)
+6. **Execute scenarios**: Run each via Bash (or browser for web scenarios) — capture stdout/stderr, exit codes, file state. Follow Bash execution constraints (see `devflow:qa/references/browser-testing.md`).
+7. **Evaluate results**: Compare actual vs expected behavior for each scenario
+8. **Produce report**: Structured QA report with pass/fail status and evidence
+
+## Scenario Design
+
+For each scenario, define:
+- **ID**: Sequential (S1, S2, ...)
+- **Type**: happy | boundary | negative | integration | regression
+- **Description**: What is being tested, in plain language
+- **Given**: Setup preconditions
+- **When**: Action to perform (Bash command, API call, file operation)
+- **Then**: Expected observable outcome
+- **Severity if fails**: BLOCKING (acceptance criteria violated) | WARNING (edge case concern)
+
+## Execution
+
+For each scenario:
+1. Set up preconditions (create files, set state)
+2. Execute the action via Bash
+3. Capture stdout, stderr, exit code
+4. Compare against expected outcome
+5. Record PASS or FAIL with evidence
+
+If a previous run failed (PREVIOUS_FAILURES provided), prioritize re-testing those scenarios first.
+
+## Output
+
+Return structured QA report:
+
+```markdown
+## QA Report
+
+### Status: PASS | FAIL
+
+### Summary
+- Scenarios designed: {total}
+- Passed: {count}
+- Failed: {count}
+- Skipped: {count}
+
+### Acceptance Criteria Coverage
+| Criterion | Scenarios | Status |
+|-----------|-----------|--------|
+| {criterion} | S1, S3 | COVERED/UNCOVERED |
+
+### Scenario Results
+
+| ID | Type | Description | Mode | Status | Severity |
+|----|------|-------------|------|--------|----------|
+| S1 | happy | {description} | bash/browser | PASS/FAIL/SKIPPED | — /BLOCKING/WARNING |
+
+### Skipped Scenarios (if any)
+
+| ID | Description | Reason |
+|----|-------------|--------|
+| S6 | Database persistence check | No local database available |
+| S9 | Form submission renders correctly | Chrome MCP tools not available |
+
+### Failed Scenarios (if any)
+
+#### S{n}: {description}
+- **Given**: {preconditions}
+- **When**: {action executed}
+- **Expected**: {what should happen}
+- **Actual**: {what actually happened}
+- **Evidence**: {stdout/stderr/exit code}
+- **Remediation**: {what Coder should fix}
+
+### Evidence Log
+{Raw command outputs for traceability}
+```
+
+## Principles
+
+1. **User perspective** - Test what the user asked for, not implementation internals
+2. **Report, don't fix** - Document failures for Coder to fix; never modify code yourself
+3. **Evidence-based** - Every result backed by captured stdout/stderr/exit codes
+4. **Severity-aware** - BLOCKING for acceptance criteria violations, WARNING for edge cases
+5. **Deterministic** - Scenarios must produce consistent results across runs
+
+## Boundaries
+
+**Report as PASS:**
+- All BLOCKING scenarios pass
+- WARNING-only failures are acceptable
+
+**Report as FAIL:**
+- Any BLOCKING scenario fails
+
+**Never:**
+- Modify code or create commits
+- Fix failures yourself
+- Skip scenarios because "they'll probably pass"
+- Test implementation details (internal function signatures, variable names)
diff --git a/shared/skills/ambient-router/SKILL.md b/shared/skills/ambient-router/SKILL.md
index fe17c6eb..6343bf85 100644
--- a/shared/skills/ambient-router/SKILL.md
+++ b/shared/skills/ambient-router/SKILL.md
@@ -89,7 +89,7 @@ Based on classified intent and depth, invoke each selected skill using the Skill
 | **RESOLVE** | devflow:resolve-orchestration, devflow:software-design | — |
 | **PIPELINE** | devflow:pipeline-orchestration, devflow:implementation-patterns | — |
 
-**Excluded from ambient loading** (loaded by agents internally): devflow:review-methodology, devflow:complexity, devflow:consistency, devflow:database, devflow:dependencies, devflow:documentation, devflow:regression, devflow:architecture, devflow:accessibility, devflow:performance. These skills are always installed (universal skill installation) but loaded by Reviewer agents at runtime, not by the router.
+**Excluded from ambient loading** (loaded by agents internally): devflow:review-methodology, devflow:complexity, devflow:consistency, devflow:database, devflow:dependencies, devflow:documentation, devflow:regression, devflow:architecture, devflow:accessibility, devflow:performance, devflow:qa. These skills are always installed (universal skill installation) but loaded by Reviewer/Tester agents at runtime, not by the router.
 
 See `references/skill-catalog.md` for the full skill-to-intent mapping with file pattern triggers.
 
diff --git a/shared/skills/ambient-router/references/skill-catalog.md b/shared/skills/ambient-router/references/skill-catalog.md
index 46443f32..db680ea2 100644
--- a/shared/skills/ambient-router/references/skill-catalog.md
+++ b/shared/skills/ambient-router/references/skill-catalog.md
@@ -84,6 +84,7 @@ These skills are always installed (universal skill installation) but loaded by a
 - devflow:architecture — SOLID analysis, coupling detection, layering issues
 - devflow:accessibility — WCAG compliance, ARIA roles, keyboard navigation
 - devflow:performance — N+1 queries, memory leaks, caching opportunities
+- devflow:qa — Scenario-based acceptance testing, evidence collection
 
 ## Multi-Worktree Detection
 
diff --git a/shared/skills/implementation-orchestration/SKILL.md b/shared/skills/implementation-orchestration/SKILL.md
index d64fa95f..59431fd6 100644
--- a/shared/skills/implementation-orchestration/SKILL.md
+++ b/shared/skills/implementation-orchestration/SKILL.md
@@ -15,7 +15,7 @@ This is a lightweight variant of `/implement` for ambient ORCHESTRATED mode. Exc
 
 > **QUALITY GATES ARE NON-NEGOTIABLE**
 >
-> Every Coder output passes through Validator → Simplifier → Scrutinizer → re-Validate → Shepherd.
+> Every Coder output passes through Validator → Simplifier → Scrutinizer → re-Validate → Evaluator → Tester.
 > Skipping a gate because "it looks fine" is never acceptable. The pipeline runs to completion
 > or halts on failure — there is no shortcut.
 
@@ -90,7 +90,8 @@ Run sequentially — each gate must pass before the next:
 2. `Task(subagent_type="Simplifier")` — code clarity and maintainability pass on FILES_CHANGED
 3. `Task(subagent_type="Scrutinizer")` — 9-pillar quality evaluation on FILES_CHANGED
 4. `Task(subagent_type="Validator")` (re-validate after Simplifier/Scrutinizer changes)
-5. `Task(subagent_type="Shepherd")` — verify implementation matches original request — retry up to 2× if misalignment found
+5. `Task(subagent_type="Evaluator")` — verify implementation matches original request — retry up to 2× if misalignment found
+6. `Task(subagent_type="Tester")` — scenario-based acceptance testing from user's perspective — retry up to 2× if QA fails
 
 If any gate exhausts retries, halt pipeline and report what passed and what failed.
 
@@ -108,4 +109,5 @@ Report results:
 
 - **Coder BLOCKED**: Halt immediately, report blocker to user
 - **Validator fails after retries**: Report specific failures, halt pipeline
-- **Shepherd misalignment after retries**: Report misalignment details, let user decide next steps
+- **Evaluator misalignment after retries**: Report misalignment details, let user decide next steps
+- **Tester QA failures after retries**: Report QA failure details, let user decide next steps
diff --git a/shared/skills/qa/SKILL.md b/shared/skills/qa/SKILL.md
new file mode 100644
index 00000000..f7d5eb11
--- /dev/null
+++ b/shared/skills/qa/SKILL.md
@@ -0,0 +1,136 @@
+---
+name: qa
+description: This skill should be used when performing scenario-based acceptance testing,
+  designing QA test plans, or validating that implementation behavior matches acceptance
+  criteria beyond unit tests.
+user-invocable: false
+allowed-tools: Read, Grep, Glob, Bash
+---
+
+# QA Patterns
+
+Scenario-based acceptance testing methodology. Ensures implementations satisfy user-observable requirements beyond what unit tests cover.
+
+## Iron Law
+
+> **VERIFY BEHAVIOR FROM THE USER'S PERSPECTIVE** [1][2]
+>
+> Test what the user asked for, not implementation details. Every acceptance criterion
+> gets at least one scenario. Every scenario produces observable evidence. If you can't
+> demonstrate it works from the outside, it doesn't work. [3][8]
+
+---
+
+## Scenario Types [1][2][6]
+
+Five categories ensure comprehensive coverage:
+
+| Type | Purpose | Example |
+|------|---------|---------|
+| **Happy path** | Core functionality works as described | "Add item → item appears in list" |
+| **Boundary/edge** | Limits, empty, maximum, minimum values | "Add item with 1000-char name → truncated or rejected" |
+| **Negative path** | Invalid inputs, missing permissions, errors | "Add item without auth → 401 returned" |
+| **Integration** | Components work together correctly | "Add item → appears in search results" |
+| **Regression** | Existing behavior preserved after changes | "Old items still load after schema migration" |
+
+**Minimum coverage**: At least one scenario per acceptance criterion. At least one boundary and one negative scenario per feature. [1][6]
+
+## Scenario Design from Acceptance Criteria [4][8][9]
+
+Extract testable claims using Given/When/Then:
+
+```
+Acceptance criterion: "Users can upload files up to 10MB"
+
+S1 (happy):    Given auth user, When upload 5MB file, Then 200 + file accessible
+S2 (boundary): Given auth user, When upload 10MB file, Then 200 (exact limit)
+S3 (boundary): Given auth user, When upload 10.1MB file, Then 413 rejected
+S4 (negative): Given no auth, When upload 5MB file, Then 401
+S5 (negative): Given auth user, When upload empty file, Then 400
+```
+
+**Extracting criteria** [4]: If no explicit acceptance criteria, derive from the request:
+1. What new behavior was requested? → happy path scenarios
+2. What inputs does it accept? → boundary scenarios [7][10]
+3. What should it reject? → negative scenarios
+4. What existing behavior must survive? → regression scenarios
+
+## Equivalence Partitioning & Boundary Analysis [1][6][7]
+
+Reduce infinite inputs to representative cases:
+
+- **Valid partition**: One representative value from each valid class
+- **Invalid partition**: One representative from each invalid class
+- **Boundary values**: On, just below, just above each boundary [10]
+
+```
+Input: age (integer, 0-120 allowed)
+├── Valid: 25 (mid-range), 0 (minimum), 120 (maximum)
+├── Invalid: -1 (below min), 121 (above max), "abc" (wrong type)
+└── Boundary: 0, 1, 119, 120
+```
+
+## Exploratory Testing Heuristics [2][3][11]
+
+When acceptance criteria are vague, use structured exploration:
+
+- **CRUD tour** [11]: Create, Read, Update, Delete — does each operation work end-to-end?
+- **Configuration tour** [11]: Change every configurable value — does the system adapt?
+- **Error tour** [3]: Force every error path — are messages helpful, is state clean?
+- **Boundary tour** [3][11]: Push every input to its limits
+
+## Evidence Collection
+
+Every scenario must produce verifiable evidence:
+
+| Evidence Type | How to Capture | When to Use |
+|--------------|----------------|-------------|
+| Exit codes | `echo $?` after command | CLI tools, scripts |
+| Stdout/stderr | Redirect to capture | All command execution |
+| File state | `ls -la`, `diff`, `cat` | File creation/modification |
+| HTTP status | Response code from curl/fetch | API endpoints |
+| Log output | Grep logs after action | Background processes |
+
+## Browser-Based Scenarios [8][12]
+
+When implementation includes web-facing changes (.tsx, .jsx, .html, routes, pages):
+
+| Scenario Type | Browser Action | Evidence |
+|--------------|---------------|----------|
+| Page renders | Navigate → read page text | Page content, no console errors |
+| Form works | Find fields → input → submit | Redirect or DOM state change |
+| Validation fires | Input invalid data → check | Error messages visible in page |
+| Console clean | Navigate → read console | No errors/warnings logged |
+| Navigation works | Click links → verify URL change | Correct page loaded |
+
+**Dev server lifecycle**: Check for running server first. If none, auto-start from package.json scripts, poll for readiness, kill after testing. Never kill pre-existing servers.
+**Testability assessment**: Before designing scenarios, assess what local infrastructure is available (DB, Redis, external APIs). Mark scenarios requiring unavailable infrastructure as SKIPPED.
+**Graceful degradation**: If browser tools or dev server unavailable, fall back to curl/API testing. Always report what was skipped and why alongside what was tested.
+
+## Severity Classification
+
+| Level | Meaning | Action |
+|-------|---------|--------|
+| **BLOCKING** | Acceptance criterion violated — feature does not work as requested | Report FAIL — Coder must fix |
+| **WARNING** | Edge case concern — feature works but edge behavior is unexpected | Report PASS with warnings — note for improvement |
+
+---
+
+## Extended References
+
+For additional scenario templates and anti-patterns:
+- `references/sources.md` — Full bibliography with access details
+- `references/patterns.md` — Correct scenario design patterns
+- `references/violations.md` — QA anti-patterns to avoid
+- `references/scenario-templates.md` — Templates for common feature types
+
+---
+
+## Success Criteria
+
+- [ ] Every acceptance criterion has at least one scenario
+- [ ] At least one boundary and one negative scenario per feature
+- [ ] Every scenario has Given/When/Then structure
+- [ ] Every result has captured evidence (stdout, exit code, file state)
+- [ ] Severity correctly assigned: BLOCKING for criteria violations, WARNING for edge cases
+- [ ] Non-testable changes (docs, config) correctly identified and skipped
diff --git a/shared/skills/qa/references/browser-testing.md b/shared/skills/qa/references/browser-testing.md
new file mode 100644
index 00000000..7dba306e
--- /dev/null
+++ b/shared/skills/qa/references/browser-testing.md
@@ -0,0 +1,97 @@
+# Browser Testing Procedures
+
+Detailed procedures for dev server lifecycle and browser scenario execution in QA.
+
+---
+
+## Dev Server Lifecycle
+
+When web-facing changes detected, manage a dev server for browser testing:
+
+### 1. Check for Already Running Server
+
+Before starting anything, check if a dev server is already running:
+- `lsof -i :3000 -i :5173 -i :8080 -i :4200 -i :8000 -t 2>/dev/null`
+- If a server is already running on the expected port: USE IT (do not start another)
+- Record whether server was pre-existing (skip cleanup if so)
+
+### 2. Discover Server Command (if no running server)
+
+Read `package.json` (or equivalent) to find the dev server command:
+- Check `scripts.dev`, `scripts.start`, `scripts.serve` (in that order)
+- For Python: look for `manage.py`, Flask/FastAPI entry points
+- For Go: check `Makefile` or `go run` targets
+- If no dev script found: skip browser scenarios, report "No dev server script — browser scenarios skipped"
+
+### 3. Detect Port
+
+Determine port from (in order):
+- Framework config: `vite.config.ts` (server.port), `next.config.js`
+- `.env` file: extract only `PORT` — `grep ^PORT= .env | cut -d= -f2` (do NOT read entire .env)
+- Script definition: parse `--port` flags in the dev script
+- Defaults by framework: Next.js→3000, Vite→5173, CRA→3000, Django→8000, Go→8080
+
+### 4. Start Server (if not already running)
+
+- Create a unique log file: `SERVER_LOG=$(mktemp /tmp/devflow-tester-XXXXXX.log)`
+- Run in background: `npm run dev > "$SERVER_LOG" 2>&1 &`
+- Record PID: `DEV_SERVER_PID=$!`
+- Poll for readiness: `curl -s -o /dev/null -w "%{http_code}" http://localhost:{port}/`
+- Retry up to 15 times, 2s intervals (30s max)
+- If timeout: kill server, skip browser scenarios, report "Dev server did not become ready within 30s"
+
+### 5. Run Browser Scenarios
+
+(See Browser Execution section below)
+
+### 6. Cleanup (only for servers WE started)
+
+- Kill dev server: `kill $DEV_SERVER_PID 2>/dev/null`
+- Kill process group: `kill -- -$DEV_SERVER_PID 2>/dev/null || true`
+- Remove log file: `rm -f "$SERVER_LOG"`
+- NEVER kill a pre-existing server
+
+---
+
+## Browser Execution
+
+Requires: Chrome MCP tools available + dev server running.
+
+1. Check Chrome availability: attempt `mcp__claude-in-chrome__tabs_context_mcp`
+   - If unavailable: skip browser scenarios, note "Chrome MCP tools not available"
+   - If available: create a new tab with `mcp__claude-in-chrome__tabs_create_mcp`
+
+2. For each browser scenario:
+   a. Navigate: `mcp__claude-in-chrome__navigate` to the relevant page
+   b. Read content: `mcp__claude-in-chrome__get_page_text` or `mcp__claude-in-chrome__read_page`
+   c. Find elements: `mcp__claude-in-chrome__find` for buttons, forms, text
+   d. Interact: `mcp__claude-in-chrome__form_input` for form fields
+   e. Assert via JS: `mcp__claude-in-chrome__javascript_tool` for state checks
+   f. Check console: `mcp__claude-in-chrome__read_console_messages` for errors
+   g. Record evidence from each step
+
+3. After all browser scenarios: close the tab created in step 1
+
+---
+
+## Bash Execution Constraints
+
+When executing scenarios via Bash, these constraints are mandatory:
+
+**NEVER execute:**
+- Destructive filesystem commands: `rm -rf`, `rmdir`, `truncate`, `shred`
+- Privilege escalation: `sudo`, `su`, `chown`, `chmod` on system directories
+- Code injection vectors: `eval`, `exec`, shell substitution on untrusted input
+- Package management: `npm install -g`, `pip install`, `brew install`, `apt-get`
+- Network exfiltration: outbound curl/wget to external hosts not under test
+
+**ONLY run:**
+- Test runners: `npm test`, `pytest`, `go test`, `cargo test`, `jest`, `mocha`
+- Build commands: `npm run build`, `go build`, `cargo build`
+- Read-only file inspection: `cat`, `ls`, `diff`, `head`, `tail`, `grep`
+- Readiness checks: `curl` to `localhost` only, `lsof`, `pg_isready`, `redis-cli ping`
+- Process management for servers WE started: `kill $DEV_SERVER_PID`
+
+**Filesystem writes restricted to:**
+- Temporary files created via `mktemp` in `/tmp/devflow-tester-*`
+- Project test directories (e.g., `__tests__/fixtures/`, `test/data/`)
diff --git a/shared/skills/qa/references/patterns.md b/shared/skills/qa/references/patterns.md
new file mode 100644
index 00000000..ab2ad0b0
--- /dev/null
+++ b/shared/skills/qa/references/patterns.md
@@ -0,0 +1,118 @@
+# QA Skill — Correct Patterns
+
+Scenario design patterns that produce reliable, maintainable acceptance tests.
+
+## Pattern 1: Criterion-Driven Scenario Design [4][9]
+
+Derive scenarios directly from acceptance criteria using Given/When/Then:
+
+```
+Criterion: "Admin users can delete any comment"
+
+S1 (happy):    Given admin user + existing comment
+               When DELETE /comments/{id}
+               Then 200 + comment removed from database
+
+S2 (negative): Given regular user + existing comment
+               When DELETE /comments/{id}
+               Then 403 + comment still exists
+
+S3 (negative): Given admin user + non-existent comment
+               When DELETE /comments/{id}
+               Then 404
+
+S4 (boundary): Given admin user + already-deleted comment
+               When DELETE /comments/{id}
+               Then 404 (idempotent)
+```
+
+**Why**: Direct criterion mapping ensures nothing is missed [4]. Every criterion generates at least one scenario.
+
+## Pattern 2: Boundary Triplet Testing [1][6][10]
+
+For every boundary, test three values: below, on, and above:
+
+```
+Constraint: "Username must be 3-20 characters"
+
+Below minimum: "ab" (2 chars) → rejected
+At minimum:    "abc" (3 chars) → accepted
+Above minimum: "abcd" (4 chars) → accepted
+Below maximum: 19 chars → accepted
+At maximum:    20 chars → accepted
+Above maximum: 21 chars → rejected
+```
+
+**Why**: Most defects cluster at boundaries [1][7]. The triplet pattern catches off-by-one errors systematically.
+
+## Pattern 3: State-Based Scenario Chains [6][7]
+
+Test state transitions that matter to the user:
+
+```
+Feature: Order lifecycle
+
+S1: Created → Paid (happy path)
+S2: Created → Cancelled (valid transition)
+S3: Paid → Shipped (happy path)
+S4: Shipped → Cancelled (should this be allowed? → negative)
+S5: Cancelled → Paid (invalid transition → error)
+```
+
+**Why**: State machines reveal invalid transitions that happy-path testing misses [6].
+
+## Pattern 4: Evidence-First Execution [8][12]
+
+Capture evidence before asserting:
+
+```bash
+# Execute
+OUTPUT=$(some-command 2>&1)
+EXIT_CODE=$?
+
+# Evidence captured — now assert
+echo "Exit code: $EXIT_CODE"
+echo "Output: $OUTPUT"
+
+# Check expectations
+if [ $EXIT_CODE -ne 0 ]; then
+  echo "FAIL: Expected exit code 0, got $EXIT_CODE"
+fi
+```
+
+**Why**: Raw evidence enables debugging when scenarios fail. Without it, you only know "it failed" but not why [8].
+
+## Pattern 5: Regression Guard Scenarios [5][12]
+
+After modifying existing code, verify unchanged behavior:
+
+```
+Feature: New search filter added to existing search
+
+S1 (regression): Existing search without filter still works
+S2 (regression): Existing search results unchanged
+S3 (happy):      New filter produces expected results
+S4 (integration): Filter + existing sort work together
+```
+
+**Why**: Most production bugs are regressions — features that used to work and stopped [12].
+
+## Pattern 6: Exploratory Charters [3][11]
+
+When criteria are vague, use structured exploration:
+
+```
+Charter: "Explore the file upload feature with hostile inputs"
+Time-box: 15 minutes
+
+Tour plan:
+1. Upload file with special characters in name
+2. Upload file with zero bytes
+3. Upload file with executable extension
+4. Simultaneous uploads of same file
+5. Upload during network interruption (kill mid-transfer)
+
+Record: What happened, what was surprising, what broke
+```
+
+**Why**: Scripted tests find expected bugs. Exploratory testing finds unexpected ones [3][11].
diff --git a/shared/skills/qa/references/scenario-templates.md b/shared/skills/qa/references/scenario-templates.md
new file mode 100644
index 00000000..6778d556
--- /dev/null
+++ b/shared/skills/qa/references/scenario-templates.md
@@ -0,0 +1,102 @@
+# QA Skill — Scenario Templates
+
+Ready-to-adapt templates for common feature types.
+
+## Template 1: CLI Command
+
+```
+Feature: New CLI command `devflow foo --bar`
+
+S1 (happy):     Run `devflow foo --bar value` → expected output, exit 0
+S2 (happy):     Run `devflow foo` with defaults → sensible default behavior
+S3 (boundary):  Run `devflow foo --bar ""` → validation error, exit 1
+S4 (boundary):  Run `devflow foo --bar` (missing value) → usage help, exit 1
+S5 (negative):  Run `devflow foo --unknown` → "unknown flag" error, exit 1
+S6 (negative):  Run in directory without required config → helpful error message
+S7 (regression): Existing commands still work unchanged
+```
+
+## Template 2: API Endpoint
+
+```
+Feature: POST /api/widgets
+
+S1 (happy):      Valid body → 201, widget in response, persisted in DB
+S2 (happy):      Minimal valid body → 201, defaults applied
+S3 (boundary):   Body at max allowed size → 201 or 413
+S4 (boundary):   Empty required field → 400 with field-specific error
+S5 (negative):   No auth header → 401
+S6 (negative):   Invalid JSON body → 400
+S7 (negative):   Duplicate unique field → 409
+S8 (integration): Created widget appears in GET /api/widgets
+```
+
+## Template 3: Configuration Change
+
+```
+Feature: New config option `maxRetries` in config.json
+
+S1 (happy):     Set maxRetries=3 → system retries 3 times on failure
+S2 (boundary):  Set maxRetries=0 → no retries (immediate fail)
+S3 (boundary):  Set maxRetries=100 → accepted (or capped with warning)
+S4 (negative):  Set maxRetries=-1 → validation error on startup
+S5 (negative):  Omit maxRetries → sensible default applied
+S6 (negative):  Set maxRetries="abc" → type error on startup
+S7 (regression): Existing config options still work with new option added
+```
+
+## Template 4: File Processing
+
+```
+Feature: Import CSV data
+
+S1 (happy):      Valid CSV with 10 rows → 10 records imported
+S2 (happy):      CSV with headers → headers correctly mapped
+S3 (boundary):   Empty CSV (headers only) → 0 records, no error
+S4 (boundary):   Large CSV (10K rows) → all imported within timeout
+S5 (negative):   Malformed CSV (unmatched quotes) → error with line number
+S6 (negative):   Missing required column → error naming the column
+S7 (negative):   File not found → clear error message
+S8 (integration): Imported records queryable via existing API
+```
+
+## Template 5: Refactoring / Internal Change
+
+```
+Feature: Refactored auth middleware (no behavior change intended)
+
+S1 (regression): Login with valid credentials → same response as before
+S2 (regression): Login with invalid credentials → same error as before
+S3 (regression): Protected endpoint with valid token → accessible
+S4 (regression): Protected endpoint without token → 401
+S5 (regression): Token expiry behavior unchanged
+S6 (regression): Rate limiting behavior unchanged
+```
+
+## Template 6: Build System / Tooling Change
+
+```
+Feature: Added new build step
+
+S1 (happy):      Full build succeeds → all artifacts present
+S2 (happy):      Incremental build after change → only affected files rebuilt
+S3 (negative):   Build with missing dependency → clear error message
+S4 (regression): Existing build outputs unchanged
+S5 (regression): Build time not significantly degraded
+S6 (integration): Built artifacts installable/runnable
+```
+
+## Template 7: Web Component / Route
+
+```
+Feature: Login page
+
+S1 (happy):      Navigate to /login → form visible, all fields present
+S2 (happy):      Enter valid credentials → submit → redirect to /dashboard
+S3 (boundary):   Email with 255 chars → accepted or validation message shown
+S4 (negative):   Submit empty form → validation errors for required fields
+S5 (negative):   Wrong password → "Invalid credentials" message displayed
+S6 (integration): Successful login → user name visible on dashboard
+S7 (regression):  Existing pages still render correctly after changes
+S8 (negative):   Navigate to /login when already logged in → redirect to dashboard
+```
diff --git a/shared/skills/qa/references/sources.md b/shared/skills/qa/references/sources.md
new file mode 100644
index 00000000..1a149a9b
--- /dev/null
+++ b/shared/skills/qa/references/sources.md
@@ -0,0 +1,30 @@
+# QA Skill — Sources
+
+Canonical references for scenario-based acceptance testing methodology.
+
+## Bibliography
+
+| # | Author(s) | Title | Year | Topics | Access |
+|---|-----------|-------|------|--------|--------|
+| 1 | Myers, Sandler, Badgett | The Art of Software Testing, 3rd ed. | 2011 | Boundary value analysis, equivalence partitioning, test case design | Wiley |
+| 2 | Kaner, Bach, Pettichord | Lessons Learned in Software Testing | 2001 | Scenario design, QA philosophy, exploratory testing foundations | Wiley |
+| 3 | Hendrickson, E. | Explore It! Reduce Risk and Increase Confidence with Exploratory Testing | 2013 | Exploratory testing charters, heuristics, risk-based exploration | Pragmatic |
+| 4 | Adzic, G. | Specification by Example | 2011 | Acceptance criteria → executable specifications, living documentation | Manning |
+| 5 | Crispin, Gregory | Agile Testing: A Practical Guide | 2009 | Testing quadrants (Q1-Q4), acceptance testing in agile, whole-team quality | Addison-Wesley |
+| 6 | Copeland, L. | A Practitioner's Guide to Software Test Design | 2004 | Equivalence classes, decision tables, state transition testing, pairwise | Artech House |
+| 7 | Beizer, B. | Software Testing Techniques, 2nd ed. | 1990 | Domain testing, cause-effect graphing, syntax testing, graph-based methods | Van Nostrand Reinhold |
+| 8 | Freeman, Pryce | Growing Object-Oriented Software, Guided by Tests (GOOS) | 2009 | Acceptance test-driven development, walking skeleton, outside-in TDD | Addison-Wesley |
+| 9 | Smart, J.F. | BDD in Action: Behavior-Driven Development for the Whole Software Lifecycle | 2014 | Given/When/Then scenarios, living documentation, specification workshops | Manning |
+| 10 | Ammann, Offutt | Introduction to Software Testing, 2nd ed. | 2016 | Input space partitioning, graph coverage, logic coverage criteria | Cambridge |
+| 11 | Whittaker, J. | Exploratory Software Testing | 2009 | Test tours (feature, complexity, claims, configuration), risk-based exploration | Addison-Wesley |
+| 12 | Google Testing Blog | Various: Acceptance Testing practices | 2007-2024 | Test pyramids, acceptance test patterns, flaky test reduction | testing.googleblog.com |
+
+## How Sources Map to SKILL.md Sections
+
+| Section | Primary Sources | Supporting Sources |
+|---------|----------------|-------------------|
+| Scenario Types | [1] Myers, [2] Kaner, [6] Copeland | [5] Crispin Q2/Q3 quadrants |
+| Scenario Design from Criteria | [4] Adzic, [8] Freeman, [9] Smart | [5] Crispin acceptance testing |
+| Equivalence Partitioning | [1] Myers, [6] Copeland, [7] Beizer | [10] Ammann ISP |
+| Exploratory Heuristics | [2] Kaner, [3] Hendrickson, [11] Whittaker | [5] Crispin session-based |
+| Evidence Collection | [8] Freeman (observable behavior), [12] Google | [4] Adzic (living docs) |
diff --git a/shared/skills/qa/references/violations.md b/shared/skills/qa/references/violations.md
new file mode 100644
index 00000000..bdae9173
--- /dev/null
+++ b/shared/skills/qa/references/violations.md
@@ -0,0 +1,98 @@
+# QA Skill — Anti-Patterns
+
+Common QA anti-patterns that produce false confidence.
+
+## Violation 1: Happy-Path-Only Testing [1][2]
+
+```
+❌ Only testing the success case:
+   "User logs in with correct credentials → success"
+   No negative, boundary, or error scenarios
+
+✅ Complete coverage:
+   S1: Correct credentials → success (happy)
+   S2: Wrong password → error message (negative)
+   S3: Empty password → validation error (boundary)
+   S4: Locked account → account locked message (negative)
+   S5: SQL injection in username → sanitized, no breach (negative)
+```
+
+**Why**: Happy path tests verify ~20% of behavior. Most defects live in error handling and boundaries [1].
+
+## Violation 2: Testing Implementation, Not Behavior [2][8]
+
+```
+❌ Testing internals:
+   "Verify that addToCart() calls inventoryService.reserve()"
+   "Check that the Redux store has items array with length 1"
+
+✅ Testing user-observable behavior:
+   "Add item to cart → cart count shows 1"
+   "Add item to cart → item appears in checkout"
+```
+
+**Why**: Implementation tests break on refactoring. Behavior tests break when features break [8].
+
+## Violation 3: Missing Evidence [8][12]
+
+```
+❌ No evidence captured:
+   "Ran the command and it seemed to work"
+   PASS (based on what?)
+
+✅ Evidence captured:
+   $ some-command --flag
+   Exit code: 0
+   Output: "Success: 3 items processed"
+   File created: output.json (142 bytes)
+   PASS (evidence: exit code 0, expected output string present)
+```
+
+**Why**: Without evidence, failures are unreproducible and pass results are unverifiable [8].
+
+## Violation 4: Skipping Boundary Analysis [1][6][7]
+
+```
+❌ Testing one valid value:
+   "Input: age=25 → accepted"
+   (Ignores: age=0, age=-1, age=121, age=NaN)
+
+✅ Boundary triplets:
+   age=-1 → rejected (below minimum)
+   age=0  → accepted (minimum)
+   age=1  → accepted (above minimum)
+   age=119 → accepted (below maximum)
+   age=120 → accepted (maximum)
+   age=121 → rejected (above maximum)
+```
+
+**Why**: ~65% of input-related defects occur at boundaries [7]. Single-value testing misses them all.
+
+## Violation 5: Untraceable Scenarios [4][9]
+
+```
+❌ Vague scenarios:
+   "Test that search works"
+   "Make sure the form validates"
+
+✅ Structured Given/When/Then:
+   Given: Database contains users "alice", "bob", "charlie"
+   When: Search for "ali"
+   Then: Results contain "alice" only, displayed within 200ms
+```
+
+**Why**: Vague scenarios are non-reproducible. Structured scenarios are executable specifications [4][9].
+
+## Violation 6: Ignoring Non-Functional Acceptance [5][12]
+
+```
+❌ Only functional testing:
+   "Upload works" (but takes 30 seconds for a 1MB file)
+
+✅ Including observable quality:
+   S1: Upload 1MB file → completes within 2 seconds (performance)
+   S2: Upload shows progress indicator (usability)
+   S3: Upload failure shows retry option (error recovery)
+```
+
+**Why**: Users experience performance, usability, and error recovery — not just correct outputs [5].
diff --git a/src/cli/commands/init.ts b/src/cli/commands/init.ts
index 2179e299..98e2eb3e 100644
--- a/src/cli/commands/init.ts
+++ b/src/cli/commands/init.ts
@@ -20,7 +20,7 @@ import {
   migrateMemoryFiles,
   type SecurityMode,
 } from '../utils/post-install.js';
-import { DEVFLOW_PLUGINS, LEGACY_SKILL_NAMES, LEGACY_COMMAND_NAMES, SHADOW_RENAMES, buildAssetMaps, buildFullSkillsMap, type PluginDefinition } from '../plugins.js';
+import { DEVFLOW_PLUGINS, LEGACY_PLUGIN_NAMES, LEGACY_SKILL_NAMES, LEGACY_COMMAND_NAMES, SHADOW_RENAMES, buildAssetMaps, buildFullSkillsMap, type PluginDefinition } from '../plugins.js';
 import { detectPlatform, detectShell, getProfilePath, getSafeDeleteInfo, hasSafeDelete } from '../utils/safe-delete.js';
 import { generateSafeDeleteBlock, installToProfile, removeFromProfile, getInstalledVersion, SAFE_DELETE_BLOCK_VERSION } from '../utils/safe-delete-install.js';
 import { addAmbientHook } from './ambient.js';
@@ -125,7 +125,8 @@ export function parsePluginSelection(
 ): { selected: string[]; invalid: string[] } {
   const selected = input.split(',').map(p => {
     const trimmed = p.trim();
-    return trimmed.startsWith('devflow-') ? trimmed : `devflow-${trimmed}`;
+    const normalized = trimmed.startsWith('devflow-') ? trimmed : `devflow-${trimmed}`;
+    return LEGACY_PLUGIN_NAMES[normalized] ?? normalized;
   });
 
   const validNames = validPlugins.map(p => p.name);
@@ -309,7 +310,7 @@ export const initCommand = new Command('init')
         'devflow-typescript': 'TypeScript patterns',
         'devflow-react': 'React patterns',
         'devflow-accessibility': 'WCAG compliance',
-        'devflow-frontend-design': 'typography, color, spacing',
+        'devflow-ui-design': 'typography, color, spacing',
         'devflow-go': 'Go patterns',
         'devflow-java': 'Java patterns',
         'devflow-python': 'Python patterns',
diff --git a/src/cli/plugins.ts b/src/cli/plugins.ts
index 5398a3c2..04f28f7c 100644
--- a/src/cli/plugins.ts
+++ b/src/cli/plugins.ts
@@ -60,8 +60,8 @@ export const DEVFLOW_PLUGINS: PluginDefinition[] = [
     name: 'devflow-implement',
     description: 'Complete task implementation workflow with exploration, planning, and coding',
     commands: ['/implement'],
-    agents: ['git', 'skimmer', 'synthesizer', 'coder', 'simplifier', 'scrutinizer', 'shepherd', 'validator'],
-    skills: ['agent-teams', 'implementation-patterns', 'knowledge-persistence', 'self-review', 'worktree-support'],
+    agents: ['git', 'skimmer', 'synthesizer', 'coder', 'simplifier', 'scrutinizer', 'evaluator', 'tester', 'validator'],
+    skills: ['agent-teams', 'implementation-patterns', 'knowledge-persistence', 'qa', 'self-review', 'worktree-support'],
   },
   {
     name: 'devflow-code-review',
@@ -95,7 +95,7 @@ export const DEVFLOW_PLUGINS: PluginDefinition[] = [
     name: 'devflow-ambient',
     description: 'Ambient mode — intent classification with proportional agent orchestration',
     commands: ['/ambient'],
-    agents: ['coder', 'validator', 'simplifier', 'scrutinizer', 'shepherd', 'skimmer', 'reviewer', 'git', 'synthesizer', 'resolver'],
+    agents: ['coder', 'validator', 'simplifier', 'scrutinizer', 'evaluator', 'tester', 'skimmer', 'reviewer', 'git', 'synthesizer', 'resolver'],
     skills: [
       'ambient-router',
       'implementation-orchestration',
@@ -117,6 +117,7 @@ export const DEVFLOW_PLUGINS: PluginDefinition[] = [
       'documentation',
       'implementation-patterns',
       'knowledge-persistence',
+      'qa',
       'worktree-support',
     ],
   },
@@ -153,8 +154,8 @@ export const DEVFLOW_PLUGINS: PluginDefinition[] = [
     optional: true,
   },
   {
-    name: 'devflow-frontend-design',
-    description: 'Frontend design patterns - typography, color systems, spacing, motion, responsive design',
+    name: 'devflow-ui-design',
+    description: 'UI design patterns - typography, color systems, spacing, motion, responsive design',
     commands: [],
     agents: [],
     skills: ['ui-design'],
@@ -194,6 +195,14 @@ export const DEVFLOW_PLUGINS: PluginDefinition[] = [
   },
 ];
 
+/**
+ * Deprecated plugin names from old installations.
+ * Maps old name → new name for migration during init.
+ */
+export const LEGACY_PLUGIN_NAMES: Record<string, string> = {
+  'devflow-frontend-design': 'devflow-ui-design',
+};
+
 /**
  * Deprecated command names from old installations.
  * Used during init to clean up stale command files on upgrade.
@@ -202,6 +211,14 @@ export const LEGACY_COMMAND_NAMES: string[] = [
   'review',
 ];
 
+/**
+ * Deprecated agent names from old installations.
+ * Used during init to clean up stale agent files on upgrade.
+ */
+export const LEGACY_AGENT_NAMES: string[] = [
+  'shepherd',
+];
+
 /**
  * Deprecated skill names from old installations (prefixed with devflow-).
  * Used during uninstall to clean up legacy installs.
@@ -309,6 +326,8 @@ export const LEGACY_SKILL_NAMES: string[] = [
   'database',
   'dependencies',
   'documentation',
+  // v2.0.0 new skills: bare names for pre-namespace installs
+  'qa',
   // v2.0.0 git consolidation: prefixed old names for cleanup
   'devflow:git-safety',
   'devflow:git-workflow',
diff --git a/src/cli/utils/installer.ts b/src/cli/utils/installer.ts
index 28090bf3..9c1a5703 100644
--- a/src/cli/utils/installer.ts
+++ b/src/cli/utils/installer.ts
@@ -2,7 +2,7 @@ import { promises as fs } from 'fs';
 import * as path from 'path';
 import { execSync } from 'child_process';
 import type { PluginDefinition } from '../plugins.js';
-import { DEVFLOW_PLUGINS, prefixSkillName } from '../plugins.js';
+import { DEVFLOW_PLUGINS, LEGACY_AGENT_NAMES, prefixSkillName } from '../plugins.js';
 
 /**
  * Minimal spinner interface matching @clack/prompts spinner().
@@ -166,6 +166,7 @@ export async function installViaFileCopy(options: FileCopyOptions): Promise<void
 
   // Install commands and agents from selected plugins (with deduplication)
   spinner.message('Installing commands and agents...');
+  const agentsTarget = path.join(claudeDir, 'agents', 'devflow');
   for (const plugin of plugins) {
     const pluginSourceDir = path.join(pluginsDir, plugin.name);
 
@@ -193,7 +194,6 @@ export async function installViaFileCopy(options: FileCopyOptions): Promise<void
 
     // Install agents (deduplicated)
     const agentsSource = path.join(pluginSourceDir, 'agents');
-    const agentsTarget = path.join(claudeDir, 'agents', 'devflow');
     try {
       const files = await fs.readdir(agentsSource);
       if (files.length > 0) {
@@ -211,6 +211,13 @@ export async function installViaFileCopy(options: FileCopyOptions): Promise<void
     } catch { /* no agents directory */ }
   }
 
+  // Clean up legacy agent files (renamed or removed agents from prior versions)
+  for (const legacyAgent of LEGACY_AGENT_NAMES) {
+    try {
+      await fs.rm(path.join(agentsTarget, `${legacyAgent}.md`), { force: true });
+    } catch { /* ignore */ }
+  }
+
   // Install skills from ALL plugins (skillsMap covers all plugins, not just selected).
   // Skills are tiny markdown files — universal install ensures orchestration skills
   // can spawn agents that depend on skills from other plugins.
diff --git a/src/cli/utils/manifest.ts b/src/cli/utils/manifest.ts
index 0aaafef2..50f2414c 100644
--- a/src/cli/utils/manifest.ts
+++ b/src/cli/utils/manifest.ts
@@ -1,5 +1,6 @@
 import { promises as fs } from 'fs';
 import * as path from 'path';
+import { LEGACY_PLUGIN_NAMES } from '../plugins.js';
 
 /**
  * Manifest data tracked for each DevFlow installation.
@@ -133,7 +134,8 @@ export function resolvePluginList(
   isPartialInstall: boolean,
 ): string[] {
   if (existingManifest && isPartialInstall) {
-    return mergeManifestPlugins(existingManifest.plugins, installedPluginNames);
+    const cleaned = existingManifest.plugins.map(p => LEGACY_PLUGIN_NAMES[p] ?? p);
+    return mergeManifestPlugins(cleaned, installedPluginNames);
   }
   return installedPluginNames;
 }
diff --git a/tests/init-logic.test.ts b/tests/init-logic.test.ts
index c1fa5eca..a52d2868 100644
--- a/tests/init-logic.test.ts
+++ b/tests/init-logic.test.ts
@@ -58,6 +58,18 @@ describe('parsePluginSelection', () => {
     expect(selected).toEqual(['devflow-implement']);
     expect(invalid).toEqual([]);
   });
+
+  it('remaps legacy plugin names', () => {
+    const { selected, invalid } = parsePluginSelection('frontend-design', DEVFLOW_PLUGINS);
+    expect(selected).toEqual(['devflow-ui-design']);
+    expect(invalid).toEqual([]);
+  });
+
+  it('remaps legacy plugin names with prefix', () => {
+    const { selected, invalid } = parsePluginSelection('devflow-frontend-design', DEVFLOW_PLUGINS);
+    expect(selected).toEqual(['devflow-ui-design']);
+    expect(invalid).toEqual([]);
+  });
 });
 
 describe('substituteSettingsTemplate', () => {
diff --git a/tests/manifest.test.ts b/tests/manifest.test.ts
index 7db914ae..8c315bb6 100644
--- a/tests/manifest.test.ts
+++ b/tests/manifest.test.ts
@@ -346,4 +346,17 @@ describe('resolvePluginList', () => {
     );
     expect(result).toEqual(['devflow-code-review']);
   });
+
+  it('remaps legacy plugin names in existing manifest on partial install', () => {
+    const legacyManifest: ManifestData = {
+      ...existingManifest,
+      plugins: ['devflow-core-skills', 'devflow-frontend-design'],
+    };
+    const result = resolvePluginList(
+      ['devflow-code-review'],
+      legacyManifest,
+      true,
+    );
+    expect(result).toEqual(['devflow-core-skills', 'devflow-ui-design', 'devflow-code-review']);
+  });
 });
diff --git a/tests/plugins.test.ts b/tests/plugins.test.ts
index f9e939c9..36c4b849 100644
--- a/tests/plugins.test.ts
+++ b/tests/plugins.test.ts
@@ -7,6 +7,7 @@ import {
   buildFullSkillsMap,
   SHADOW_RENAMES,
   LEGACY_SKILL_NAMES,
+  LEGACY_AGENT_NAMES,
   type PluginDefinition,
 } from '../src/cli/plugins.js';
 
@@ -157,7 +158,7 @@ describe('optional plugin flag', () => {
     'devflow-typescript',
     'devflow-react',
     'devflow-accessibility',
-    'devflow-frontend-design',
+    'devflow-ui-design',
     'devflow-go',
     'devflow-java',
     'devflow-python',
@@ -218,6 +219,26 @@ describe('optional plugin flag', () => {
     expect(ambient!.agents).toContain('resolver');
   });
 
+  it('devflow-implement declares evaluator and tester agents and qa skill', () => {
+    const implement = DEVFLOW_PLUGINS.find(p => p.name === 'devflow-implement');
+    expect(implement).toBeDefined();
+    // evaluator and tester are declared so uninstalling ambient doesn't break implement
+    expect(implement!.agents).toContain('evaluator');
+    expect(implement!.agents).toContain('tester');
+    // qa skill is required for the tester agent
+    expect(implement!.skills).toContain('qa');
+  });
+
+  it('devflow-ambient declares evaluator, tester agents and qa skill', () => {
+    const ambient = DEVFLOW_PLUGINS.find(p => p.name === 'devflow-ambient');
+    expect(ambient).toBeDefined();
+    // Ambient orchestrates the full implement pipeline, so evaluator and tester must be declared
+    expect(ambient!.agents).toContain('evaluator');
+    expect(ambient!.agents).toContain('tester');
+    // qa skill is required for the tester agent
+    expect(ambient!.skills).toContain('qa');
+  });
+
   it('devflow-core-skills does not contain language/ecosystem skills', () => {
     const coreSkills = DEVFLOW_PLUGINS.find(p => p.name === 'devflow-core-skills');
     expect(coreSkills).toBeDefined();
@@ -252,3 +273,15 @@ describe('SHADOW_RENAMES consistency', () => {
     }
   });
 });
+
+describe('LEGACY_AGENT_NAMES consistency', () => {
+  it('no legacy agent name appears in any current plugin agents array', () => {
+    const currentAgents = getAllAgentNames();
+    for (const legacyName of LEGACY_AGENT_NAMES) {
+      expect(
+        currentAgents,
+        `LEGACY_AGENT_NAMES entry '${legacyName}' must not appear in getAllAgentNames() — remove it from LEGACY_AGENT_NAMES or update the plugin registry`,
+      ).not.toContain(legacyName);
+    }
+  });
+});