From a1b10a969e2d8172be5717a04955206c15564a23 Mon Sep 17 00:00:00 2001
From: Molly He <mollyhe@amazon.com>
Date: Tue, 24 Feb 2026 17:59:11 -0800
Subject: [PATCH 1/5] Add cleanup mechanism for MC dataset integ test

---
 sagemaker-train/src/sagemaker/ai_registry/dataset.py  |  1 -
 sagemaker-train/tests/integ/ai_registry/conftest.py   |  9 ++-------
 .../tests/integ/ai_registry/test_dataset.py           | 11 +++++++----
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/sagemaker-train/src/sagemaker/ai_registry/dataset.py b/sagemaker-train/src/sagemaker/ai_registry/dataset.py
index 92aeef23f3..eb1bd3505a 100644
--- a/sagemaker-train/src/sagemaker/ai_registry/dataset.py
+++ b/sagemaker-train/src/sagemaker/ai_registry/dataset.py
@@ -389,7 +389,6 @@ def get_versions(self) -> List["DataSet"]:
         
         return datasets
 
-    @classmethod
     @classmethod
     @_telemetry_emitter(feature=Feature.MODEL_CUSTOMIZATION, func_name="DataSet.get_all")
     def get_all(cls, max_results: Optional[int] = None, sagemaker_session=None):
diff --git a/sagemaker-train/tests/integ/ai_registry/conftest.py b/sagemaker-train/tests/integ/ai_registry/conftest.py
index 9cbbc18145..9f7c2069f1 100644
--- a/sagemaker-train/tests/integ/ai_registry/conftest.py
+++ b/sagemaker-train/tests/integ/ai_registry/conftest.py
@@ -106,13 +106,8 @@ def cleanup_list():
     """Track resources for cleanup."""
     resources = []
     yield resources
-    for evaluator in resources:
+    for resource in resources:
         try:
-            from sagemaker.ai_registry.air_hub import AIRHub
-            AIRHub.delete_hub_content(
-                hub_content_type=evaluator.hub_content_type,
-                hub_content_name=evaluator.name,
-                hub_content_version=evaluator.version
-            )
+            resource.delete()
         except Exception:
             pass
diff --git a/sagemaker-train/tests/integ/ai_registry/test_dataset.py b/sagemaker-train/tests/integ/ai_registry/test_dataset.py
index f29c9e09e0..cfefcdb85e 100644
--- a/sagemaker-train/tests/integ/ai_registry/test_dataset.py
+++ b/sagemaker-train/tests/integ/ai_registry/test_dataset.py
@@ -129,9 +129,10 @@ def test_create_dataset_from_s3_nova_eval(self, unique_name, test_bucket, cleanu
         cleanup_list.append(dataset)
         assert dataset.name == unique_name
 
-    def test_get_dataset(self, unique_name, sample_jsonl_file):
+    def test_get_dataset(self, unique_name, sample_jsonl_file, cleanup_list):
         """Test retrieving dataset by name."""
         created = DataSet.create(name=unique_name, source=sample_jsonl_file, wait=False)
+        cleanup_list.append(created)
         retrieved = DataSet.get(unique_name)
         assert retrieved.name == created.name
         assert retrieved.arn == created.arn
@@ -141,16 +142,18 @@ def test_get_all_datasets(self):
         datasets = list(DataSet.get_all(max_results=5))
         assert isinstance(datasets, list)
 
-    def test_dataset_refresh(self, unique_name, sample_jsonl_file):
+    def test_dataset_refresh(self, unique_name, sample_jsonl_file, cleanup_list):
         """Test refreshing dataset status."""
         dataset = DataSet.create(name=unique_name, source=sample_jsonl_file, wait=False)
+        cleanup_list.append(dataset)
         dataset.refresh()
         time.sleep(3)
         assert dataset.status in [HubContentStatus.IMPORTING.value, HubContentStatus.AVAILABLE.value]
 
-    def test_dataset_get_versions(self, unique_name, sample_jsonl_file):
+    def test_dataset_get_versions(self, unique_name, sample_jsonl_file, cleanup_list):
         """Test getting dataset versions."""
         dataset = DataSet.create(name=unique_name, source=sample_jsonl_file, wait=False)
+        cleanup_list.append(dataset)
         versions = dataset.get_versions()
         assert len(versions) >= 1
         assert all(isinstance(v, DataSet) for v in versions)
@@ -178,7 +181,7 @@ def test_create_dataset_version(self, unique_name, sample_jsonl_file, cleanup_li
         """Test creating new dataset version."""
         dataset = DataSet.create(name=unique_name, source=sample_jsonl_file, wait=False)
         result = dataset.create_version(sample_jsonl_file)
-        cleanup_list.append(cleanup_list)
+        cleanup_list.append(dataset)
         assert result is True
 
     def test_dataset_validation_invalid_extension(self, unique_name):

From 67f3163723fd62e0a2d5585cf626eb4321092c8a Mon Sep 17 00:00:00 2001
From: Molly He <mollyhe@amazon.com>
Date: Wed, 25 Feb 2026 14:01:34 -0800
Subject: [PATCH 2/5] Fix integ test for test_llm_as_judge_base_model_fix

---
 .../train/test_llm_as_judge_base_model_fix.py | 42 +++++++++++++------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
index f2d28e1ac2..cbb715796b 100644
--- a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
+++ b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
@@ -144,14 +144,23 @@ def test_base_model_evaluation_uses_correct_weights(self):
         # Check that we have both base and custom inference steps
         step_names = [step.name for step in execution.status.step_details] if execution.status.step_details else []
         
-        logger.info(f"Pipeline steps: {step_names}")
+        logger.info(f"Pipeline steps ({len(step_names)}): {step_names}")
         
-        # Verify both inference steps exist
-        has_base_step = any("BaseInference" in name for name in step_names)
-        has_custom_step = any("CustomInference" in name for name in step_names)
+        # If no steps yet, wait a bit for pipeline to initialize
+        if not step_names:
+            logger.info("No steps found yet, waiting for pipeline initialization...")
+            import time
+            time.sleep(10)
+            execution.refresh()
+            step_names = [step.name for step in execution.status.step_details] if execution.status.step_details else []
+            logger.info(f"Pipeline steps after wait ({len(step_names)}): {step_names}")
         
-        assert has_base_step, "Pipeline should have EvaluateBaseInferenceModel step"
-        assert has_custom_step, "Pipeline should have EvaluateCustomInferenceModel step"
+        # Verify both inference steps exist (case-insensitive, flexible matching)
+        has_base_step = any("base" in name.lower() and "inference" in name.lower() for name in step_names)
+        has_custom_step = any("custom" in name.lower() and "inference" in name.lower() for name in step_names)
+        
+        assert has_base_step, f"Pipeline should have base inference step. Found steps: {step_names}"
+        assert has_custom_step, f"Pipeline should have custom inference step. Found steps: {step_names}"
         
         logger.info(f"✓ Pipeline has both base and custom inference steps")
         logger.info(f"  Base model step: {'Found' if has_base_step else 'Missing'}")
@@ -259,14 +268,23 @@ def test_base_model_false_still_works(self):
         execution.refresh()
         step_names = [step.name for step in execution.status.step_details] if execution.status.step_details else []
         
-        logger.info(f"Pipeline steps: {step_names}")
+        logger.info(f"Pipeline steps ({len(step_names)}): {step_names}")
+        
+        # If no steps yet, wait a bit for pipeline to initialize
+        if not step_names:
+            logger.info("No steps found yet, waiting for pipeline initialization...")
+            import time
+            time.sleep(10)
+            execution.refresh()
+            step_names = [step.name for step in execution.status.step_details] if execution.status.step_details else []
+            logger.info(f"Pipeline steps after wait ({len(step_names)}): {step_names}")
         
-        # Should NOT have base inference step
-        has_base_step = any("BaseInference" in name for name in step_names)
-        has_custom_step = any("CustomInference" in name for name in step_names)
+        # Should NOT have base inference step (case-insensitive, flexible matching)
+        has_base_step = any("base" in name.lower() and "inference" in name.lower() for name in step_names)
+        has_custom_step = any("custom" in name.lower() and "inference" in name.lower() for name in step_names)
         
-        assert not has_base_step, "Pipeline should NOT have EvaluateBaseInferenceModel step when evaluate_base_model=False"
-        assert has_custom_step, "Pipeline should have EvaluateCustomInferenceModel step"
+        assert not has_base_step, f"Pipeline should NOT have base inference step when evaluate_base_model=False. Found steps: {step_names}"
+        assert has_custom_step, f"Pipeline should have custom inference step. Found steps: {step_names}"
         
         logger.info(f"✓ Pipeline structure correct for evaluate_base_model=False")
         logger.info(f"  Base model step: {'Found (ERROR!)' if has_base_step else 'Not present (correct)'}")

From 2b6fc645524c3b7ebe26cbdcbf6b035878b38000 Mon Sep 17 00:00:00 2001
From: Molly He <mollyhe@amazon.com>
Date: Thu, 26 Feb 2026 11:40:17 -0800
Subject: [PATCH 3/5] Further fix for the same LLM as judge integ test failure

---
 .../src/sagemaker/train/evaluate/base_evaluator.py     | 10 +++++++++-
 .../src/sagemaker/train/evaluate/pipeline_templates.py |  6 ++++--
 .../integ/train/test_llm_as_judge_base_model_fix.py    |  2 ++
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/sagemaker-train/src/sagemaker/train/evaluate/base_evaluator.py b/sagemaker-train/src/sagemaker/train/evaluate/base_evaluator.py
index c1f0406c9f..22c30c512a 100644
--- a/sagemaker-train/src/sagemaker/train/evaluate/base_evaluator.py
+++ b/sagemaker-train/src/sagemaker/train/evaluate/base_evaluator.py
@@ -701,10 +701,18 @@ def _get_base_template_context(
         Returns:
             dict: Base template context dictionary
         """
+        # Generate default mlflow_experiment_name if not provided
+        # This is required by AWS when ModelPackageGroupArn is not provided in training jobs
+        mlflow_experiment_name = self.mlflow_experiment_name
+        if not mlflow_experiment_name and self.mlflow_resource_arn:
+            # Use pipeline_name as default experiment name
+            mlflow_experiment_name = '{{ pipeline_name }}'
+            _logger.info("No mlflow_experiment_name provided, using pipeline_name as default")
+        
         return {
             'role_arn': role_arn,
             'mlflow_resource_arn': self.mlflow_resource_arn,
-            'mlflow_experiment_name': self.mlflow_experiment_name,
+            'mlflow_experiment_name': mlflow_experiment_name,
             'mlflow_run_name': self.mlflow_run_name,
             'model_package_group_arn': model_package_group_arn,
             'source_model_package_arn': self._source_model_package_arn,
diff --git a/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py b/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py
index 79eb565fa6..12daa12f0b 100644
--- a/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py
+++ b/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py
@@ -337,7 +337,8 @@
             "Name": "EvaluateBaseInferenceModel",
             "Type": "Training",
             "Arguments": {
-                "TrainingJobName": "BaseInference",
+                "TrainingJobName": "BaseInference",{% if mlflow_experiment_name %}
+                "MlflowExperimentName": "{{ mlflow_experiment_name }}",{% endif %}
                 "RoleArn": "{{ role_arn }}",
                 "ServerlessJobConfig": {
                     "BaseModelArn": "{{ base_model_arn }}",
@@ -1007,7 +1008,8 @@
             "Name": "EvaluateBaseInferenceModel",
             "Type": "Training",
             "Arguments": {
-                "TrainingJobName": "BaseInference",
+                "TrainingJobName": "BaseInference",{% if mlflow_experiment_name %}
+                "MlflowExperimentName": "{{ mlflow_experiment_name }}",{% endif %}
                 "RoleArn": "{{ role_arn }}",
                 "ServerlessJobConfig": {
                     "BaseModelArn": "{{ base_model_arn }}",
diff --git a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
index cbb715796b..a671ada252 100644
--- a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
+++ b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
@@ -112,6 +112,7 @@ def test_base_model_evaluation_uses_correct_weights(self):
             builtin_metrics=TEST_CONFIG["builtin_metrics"],
             custom_metrics=TEST_CONFIG["custom_metrics_json"],
             s3_output_path=TEST_CONFIG["s3_output_path"],
+            mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
             evaluate_base_model=TEST_CONFIG["evaluate_base_model"],
         )
         
@@ -247,6 +248,7 @@ def test_base_model_false_still_works(self):
             dataset=TEST_CONFIG["dataset_s3_uri"],
             builtin_metrics=TEST_CONFIG["builtin_metrics"],
             s3_output_path=TEST_CONFIG["s3_output_path"],
+            mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
             evaluate_base_model=False,  # Only evaluate custom model
         )
         

From 564819ea8dedf99d411087a030752262248f633d Mon Sep 17 00:00:00 2001
From: Molly He <mollyhe@amazon.com>
Date: Thu, 26 Feb 2026 14:09:06 -0800
Subject: [PATCH 4/5] Rollback fix in src code

---
 .../train/evaluate/pipeline_templates.py      |  6 ++----
 .../train/test_llm_as_judge_base_model_fix.py | 19 +++++++++++--------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py b/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py
index 12daa12f0b..79eb565fa6 100644
--- a/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py
+++ b/sagemaker-train/src/sagemaker/train/evaluate/pipeline_templates.py
@@ -337,8 +337,7 @@
             "Name": "EvaluateBaseInferenceModel",
             "Type": "Training",
             "Arguments": {
-                "TrainingJobName": "BaseInference",{% if mlflow_experiment_name %}
-                "MlflowExperimentName": "{{ mlflow_experiment_name }}",{% endif %}
+                "TrainingJobName": "BaseInference",
                 "RoleArn": "{{ role_arn }}",
                 "ServerlessJobConfig": {
                     "BaseModelArn": "{{ base_model_arn }}",
@@ -1008,8 +1007,7 @@
             "Name": "EvaluateBaseInferenceModel",
             "Type": "Training",
             "Arguments": {
-                "TrainingJobName": "BaseInference",{% if mlflow_experiment_name %}
-                "MlflowExperimentName": "{{ mlflow_experiment_name }}",{% endif %}
+                "TrainingJobName": "BaseInference",
                 "RoleArn": "{{ role_arn }}",
                 "ServerlessJobConfig": {
                     "BaseModelArn": "{{ base_model_arn }}",
diff --git a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
index a671ada252..29d84113ed 100644
--- a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
+++ b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
@@ -112,7 +112,6 @@ def test_base_model_evaluation_uses_correct_weights(self):
             builtin_metrics=TEST_CONFIG["builtin_metrics"],
             custom_metrics=TEST_CONFIG["custom_metrics_json"],
             s3_output_path=TEST_CONFIG["s3_output_path"],
-            mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
             evaluate_base_model=TEST_CONFIG["evaluate_base_model"],
         )
         
@@ -216,14 +215,19 @@ def test_base_model_evaluation_uses_correct_weights(self):
             if execution.status.failure_reason:
                 logger.error(f"  Failure reason: {execution.status.failure_reason}")
             
-            # Log step failures
+            # Log step failures with detailed information
             if execution.status.step_details:
-                logger.error("\nFailed steps:")
+                logger.error("\n" + "=" * 80)
+                logger.error("DETAILED STEP FAILURE INFORMATION:")
+                logger.error("=" * 80)
                 for step in execution.status.step_details:
-                    if "failed" in step.status.lower():
-                        logger.error(f"  {step.name}: {step.status}")
-                        if step.failure_reason:
-                            logger.error(f"    Reason: {step.failure_reason}")
+                    logger.error(f"\nStep: {step.name}")
+                    logger.error(f"  Status: {step.status}")
+                    logger.error(f"  Start Time: {step.start_time}")
+                    logger.error(f"  End Time: {step.end_time}")
+                    if step.failure_reason:
+                        logger.error(f"  ❌ FAILURE REASON: {step.failure_reason}")
+                logger.error("=" * 80)
             
             # Re-raise to fail the test
             raise
@@ -248,7 +252,6 @@ def test_base_model_false_still_works(self):
             dataset=TEST_CONFIG["dataset_s3_uri"],
             builtin_metrics=TEST_CONFIG["builtin_metrics"],
             s3_output_path=TEST_CONFIG["s3_output_path"],
-            mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
             evaluate_base_model=False,  # Only evaluate custom model
         )
         

From 79a41119f8ca5d79d3a1ec5fc02aa48b5df5ed50 Mon Sep 17 00:00:00 2001
From: Molly He <mollyhe@amazon.com>
Date: Fri, 27 Feb 2026 11:19:35 -0800
Subject: [PATCH 5/5] Update error ahndling for sagemaker-train show results
 util

---
 .../src/sagemaker/train/common_utils/show_results_utils.py  | 2 ++
 .../tests/integ/train/test_llm_as_judge_base_model_fix.py   | 6 +++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/sagemaker-train/src/sagemaker/train/common_utils/show_results_utils.py b/sagemaker-train/src/sagemaker/train/common_utils/show_results_utils.py
index cb0494f6fc..f7ab97e1f6 100644
--- a/sagemaker-train/src/sagemaker/train/common_utils/show_results_utils.py
+++ b/sagemaker-train/src/sagemaker/train/common_utils/show_results_utils.py
@@ -341,6 +341,8 @@ def _parse_response(response_str: str) -> str:
 
 def _format_score(score: float) -> str:
     """Format score as percentage: 0.8333 -> '83.3%' """
+    if score is None:
+        return "N/A"
     return f"{score * 100:.1f}%"
 
 
diff --git a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
index 29d84113ed..dcef7d4881 100644
--- a/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
+++ b/sagemaker-train/tests/integ/train/test_llm_as_judge_base_model_fix.py
@@ -184,7 +184,11 @@ def test_base_model_evaluation_uses_correct_weights(self):
             
             # Display results
             logger.info("  Fetching results (first 10 rows)...")
-            execution.show_results(limit=10, offset=0, show_explanations=False)
+            try:
+                execution.show_results(limit=10, offset=0, show_explanations=False)
+            except (TypeError, ValueError) as e:
+                logger.warning(f"  Could not display results due to formatting issue: {e}")
+                logger.info("  Results are available but display utility has a bug with None scores")
             
             # Verify S3 output path
             assert execution.s3_output_path is not None