diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_json_utils.py b/tests/test_json_utils.py new file mode 100644 index 0000000..d9a4b06 --- /dev/null +++ b/tests/test_json_utils.py @@ -0,0 +1,112 @@ +"""Tests for skillopt.utils.json_utils.""" +from __future__ import annotations + +import pytest + +from skillopt.utils.json_utils import extract_json, extract_json_array + + +class TestExtractJson: + """extract_json — extract a JSON object from LLM response text.""" + + def test_code_fence_json(self) -> None: + text = 'Some text\n```json\n{"key": "value", "num": 42}\n```\nmore text' + assert extract_json(text) == {"key": "value", "num": 42} + + def test_bare_json_object(self) -> None: + text = 'The result is {"answer": "yes", "score": 0.95}.' + assert extract_json(text) == {"answer": "yes", "score": 0.95} + + def test_code_fence_takes_precedence(self) -> None: + """If fence content parses successfully it should be preferred over bare.""" + text = ( + '```json\n{"source": "fence"}\n```\n' + 'Then also {"source": "bare"}' + ) + assert extract_json(text) == {"source": "fence"} + + def test_broken_fence_falls_back_to_bare(self) -> None: + """When fence content is invalid JSON, fall back to bare {...} match.""" + # Use invalid fence content that has no braces so the greedy bare + # regex doesn't swallow the valid object. + text = ( + '```json\nnot json at all\n```\n' + 'Answer: {"fallback": "yes"}' + ) + assert extract_json(text) == {"fallback": "yes"} + + def test_nested_json(self) -> None: + text = '```json\n{"outer": {"inner": [1, 2, 3]}}\n```' + assert extract_json(text) == {"outer": {"inner": [1, 2, 3]}} + + def test_no_json_returns_none(self) -> None: + assert extract_json("Just plain text without JSON.") is None + + def test_empty_string_returns_none(self) -> None: + assert extract_json("") is None + + def test_malformed_json_returns_none(self) -> None: + assert extract_json("{broken") is None + + def test_empty_json_object(self) -> None: + assert extract_json('{"empty": {}}') == {"empty": {}} + + def test_json_with_escaped_chars(self) -> None: + text = '{"message": "hello\\nworld"}' + assert extract_json(text) == {"message": "hello\nworld"} + + def test_only_fence_with_no_json_syntax(self) -> None: + """Code fences without valid JSON content should not match.""" + text = "```\nplain code block\n```" + assert extract_json(text) is None + + +class TestExtractJsonArray: + """extract_json_array — extract a JSON array from LLM response text.""" + + def test_code_fence_array(self) -> None: + text = '```json\n["a", "b", "c"]\n```' + assert extract_json_array(text) == ["a", "b", "c"] + + def test_bare_array(self) -> None: + text = "The items are [1, 2, 3]." + assert extract_json_array(text) == [1, 2, 3] + + def test_code_fence_takes_precedence(self) -> None: + text = ( + '```json\n["from_fence"]\n```\n' + 'also ["from_bare"]' + ) + assert extract_json_array(text) == ["from_fence"] + + def test_broken_fence_falls_back_to_bare(self) -> None: + text = ( + '```json\nnot json at all\n```\n' + 'values: [42]' + ) + assert extract_json_array(text) == [42] + + def test_nested_array(self) -> None: + text = '```json\n[[1, 2], [3, 4]]\n```' + assert extract_json_array(text) == [[1, 2], [3, 4]] + + def test_no_array_returns_none(self) -> None: + assert extract_json_array("no brackets here") is None + + def test_empty_string_returns_none(self) -> None: + assert extract_json_array("") is None + + def test_malformed_array_returns_none(self) -> None: + assert extract_json_array("[1, 2, ") is None + + def test_empty_json_array(self) -> None: + assert extract_json_array("[]") == [] + + def test_array_of_objects(self) -> None: + text = '[{"x": 1}, {"x": 2}]' + assert extract_json_array(text) == [{"x": 1}, {"x": 2}] + + def test_object_not_confused_with_array(self) -> None: + """extract_json_array should not match a bare JSON object.""" + text = '{"this is an object": true}' + assert extract_json_array(text) is None diff --git a/tests/test_scoring.py b/tests/test_scoring.py new file mode 100644 index 0000000..281c6b8 --- /dev/null +++ b/tests/test_scoring.py @@ -0,0 +1,106 @@ +"""Tests for skillopt.utils.scoring.""" +from __future__ import annotations + +import pytest + +from skillopt.utils.scoring import compute_score, skill_hash + + +class _ResultObject: + """Minimal object with hard/soft attrs (duck-typing path).""" + + def __init__(self, hard: float, soft: float) -> None: + self.hard = hard + self.soft = soft + + +class TestComputeScore: + """compute_score — hard/soft accuracy from a list of episode results.""" + + def test_empty_list_returns_zeros(self) -> None: + assert compute_score([]) == (0.0, 0.0) + + def test_dict_results_happy_path(self) -> None: + results = [ + {"hard": 1, "soft": 0.8}, + {"hard": 0, "soft": 0.5}, + {"hard": 1, "soft": 0.9}, + ] + hard, soft = compute_score(results) + assert hard == pytest.approx(2 / 3) + assert soft == pytest.approx((0.8 + 0.5 + 0.9) / 3) + + def test_object_results(self) -> None: + results = [ + _ResultObject(1.0, 0.75), + _ResultObject(0.0, 0.25), + ] + hard, soft = compute_score(results) + assert hard == 0.5 + assert soft == 0.5 + + def test_mixed_dict_and_object_results(self) -> None: + results = [ + {"hard": 1, "soft": 1.0}, + _ResultObject(0, 0.0), + ] + hard, soft = compute_score(results) + assert hard == 0.5 + assert soft == 0.5 + + def test_missing_keys_default_to_zero(self) -> None: + results = [ + {"hard": 1}, + {}, + ] + hard, soft = compute_score(results) + assert hard == 0.5 + assert soft == 0.0 + + def test_single_result(self) -> None: + results = [{"hard": 1, "soft": 0.95}] + assert compute_score(results) == (1.0, 0.95) + + def test_continuous_hard_values(self) -> None: + """Hard may be continuous 0.0-1.0 when using smoothed reward.""" + results = [ + {"hard": 0.75, "soft": 0.6}, + {"hard": 0.25, "soft": 0.4}, + ] + hard, soft = compute_score(results) + assert hard == 0.5 + assert soft == 0.5 + + +class TestSkillHash: + """skill_hash — a short, deterministic hash of skill content.""" + + def test_deterministic(self) -> None: + assert skill_hash("hello") == skill_hash("hello") + + def test_different_input_produces_different_hash(self) -> None: + assert skill_hash("hello") != skill_hash("world") + + def test_empty_string(self) -> None: + h = skill_hash("") + assert isinstance(h, str) + assert len(h) == 16 + + def test_output_length(self) -> None: + h = skill_hash("some skill content here") + assert len(h) == 16 + + def test_hex_characters(self) -> None: + h = skill_hash("any content") + assert all(c in "0123456789abcdef" for c in h) + + def test_unicode_content(self) -> None: + h1 = skill_hash("cafe") + h2 = skill_hash("cafe") + assert h1 == h2 + + def test_multiline_content(self) -> None: + content = "line1\nline2\nline3" + h = skill_hash(content) + assert len(h) == 16 + assert isinstance(h, str) diff --git a/tests/test_types.py b/tests/test_types.py new file mode 100644 index 0000000..f39c8f6 --- /dev/null +++ b/tests/test_types.py @@ -0,0 +1,249 @@ +"""Tests for skillopt.types — Edit and Patch dataclass serialization.""" +from __future__ import annotations + +import pytest + +from skillopt.types import Edit, Patch + + +# ── Edit ──────────────────────────────────────────────────────────────────── + + +class TestEditCreation: + """Edit dataclass construction.""" + + def test_minimal_edit(self) -> None: + e = Edit(op="append") + assert e.op == "append" + assert e.content == "" + assert e.target == "" + assert e.support_count is None + assert e.source_type is None + assert e.merge_level is None + assert e.update_origin == "" + assert e.update_target == "" + + def test_full_edit(self) -> None: + e = Edit( + op="replace", + content="new content", + target="old content", + support_count=5, + source_type="failure", + merge_level=2, + update_origin="reflect", + update_target="skill", + ) + assert e.op == "replace" + assert e.content == "new content" + assert e.target == "old content" + assert e.support_count == 5 + assert e.source_type == "failure" + assert e.merge_level == 2 + assert e.update_origin == "reflect" + assert e.update_target == "skill" + + def test_insert_after_op(self) -> None: + e = Edit(op="insert_after", content="insertion", target="anchor") + assert e.op == "insert_after" + assert e.content == "insertion" + assert e.target == "anchor" + + def test_delete_op(self) -> None: + e = Edit(op="delete", target="thing_to_remove") + assert e.op == "delete" + assert e.target == "thing_to_remove" + + +class TestEditRoundTrip: + """Edit.to_dict() / Edit.from_dict() round-trip.""" + + def test_round_trip_minimal(self) -> None: + e = Edit(op="append") + d = e.to_dict() + restored = Edit.from_dict(d) + assert restored == e + + def test_round_trip_full(self) -> None: + e = Edit( + op="replace", + content="new content", + target="old content", + support_count=3, + source_type="success", + merge_level=1, + update_origin="meta_reflect", + update_target="system_prompt", + ) + d = e.to_dict() + restored = Edit.from_dict(d) + assert restored == e + + def test_round_trip_delete_without_content(self) -> None: + e = Edit(op="delete", target="obsolete_line") + d = e.to_dict() + restored = Edit.from_dict(d) + assert restored == e + + def test_optional_fields_omitted_when_default(self) -> None: + e = Edit(op="append") + d = e.to_dict() + assert d == {"op": "append", "content": ""} + # support_count, source_type, etc. should be absent + assert "support_count" not in d + assert "source_type" not in d + assert "merge_level" not in d + assert "target" not in d + assert "update_origin" not in d + assert "update_target" not in d + + def test_from_dict_with_defaults(self) -> None: + d = {"op": "replace", "content": "abc"} + e = Edit.from_dict(d) + assert e.op == "replace" + assert e.content == "abc" + assert e.target == "" + assert e.support_count is None + assert e.source_type is None + + def test_from_dict_with_extra_keys(self) -> None: + """Extra keys in dict should be ignored.""" + d = {"op": "append", "content": "", "unknown_field": 42} + e = Edit.from_dict(d) + assert e.op == "append" + assert not hasattr(e, "unknown_field") + + +class TestEditEdgeCases: + """Edge cases around Edit.""" + + def test_support_count_zero(self) -> None: + """0 is a valid support_count and should be serialized.""" + e = Edit(op="append", support_count=0) + d = e.to_dict() + assert d["support_count"] == 0 + restored = Edit.from_dict(d) + assert restored.support_count == 0 + + def test_merge_level_zero(self) -> None: + e = Edit(op="replace", merge_level=0) + d = e.to_dict() + assert d["merge_level"] == 0 + restored = Edit.from_dict(d) + assert restored.merge_level == 0 + + def test_empty_target_stays_empty(self) -> None: + e = Edit(op="append", target="") + d = e.to_dict() + assert "target" not in d + + +# ── Patch ─────────────────────────────────────────────────────────────────── + + +class TestPatchCreation: + """Patch dataclass construction.""" + + def test_empty_patch(self) -> None: + p = Patch() + assert p.edits == [] + assert p.reasoning == "" + assert p.ranking_details is None + + def test_patch_with_edits(self) -> None: + edits = [ + Edit(op="append", content="step 1"), + Edit(op="append", content="step 2"), + ] + p = Patch(edits=edits, reasoning="Added two steps") + assert len(p.edits) == 2 + assert p.reasoning == "Added two steps" + + def test_patch_with_ranking_details(self) -> None: + p = Patch(ranking_details={"score": 0.95, "rank": 1}) + assert p.ranking_details == {"score": 0.95, "rank": 1} + + +class TestPatchRoundTrip: + """Patch.to_dict() / Patch.from_dict() round-trip.""" + + def test_round_trip_empty(self) -> None: + p = Patch() + d = p.to_dict() + restored = Patch.from_dict(d) + assert restored.edits == [] + assert restored.reasoning == "" + assert restored.ranking_details is None + + def test_round_trip_with_edits(self) -> None: + edits = [ + Edit(op="insert_after", content="new step", target="existing step"), + Edit(op="replace", content="updated", target="old"), + ] + p = Patch(edits=edits, reasoning="Batch update") + d = p.to_dict() + restored = Patch.from_dict(d) + assert len(restored.edits) == 2 + for original, restored_edit in zip(p.edits, restored.edits): + assert isinstance(restored_edit, Edit) + assert original == restored_edit + assert restored.reasoning == "Batch update" + assert restored.ranking_details is None + + def test_round_trip_with_ranking_details(self) -> None: + details = {"strategy": "rouge", "scores": [0.9, 0.8, 0.7]} + p = Patch( + edits=[Edit(op="append", content="a")], + reasoning="selected best", + ranking_details=details, + ) + d = p.to_dict() + restored = Patch.from_dict(d) + assert restored.ranking_details == details + + def test_to_dict_contains_reasoning_and_edits(self) -> None: + p = Patch(edits=[Edit(op="append", content="test")], reasoning="reason") + d = p.to_dict() + assert "reasoning" in d + assert "edits" in d + assert isinstance(d["edits"], list) + + def test_from_dict_preserves_edit_order(self) -> None: + edits = [ + Edit(op="append", content="first"), + Edit(op="insert_after", content="second", target="first"), + Edit(op="append", content="third"), + ] + p = Patch(edits=edits, reasoning="ordered") + d = p.to_dict() + restored = Patch.from_dict(d) + assert restored.edits[0].content == "first" + assert restored.edits[1].content == "second" + assert restored.edits[2].content == "third" + + +class TestPatchEdgeCases: + """Edge cases around Patch.""" + + def test_reasoning_empty_string(self) -> None: + p = Patch(reasoning="") + d = p.to_dict() + assert d["reasoning"] == "" + + def test_zero_edits(self) -> None: + """Patch with explicitly empty edit list.""" + p = Patch(edits=[]) + d = p.to_dict() + assert d["edits"] == [] + + def test_nested_edit_from_dict_handles_dicts(self) -> None: + """from_dict should accept dicts in the 'edits' list.""" + d = { + "reasoning": "test", + "edits": [{"op": "append", "content": "hello"}], + } + p = Patch.from_dict(d) + assert len(p.edits) == 1 + assert isinstance(p.edits[0], Edit) + assert p.edits[0].op == "append" + assert p.edits[0].content == "hello"