Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions requirements-runners.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
ARS_Test_Runner==0.2.4
# benchmarks-runner==0.1.3
# ui-test-runner==0.0.2
graph-validation-test-runners==0.1.5
# we need to manually pin the right reasoner-validator version
reasoner-validator==4.2.5
# graph-validation-test-runners==0.1.5
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ pydantic==2.7.1
setproctitle==1.3.3
slack_sdk==3.27.2
tqdm==4.66.4
translator-testing-model==0.3.2
translator-testing-model==0.4.1
reasoner-validator==4.2.5
8 changes: 6 additions & 2 deletions test_harness/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,18 @@
from typing import List, Union, Dict
import zipfile

from translator_testing_model.datamodel.pydanticmodel import TestCase, TestSuite
from translator_testing_model.datamodel.pydanticmodel import (
TestCase,
PathfinderTestCase,
TestSuite,
)


def download_tests(
suite: Union[str, List[str]],
url: Path,
logger: logging.Logger,
) -> Dict[str, TestCase]:
) -> Dict[str, Union[TestCase, PathfinderTestCase]]:
"""Download tests from specified location."""
assert Path(url).suffix == ".zip"
logger.info(f"Downloading tests from {url}...")
Expand Down
44 changes: 44 additions & 0 deletions test_harness/pathfinder_test_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from typing import Dict, Union, List


async def pathfinder_pass_fail_analysis(
report: Dict[str, any],
agent: str,
message: Dict[str, any],
path_nodes: List[List[str]],
minimum_required_path_nodes: int,
) -> Dict[str, any]:
found_path_nodes = set()
unmatched_paths = set()
for analysis in message["results"][0]["analyses"]:
for path_bindings in analysis["path_bindings"].values():
for path_binding in path_bindings:
path_id = path_binding["id"]
matching_path_nodes = set()
for edge_id in message["auxiliary_graphs"][path_id]["edges"]:
edge = message["knowledge_graph"]["edges"][edge_id]
for node_curies in path_nodes:
unhit_node = True
for curie in node_curies:
if curie in matching_path_nodes:
unhit_node = False
if unhit_node:
if edge["subject"] in node_curies:
matching_path_nodes.add(edge["subject"])
if edge["object"] in node_curies:
matching_path_nodes.add(edge["object"])
if len(matching_path_nodes) >= minimum_required_path_nodes:
found_path_nodes.add(",".join(matching_path_nodes))
elif len(matching_path_nodes) > 0:
unmatched_paths.add(",".join(matching_path_nodes))

if len(found_path_nodes) > 0:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A couple things here:

  • I know we talked about including the expected intermediate nodes in the test labels, and I think it's right that they aren't included there, but it would be nice if they showed up somewhere, probably in the report. Otherwise, the test doesn't show what they are anywhere. Creative queries are easier because the title includes the expected answer nodes.
  • In the same vein, maybe also including the minimum number of required path nodes would be useful
  • I'm not sure it helps anyone to include the expected_path_nodes if someone passed the test? Let's rethink what all should be included in the report so that someone could get just about all the information they need just by looking at the test output.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that's true. So I guess the best thing to do would be to include the minimum required path node number, as well as the expected intermediate nodes, regardless of whether or not the test is passed?

I do think providing the expected_path_nodes is useful even if the test is passed. That way the test output makes clear exactly how the test is passed, by showing what pairs are actually found. Maybe the name should be changed? expected_paths_found?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This all sounds good. I would maybe even include the expected_nodes_found(note the updated name) for failed tests, to show how close/far they were from passing.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of including the minimum number and expected intermediate nodes for every ARA, you could include it as a higher field in the whole report. Here's an example of the full report output:

{
    "pks": {
        "parent_pk": "cc8f8ac6-4582-4959-b96a-bb44e92fbf0e",
        "improving-agent": "0aeadcb7-3407-456c-8d2a-721fba2f8860",
        "molepro": "0b582315-48ac-4092-b666-d9bf2a57ddf6",
        "multiomics-drugapprovals": "19bc999b-1041-4b78-95c3-c18528d25fc9",
        "connections-hypothesis": "212f904b-ae03-46ea-9779-6f69aa5ac8d3",
        "unsecret-agent": "27f4f2b8-11bd-4d24-8aa7-cd73959438f5",
        "cohd": "2c53bc0f-a5e5-41ac-b278-3e2d765b3ee5",
        "genetics-data-provider": "301d50d0-587f-4c6f-ac55-4682501f0c37",
        "aragorn": "3e0e9faf-4a2a-479e-9e6f-ef7327164c98",
        "biothings-explorer": "aee66e2d-9c88-4f42-80a2-c11a30f4f651",
        "automat-cam-kp": "bc727dcf-9487-40a8-8b33-65805094a051",
        "text-mining-provider-targeted": "c3abf709-a8d3-4dc6-895e-24c3715ec2f4",
        "arax": "dc64651d-08f2-4f6a-90a1-83f5127bca42",
        "multiomics-clinicaltrials": "ddd4c246-424a-4bb7-a4c8-e0bc38c9578b",
        "openpredict": "f0e177c2-b878-4b39-be50-6ea57cb61595",
        "ars": "9c6fb07c-c9cb-49c8-8333-016193b66bb4"
    },
    "result": {
        "improving-agent": {
            "trapi_validation": "NA",
            "status": "FAILED",
            "message": "Status code: 400"
        },
        "molepro": {
            "trapi_validation": "NA",
            "status": "FAILED",
            "message": "Status code: 400"
        },
        "multiomics-drugapprovals": {
            "trapi_validation": "NA",
            "status": "FAILED",
            "message": "Status code: 400"
        },
        "connections-hypothesis": {
            "trapi_validation": "NA",
            "status": "FAILED",
            "message": "Status code: 400"
        },
        "unsecret-agent": {
            "trapi_validation": "NA",
            "status": "FAILED",
            "message": "Status code: 503"
        },
        "cohd": {
            "trapi_validation": "NA",
            "status": "FAILED",
            "message": "Status code: 400"
        },
        "genetics-data-provider": {
            "trapi_validation": "NA",
            "status": "FAILED",
            "message": "Status code: 400"
        },
        "aragorn": {
            "trapi_validation": "NA",
            "status": "FAILED"
        },
        "biothings-explorer": {
            "trapi_validation": "NA",
            "status": "FAILED",
            "message": "Status code: 400"
        },
        "automat-cam-kp": {
            "trapi_validation": "NA",
            "status": "FAILED",
            "message": "Status code: 422"
        },
        "text-mining-provider-targeted": {
            "trapi_validation": "NA",
            "status": "FAILED",
            "message": "Status code: 400"
        },
        "arax": {
            "trapi_validation": "NA",
            "status": "PASSED",
            "expected_path_nodes": "NCBIGene:3815,CL:0000097"
        },
        "multiomics-clinicaltrials": {
            "trapi_validation": "NA",
            "status": "FAILED",
            "message": "Status code: 400"
        },
        "openpredict": {
            "trapi_validation": "NA",
            "status": "FAILED",
            "message": "Status code: 502"
        },
        "ars": {
            "trapi_validation": "NA",
            "status": "PASSED",
            "expected_path_nodes": "NCBIGene:3815,CL:0000097"
        }
    }
}

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added both of these outside of the individual agent reports, and also added more details to the agent reports as well.

report[agent]["status"] = "PASSED"
report[agent]["expected_nodes_found"] = "; ".join(found_path_nodes)
elif len(unmatched_paths) > 0:
report[agent]["status"] = "FAILED"
report[agent]["expected_nodes_found"] = "; ".join(unmatched_paths)
else:
report[agent]["status"] = "FAILED"

return report
78 changes: 55 additions & 23 deletions test_harness/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,14 @@
import httpx
import logging
import os
from typing import List
from typing import List, Union

from translator_testing_model.datamodel.pydanticmodel import TestCase, TestAsset
from translator_testing_model.datamodel.pydanticmodel import (
TestCase,
PathfinderTestCase,
TestAsset,
PathfinderTestAsset,
)


class Reporter:
Expand Down Expand Up @@ -63,25 +68,36 @@ async def create_test_run(self, test_env, suite_name):
self.test_run_id = res_json["id"]
return self.test_run_id

async def create_test(self, test: TestCase, asset: TestAsset):
async def create_test(
self,
test: Union[TestCase, PathfinderTestCase],
asset: Union[TestAsset, PathfinderTestAsset],
):
"""Create a test in the IR."""
name = asset.name if asset.name else asset.description
res = await self.authenticated_client.post(
url=f"{self.base_path}/api/reporting/v1/test-runs/{self.test_run_id}/tests",
json={
"name": name,
"className": test.name,
"methodName": asset.name,
"startedAt": datetime.now().astimezone().isoformat(),
"labels": [
{
"key": "TestCase",
"value": test.id,
},
{
"key": "TestAsset",
"value": asset.id,
},
test_json = {
"name": name,
"className": test.name,
"methodName": asset.name,
"startedAt": datetime.now().astimezone().isoformat(),
"labels": [
{
"key": "TestCase",
"value": test.id,
},
{
"key": "TestAsset",
"value": asset.id,
},
{
"key": "ExpectedOutput",
"value": asset.expected_output,
},
],
}
if isinstance(test, TestCase) and isinstance(asset, TestAsset):
test_json["labels"].extend(
[
{
"key": "InputCurie",
"value": asset.input_id,
Expand All @@ -90,12 +106,28 @@ async def create_test(self, test: TestCase, asset: TestAsset):
"key": "OutputCurie",
"value": asset.output_id,
},
]
)
elif isinstance(test, PathfinderTestCase) and isinstance(
asset, PathfinderTestAsset
):
test_json["labels"].extend(
[
{
"key": "SourceInputCurie",
"value": asset.source_input_id,
},
{
"key": "ExpectedOutput",
"value": asset.expected_output,
"key": "TargetInputCurie",
"value": asset.target_input_id,
},
],
},
]
)
else:
raise Exception
res = await self.authenticated_client.post(
url=f"{self.base_path}/api/reporting/v1/test-runs/{self.test_run_id}/tests",
json=test_json,
)
res.raise_for_status()
res_json = res.json()
Expand Down
11 changes: 8 additions & 3 deletions test_harness/result_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@

import logging
from typing import Union
from translator_testing_model.datamodel.pydanticmodel import TestAsset, TestCase
from translator_testing_model.datamodel.pydanticmodel import (
TestAsset,
PathfinderTestAsset,
TestCase,
PathfinderTestCase,
)

from test_harness.utils import get_tag

Expand Down Expand Up @@ -43,8 +48,8 @@ def __init__(self, logger: logging.Logger):

def collect_result(
self,
test: TestCase,
asset: TestAsset,
test: Union[TestCase, PathfinderTestCase],
asset: Union[TestAsset, PathfinderTestAsset],
report: dict,
parent_pk: Union[str, None],
url: str,
Expand Down
105 changes: 70 additions & 35 deletions test_harness/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,31 @@
import time
from tqdm import tqdm
import traceback
from typing import Dict
from typing import Dict, Union

from ARS_Test_Runner.semantic_test import pass_fail_analysis
from standards_validation_test_runner import StandardsValidationTest

# from standards_validation_test_runner import StandardsValidationTest

# from benchmarks_runner import run_benchmarks

from translator_testing_model.datamodel.pydanticmodel import TestCase
from translator_testing_model.datamodel.pydanticmodel import (
TestCase,
PathfinderTestCase,
)

from test_harness.runner.query_runner import QueryRunner
from test_harness.reporter import Reporter
from test_harness.slacker import Slacker
from test_harness.result_collector import ResultCollector
from test_harness.utils import get_tag, hash_test_asset
from test_harness.pathfinder_test_runner import pathfinder_pass_fail_analysis


async def run_tests(
reporter: Reporter,
slacker: Slacker,
tests: Dict[str, TestCase],
tests: Dict[str, Union[TestCase, PathfinderTestCase]],
logger: logging.Logger = logging.getLogger(__name__),
args: Dict[str, any] = {},
) -> Dict:
Expand Down Expand Up @@ -71,7 +76,7 @@ async def run_tests(
try:
test_id = await reporter.create_test(test, asset)
test_ids.append(test_id)
except Exception:
except Exception as e:
logger.error(f"Failed to create test: {test.id}")
continue

Expand All @@ -91,9 +96,24 @@ async def run_tests(
"pks": test_query["pks"],
"result": {},
}
if isinstance(test, PathfinderTestCase):
report["test_details"] = {
"minimum_required_path_nodes": asset.minimum_required_path_nodes,
"expected_path_nodes": "; ".join(
[
",".join(
[
normalized_curies[path_node_id]
for path_node_id in path_node.ids
]
)
for path_node in asset.path_nodes
]
),
}
for agent, response in test_query["responses"].items():
report["result"][agent] = {
"trapi_validation": "NA",
# "trapi_validation": "NA",
}
agent_report = report["result"][agent]
try:
Expand All @@ -117,28 +137,28 @@ async def run_tests(
logger.warning(
f"Failed to parse basic response fields from {agent}: {e}"
)
try:
svt = StandardsValidationTest(
test_asset=asset,
environment=test.test_env,
component=agent,
trapi_version=args["trapi_version"],
biolink_version="suppress",
runner_settings="Inferred",
)
results = svt.test_case_processor(
trapi_response=response["response"]
)
agent_report["trapi_validation"] = results[
next(iter(results.keys()))
][agent]["status"]
if agent_report["trapi_validation"] == "FAILED":
agent_report["status"] = "FAILED"
agent_report["message"] = "TRAPI Validation Error"
continue
except Exception as e:
logger.warning(f"Failed to run TRAPI validation with {e}")
agent_report["trapi_validation"] = "ERROR"
# try:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# svt = StandardsValidationTest(
# test_asset=asset,
# environment=test.test_env,
# component=agent,
# trapi_version=args["trapi_version"],
# biolink_version="suppress",
# runner_settings="Inferred",
# )
# results = svt.test_case_processor(
# trapi_response=response["response"]
# )
# agent_report["trapi_validation"] = results[
# next(iter(results.keys()))
# ][agent]["status"]
# if agent_report["trapi_validation"] == "FAILED":
# agent_report["status"] = "FAILED"
# agent_report["message"] = "TRAPI Validation Error"
# continue
# except Exception as e:
# logger.warning(f"Failed to run TRAPI validation with {e}")
# agent_report["trapi_validation"] = "ERROR"
try:
if (
response["response"]["message"].get("results") is None
Expand All @@ -147,13 +167,28 @@ async def run_tests(
agent_report["status"] = "DONE"
agent_report["message"] = "No results"
continue
await pass_fail_analysis(
report["result"],
agent,
response["response"]["message"]["results"],
normalized_curies[asset.output_id],
asset.expected_output,
)
if isinstance(test, PathfinderTestCase):
await pathfinder_pass_fail_analysis(
report["result"],
agent,
response["response"]["message"],
[
[
normalized_curies[path_node_id]
for path_node_id in path_node.ids
]
for path_node in asset.path_nodes
],
asset.minimum_required_path_nodes,
)
else:
await pass_fail_analysis(
report["result"],
agent,
response["response"]["message"]["results"],
normalized_curies[asset.output_id],
asset.expected_output,
)
except Exception as e:
logger.error(
f"Failed to run acceptance test analysis on {agent}: {e}"
Expand Down
Loading