diff --git a/hed/schema/schema_attribute_validator_hed_id.py b/hed/schema/schema_attribute_validator_hed_id.py index fb1c99e3..a55b909e 100644 --- a/hed/schema/schema_attribute_validator_hed_id.py +++ b/hed/schema/schema_attribute_validator_hed_id.py @@ -1,5 +1,4 @@ from hed.schema.hed_cache import get_library_data -from hed.schema.schema_io.df_util import remove_prefix from semantic_version import Version from hed.schema.hed_schema_io import load_schema_version from hed.schema.hed_cache import get_hed_versions @@ -87,13 +86,13 @@ def verify_tag_id(self, hed_schema, tag_entry, attribute_name): if old_id: try: - old_id = int(remove_prefix(old_id, "HED_")) + old_id = int(old_id.removeprefix("HED_")) except ValueError: # Just silently ignore invalid old_id values(this shouldn't happen) pass if new_id: try: - new_id = int(remove_prefix(new_id, "HED_")) + new_id = int(new_id.removeprefix("HED_")) except ValueError: return ErrorHandler.format_error(SchemaAttributeErrors.SCHEMA_HED_ID_INVALID, tag_entry.name, new_id) # Nothing to verify diff --git a/hed/schema/schema_io/df_util.py b/hed/schema/schema_io/df_util.py index aca56c07..d8398f88 100644 --- a/hed/schema/schema_io/df_util.py +++ b/hed/schema/schema_io/df_util.py @@ -233,13 +233,6 @@ def get_library_name_and_id(schema): return name.capitalize(), starting_id -# todo: Replace this once we no longer support < python 3.10 -def remove_prefix(text, prefix): - if text and text.startswith(prefix): - return text[len(prefix) :] - return text - - def calculate_attribute_type(attribute_entry): """Returns the type of this attribute(annotation, object, data) diff --git a/hed/schema/schema_io/hed_id_util.py b/hed/schema/schema_io/hed_id_util.py index b362926b..f85a6bac 100644 --- a/hed/schema/schema_io/hed_id_util.py +++ b/hed/schema/schema_io/hed_id_util.py @@ -9,7 +9,6 @@ from hed.schema.schema_io import schema_util from hed.errors.exceptions import HedFileError from hed.schema.hed_schema_constants import HedKey -from hed.schema.schema_io.df_util import remove_prefix from hed.schema.hed_cache import get_library_data from hed.schema.schema_io import df_constants as constants @@ -67,7 +66,7 @@ def get_all_ids(df): Union[Set, None]: None if this has no HED column, otherwise all unique numbers as a set. """ if constants.hed_id in df.columns: - modified_df = df[constants.hed_id].apply(lambda x: remove_prefix(x, "HED_")) + modified_df = df[constants.hed_id].apply(lambda x: x.removeprefix("HED_") if isinstance(x, str) else x) modified_df = pd.to_numeric(modified_df, errors="coerce").dropna().astype(int) return set(modified_df.unique()) return None @@ -171,7 +170,7 @@ def _verify_hedid_matches(section, df, unused_tag_ids): row_number, row, f"'{label}' has an improperly formatted hedID in dataframe." ) continue - id_value = remove_prefix(df_id, "HED_") + id_value = df_id.removeprefix("HED_") try: id_int = int(id_value) if id_int not in unused_tag_ids: diff --git a/hed/schema/schema_io/schema2df.py b/hed/schema/schema_io/schema2df.py index 56b841aa..a99e7cef 100644 --- a/hed/schema/schema_io/schema2df.py +++ b/hed/schema/schema_io/schema2df.py @@ -4,7 +4,6 @@ from hed.schema.schema_io.df_util import ( create_empty_dataframes, get_library_name_and_id, - remove_prefix, calculate_attribute_type, ) from hed.schema.schema_io.schema2base import Schema2Base @@ -49,7 +48,7 @@ def _get_object_name_and_id(self, object_name, include_prefix=False): - The full formatted hed_id. """ prefix, obj_id = get_library_name_and_id(self._schema) - name = f"{prefix}{remove_prefix(object_name, 'Hed')}" + name = f"{prefix}{object_name.removeprefix('Hed')}" full_hed_id = self._get_object_id(object_name, obj_id, include_prefix) return name, full_hed_id diff --git a/hed/tools/analysis/sequence_map.py b/hed/tools/analysis/sequence_map.py deleted file mode 100644 index 231b5451..00000000 --- a/hed/tools/analysis/sequence_map.py +++ /dev/null @@ -1,197 +0,0 @@ -"""A map of containing the number of times a particular sequence of values in a column of a columnar file.""" - -import pandas as pd -from hed.tools.util import data_util - - -class SequenceMap: - # TODO: This class is partially implemented. - """A map of unique sequences of column values of a particular length appear in a columnar file. - - Attributes: - - name (str): An optional name of this remap for identification purposes. - - Notes: This mapping converts all columns in the mapping to strings. - The remapping does not support other types of columns. - - """ - - def __init__(self, codes=None, name=""): - """Information for setting up the maps. - - Parameters: - codes (list or None): If None use all codes, otherwise only include listed codes in the map. - name (str): Name associated with this remap (usually a pathname of the events file). - - """ - - self.codes = codes - self.name = name - self.node_counts = {} - self.edges = {} # map of keys to n-element sequences - self.edge_counts = {} # Keeps a running count of the number of times a key appears in the data - - @property - def __str__(self): - """Return a version of this sequence map serialized to a string.""" - node_counts = [f"{value}({str(count)})" for value, count in self.node_counts.items()] - node_str = " ".join(node_counts) - return node_str - # temp_list = [f"{self.name} counts for key [{str(self.key_cols)}]:"] - # for index, row in self.col_map.iterrows(): - # key_hash = get_row_hash(row, self.columns) - # temp_list.append(f"{str(list(row.values))}:\t{self.count_dict[key_hash]}") - # return "\n".join(temp_list) - - def dot_str(self, group_spec=None): - """Produce a DOT string representing this sequence map. - - Parameters: - group_spec (dict or None): Specification for grouping nodes. If None, defaults to empty dict. - - Returns: - str: DOT format string representation of the sequence map. - """ - if group_spec is None: - group_spec = {} - base = "digraph g { \n" - if self.codes: - node_list = [f"{node};" for node in self.codes if node not in self.node_counts] - if node_list: - base = base + 'subgraph cluster_unused {\n bgcolor="#cAcAcA";\n' + "\n".join(node_list) + "\n}\n" - if group_spec: - for group, spec in group_spec.items(): - group_list = [f"{node};" for node in self.node_counts if node in spec["nodes"]] - if group_list: - spec_color = spec["color"] - if spec_color[0] == "#": - spec_color = f'"{spec_color}"' - base = ( - base - + "subgraph cluster_" - + group - + "{\n" - + f"bgcolor={spec_color};\n" - + "\n".join(group_list) - + "\n}\n" - ) - edge_list = self.get_edge_list(sort=True) - - dot_str = base + "\n".join(edge_list) + "}\n" - return dot_str - - def edge_to_str(self, key): - """Convert a graph edge to a DOT string. - - Parameters: - key(str): Hashcode string representing a graph edge. - - """ - value = self.edges.get(key, []) - if value: - return f"{value[0]} -> {value[1]} " - else: - return "" - - def get_edge_list(self, sort=True): - """Return a DOT format edge list with the option of sorting by edge counts. - - Parameters: - sort (bool): If True (the default), the edge list is sorted by edge counts. - - Returns: - list: list of DOT strings representing the edges labeled by counts. - - """ - - df = pd.DataFrame(list(self.edge_counts.items()), columns=["Key", "Counts"]) - if sort: - df = df.sort_values(by="Counts", ascending=False) - edge_list = [ - f"{self.edge_to_str(row['Key'])} [label={str(self.edge_counts[row['Key']])}];" for index, row in df.iterrows() - ] - return edge_list - - def filter_edges(self): - pass - - def update(self, data): - """Update the existing map with information from data. - - Parameters: - data (Series): DataFrame or filename of an events file or event map. - allow_missing (bool): If True allow missing keys and add as n/a columns. - - Raises: - HedFileError: If there are missing keys and allow_missing is False. - - """ - filtered = self.prep(data) - if self.codes: - mask = filtered.isin(self.codes) - filtered = filtered[mask] - for index, value in filtered.items(): - if value not in self.node_counts: - self.node_counts[value] = 1 - else: - self.node_counts[value] = self.node_counts[value] + 1 - if index + 1 >= len(filtered): - break - key_list = filtered[index : index + 2].tolist() - key = data_util.get_key_hash(key_list) - if key in self.edges: - self.edge_counts[key] = self.edge_counts[key] + 1 - else: - self.edges[key] = key_list - self.edge_counts[key] = 1 - - # def update(self, data): - # """ Update the existing map with information from data. - # - # Parameters: - # data (Series): DataFrame or filename of an events file or event map. - # allow_missing (bool): If true allow missing keys and add as n/a columns. - # - # :raises HedFileError: - # - If there are missing keys and allow_missing is False. - # - # """ - # filtered = self.prep(data) - # if self.codes: - # mask = filtered.isin(self.codes) - # filtered = filtered[mask] - # for index, value in filtered.items(): - # if value not in self.node_counts: - # self.node_counts[value] = 1 - # else: - # self.node_counts[value] = self.node_counts[value] + 1 - # if index + 1 >= len(filtered): - # break - # key_list = filtered[index:index + 2].tolist() - # key = get_key_hash(key_list) - # if key in self.edges: - # self.edge_counts[key] = self.edge_counts[key] + 1 - # else: - # self.edges[key] = key_list - # self.edge_counts[key] = 1 - - @staticmethod - def prep(data): - """Remove quotes from the specified columns and convert to string. - - Parameters: - data (Series): Dataframe to process by removing quotes. - - Returns: - Series - - Notes: - - Replacement is done in place. - """ - - filtered = data.astype(str) - filtered.fillna("n/a").astype(str) - filtered = filtered.str.replace('"', "") - filtered = filtered.str.replace("'", "") - return filtered diff --git a/spec_tests/hed-examples b/spec_tests/hed-examples index 336a4fcc..a788cd7c 160000 --- a/spec_tests/hed-examples +++ b/spec_tests/hed-examples @@ -1 +1 @@ -Subproject commit 336a4fccaec59b4924c8f37c50967fa480538ccf +Subproject commit a788cd7ce7c88e83119009af6a0b242df96c174f diff --git a/tests/schema/test_hed_id_util.py b/tests/schema/test_hed_id_util.py index 5b9f5236..087b451f 100644 --- a/tests/schema/test_hed_id_util.py +++ b/tests/schema/test_hed_id_util.py @@ -111,6 +111,32 @@ def test_get_all_ids_mixed_invalid(self): result = get_all_ids(df) self.assertEqual(result, {1, 3}) # Should ignore non-numeric and malformed IDs + def test_get_all_ids_with_nan(self): + # Test when hedId column contains NaN values (pandas null) + df = pd.DataFrame({"hedId": ["HED_0000001", pd.NA, "HED_0000003", None]}) + result = get_all_ids(df) + self.assertEqual(result, {1, 3}) # Should handle NaN/None gracefully + + def test_get_all_ids_with_numeric_types(self): + # Test when hedId column contains numeric types (edge case) + # pd.to_numeric will convert these numeric values as-is + df = pd.DataFrame({"hedId": ["HED_0000001", 123, "HED_0000003", 456]}) + result = get_all_ids(df) + # Should extract from valid string entries with HED_ prefix AND numeric values + self.assertEqual(result, {1, 3, 123, 456}) + + def test_get_all_ids_empty_strings(self): + # Test when hedId column contains empty strings + df = pd.DataFrame({"hedId": ["HED_0000001", "", "HED_0000003", ""]}) + result = get_all_ids(df) + self.assertEqual(result, {1, 3}) + + def test_get_all_ids_with_none(self): + # Test when hedId column contains None values + df = pd.DataFrame({"hedId": ["HED_0000001", None, "HED_0000003", None]}) + result = get_all_ids(df) + self.assertEqual(result, {1, 3}) + def test_assign_hed_ids_section(self): df = pd.DataFrame( { diff --git a/tests/tools/analysis/test_sequence_map.py b/tests/tools/analysis/test_sequence_map.py deleted file mode 100644 index 771891a8..00000000 --- a/tests/tools/analysis/test_sequence_map.py +++ /dev/null @@ -1,63 +0,0 @@ -import unittest -import os -from hed.tools.analysis.sequence_map import SequenceMap - - -class Test(unittest.TestCase): - @classmethod - def setUpClass(cls): - # curation_base_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data/other_tests') - base_path = "" - cls.events_path = os.path.realpath( - base_path + "/sub-01/ses-01/eeg/sub-01_ses-01_task-DriveRandomSound_run-1_events.tsv" - ) - - def test_constructor(self): - codes1 = [ - "1111", - "1112", - "1121", - "1122", - "1131", - "1132", - "1141", - "1142", - "1311", - "1312", - "1321", - "1322", - "4210", - "4220", - "4230", - "4311", - "4312", - ] - - smap1 = SequenceMap(codes=codes1) - self.assertIsInstance(smap1, SequenceMap) - # df = get_new_dataframe(self.events_path) - # data = df['value'] - # smap1.update(data) - # #print(f"{smap1.__str__}") - # print("to here") - - def test_update(self): - # codes1 = ['1111', '1121', '1131', '1141', '1311', '1321', - # '4210', '4220', '4230', '4311'] - codes1 = ["1111", "1121", "1131", "1141", "1311", "4311"] - # codes1 = ['1111', '1121', '1131', '1141', '1311'] - smap1 = SequenceMap(codes=codes1) - self.assertIsInstance(smap1, SequenceMap) - # df = get_new_dataframe(self.events_path) - # data = df['value'] - # smap1.update(data) - # print(f"{smap1.dot_str()}") - # group_spec = {"stimulus": {"color": "#FFAAAA", "nodes": ["1111", "1121", "1131", "1141", "1311"]}} - # print(f"{smap1.dot_str(group_spec=group_spec)}") - - def test_str(self): - pass - - -if __name__ == "__main__": - unittest.main()