From 1e77e684fcda4450d686ea2251e484ba7499534e Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Thu, 21 May 2026 17:35:06 -0400 Subject: [PATCH 1/5] Add common-name rank and higher-rank fallback control Record the rank used for each common-name hit so downstream consumers can distinguish specific names from higher-rank fallbacks. Preserve the existing fallback behavior by default while allowing callers to opt out. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/taxonopy/cli.py | 18 +- src/taxonopy/config.py | 6 + src/taxonopy/resolve_common_names.py | 191 +++++++++++++++----- tests/test_resolve_common_names.py | 251 ++++++++++++++++++++++++++- 4 files changed, 409 insertions(+), 57 deletions(-) diff --git a/src/taxonopy/cli.py b/src/taxonopy/cli.py index f582a8c..ee4a690 100644 --- a/src/taxonopy/cli.py +++ b/src/taxonopy/cli.py @@ -135,6 +135,18 @@ def create_parser() -> argparse.ArgumentParser: parser_common = subparsers.add_parser("common-names", help="Merge vernacular names (post-process) into resolved outputs") parser_common.add_argument("--resolved-dir", dest="annotation_dir", required=True,help="Directory containing your *.resolved.parquet files") parser_common.add_argument("--output-dir", required=True, help="Directory to write annotated .parquet files") + parser_common.add_argument( + "--higher-rank-fallback", + dest="higher_rank_fallback", + action=argparse.BooleanOptionalAction, + default=True, + help=( + "When set (default), climb species->genus->family->...->kingdom until " + "a vernacular is found. With --no-higher-rank-fallback, query the GBIF " + "VernacularName table only at the finest non-null rank in the row's " + "lineage; no climbing." + ), + ) return parser @@ -431,7 +443,11 @@ def main(args: Optional[List[str]] = None) -> int: count = clear_cache() print(f"\nCleared {count} cache entries") return 0 - return cn_main(parsed_args.annotation_dir, parsed_args.output_dir) + return cn_main( + parsed_args.annotation_dir, + parsed_args.output_dir, + higher_rank_fallback=parsed_args.higher_rank_fallback, + ) else: parser.error(f"Unknown command: {parsed_args.command}") return 1 diff --git a/src/taxonopy/config.py b/src/taxonopy/config.py index 1978d57..40f1dc6 100644 --- a/src/taxonopy/config.py +++ b/src/taxonopy/config.py @@ -43,6 +43,9 @@ def __init__(self): # Output settings self.output_format = "parquet" + + # Common-names settings + self.higher_rank_fallback = True def update_from_args(self, args: Dict[str, Any]) -> None: """Update configuration from command-line arguments. @@ -85,6 +88,9 @@ def update_from_args(self, args: Dict[str, Any]) -> None: if 'species_group' in args: self.species_group = args['species_group'] + + if 'higher_rank_fallback' in args: + self.higher_rank_fallback = args['higher_rank_fallback'] def ensure_directories(self) -> None: """Create any required directories.""" diff --git a/src/taxonopy/resolve_common_names.py b/src/taxonopy/resolve_common_names.py index 38e5e6d..7686494 100644 --- a/src/taxonopy/resolve_common_names.py +++ b/src/taxonopy/resolve_common_names.py @@ -268,71 +268,148 @@ def prioritize_vernacular(vernacular_df: pl.DataFrame) -> pl.DataFrame: return result.select(["taxonID", "vernacularName"]) -def apply_hierarchical_common_name_lookup(anno_df: pl.DataFrame, common_lookup: pl.DataFrame) -> pl.DataFrame: +def apply_hierarchical_common_name_lookup( + anno_df: pl.DataFrame, + common_lookup: pl.DataFrame, + higher_rank_fallback: bool = True, +) -> pl.DataFrame: """ - Apply hierarchical common name lookup from most specific to least specific rank. - + Apply common name lookup, with optional higher-rank fallback. + + When ``higher_rank_fallback`` is True (default), iterate ranks from most to + least specific (species -> kingdom) and take the first non-null vernacular. + When False, query only the finest non-null rank present in the row's lineage + ("most-granular-resolved"): no climbing. + + Always emits a ``common_name_rank`` column recording the rank at which the + name was found, or null when no name was available. + :param anno_df: Annotation dataframe with taxonID_* columns :param common_lookup: Common name lookup table with (taxonID, common_name) columns - :return: DataFrame with common_name column populated using hierarchical fallback + :param higher_rank_fallback: Whether to climb to higher ranks on miss + :return: DataFrame with ``common_name`` and ``common_name_rank`` columns populated """ # Define hierarchical order of taxonomic ranks (map class_ to class) rank_columns = [r.rstrip('_') for r in TAXONOMIC_RANKS_BY_SPECIFICITY] - - # Initialize common_name column - result_df = anno_df.with_columns(pl.lit(None).cast(pl.Utf8).alias("common_name")) - - # Apply hierarchical lookup - check each rank in priority order + + # Initialize common_name and common_name_rank columns + result_df = anno_df.with_columns([ + pl.lit(None).cast(pl.Utf8).alias("common_name"), + pl.lit(None).cast(pl.Utf8).alias("common_name_rank"), + ]) + + if higher_rank_fallback: + # Apply hierarchical lookup - check each rank in priority order + for rank in rank_columns: + taxonid_col = f"taxonID_{rank}" + if taxonid_col not in result_df.columns: + continue + + # Join common names for this rank + temp_df = result_df.join( + common_lookup.select([ + "taxonID", + pl.col("common_name").alias(f"temp_common_{rank}") + ]), + left_on=taxonid_col, + right_on="taxonID", + how="left" + ) + + # Record the rank ONLY where this is the first hit (existing + # common_name still null and this rank produced a value). Update the + # rank field BEFORE coalescing the name so the predicate observes + # the pre-update common_name. + result_df = ( + temp_df + .with_columns([ + pl.when( + pl.col("common_name").is_null() + & pl.col(f"temp_common_{rank}").is_not_null() + ) + .then(pl.lit(rank)) + .otherwise(pl.col("common_name_rank")) + .alias("common_name_rank"), + pl.coalesce([ + pl.col("common_name"), + pl.col(f"temp_common_{rank}") + ]).alias("common_name"), + ]) + .drop(f"temp_common_{rank}") + ) + + # Drop taxonID column if it exists (may not exist if no matches) + if "taxonID" in result_df.columns: + result_df = result_df.drop("taxonID") + + return result_df + + # Fallback OFF: most-granular-resolved. + # 1) Per row, identify the finest non-null rank in the lineage. + finest_rank_expr = pl.lit(None).cast(pl.Utf8) + for rank in rank_columns: # species -> kingdom + if rank in result_df.columns: + finest_rank_expr = ( + pl.when(finest_rank_expr.is_null() & pl.col(rank).is_not_null()) + .then(pl.lit(rank)) + .otherwise(finest_rank_expr) + ) + result_df = result_df.with_columns(finest_rank_expr.alias("_finest_rank")) + + # 2) Project that rank's taxonID into a unified column. + finest_tid_expr = pl.lit(None).cast(pl.Utf8) for rank in rank_columns: - taxonid_col = f"taxonID_{rank}" - if taxonid_col not in result_df.columns: - continue - - # Join common names for this rank - temp_df = result_df.join( - common_lookup.select([ - "taxonID", - pl.col("common_name").alias(f"temp_common_{rank}") - ]), - left_on=taxonid_col, - right_on="taxonID", - how="left" - ) - - # Update common_name where it's null and this rank has a name - result_df = ( - temp_df - # pick up the new common_name, drop the temp join field - .with_columns([ - pl.coalesce([ - pl.col("common_name"), - pl.col(f"temp_common_{rank}") - ]).alias("common_name") - ]) - .drop(f"temp_common_{rank}") - ) - - # Drop taxonID column if it exists (may not exist if no matches) - if "taxonID" in result_df.columns: - result_df = result_df.drop("taxonID") - + tcol = f"taxonID_{rank}" + if tcol in result_df.columns: + finest_tid_expr = ( + pl.when(pl.col("_finest_rank") == rank) + .then(pl.col(tcol).cast(pl.Utf8)) + .otherwise(finest_tid_expr) + ) + result_df = result_df.with_columns(finest_tid_expr.alias("_finest_taxonid")) + + # 3) Single join at the finest rank's taxonID; assign common_name and + # common_name_rank in one pass. + result_df = result_df.join( + common_lookup.select([ + pl.col("taxonID").cast(pl.Utf8).alias("_finest_taxonid"), + pl.col("common_name").alias("_cn_finest"), + ]), + on="_finest_taxonid", + how="left", + ) + result_df = result_df.with_columns([ + pl.col("_cn_finest").alias("common_name"), + pl.when(pl.col("_cn_finest").is_not_null()) + .then(pl.col("_finest_rank")) + .otherwise(pl.lit(None).cast(pl.Utf8)) + .alias("common_name_rank"), + ]).drop(["_cn_finest", "_finest_rank", "_finest_taxonid"]) + return result_df -def override_input_common_name(df: pl.DataFrame, common_lookup: pl.DataFrame) -> pl.DataFrame: +def override_input_common_name( + df: pl.DataFrame, + common_lookup: pl.DataFrame, + higher_rank_fallback: bool = True, +) -> pl.DataFrame: """ Override any existing common_name column with backbone-derived common names. - + :param df: DataFrame that may have a pre-existing common_name column :param common_lookup: Common name lookup table with hierarchical fallback applied + :param higher_rank_fallback: Whether to climb to higher ranks on miss :return: DataFrame with backbone-derived common_name (input common_name completely replaced) """ # Drop any existing common_name column and apply the backbone lookup df_clean = df.drop("common_name") if "common_name" in df.columns else df - return apply_hierarchical_common_name_lookup(df_clean, common_lookup) + return apply_hierarchical_common_name_lookup( + df_clean, common_lookup, higher_rank_fallback=higher_rank_fallback + ) -def merge_common_name(anno_df, common_name_df, taxon_df): +def merge_common_name(anno_df, common_name_df, taxon_df, higher_rank_fallback: bool = True): """ This function merges common names with annotation dataframe using hierarchical lookup. Common names are always derived from backbone lookup data for consistent mapping. @@ -393,7 +470,9 @@ def merge_common_name(anno_df, common_name_df, taxon_df): new_anno_df = new_anno_df.rename({temp_taxonid_col: taxonid_col}) # Override any input common_name with backbone data - new_anno_df = override_input_common_name(new_anno_df, common_lookup) + new_anno_df = override_input_common_name( + new_anno_df, common_lookup, higher_rank_fallback=higher_rank_fallback + ) # Clean up temporary taxonID columns (keep original taxonomic classification columns) cleanup_cols = [f"taxonID_{rank}" for rank in rank_columns] @@ -408,7 +487,7 @@ def merge_common_name(anno_df, common_name_df, taxon_df): return new_anno_df -def main(annotation_dir=None, output_dir=None): +def main(annotation_dir=None, output_dir=None, higher_rank_fallback: bool = True): """ Merge common names into resolved output files. """ @@ -428,6 +507,18 @@ def main(annotation_dir=None, output_dir=None): required=True, help="Where to write the new, annotated .parquet files" ) + parser.add_argument( + "--higher-rank-fallback", + dest="higher_rank_fallback", + action=argparse.BooleanOptionalAction, + default=True, + help=( + "When set (default), climb species->genus->family->...->kingdom until " + "a vernacular is found. With --no-higher-rank-fallback, query the GBIF " + "VernacularName table only at the finest non-null rank in the row's " + "lineage; no climbing." + ), + ) args = parser.parse_args() # Update config if cache-dir was provided @@ -438,6 +529,7 @@ def main(annotation_dir=None, output_dir=None): annotation_dir = args.annotation_dir output_dir = args.output_dir + higher_rank_fallback = args.higher_rank_fallback # Use global config's cache_dir from taxonopy.config import config @@ -482,7 +574,10 @@ def main(annotation_dir=None, output_dir=None): anno_df = pl.read_parquet(annotation_path) new_df = merge_taxon_id(anno_df, taxon_df) - new_df = merge_common_name(new_df, common_name_df, taxon_df) + new_df = merge_common_name( + new_df, common_name_df, taxon_df, + higher_rank_fallback=higher_rank_fallback, + ) new_df = new_df.with_columns([ pl.col("scientific_name").cast(pl.Utf8) ]) diff --git a/tests/test_resolve_common_names.py b/tests/test_resolve_common_names.py index da75185..afda943 100644 --- a/tests/test_resolve_common_names.py +++ b/tests/test_resolve_common_names.py @@ -127,10 +127,11 @@ def test_merge_common_name_species_priority(self): }) result = merge_common_name(anno_df, common_name_df, taxon_df) - + assert len(result) == 1 assert result["common_name"].to_list()[0] == "Gray Wolf", "Should prefer species over genus name" - + assert result["common_name_rank"].to_list()[0] == "species", "Rank should record the species-level hit" + def test_merge_common_name_cleans_up_intermediate_columns(self): """Test that all intermediate columns are removed after processing""" anno_df = pl.DataFrame({ @@ -172,7 +173,8 @@ def test_merge_common_name_cleans_up_intermediate_columns(self): # Lock in the exact final column set expected_columns = { - "uuid", "species", "genus", "family", "order", "class", "phylum", "kingdom", "common_name" + "uuid", "species", "genus", "family", "order", "class", "phylum", "kingdom", + "common_name", "common_name_rank" } assert set(result.columns) == expected_columns, f"Final columns should be exactly {expected_columns}" @@ -234,7 +236,8 @@ def test_merge_common_name_overrides_existing_common_name(self): # Verify exact final column set (original input had common_name, should be overridden) expected_columns = { - "uuid", "species", "genus", "family", "order", "class", "phylum", "kingdom", "common_name" + "uuid", "species", "genus", "family", "order", "class", "phylum", "kingdom", + "common_name", "common_name_rank" } assert set(result.columns) == expected_columns, f"Final columns should be exactly {expected_columns}" @@ -325,8 +328,11 @@ def test_merge_common_name_hierarchical_fallback_levels(self, available_rank, ex }) result = merge_common_name(anno_df, common_name_df, taxon_df) - + assert result["common_name"].to_list()[0] == expected_name, f"Should fallback to {available_rank} level name" + assert result["common_name_rank"].to_list()[0] == available_rank, ( + f"common_name_rank should record the {available_rank} fallback level" + ) class TestNormalizeTaxonomicColumns: @@ -421,9 +427,10 @@ def test_hierarchical_fallback_levels(self, available_rank, expected_name): }) result = apply_hierarchical_common_name_lookup(anno_df, common_lookup) - + assert result["common_name"].to_list()[0] == expected_name - + assert result["common_name_rank"].to_list()[0] == available_rank + def test_species_takes_priority_over_genus(self): """Test that species-level names take priority over genus""" anno_df = pl.DataFrame({ @@ -438,8 +445,236 @@ def test_species_takes_priority_over_genus(self): }) result = apply_hierarchical_common_name_lookup(anno_df, common_lookup) - + assert result["common_name"].to_list()[0] == "Gray Wolf" + assert result["common_name_rank"].to_list()[0] == "species" + + +class TestHierarchicalCommonNameLookupNoFallback: + """Unit tests for apply_hierarchical_common_name_lookup with fallback disabled""" + + @pytest.mark.parametrize("finest_rank,expected_name", [ + ("species", "Species Name"), + ("genus", "Genus Name"), + ("family", "Family Name"), + ("order", "Order Name"), + ("class", "Class Name"), + ("phylum", "Phylum Name"), + ("kingdom", "Kingdom Name"), + ]) + def test_no_fallback_returns_name_at_finest_rank(self, finest_rank, expected_name): + """With fallback off, only the finest non-null rank's vernacular is returned.""" + # Build a lineage that is non-null only at finest_rank and any higher (less-specific) ranks. + rank_order = ["species", "genus", "family", "order", "class", "phylum", "kingdom"] + finest_idx = rank_order.index(finest_rank) + lineage = {r: [None] for r in rank_order} + for r in rank_order[finest_idx:]: + lineage[r] = [f"Test {r}"] + + taxonid_cols = {f"taxonID_{r}": [None] for r in rank_order} + taxonid_cols[f"taxonID_{finest_rank}"] = [100] + + anno_df = pl.DataFrame({"uuid": ["test1"], **lineage, **taxonid_cols}) + + common_lookup = pl.DataFrame({ + "taxonID": [100], + "common_name": [expected_name], + }) + + result = apply_hierarchical_common_name_lookup( + anno_df, common_lookup, higher_rank_fallback=False + ) + + assert result["common_name"].to_list()[0] == expected_name + assert result["common_name_rank"].to_list()[0] == finest_rank + + def test_no_fallback_no_climb_when_species_has_no_vernacular(self): + """Species present in lineage but lacking a vernacular -> null, no climb to genus.""" + anno_df = pl.DataFrame({ + "uuid": ["test1"], + "species": ["Canis lupus"], + "genus": ["Canis"], + "family": ["Canidae"], + "order": ["Carnivora"], + "class": ["Mammalia"], + "phylum": ["Chordata"], + "kingdom": ["Animalia"], + "taxonID_species": [100], # no vernacular for 100 + "taxonID_genus": [200], # genus has a vernacular but should NOT be used + "taxonID_family": [None], + "taxonID_order": [None], + "taxonID_class": [None], + "taxonID_phylum": [None], + "taxonID_kingdom": [None], + }) + + common_lookup = pl.DataFrame({ + "taxonID": [200], + "common_name": ["Dog Genus"], + }) + + result = apply_hierarchical_common_name_lookup( + anno_df, common_lookup, higher_rank_fallback=False + ) + + assert result["common_name"].to_list()[0] is None + assert result["common_name_rank"].to_list()[0] is None + + def test_no_fallback_finest_is_genus_when_species_is_null(self): + """Species null but genus populated -> genus is queried only.""" + anno_df = pl.DataFrame({ + "uuid": ["test1"], + "species": [None], + "genus": ["Canis"], + "family": ["Canidae"], + "order": ["Carnivora"], + "class": ["Mammalia"], + "phylum": ["Chordata"], + "kingdom": ["Animalia"], + "taxonID_species": [None], + "taxonID_genus": [200], + "taxonID_family": [500], + "taxonID_order": [None], + "taxonID_class": [None], + "taxonID_phylum": [None], + "taxonID_kingdom": [None], + }) + + common_lookup = pl.DataFrame({ + "taxonID": [200, 500], + "common_name": ["Dog Genus", "Dog Family"], + }) + + result = apply_hierarchical_common_name_lookup( + anno_df, common_lookup, higher_rank_fallback=False + ) + + assert result["common_name"].to_list()[0] == "Dog Genus" + assert result["common_name_rank"].to_list()[0] == "genus" + + def test_no_fallback_all_null_lineage(self): + """All-null lineage row -> both output columns null.""" + anno_df = pl.DataFrame({ + "uuid": ["test1"], + "species": [None], + "genus": [None], + "family": [None], + "order": [None], + "class": [None], + "phylum": [None], + "kingdom": [None], + "taxonID_species": [None], + "taxonID_genus": [None], + "taxonID_family": [None], + "taxonID_order": [None], + "taxonID_class": [None], + "taxonID_phylum": [None], + "taxonID_kingdom": [None], + }) + + common_lookup = pl.DataFrame({ + "taxonID": [100], + "common_name": ["Anything"], + }) + + result = apply_hierarchical_common_name_lookup( + anno_df, common_lookup, higher_rank_fallback=False + ) + + assert result["common_name"].to_list()[0] is None + assert result["common_name_rank"].to_list()[0] is None + + +class TestMergeCommonNameNoFallback: + """Integration tests for merge_common_name with fallback disabled.""" + + def test_no_fallback_schema_includes_rank_column(self): + """common_name_rank is emitted regardless of mode.""" + anno_df = pl.DataFrame({ + "uuid": ["test1"], + "species": ["Canis lupus"], + "genus": ["Canis"], + "family": ["Canidae"], + "order": ["Carnivora"], + "class": ["Mammalia"], + "phylum": ["Chordata"], + "kingdom": ["Animalia"], + "taxonID_species": [100], + "taxonID_genus": [300], + }) + + common_name_df = pl.DataFrame({ + "taxonID": [100], + "vernacularName": ["Gray Wolf"], + }) + + taxon_df = pl.DataFrame({ + "taxonID": [100, 300], + "canonicalName": ["Canis lupus", "Canis"], + "taxonRank": ["species", "genus"], + "kingdom": ["Animalia"] * 2, + "phylum": ["Chordata"] * 2, + "class": ["Mammalia"] * 2, + "order": ["Carnivora"] * 2, + "family": ["Canidae"] * 2, + "genus": ["Canis"] * 2, + }) + + result = merge_common_name( + anno_df, common_name_df, taxon_df, higher_rank_fallback=False + ) + + assert "common_name_rank" in result.columns + assert result["common_name"].to_list()[0] == "Gray Wolf" + assert result["common_name_rank"].to_list()[0] == "species" + + def test_no_fallback_differs_from_fallback_on_genus_only_hit(self): + """When only the genus has a vernacular, ON gets it; OFF returns null.""" + anno_df = pl.DataFrame({ + "uuid": ["test1"], + "species": ["Canis lupus"], + "genus": ["Canis"], + "family": ["Canidae"], + "order": ["Carnivora"], + "class": ["Mammalia"], + "phylum": ["Chordata"], + "kingdom": ["Animalia"], + "taxonID_species": [100], + "taxonID_genus": [300], + }) + + # Only genus has a vernacular; species (taxonID 100) does not. + common_name_df = pl.DataFrame({ + "taxonID": [300], + "vernacularName": ["Dog Genus"], + }) + + taxon_df = pl.DataFrame({ + "taxonID": [100, 300], + "canonicalName": ["Canis lupus", "Canis"], + "taxonRank": ["species", "genus"], + "kingdom": ["Animalia"] * 2, + "phylum": ["Chordata"] * 2, + "class": ["Mammalia"] * 2, + "order": ["Carnivora"] * 2, + "family": ["Canidae"] * 2, + "genus": ["Canis"] * 2, + }) + + on_result = merge_common_name( + anno_df, common_name_df, taxon_df, higher_rank_fallback=True + ) + off_result = merge_common_name( + anno_df, common_name_df, taxon_df, higher_rank_fallback=False + ) + + # ON: climbs to genus + assert on_result["common_name"].to_list()[0] == "Dog Genus" + assert on_result["common_name_rank"].to_list()[0] == "genus" + + # OFF: species is the finest non-null rank; no climb -> null + assert off_result["common_name"].to_list()[0] is None + assert off_result["common_name_rank"].to_list()[0] is None class TestOverrideInputCommonName: From 7b46ecfad1c78ab5559d1da84ab727a5b490fa2b Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Thu, 21 May 2026 17:35:21 -0400 Subject: [PATCH 2/5] Document common-name rank and fallback control Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/user-guide/quick-reference.md | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/docs/user-guide/quick-reference.md b/docs/user-guide/quick-reference.md index a3f86c6..c34e6d9 100644 --- a/docs/user-guide/quick-reference.md +++ b/docs/user-guide/quick-reference.md @@ -88,19 +88,26 @@ taxonopy common-names \ --output-dir examples/resolved/common ``` -This command uses GBIF Backbone data only and applies deterministic fallback: species to kingdom, with English names preferred at each rank. It also writes a `taxonopy_common_names_manifest.json` to the output directory. - -_**Sample common-name output (`examples/resolved/common/sample.resolved.parquet`)**; the last two rows (both Laelia rosea) fall back to family-level common names—none available at species or genus rank._ +- **Common Name Data Source**: GBIF Backbone data only. +- **Behavior**: The `common-names` command can be set to only retrieve data available for the most specific available taxonomic rank available or to find a relevant common name at a higher rank. + - **Default**: fallback from species to kingdom, with English names preferred at each rank (`--higher-rank-fallback` optionally specified). + - **Strict mode**: no fallback, only the finest non-null rank in the row's lineage is queried, and the column is left empty if no name is found (`--no-higher-rank-fallback`). Useful when you would rather emit no common name than a less-specific one. +- **Output columns**: the `common-names` command adds two columns to the resolved output: + - **`common_name` column**: the vernacular name found, or `null` when no name was available. + - **`common_name_rank` column**: records the rank at which the vernacular was found, or `null` when no name was available. +- **Manifest**: writes `taxonopy_common_names_manifest.json` to the output directory. + +_**Sample common-name output (`examples/resolved/common/sample.resolved.parquet`)**. Note `common_name_rank = family` on the last two rows (both Laelia rosea). No species- or genus-level vernacular was available, so the climb fell back to family._
-| uuid | common_name | kingdom | phylum | class | order | family | genus | species | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bc2a3f9f-c1f9-48df-9b01-d045475b9d5f | Human | Animalia | Chordata | Mammalia | Primates | Hominidae | Homo | Homo sapiens | -| 21ed76d8-9a3b-406e-a1a3-ef244422bf8e | Eastern White Oak | Plantae | Tracheophyta | Magnoliopsida | Fagales | Fagaceae | Quercus | Quercus alba | -| 4d166a61-b6e5-4709-91ba-b623111014e9 | Drone-Bee | Animalia | Arthropoda | Insecta | Hymenoptera | Apidae | Apis | Apis mellifera | -| 85b96dc2-70ab-446e-afb5-6a4b92b0a450 | Fly Agaric | Fungi | Basidiomycota | Agaricomycetes | Agaricales | Amanitaceae | Amanita | Amanita muscaria | -| 38327554-ffbf-4180-b4cf-63c311a26f4e | Underwing, Tiger, Tussock, And Allied Moths | Animalia | Arthropoda | Insecta | Lepidoptera | Erebidae | Laelia | Laelia rosea | -| 8f688a17-1f7a-42b2-b3dc-bd4c8fc0eee3 | Orchid | Plantae | Tracheophyta | Liliopsida | Asparagales | Orchidaceae | Laelia | Laelia rosea | +| uuid | common_name | common_name_rank | kingdom | phylum | class | order | family | genus | species | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bc2a3f9f-c1f9-48df-9b01-d045475b9d5f | Human | species | Animalia | Chordata | Mammalia | Primates | Hominidae | Homo | Homo sapiens | +| 21ed76d8-9a3b-406e-a1a3-ef244422bf8e | Eastern White Oak | species | Plantae | Tracheophyta | Magnoliopsida | Fagales | Fagaceae | Quercus | Quercus alba | +| 4d166a61-b6e5-4709-91ba-b623111014e9 | Drone-Bee | species | Animalia | Arthropoda | Insecta | Hymenoptera | Apidae | Apis | Apis mellifera | +| 85b96dc2-70ab-446e-afb5-6a4b92b0a450 | Fly Agaric | species | Fungi | Basidiomycota | Agaricomycetes | Agaricales | Amanitaceae | Amanita | Amanita muscaria | +| 38327554-ffbf-4180-b4cf-63c311a26f4e | Underwing, Tiger, Tussock, And Allied Moths | family | Animalia | Arthropoda | Insecta | Lepidoptera | Erebidae | Laelia | Laelia rosea | +| 8f688a17-1f7a-42b2-b3dc-bd4c8fc0eee3 | Orchid | family | Plantae | Tracheophyta | Liliopsida | Asparagales | Orchidaceae | Laelia | Laelia rosea |
From f69c4e9b168a5aa9637c3d724458abbb6eb7ac15 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Thu, 21 May 2026 17:35:27 -0400 Subject: [PATCH 3/5] Record common-name rank and fallback behavior in the agent guide Co-Authored-By: Claude Opus 4.7 (1M context) --- AGENTS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/AGENTS.md b/AGENTS.md index 2fae568..fe4fc95 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -62,6 +62,8 @@ taxonopy common-names \ --output-dir out_test_cn ``` - Runs `resolve_common_names.py`; expect long runtimes and large temporary files under the configured cache directory. +- Output adds two columns: `common_name` and `common_name_rank` (the taxonomic rank at which the vernacular was found, or null when none was available). +- The default `--higher-rank-fallback` preserves the prior species → kingdom climb. Pass `--no-higher-rank-fallback` to query only the finest non-null rank in the row's lineage and skip climbing on a miss. ### Cache Management - Cache default root: `~/.cache/taxonopy`, with command/version/input fingerprints stored as subdirectories (e.g., `resolve_v0.1.0b0_ab12cd34ef56`). `diskcache` manages the store; point `TAXONOPY_CACHE_DIR` (or `--cache-dir`) at the root and let the CLI derive namespaces via `set_cache_namespace`. From 9c7441b7a740a85bfdbebabdd14efd0a8c68d51a Mon Sep 17 00:00:00 2001 From: Matt Thompson <31709066+thompsonmj@users.noreply.github.com> Date: Thu, 21 May 2026 18:18:03 -0400 Subject: [PATCH 4/5] Use config for default setting of `higher_rank_fallback` Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/taxonopy/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/taxonopy/cli.py b/src/taxonopy/cli.py index ee4a690..f35d45f 100644 --- a/src/taxonopy/cli.py +++ b/src/taxonopy/cli.py @@ -139,7 +139,7 @@ def create_parser() -> argparse.ArgumentParser: "--higher-rank-fallback", dest="higher_rank_fallback", action=argparse.BooleanOptionalAction, - default=True, + default=config.higher_rank_fallback, help=( "When set (default), climb species->genus->family->...->kingdom until " "a vernacular is found. With --no-higher-rank-fallback, query the GBIF " From f5f9fdaede7748df29a7f185afff0319c5dc327b Mon Sep 17 00:00:00 2001 From: Matt Thompson <31709066+thompsonmj@users.noreply.github.com> Date: Thu, 21 May 2026 18:20:23 -0400 Subject: [PATCH 5/5] Improve wording describing common name command controls Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- docs/user-guide/quick-reference.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-guide/quick-reference.md b/docs/user-guide/quick-reference.md index c34e6d9..cefef06 100644 --- a/docs/user-guide/quick-reference.md +++ b/docs/user-guide/quick-reference.md @@ -89,7 +89,7 @@ taxonopy common-names \ ``` - **Common Name Data Source**: GBIF Backbone data only. -- **Behavior**: The `common-names` command can be set to only retrieve data available for the most specific available taxonomic rank available or to find a relevant common name at a higher rank. +- **Behavior**: The `common-names` command can be set to retrieve data only for the most specific available taxonomic rank or to find a relevant common name at a higher rank. - **Default**: fallback from species to kingdom, with English names preferred at each rank (`--higher-rank-fallback` optionally specified). - **Strict mode**: no fallback, only the finest non-null rank in the row's lineage is queried, and the column is left empty if no name is found (`--no-higher-rank-fallback`). Useful when you would rather emit no common name than a less-specific one. - **Output columns**: the `common-names` command adds two columns to the resolved output: