diff --git a/AGENTS.md b/AGENTS.md
index 2fae568..fe4fc95 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -62,6 +62,8 @@ taxonopy common-names \
--output-dir out_test_cn
```
- Runs `resolve_common_names.py`; expect long runtimes and large temporary files under the configured cache directory.
+- Output adds two columns: `common_name` and `common_name_rank` (the taxonomic rank at which the vernacular was found, or null when none was available).
+- The default `--higher-rank-fallback` preserves the prior species → kingdom climb. Pass `--no-higher-rank-fallback` to query only the finest non-null rank in the row's lineage and skip climbing on a miss.
### Cache Management
- Cache default root: `~/.cache/taxonopy`, with command/version/input fingerprints stored as subdirectories (e.g., `resolve_v0.1.0b0_ab12cd34ef56`). `diskcache` manages the store; point `TAXONOPY_CACHE_DIR` (or `--cache-dir`) at the root and let the CLI derive namespaces via `set_cache_namespace`.
diff --git a/docs/user-guide/quick-reference.md b/docs/user-guide/quick-reference.md
index a3f86c6..cefef06 100644
--- a/docs/user-guide/quick-reference.md
+++ b/docs/user-guide/quick-reference.md
@@ -88,19 +88,26 @@ taxonopy common-names \
--output-dir examples/resolved/common
```
-This command uses GBIF Backbone data only and applies deterministic fallback: species to kingdom, with English names preferred at each rank. It also writes a `taxonopy_common_names_manifest.json` to the output directory.
-
-_**Sample common-name output (`examples/resolved/common/sample.resolved.parquet`)**; the last two rows (both Laelia rosea) fall back to family-level common names—none available at species or genus rank._
+- **Common Name Data Source**: GBIF Backbone data only.
+- **Behavior**: The `common-names` command can be set to retrieve data only for the most specific available taxonomic rank or to find a relevant common name at a higher rank.
+ - **Default**: fallback from species to kingdom, with English names preferred at each rank (`--higher-rank-fallback` optionally specified).
+ - **Strict mode**: no fallback, only the finest non-null rank in the row's lineage is queried, and the column is left empty if no name is found (`--no-higher-rank-fallback`). Useful when you would rather emit no common name than a less-specific one.
+- **Output columns**: the `common-names` command adds two columns to the resolved output:
+ - **`common_name` column**: the vernacular name found, or `null` when no name was available.
+ - **`common_name_rank` column**: records the rank at which the vernacular was found, or `null` when no name was available.
+- **Manifest**: writes `taxonopy_common_names_manifest.json` to the output directory.
+
+_**Sample common-name output (`examples/resolved/common/sample.resolved.parquet`)**. Note `common_name_rank = family` on the last two rows (both Laelia rosea). No species- or genus-level vernacular was available, so the climb fell back to family._
-| uuid | common_name | kingdom | phylum | class | order | family | genus | species |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| bc2a3f9f-c1f9-48df-9b01-d045475b9d5f | Human | Animalia | Chordata | Mammalia | Primates | Hominidae | Homo | Homo sapiens |
-| 21ed76d8-9a3b-406e-a1a3-ef244422bf8e | Eastern White Oak | Plantae | Tracheophyta | Magnoliopsida | Fagales | Fagaceae | Quercus | Quercus alba |
-| 4d166a61-b6e5-4709-91ba-b623111014e9 | Drone-Bee | Animalia | Arthropoda | Insecta | Hymenoptera | Apidae | Apis | Apis mellifera |
-| 85b96dc2-70ab-446e-afb5-6a4b92b0a450 | Fly Agaric | Fungi | Basidiomycota | Agaricomycetes | Agaricales | Amanitaceae | Amanita | Amanita muscaria |
-| 38327554-ffbf-4180-b4cf-63c311a26f4e | Underwing, Tiger, Tussock, And Allied Moths | Animalia | Arthropoda | Insecta | Lepidoptera | Erebidae | Laelia | Laelia rosea |
-| 8f688a17-1f7a-42b2-b3dc-bd4c8fc0eee3 | Orchid | Plantae | Tracheophyta | Liliopsida | Asparagales | Orchidaceae | Laelia | Laelia rosea |
+| uuid | common_name | common_name_rank | kingdom | phylum | class | order | family | genus | species |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| bc2a3f9f-c1f9-48df-9b01-d045475b9d5f | Human | species | Animalia | Chordata | Mammalia | Primates | Hominidae | Homo | Homo sapiens |
+| 21ed76d8-9a3b-406e-a1a3-ef244422bf8e | Eastern White Oak | species | Plantae | Tracheophyta | Magnoliopsida | Fagales | Fagaceae | Quercus | Quercus alba |
+| 4d166a61-b6e5-4709-91ba-b623111014e9 | Drone-Bee | species | Animalia | Arthropoda | Insecta | Hymenoptera | Apidae | Apis | Apis mellifera |
+| 85b96dc2-70ab-446e-afb5-6a4b92b0a450 | Fly Agaric | species | Fungi | Basidiomycota | Agaricomycetes | Agaricales | Amanitaceae | Amanita | Amanita muscaria |
+| 38327554-ffbf-4180-b4cf-63c311a26f4e | Underwing, Tiger, Tussock, And Allied Moths | family | Animalia | Arthropoda | Insecta | Lepidoptera | Erebidae | Laelia | Laelia rosea |
+| 8f688a17-1f7a-42b2-b3dc-bd4c8fc0eee3 | Orchid | family | Plantae | Tracheophyta | Liliopsida | Asparagales | Orchidaceae | Laelia | Laelia rosea |
diff --git a/src/taxonopy/cli.py b/src/taxonopy/cli.py
index f582a8c..f35d45f 100644
--- a/src/taxonopy/cli.py
+++ b/src/taxonopy/cli.py
@@ -135,6 +135,18 @@ def create_parser() -> argparse.ArgumentParser:
parser_common = subparsers.add_parser("common-names", help="Merge vernacular names (post-process) into resolved outputs")
parser_common.add_argument("--resolved-dir", dest="annotation_dir", required=True,help="Directory containing your *.resolved.parquet files")
parser_common.add_argument("--output-dir", required=True, help="Directory to write annotated .parquet files")
+ parser_common.add_argument(
+ "--higher-rank-fallback",
+ dest="higher_rank_fallback",
+ action=argparse.BooleanOptionalAction,
+ default=config.higher_rank_fallback,
+ help=(
+ "When set (default), climb species->genus->family->...->kingdom until "
+ "a vernacular is found. With --no-higher-rank-fallback, query the GBIF "
+ "VernacularName table only at the finest non-null rank in the row's "
+ "lineage; no climbing."
+ ),
+ )
return parser
@@ -431,7 +443,11 @@ def main(args: Optional[List[str]] = None) -> int:
count = clear_cache()
print(f"\nCleared {count} cache entries")
return 0
- return cn_main(parsed_args.annotation_dir, parsed_args.output_dir)
+ return cn_main(
+ parsed_args.annotation_dir,
+ parsed_args.output_dir,
+ higher_rank_fallback=parsed_args.higher_rank_fallback,
+ )
else:
parser.error(f"Unknown command: {parsed_args.command}")
return 1
diff --git a/src/taxonopy/config.py b/src/taxonopy/config.py
index 1978d57..40f1dc6 100644
--- a/src/taxonopy/config.py
+++ b/src/taxonopy/config.py
@@ -43,6 +43,9 @@ def __init__(self):
# Output settings
self.output_format = "parquet"
+
+ # Common-names settings
+ self.higher_rank_fallback = True
def update_from_args(self, args: Dict[str, Any]) -> None:
"""Update configuration from command-line arguments.
@@ -85,6 +88,9 @@ def update_from_args(self, args: Dict[str, Any]) -> None:
if 'species_group' in args:
self.species_group = args['species_group']
+
+ if 'higher_rank_fallback' in args:
+ self.higher_rank_fallback = args['higher_rank_fallback']
def ensure_directories(self) -> None:
"""Create any required directories."""
diff --git a/src/taxonopy/resolve_common_names.py b/src/taxonopy/resolve_common_names.py
index 38e5e6d..7686494 100644
--- a/src/taxonopy/resolve_common_names.py
+++ b/src/taxonopy/resolve_common_names.py
@@ -268,71 +268,148 @@ def prioritize_vernacular(vernacular_df: pl.DataFrame) -> pl.DataFrame:
return result.select(["taxonID", "vernacularName"])
-def apply_hierarchical_common_name_lookup(anno_df: pl.DataFrame, common_lookup: pl.DataFrame) -> pl.DataFrame:
+def apply_hierarchical_common_name_lookup(
+ anno_df: pl.DataFrame,
+ common_lookup: pl.DataFrame,
+ higher_rank_fallback: bool = True,
+) -> pl.DataFrame:
"""
- Apply hierarchical common name lookup from most specific to least specific rank.
-
+ Apply common name lookup, with optional higher-rank fallback.
+
+ When ``higher_rank_fallback`` is True (default), iterate ranks from most to
+ least specific (species -> kingdom) and take the first non-null vernacular.
+ When False, query only the finest non-null rank present in the row's lineage
+ ("most-granular-resolved"): no climbing.
+
+ Always emits a ``common_name_rank`` column recording the rank at which the
+ name was found, or null when no name was available.
+
:param anno_df: Annotation dataframe with taxonID_* columns
:param common_lookup: Common name lookup table with (taxonID, common_name) columns
- :return: DataFrame with common_name column populated using hierarchical fallback
+ :param higher_rank_fallback: Whether to climb to higher ranks on miss
+ :return: DataFrame with ``common_name`` and ``common_name_rank`` columns populated
"""
# Define hierarchical order of taxonomic ranks (map class_ to class)
rank_columns = [r.rstrip('_') for r in TAXONOMIC_RANKS_BY_SPECIFICITY]
-
- # Initialize common_name column
- result_df = anno_df.with_columns(pl.lit(None).cast(pl.Utf8).alias("common_name"))
-
- # Apply hierarchical lookup - check each rank in priority order
+
+ # Initialize common_name and common_name_rank columns
+ result_df = anno_df.with_columns([
+ pl.lit(None).cast(pl.Utf8).alias("common_name"),
+ pl.lit(None).cast(pl.Utf8).alias("common_name_rank"),
+ ])
+
+ if higher_rank_fallback:
+ # Apply hierarchical lookup - check each rank in priority order
+ for rank in rank_columns:
+ taxonid_col = f"taxonID_{rank}"
+ if taxonid_col not in result_df.columns:
+ continue
+
+ # Join common names for this rank
+ temp_df = result_df.join(
+ common_lookup.select([
+ "taxonID",
+ pl.col("common_name").alias(f"temp_common_{rank}")
+ ]),
+ left_on=taxonid_col,
+ right_on="taxonID",
+ how="left"
+ )
+
+ # Record the rank ONLY where this is the first hit (existing
+ # common_name still null and this rank produced a value). Update the
+ # rank field BEFORE coalescing the name so the predicate observes
+ # the pre-update common_name.
+ result_df = (
+ temp_df
+ .with_columns([
+ pl.when(
+ pl.col("common_name").is_null()
+ & pl.col(f"temp_common_{rank}").is_not_null()
+ )
+ .then(pl.lit(rank))
+ .otherwise(pl.col("common_name_rank"))
+ .alias("common_name_rank"),
+ pl.coalesce([
+ pl.col("common_name"),
+ pl.col(f"temp_common_{rank}")
+ ]).alias("common_name"),
+ ])
+ .drop(f"temp_common_{rank}")
+ )
+
+ # Drop taxonID column if it exists (may not exist if no matches)
+ if "taxonID" in result_df.columns:
+ result_df = result_df.drop("taxonID")
+
+ return result_df
+
+ # Fallback OFF: most-granular-resolved.
+ # 1) Per row, identify the finest non-null rank in the lineage.
+ finest_rank_expr = pl.lit(None).cast(pl.Utf8)
+ for rank in rank_columns: # species -> kingdom
+ if rank in result_df.columns:
+ finest_rank_expr = (
+ pl.when(finest_rank_expr.is_null() & pl.col(rank).is_not_null())
+ .then(pl.lit(rank))
+ .otherwise(finest_rank_expr)
+ )
+ result_df = result_df.with_columns(finest_rank_expr.alias("_finest_rank"))
+
+ # 2) Project that rank's taxonID into a unified column.
+ finest_tid_expr = pl.lit(None).cast(pl.Utf8)
for rank in rank_columns:
- taxonid_col = f"taxonID_{rank}"
- if taxonid_col not in result_df.columns:
- continue
-
- # Join common names for this rank
- temp_df = result_df.join(
- common_lookup.select([
- "taxonID",
- pl.col("common_name").alias(f"temp_common_{rank}")
- ]),
- left_on=taxonid_col,
- right_on="taxonID",
- how="left"
- )
-
- # Update common_name where it's null and this rank has a name
- result_df = (
- temp_df
- # pick up the new common_name, drop the temp join field
- .with_columns([
- pl.coalesce([
- pl.col("common_name"),
- pl.col(f"temp_common_{rank}")
- ]).alias("common_name")
- ])
- .drop(f"temp_common_{rank}")
- )
-
- # Drop taxonID column if it exists (may not exist if no matches)
- if "taxonID" in result_df.columns:
- result_df = result_df.drop("taxonID")
-
+ tcol = f"taxonID_{rank}"
+ if tcol in result_df.columns:
+ finest_tid_expr = (
+ pl.when(pl.col("_finest_rank") == rank)
+ .then(pl.col(tcol).cast(pl.Utf8))
+ .otherwise(finest_tid_expr)
+ )
+ result_df = result_df.with_columns(finest_tid_expr.alias("_finest_taxonid"))
+
+ # 3) Single join at the finest rank's taxonID; assign common_name and
+ # common_name_rank in one pass.
+ result_df = result_df.join(
+ common_lookup.select([
+ pl.col("taxonID").cast(pl.Utf8).alias("_finest_taxonid"),
+ pl.col("common_name").alias("_cn_finest"),
+ ]),
+ on="_finest_taxonid",
+ how="left",
+ )
+ result_df = result_df.with_columns([
+ pl.col("_cn_finest").alias("common_name"),
+ pl.when(pl.col("_cn_finest").is_not_null())
+ .then(pl.col("_finest_rank"))
+ .otherwise(pl.lit(None).cast(pl.Utf8))
+ .alias("common_name_rank"),
+ ]).drop(["_cn_finest", "_finest_rank", "_finest_taxonid"])
+
return result_df
-def override_input_common_name(df: pl.DataFrame, common_lookup: pl.DataFrame) -> pl.DataFrame:
+def override_input_common_name(
+ df: pl.DataFrame,
+ common_lookup: pl.DataFrame,
+ higher_rank_fallback: bool = True,
+) -> pl.DataFrame:
"""
Override any existing common_name column with backbone-derived common names.
-
+
:param df: DataFrame that may have a pre-existing common_name column
:param common_lookup: Common name lookup table with hierarchical fallback applied
+ :param higher_rank_fallback: Whether to climb to higher ranks on miss
:return: DataFrame with backbone-derived common_name (input common_name completely replaced)
"""
# Drop any existing common_name column and apply the backbone lookup
df_clean = df.drop("common_name") if "common_name" in df.columns else df
- return apply_hierarchical_common_name_lookup(df_clean, common_lookup)
+ return apply_hierarchical_common_name_lookup(
+ df_clean, common_lookup, higher_rank_fallback=higher_rank_fallback
+ )
-def merge_common_name(anno_df, common_name_df, taxon_df):
+def merge_common_name(anno_df, common_name_df, taxon_df, higher_rank_fallback: bool = True):
"""
This function merges common names with annotation dataframe using hierarchical lookup.
Common names are always derived from backbone lookup data for consistent mapping.
@@ -393,7 +470,9 @@ def merge_common_name(anno_df, common_name_df, taxon_df):
new_anno_df = new_anno_df.rename({temp_taxonid_col: taxonid_col})
# Override any input common_name with backbone data
- new_anno_df = override_input_common_name(new_anno_df, common_lookup)
+ new_anno_df = override_input_common_name(
+ new_anno_df, common_lookup, higher_rank_fallback=higher_rank_fallback
+ )
# Clean up temporary taxonID columns (keep original taxonomic classification columns)
cleanup_cols = [f"taxonID_{rank}" for rank in rank_columns]
@@ -408,7 +487,7 @@ def merge_common_name(anno_df, common_name_df, taxon_df):
return new_anno_df
-def main(annotation_dir=None, output_dir=None):
+def main(annotation_dir=None, output_dir=None, higher_rank_fallback: bool = True):
"""
Merge common names into resolved output files.
"""
@@ -428,6 +507,18 @@ def main(annotation_dir=None, output_dir=None):
required=True,
help="Where to write the new, annotated .parquet files"
)
+ parser.add_argument(
+ "--higher-rank-fallback",
+ dest="higher_rank_fallback",
+ action=argparse.BooleanOptionalAction,
+ default=True,
+ help=(
+ "When set (default), climb species->genus->family->...->kingdom until "
+ "a vernacular is found. With --no-higher-rank-fallback, query the GBIF "
+ "VernacularName table only at the finest non-null rank in the row's "
+ "lineage; no climbing."
+ ),
+ )
args = parser.parse_args()
# Update config if cache-dir was provided
@@ -438,6 +529,7 @@ def main(annotation_dir=None, output_dir=None):
annotation_dir = args.annotation_dir
output_dir = args.output_dir
+ higher_rank_fallback = args.higher_rank_fallback
# Use global config's cache_dir
from taxonopy.config import config
@@ -482,7 +574,10 @@ def main(annotation_dir=None, output_dir=None):
anno_df = pl.read_parquet(annotation_path)
new_df = merge_taxon_id(anno_df, taxon_df)
- new_df = merge_common_name(new_df, common_name_df, taxon_df)
+ new_df = merge_common_name(
+ new_df, common_name_df, taxon_df,
+ higher_rank_fallback=higher_rank_fallback,
+ )
new_df = new_df.with_columns([
pl.col("scientific_name").cast(pl.Utf8)
])
diff --git a/tests/test_resolve_common_names.py b/tests/test_resolve_common_names.py
index da75185..afda943 100644
--- a/tests/test_resolve_common_names.py
+++ b/tests/test_resolve_common_names.py
@@ -127,10 +127,11 @@ def test_merge_common_name_species_priority(self):
})
result = merge_common_name(anno_df, common_name_df, taxon_df)
-
+
assert len(result) == 1
assert result["common_name"].to_list()[0] == "Gray Wolf", "Should prefer species over genus name"
-
+ assert result["common_name_rank"].to_list()[0] == "species", "Rank should record the species-level hit"
+
def test_merge_common_name_cleans_up_intermediate_columns(self):
"""Test that all intermediate columns are removed after processing"""
anno_df = pl.DataFrame({
@@ -172,7 +173,8 @@ def test_merge_common_name_cleans_up_intermediate_columns(self):
# Lock in the exact final column set
expected_columns = {
- "uuid", "species", "genus", "family", "order", "class", "phylum", "kingdom", "common_name"
+ "uuid", "species", "genus", "family", "order", "class", "phylum", "kingdom",
+ "common_name", "common_name_rank"
}
assert set(result.columns) == expected_columns, f"Final columns should be exactly {expected_columns}"
@@ -234,7 +236,8 @@ def test_merge_common_name_overrides_existing_common_name(self):
# Verify exact final column set (original input had common_name, should be overridden)
expected_columns = {
- "uuid", "species", "genus", "family", "order", "class", "phylum", "kingdom", "common_name"
+ "uuid", "species", "genus", "family", "order", "class", "phylum", "kingdom",
+ "common_name", "common_name_rank"
}
assert set(result.columns) == expected_columns, f"Final columns should be exactly {expected_columns}"
@@ -325,8 +328,11 @@ def test_merge_common_name_hierarchical_fallback_levels(self, available_rank, ex
})
result = merge_common_name(anno_df, common_name_df, taxon_df)
-
+
assert result["common_name"].to_list()[0] == expected_name, f"Should fallback to {available_rank} level name"
+ assert result["common_name_rank"].to_list()[0] == available_rank, (
+ f"common_name_rank should record the {available_rank} fallback level"
+ )
class TestNormalizeTaxonomicColumns:
@@ -421,9 +427,10 @@ def test_hierarchical_fallback_levels(self, available_rank, expected_name):
})
result = apply_hierarchical_common_name_lookup(anno_df, common_lookup)
-
+
assert result["common_name"].to_list()[0] == expected_name
-
+ assert result["common_name_rank"].to_list()[0] == available_rank
+
def test_species_takes_priority_over_genus(self):
"""Test that species-level names take priority over genus"""
anno_df = pl.DataFrame({
@@ -438,8 +445,236 @@ def test_species_takes_priority_over_genus(self):
})
result = apply_hierarchical_common_name_lookup(anno_df, common_lookup)
-
+
assert result["common_name"].to_list()[0] == "Gray Wolf"
+ assert result["common_name_rank"].to_list()[0] == "species"
+
+
+class TestHierarchicalCommonNameLookupNoFallback:
+ """Unit tests for apply_hierarchical_common_name_lookup with fallback disabled"""
+
+ @pytest.mark.parametrize("finest_rank,expected_name", [
+ ("species", "Species Name"),
+ ("genus", "Genus Name"),
+ ("family", "Family Name"),
+ ("order", "Order Name"),
+ ("class", "Class Name"),
+ ("phylum", "Phylum Name"),
+ ("kingdom", "Kingdom Name"),
+ ])
+ def test_no_fallback_returns_name_at_finest_rank(self, finest_rank, expected_name):
+ """With fallback off, only the finest non-null rank's vernacular is returned."""
+ # Build a lineage that is non-null only at finest_rank and any higher (less-specific) ranks.
+ rank_order = ["species", "genus", "family", "order", "class", "phylum", "kingdom"]
+ finest_idx = rank_order.index(finest_rank)
+ lineage = {r: [None] for r in rank_order}
+ for r in rank_order[finest_idx:]:
+ lineage[r] = [f"Test {r}"]
+
+ taxonid_cols = {f"taxonID_{r}": [None] for r in rank_order}
+ taxonid_cols[f"taxonID_{finest_rank}"] = [100]
+
+ anno_df = pl.DataFrame({"uuid": ["test1"], **lineage, **taxonid_cols})
+
+ common_lookup = pl.DataFrame({
+ "taxonID": [100],
+ "common_name": [expected_name],
+ })
+
+ result = apply_hierarchical_common_name_lookup(
+ anno_df, common_lookup, higher_rank_fallback=False
+ )
+
+ assert result["common_name"].to_list()[0] == expected_name
+ assert result["common_name_rank"].to_list()[0] == finest_rank
+
+ def test_no_fallback_no_climb_when_species_has_no_vernacular(self):
+ """Species present in lineage but lacking a vernacular -> null, no climb to genus."""
+ anno_df = pl.DataFrame({
+ "uuid": ["test1"],
+ "species": ["Canis lupus"],
+ "genus": ["Canis"],
+ "family": ["Canidae"],
+ "order": ["Carnivora"],
+ "class": ["Mammalia"],
+ "phylum": ["Chordata"],
+ "kingdom": ["Animalia"],
+ "taxonID_species": [100], # no vernacular for 100
+ "taxonID_genus": [200], # genus has a vernacular but should NOT be used
+ "taxonID_family": [None],
+ "taxonID_order": [None],
+ "taxonID_class": [None],
+ "taxonID_phylum": [None],
+ "taxonID_kingdom": [None],
+ })
+
+ common_lookup = pl.DataFrame({
+ "taxonID": [200],
+ "common_name": ["Dog Genus"],
+ })
+
+ result = apply_hierarchical_common_name_lookup(
+ anno_df, common_lookup, higher_rank_fallback=False
+ )
+
+ assert result["common_name"].to_list()[0] is None
+ assert result["common_name_rank"].to_list()[0] is None
+
+ def test_no_fallback_finest_is_genus_when_species_is_null(self):
+ """Species null but genus populated -> genus is queried only."""
+ anno_df = pl.DataFrame({
+ "uuid": ["test1"],
+ "species": [None],
+ "genus": ["Canis"],
+ "family": ["Canidae"],
+ "order": ["Carnivora"],
+ "class": ["Mammalia"],
+ "phylum": ["Chordata"],
+ "kingdom": ["Animalia"],
+ "taxonID_species": [None],
+ "taxonID_genus": [200],
+ "taxonID_family": [500],
+ "taxonID_order": [None],
+ "taxonID_class": [None],
+ "taxonID_phylum": [None],
+ "taxonID_kingdom": [None],
+ })
+
+ common_lookup = pl.DataFrame({
+ "taxonID": [200, 500],
+ "common_name": ["Dog Genus", "Dog Family"],
+ })
+
+ result = apply_hierarchical_common_name_lookup(
+ anno_df, common_lookup, higher_rank_fallback=False
+ )
+
+ assert result["common_name"].to_list()[0] == "Dog Genus"
+ assert result["common_name_rank"].to_list()[0] == "genus"
+
+ def test_no_fallback_all_null_lineage(self):
+ """All-null lineage row -> both output columns null."""
+ anno_df = pl.DataFrame({
+ "uuid": ["test1"],
+ "species": [None],
+ "genus": [None],
+ "family": [None],
+ "order": [None],
+ "class": [None],
+ "phylum": [None],
+ "kingdom": [None],
+ "taxonID_species": [None],
+ "taxonID_genus": [None],
+ "taxonID_family": [None],
+ "taxonID_order": [None],
+ "taxonID_class": [None],
+ "taxonID_phylum": [None],
+ "taxonID_kingdom": [None],
+ })
+
+ common_lookup = pl.DataFrame({
+ "taxonID": [100],
+ "common_name": ["Anything"],
+ })
+
+ result = apply_hierarchical_common_name_lookup(
+ anno_df, common_lookup, higher_rank_fallback=False
+ )
+
+ assert result["common_name"].to_list()[0] is None
+ assert result["common_name_rank"].to_list()[0] is None
+
+
+class TestMergeCommonNameNoFallback:
+ """Integration tests for merge_common_name with fallback disabled."""
+
+ def test_no_fallback_schema_includes_rank_column(self):
+ """common_name_rank is emitted regardless of mode."""
+ anno_df = pl.DataFrame({
+ "uuid": ["test1"],
+ "species": ["Canis lupus"],
+ "genus": ["Canis"],
+ "family": ["Canidae"],
+ "order": ["Carnivora"],
+ "class": ["Mammalia"],
+ "phylum": ["Chordata"],
+ "kingdom": ["Animalia"],
+ "taxonID_species": [100],
+ "taxonID_genus": [300],
+ })
+
+ common_name_df = pl.DataFrame({
+ "taxonID": [100],
+ "vernacularName": ["Gray Wolf"],
+ })
+
+ taxon_df = pl.DataFrame({
+ "taxonID": [100, 300],
+ "canonicalName": ["Canis lupus", "Canis"],
+ "taxonRank": ["species", "genus"],
+ "kingdom": ["Animalia"] * 2,
+ "phylum": ["Chordata"] * 2,
+ "class": ["Mammalia"] * 2,
+ "order": ["Carnivora"] * 2,
+ "family": ["Canidae"] * 2,
+ "genus": ["Canis"] * 2,
+ })
+
+ result = merge_common_name(
+ anno_df, common_name_df, taxon_df, higher_rank_fallback=False
+ )
+
+ assert "common_name_rank" in result.columns
+ assert result["common_name"].to_list()[0] == "Gray Wolf"
+ assert result["common_name_rank"].to_list()[0] == "species"
+
+ def test_no_fallback_differs_from_fallback_on_genus_only_hit(self):
+ """When only the genus has a vernacular, ON gets it; OFF returns null."""
+ anno_df = pl.DataFrame({
+ "uuid": ["test1"],
+ "species": ["Canis lupus"],
+ "genus": ["Canis"],
+ "family": ["Canidae"],
+ "order": ["Carnivora"],
+ "class": ["Mammalia"],
+ "phylum": ["Chordata"],
+ "kingdom": ["Animalia"],
+ "taxonID_species": [100],
+ "taxonID_genus": [300],
+ })
+
+ # Only genus has a vernacular; species (taxonID 100) does not.
+ common_name_df = pl.DataFrame({
+ "taxonID": [300],
+ "vernacularName": ["Dog Genus"],
+ })
+
+ taxon_df = pl.DataFrame({
+ "taxonID": [100, 300],
+ "canonicalName": ["Canis lupus", "Canis"],
+ "taxonRank": ["species", "genus"],
+ "kingdom": ["Animalia"] * 2,
+ "phylum": ["Chordata"] * 2,
+ "class": ["Mammalia"] * 2,
+ "order": ["Carnivora"] * 2,
+ "family": ["Canidae"] * 2,
+ "genus": ["Canis"] * 2,
+ })
+
+ on_result = merge_common_name(
+ anno_df, common_name_df, taxon_df, higher_rank_fallback=True
+ )
+ off_result = merge_common_name(
+ anno_df, common_name_df, taxon_df, higher_rank_fallback=False
+ )
+
+ # ON: climbs to genus
+ assert on_result["common_name"].to_list()[0] == "Dog Genus"
+ assert on_result["common_name_rank"].to_list()[0] == "genus"
+
+ # OFF: species is the finest non-null rank; no climb -> null
+ assert off_result["common_name"].to_list()[0] is None
+ assert off_result["common_name_rank"].to_list()[0] is None
class TestOverrideInputCommonName: