Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion dtcli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from click_aliasing import ClickAliasedGroup
from rich import console, pretty

from dtcli import clear, config, ls, ps, pull, scout
from dtcli import clear, config, ls, ps, pull, scout, unregistered
from dtcli.utilities import utilities

pretty.install()
Expand Down Expand Up @@ -46,6 +46,7 @@ def version():
cli.add_command(ps.ps)
cli.add_command(pull.pull)
cli.add_command(scout.scout)
cli.add_command(unregistered.unregistered)


def check_version() -> None:
Expand Down
71 changes: 71 additions & 0 deletions dtcli/src/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

import logging
import os
import re
import shutil
import time
from collections import defaultdict
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

Expand Down Expand Up @@ -481,3 +483,72 @@ def get_unregistered_dataset(dataset: str, scope: str) -> Optional[Dict[str, Any
return None
else:
return response[0]


def signature(msg: str) -> str:
"""Create a signature for a reason unregistered message.

Args:
msg: Reason message for unregistered dataset.

Returns:
str: Signature for error message.
"""
ATTACH_RE = re.compile(
r"Could not attach datasets: \d+ and (pulsar\.[^\.]+).*?, (\w+)\.event\.baseband\.raw not found" # noqa: E501
)

CREATE_RE = re.compile(
r"Could not create dataset: \d+, scope: (\w+\.event\.baseband\.raw).*UniqueViolation" # noqa: E501
)

msg = msg.strip()

# Attach-dataset errors
m = ATTACH_RE.search(msg)
if m:
pulsar, backend = m.groups()
return f"ATTACH_MISSING:{pulsar}:{backend}"

# Create-dataset unique violation
m = CREATE_RE.search(msg)
if m:
scope = m.group(1)
return f"CREATE_DUPLICATE:{scope}"

# Short status / token messages
if len(msg) < 80 and "\n" not in msg:
return f"STATUS:{msg}"

# Fallback: normalized text
msg = re.sub(r"\d+", "<ID>", msg)
msg = re.sub(r"\s+", " ", msg)
return f"OTHER:{msg[:120]}"


def get_all_unregistered_datasets() -> List[Dict[str, Any]]:
"""Get all unregistered datasets from Workflow Results.

Returns:
List[Dict[str, Any]]: List of unregistered dataset information.
"""
return view_results(
pipeline="datatrail-unregistered-datasets", query={}, projection={}, limit=10000
)


def summarise_unregistered_datasets() -> Dict[str, int]:
"""Create a summary of unregistered datasets by grouping similar error messages.

Returns:
Dict[str, int]: Dictionary of error message signatures and their counts.
"""
response = get_all_unregistered_datasets()
reason_groups: Dict[str, int] = defaultdict(int)
messages = [str(r["results"]["reason"]) for r in response]

for msg in messages:
sig = signature(msg)
reason_groups[sig] += 1

return reason_groups
57 changes: 57 additions & 0 deletions dtcli/unregistered.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
"""Datatrail Unregistered datasets commands."""

import logging

import click
from rich.console import Console
from rich.table import Table

from dtcli.src import functions
from dtcli.utilities.utilities import set_log_level

logger = logging.getLogger(__name__)

console = Console()
error_console = Console(stderr=True, style="bold red")


@click.group(help="Commands related to unregistered datasets.")
def unregistered():
"""Group of commands related to unregistered datasets."""
pass


@unregistered.command(help="Summarise the reasons for unregistered datasets.")
@click.option("-v", "--verbose", count=True, help="Verbosity: v=INFO, vv=DEBUG.")
@click.option("-q", "--quiet", is_flag=True, help="Only errors shown in logs.")
@click.pass_context
def summary(
ctx: click.Context,
verbose: int = 0,
quiet: bool = False,
):
"""Show a summary of the unregistered datasets.

Args:
ctx (click.Context): Click context.
verbose (int): Verbosity: v=INFO, vv=DEBUG.
quiet (bool): Only errors shown in logs.
"""
# Set logging level.
set_log_level(logger, verbose, quiet)
logger.debug("`summary` called with:")
logger.debug(f"verbose: {verbose} [{type(verbose)}]")
logger.debug(f"quiet: {quiet} [{type(quiet)}]")

results = functions.summarise_unregistered_datasets()

table = Table(
title="Summary of reasons", header_style="magenta", title_style="bold magenta"
)
table.add_column("Reason")
table.add_column("Number of Datasets")

for key, value in results.items():
table.add_row(key, str(value))

console.print(table)
14 changes: 1 addition & 13 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,9 @@ def test_cli_help(runner: CliRunner) -> None:
expected_response = """Usage: cli [OPTIONS] COMMAND [ARGS]...

Datatrail Command Line Interface.

Options:
--help Show this message and exit.

Commands:
clear Clear a dataset.
config Datatrail CLI Configuration.
list (ls) List scopes & datasets
ps Details of a dataset.
pull Download a dataset.
scout Scout a dataset.
version Show versions.
"""
assert result.exit_code == 0
assert result.output == expected_response
assert expected_response in result.output


def test_cli_config_help(runner: CliRunner) -> None:
Expand Down