Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
254 changes: 253 additions & 1 deletion src/fromager/commands/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@
import typing

import click
import rich
import rich.box
from packaging.requirements import Requirement
from packaging.utils import canonicalize_name
from packaging.utils import NormalizedName, canonicalize_name
from packaging.version import Version
from rich.table import Table

from fromager import clickext, context
from fromager.commands import bootstrap
Expand Down Expand Up @@ -784,3 +787,252 @@ def n2s(nodes: typing.Iterable[DependencyNode]) -> str:
topo.done(*nodes_to_build)

print(f"\nBuilding {len(graph)} packages in {rounds} rounds.")


def get_dependency_closure(node: DependencyNode) -> set[NormalizedName]:
"""Compute the full dependency closure for a node.

Traverses all edge types and returns the set of canonical package names reachable from node,
including node itself.

Args:
node: The starting node to compute the closure for.

Returns:
Set of canonicalized package names in the transitive closure.
"""
dependency_names: set[NormalizedName] = set()
if node.canonicalized_name != ROOT:
dependency_names.add(node.canonicalized_name)
for dependency in node.iter_all_dependencies():
if dependency.canonicalized_name != ROOT:
dependency_names.add(dependency.canonicalized_name)
return dependency_names


def get_package_names(graph: DependencyGraph) -> set[NormalizedName]:
"""Extract all unique canonical package names from a graph.

Args:
graph: The dependency graph to extract names from.

Returns:
Set of canonicalized package names, excluding the ROOT node.
"""
return {
node.canonicalized_name for node in graph.get_all_nodes() if node.key != ROOT
}


def extract_collection_name(graph_path: str) -> str:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This encodes some very opinionated naming conventions in the filename. How about if we just use the filename as the identifier? Then users could name the files in ways that are meaningful to them.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Simplified to just return the filename.

"""Derive a collection name from a graph file path.

Returns the filename without the extension as a string.

Args:
graph_path: Filesystem path to a graph JSON file.

Returns:
The filename without the extension.
"""
return pathlib.PurePath(graph_path).stem


class _CollectionScore(typing.NamedTuple):
"""Overlap score between a package's dependency closure and a collection."""

collection: str
new_packages: int
existing_packages: int
coverage_percentage: float


def _analyze_suggestions(
toplevel_nodes: list[DependencyNode],
collection_packages: dict[str, set[NormalizedName]],
) -> list[dict[str, typing.Any]]:
"""Score each onboarding top-level package against every collection.

Args:
toplevel_nodes: Top-level nodes from the onboarding graph.
collection_packages: Mapping of collection name to its package name set.

Returns:
List of result dicts, one per top-level package, sorted by package name.
"""
results: list[dict[str, typing.Any]] = []

for node in sorted(toplevel_nodes, key=lambda n: n.canonicalized_name):
dependency_names = get_dependency_closure(node)
total_dependency_count = len(dependency_names)

scores: list[_CollectionScore] = []
for collection_name, packages in collection_packages.items():
existing_count = len(dependency_names & packages)
new_count = total_dependency_count - existing_count
coverage_percentage = (
(existing_count / total_dependency_count * 100)
if total_dependency_count
else 0.0
)
scores.append(
_CollectionScore(
collection_name, new_count, existing_count, coverage_percentage
)
)

# Rank: fewest new packages, then highest coverage, then name for determinism
scores.sort(
key=lambda score: (
score.new_packages,
-score.coverage_percentage,
score.collection,
)
)
best_score = scores[0] if scores else None

logger.debug(
"%s: %d deps, best fit '%s' (%d new, %.1f%% coverage)",
node.canonicalized_name,
total_dependency_count,
best_score.collection if best_score else "none",
best_score.new_packages if best_score else 0,
best_score.coverage_percentage if best_score else 0.0,
)

results.append(
{
"package": str(node.canonicalized_name),
"version": str(node.version),
"total_dependencies": total_dependency_count,
"best_fit": best_score.collection if best_score else "none",
"new_packages": best_score.new_packages if best_score else 0,
"existing_packages": best_score.existing_packages if best_score else 0,
"coverage_percentage": (
round(best_score.coverage_percentage, 1) if best_score else 0.0
),
"all_collections": [
{
"collection": score.collection,
"new_packages": score.new_packages,
"existing_packages": score.existing_packages,
"coverage_percentage": round(score.coverage_percentage, 1),
}
for score in scores
],
}
Comment on lines +903 to +923
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Preserve the documented JSON schema.

The payload built here doesn't match the --format json contract from issue #971. It emits top-level best_fit/new_packages/all_collections, but the advertised machine-readable shape is package, version, total_dependencies, plus an ordered collections list whose entries include name, new_packages, existing_packages, total_dependencies, and coverage_pct. That will break consumers expecting the documented structure.

Suggested fix
         results.append(
             {
                 "package": str(node.canonicalized_name),
                 "version": str(node.version),
                 "total_dependencies": total_dependency_count,
-                "best_fit": best_score.collection if best_score else "none",
-                "new_packages": best_score.new_packages if best_score else 0,
-                "existing_packages": best_score.existing_packages if best_score else 0,
-                "coverage_percentage": (
-                    round(best_score.coverage_percentage, 1) if best_score else 0.0
-                ),
-                "all_collections": [
+                "collections": [
                     {
-                        "collection": score.collection,
+                        "name": score.collection,
                         "new_packages": score.new_packages,
                         "existing_packages": score.existing_packages,
-                        "coverage_percentage": round(score.coverage_percentage, 1),
+                        "total_dependencies": total_dependency_count,
+                        "coverage_pct": round(score.coverage_percentage, 1),
                     }
                     for score in scores
                 ],
             }
         )

If you still want the flattened fields for table rendering, derive them in _print_suggest_collection_table() from the first ranked collection instead of serializing a second JSON shape here.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/fromager/commands/graph.py` around lines 903 - 923, The JSON being
emitted in the results dict in src/fromager/commands/graph.py must follow the
documented schema: replace the top-level
best_fit/new_packages/existing_packages/coverage_percentage/all_collections
shape with package, version, total_dependencies, and an ordered collections list
named "collections" where each entry has keys name (use score.collection),
new_packages, existing_packages, total_dependencies (compute per-score total if
available), and coverage_pct (rounded to 1 decimal); remove the flattened
top-level best_* fields from this serialized dict and, if you still need those
flattened values for the human table, compute them in
_print_suggest_collection_table() by taking the first-ranked collection from the
collections list rather than emitting a second JSON shape here.

)

return results


def _print_suggest_collection_table(
results: list[dict[str, typing.Any]],
) -> None:
"""Render suggest-collection results as a Rich table."""
table = Table(
title="Collection Suggestions for Onboarding Packages",
box=rich.box.MARKDOWN,
title_justify="left",
)
table.add_column("Package", justify="left", no_wrap=True)
table.add_column("Version", justify="left", no_wrap=True)
table.add_column("Total Deps", justify="right", no_wrap=True)
table.add_column("Best Fit", justify="left", no_wrap=True)
table.add_column("New Pkgs", justify="right", no_wrap=True)
table.add_column("Existing", justify="right", no_wrap=True)
table.add_column("Coverage", justify="right", no_wrap=True)

for result in results:
table.add_row(
result["package"],
result["version"],
str(result["total_dependencies"]),
result["best_fit"],
str(result["new_packages"]),
str(result["existing_packages"]),
f"{result['coverage_percentage']:.1f}%",
)

rich.get_console().print(table)


@graph.command(name="suggest-collection")
@click.option(
"--format",
"output_format",
type=click.Choice(["table", "json"], case_sensitive=False),
default="table",
help="Output format (default: table)",
)
@click.argument("onboarding-graph", type=str)
@click.argument("collection-graphs", nargs=-1, required=True, type=str)
def suggest_collection(
output_format: str,
onboarding_graph: str,
collection_graphs: tuple[str, ...],
) -> None:
"""Suggest the best-fit collection for each onboarding package.

Analyzes dependency overlap between top-level packages in ONBOARDING_GRAPH
and the existing COLLECTION_GRAPHS to recommend where each onboarding
package should be placed.

For each top-level package in the onboarding graph, computes the full
transitive dependency closure and compares it against every collection.
Collections are ranked by fewest new packages required, then by highest
dependency coverage.

\b
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this \b do something when printed as part of the command help?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its Click's way to preserves line breaks in --help output. Without it, Click displays the argument descriptions into a single paragraph.

ONBOARDING_GRAPH Path to the onboarding collection graph.json.
COLLECTION_GRAPHS One or more paths to existing collection graph.json files.
"""
try:
onboarding = DependencyGraph.from_file(onboarding_graph)
except Exception as err:
raise click.ClickException(
f"Failed to load onboarding graph {onboarding_graph}: {err}"
) from err

root = onboarding.get_root_node()

toplevel_nodes: list[DependencyNode] = [
edge.destination_node
for edge in root.children
if edge.req_type == RequirementType.TOP_LEVEL
]

if not toplevel_nodes:
click.echo("No top-level packages found in onboarding graph.", err=True)

logger.info(
"Loaded onboarding graph with %d top-level packages", len(toplevel_nodes)
)

collection_packages: dict[str, set[NormalizedName]] = {}
for graph_path in collection_graphs:
collection_name = extract_collection_name(graph_path)
if collection_name in collection_packages:
raise click.ClickException(
f"Duplicate collection name '{collection_name}' from {graph_path}. "
"Rename one of the graph files to avoid ambiguity."
)
try:
collection_graph = DependencyGraph.from_file(graph_path)
except Exception as err:
raise click.ClickException(
f"Failed to load collection graph {graph_path}: {err}"
) from err
collection_packages[collection_name] = get_package_names(collection_graph)
logger.debug(
"Collection '%s': %d packages",
collection_name,
len(collection_packages[collection_name]),
)

results = _analyze_suggestions(toplevel_nodes, collection_packages)

if output_format == "json":
click.echo(json.dumps(results, indent=2))
else:
_print_suggest_collection_table(results)
18 changes: 18 additions & 0 deletions src/fromager/dependency_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,24 @@ def iter_build_requirements(self) -> typing.Iterable[DependencyNode]:
):
yield install_edge.destination_node

def iter_all_dependencies(self) -> typing.Iterable[DependencyNode]:
"""Get all unique, recursive dependencies following every edge type.

Yields every reachable node exactly once using iterative DFS.
Follows install, build, and toplevel edges.
"""
visited: set[str] = {self.key}
stack: list[DependencyNode] = [self]
while stack:
current = stack.pop()
for edge in current.children:
child_node = edge.destination_node
if child_node.key in visited:
continue
visited.add(child_node.key)
yield child_node
stack.append(child_node)

def iter_install_requirements(self) -> typing.Iterable[DependencyNode]:
"""Get all unique, recursive install requirements"""
visited: set[str] = set()
Expand Down
Loading
Loading