diff --git a/src/somef/__main__.py b/src/somef/__main__.py index 0b666830..6b6dd902 100644 --- a/src/somef/__main__.py +++ b/src/somef/__main__.py @@ -195,7 +195,13 @@ def configure(auto, base_uri): "--tag", type=str, default=None, - help="Tag of the repository to analyze. Incompatible with --branch" + help="Tag of the repository to analyze. Incompatible with --branch and --commit" +) +@click.option( + "--commit", + type=str, + default=None, + help="Commit SHA of the repository to analyze. Incompatible with --branch and --tag" ) def describe(requirements_v, requirements_all, **kwargs): # import so missing packages get installed when appropriate diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py index 6cbd6e06..76f1ada1 100644 --- a/src/somef/process_repository.py +++ b/src/somef/process_repository.py @@ -152,7 +152,8 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=const return response, date -def load_gitlab_repository_metadata(repo_metadata: Result, repository_url): +def load_gitlab_repository_metadata(repo_metadata: Result, repository_url, + ignore_api_metadata=False, commit=None): """ Function uses the repository_url provided to load required information from gitlab. Information kept from the repository is written in keep_keys. @@ -160,6 +161,8 @@ def load_gitlab_repository_metadata(repo_metadata: Result, repository_url): ---------- @param repo_metadata: Result object with the metadata found in the repository so far @param repository_url: URL of the Gitlab repository to analyze + @param ignore_api_metadata: true if you do not want to do an additional request to the target API + @param commit: commit SHA of the repository to analyze Returns ------- @@ -308,6 +311,9 @@ def load_gitlab_repository_metadata(repo_metadata: Result, repository_url): if default_branch is None: default_branch = general_resp['defaultBranch'] + if commit: + default_branch = commit + project_path = "/".join(path_components) # {constants.PROP_VALUE: f"https://{url.netloc}/{owner}/{repo_name}/", @@ -405,6 +411,12 @@ def load_gitlab_repository_metadata(repo_metadata: Result, repository_url): constants.PROP_TYPE: constants.URL }, 1, constants.TECHNIQUE_GITLAB_API) + if not ignore_api_metadata and commit: + repo_metadata = fetch_commit_metadata( + repo_metadata, constants.RepositoryType.GITLAB, commit, + headers=header_template(), project_api_url=project_api_url + ) + logging.info("Repository information successfully loaded. \n") # return repo_metadata, owner, repo_name, default_branch return repo_metadata, owner, repo_name, default_branch, project_path @@ -510,7 +522,7 @@ def download_readme(owner, repo_name, default_branch, repo_type, authorization, def load_online_repository_metadata(repository_metadata: Result, repository_url, ignore_api_metadata=False, repo_type=constants.RepositoryType.GITHUB, authorization=None, reconcile_authors=False, - branch=None,tag=None): + branch=None,tag=None,commit=None): """ Function uses the repository_url provided to load required information from GitHub or Gitlab. Information kept from the repository is written in keep_keys. @@ -524,13 +536,16 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url, @param reconcile_authors: flag to indicate if additional should be extracted from certain files as codeowners. More request. @param branch: branch of the repository to analyze. Overrides the default branch detected from the repository metadata. @param tag: tag of the repository to analyze. Cannot be used together with the branch parameter. + @param commit: commit SHA of the repository to analyze. Cannot be used together with the branch or tag parameters. Returns ------- @return: Result object with the available metadata from online APIs plus its owner, repo name and default branch """ if repo_type == constants.RepositoryType.GITLAB: - return load_gitlab_repository_metadata(repository_metadata, repository_url) + return load_gitlab_repository_metadata(repository_metadata, repository_url, + ignore_api_metadata=ignore_api_metadata, + commit=commit) elif repo_type == constants.RepositoryType.LOCAL: logging.warning("Trying to download metadata from a local repository") return None @@ -602,6 +617,8 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url, default_branch = branch if tag: default_branch = tag + if commit: + default_branch = commit # filter the general response with only the fields we are interested in, mapping them to our keys filtered_resp = {} @@ -743,10 +760,220 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url, repository_metadata.add_result(constants.CAT_RELEASES, release_obj, 1, constants.TECHNIQUE_GITHUB_API) + if not ignore_api_metadata and commit: + repository_metadata = fetch_commit_metadata( + repository_metadata, constants.RepositoryType.GITHUB, commit, header, + repo_api_base_url=repo_api_base_url + ) logging.info("Repository information successfully loaded.\n") return repository_metadata, owner, repo_name, default_branch, None +def fetch_commit_metadata(repository_metadata, repo_type, commit_sha, headers, + repo_api_base_url=None, project_api_url=None): + """ + Fetches metadata for a specific commit from the GitHub or GitLab API and + adds it to the repository metadata. + Parameters + ---------- + @param repository_metadata: Result object to store the findings + @param repo_type: type of the repository (GITHUB or GITLAB) + @param commit_sha: The commit SHA to fetch metadata for + @param headers: HTTP headers to use for the request + @param repo_api_base_url: Base URL of the GitHub repository API (e.g. https://api.github.com/repos/owner/repo) + @param project_api_url: Base URL of the GitLab project API (e.g. https://gitlab.com/api/v4/projects/123) + Returns + ------- + @return: Result object enriched with commit metadata + """ + if repo_type == constants.RepositoryType.GITLAB: + if not project_api_url: + logging.warning("No project API URL provided for GitLab commit metadata fetch.") + return repository_metadata + commit_url = f"{project_api_url}/repository/commits/{commit_sha}" + is_gitlab = True + else: + commit_url = f"{repo_api_base_url}/commits/{commit_sha}" + is_gitlab = False + + logging.info(f"Fetching commit metadata from {commit_url}") + commit_resp, _ = rate_limit_get(commit_url, headers=headers) + if commit_resp is None: + logging.warning("Skipping commit metadata: rate_limit_get returned None (size limit or network error)") + return repository_metadata + if commit_resp.status_code != 200: + logging.warning(f"Could not fetch commit metadata: HTTP {commit_resp.status_code}") + return repository_metadata + commit_data = commit_resp.json() + + # Extract commit metadata since fields differ between GitHub and GitLab + author_name = None + commit_date_str = None + commit_html_url = None + if is_gitlab: + author_name = commit_data.get("author_name") + commit_date_str = commit_data.get("authored_date") or commit_data.get("committed_date") + commit_html_url = commit_data.get("web_url") + else: + commit_details = commit_data.get("commit", {}) + commit_author = commit_data.get("author", {}) + if commit_author and commit_author.get("login"): + author_name = commit_author["login"] + elif commit_details.get("author") and commit_details["author"].get("name"): + author_name = commit_details["author"]["name"] + if commit_details.get("author") and commit_details["author"].get("date"): + commit_date_str = commit_details["author"]["date"] + elif commit_details.get("committer") and commit_details["committer"].get("date"): + commit_date_str = commit_details["committer"]["date"] + commit_html_url = commit_data.get("html_url") + + if author_name: + author_result = { + constants.PROP_VALUE: author_name, + constants.PROP_TYPE: constants.AGENT + } + repository_metadata.add_result(constants.CAT_AUTHORS, author_result, 1, constants.TECHNIQUE_GITHUB_API) + + if commit_date_str: + date_result = { + constants.PROP_VALUE: commit_date_str, + constants.PROP_TYPE: constants.DATE + } + repository_metadata.add_result(constants.CAT_DATE_CREATED, date_result, 1, constants.TECHNIQUE_GITHUB_API) + + if commit_html_url: + url_result = { + constants.PROP_VALUE: commit_html_url, + constants.PROP_TYPE: constants.URL, + constants.PROP_COMMIT: commit_sha + } + repository_metadata.add_result(constants.CAT_CODE_REPOSITORY, url_result, 1, constants.TECHNIQUE_GITHUB_API) + + # Resolve release tags to commit SHAs and store them on each release entry + repository_metadata = resolve_release_commits( + repository_metadata, repo_type, headers, + repo_api_base_url=repo_api_base_url, project_api_url=project_api_url + ) + + # In here, we keep only releases whose date is at or before the commit date. This will guarantee that the output JSON + # contains the releases that existed up to the point in time of the requested commit, + # with the closest one sitting at the end of the list. + # In some edge cases, like the commit is after ALL releases, we would keep the list of releases as it is. + + if commit_date_str and constants.CAT_RELEASES in repository_metadata.results: + try: + commit_dt = datetime.fromisoformat(commit_date_str[:19]) + filtered = [] + closest_release_date = None + closest_release_tag = None + next_release_date = None + next_release_tag = None + found_closest = False + for release in repository_metadata.results[constants.CAT_RELEASES]: + release_result = release.get(constants.PROP_RESULT, {}) + release_date = release_result.get(constants.PROP_DATE_CREATED) + if release_date is None: + release_date = release_result.get(constants.PROP_DATE_PUBLISHED) + if release_date is None: + continue + try: + release_dt = datetime.fromisoformat(release_date[:19]) + except (ValueError, TypeError): + continue + if release_dt <= commit_dt: + filtered.append(release) + if not found_closest: + found_closest = True + closest_release_date = release_date + closest_release_tag = release_result.get(constants.PROP_TAG, release_result.get(constants.PROP_NAME, "unknown")) + else: + if next_release_date is None: + next_release_date = release_date + next_release_tag = release_result.get(constants.PROP_TAG, release_result.get(constants.PROP_NAME, "unknown")) + if filtered: + repository_metadata.results[constants.CAT_RELEASES] = filtered + if found_closest: + msg = f"Closest release behind commit date found. Commit date: {commit_date_str}. Matched release: {closest_release_tag} (date: {closest_release_date})." + if next_release_tag: + msg += f" Next release: {next_release_tag} (date: {next_release_date})." + logging.info(msg) + else: + logging.warning("All releases are after the commit date; keeping the unfiltered release list.") + except (ValueError, TypeError) as e: + logging.warning(f"Could not parse commit date for filtering releases: {e}") + + logging.info("Commit metadata successfully loaded.") + return repository_metadata + + +def resolve_release_commits(repository_metadata, repo_type, headers, + repo_api_base_url=None, project_api_url=None): + """ + Resolves the commit SHA for each release's tag using the GitHub or GitLab + tags API and stores the SHA directly on each release result dict. + + Parameters + ---------- + @param repository_metadata: Result object containing releases loaded from the API + @param repo_type: type of the repository (GITHUB or GITLAB) + @param headers: HTTP headers to use for the request + @param repo_api_base_url: Base URL of the GitHub repository API (e.g. https://api.github.com/repos/owner/repo) + @param project_api_url: Base URL of the GitLab project API (e.g. https://gitlab.com/api/v4/projects/123) + Returns + ------- + @return: Result object with release entries enriched with commit SHAs (when resolvable) + """ + if repo_type == constants.RepositoryType.GITLAB: + if not project_api_url: + logging.warning("No project API URL provided for GitLab tag resolution.") + return repository_metadata + tags_url = f"{project_api_url}/repository/tags" + is_gitlab = True + else: + tags_url = f"{repo_api_base_url}/tags" + is_gitlab = False + + # Retrieve all tags from the paginated /tags endpoint + logging.info(f"Resolving release tags to commit SHAs via {tags_url}") + all_tags = get_all_paginated_results(tags_url, headers=headers) + if not all_tags: + logging.warning("No tags found, cannot resolve release commits.") + return repository_metadata + + # Build a mapping from tag name to commit SHA + tag_to_sha = {} + for tag_entry in all_tags: + tag_name = tag_entry.get("name") + commit_info = tag_entry.get("commit") + if tag_name and commit_info: + if is_gitlab: + sha = commit_info.get("id") + else: + sha = commit_info.get("sha") + if sha: + tag_to_sha[tag_name] = sha + + if not tag_to_sha: + return repository_metadata + + # Walk through existing releases and add the commit SHA when the tag matches + if constants.CAT_RELEASES not in repository_metadata.results: + return repository_metadata + + for release_entry in repository_metadata.results[constants.CAT_RELEASES]: + release_result = release_entry.get(constants.PROP_RESULT) + if release_result is None: + continue + tag_name = release_result.get(constants.PROP_TAG) + if tag_name is None: + continue + commit_sha = tag_to_sha.get(tag_name) + if commit_sha is not None: + release_result[constants.PROP_COMMIT] = commit_sha + + return repository_metadata + + def get_path(obj, path): if isinstance(path, list) or isinstance(path, tuple): if len(path) == 1: diff --git a/src/somef/somef_cli.py b/src/somef/somef_cli.py index 371489bb..77c59441 100644 --- a/src/somef/somef_cli.py +++ b/src/somef/somef_cli.py @@ -21,7 +21,8 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, local_repo=None, ignore_github_metadata=False, readme_only=False, keep_tmp=None, authorization=None, - ignore_test_folder=True,requirements_mode='all', reconcile_authors=False, branch=None, tag=None) -> Result: + ignore_test_folder=True,requirements_mode='all', reconcile_authors=False, branch=None, tag=None, + commit=None) -> Result: """ Main function to get the data through the command line Parameters @@ -40,6 +41,7 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc @param reconcile_authors: flag to indicate if additional should be extracted from certain files as codeowners. Bear in mind that using this flags consumes more requests to the GitHub API. @param branch: branch of the repository to analyze. Overrides the default branch detected from the repository metadata. @param tag: tag of the repository to analyze. Cannot be used together with the branch parameter. + @param commit: commit SHA of the repository to analyze. Cannot be used together with the branch or tag parameters. Returns ------- @return: Dictionary with the results found by SOMEF, formatted as a Result object. @@ -60,6 +62,9 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc if branch and tag: logging.error("You cannot use --branch and --tag at the same time. Mutually exclusive") sys.exit() + if commit and (branch or tag): + logging.error("You cannot use --commit together with --branch or --tag. Mutually exclusive") + sys.exit() if repo_url is not None: try: @@ -98,7 +103,8 @@ def cli_get_data(threshold, ignore_classifiers, repo_url=None, doc_src=None, loc authorization, reconcile_authors, branch=branch, - tag=tag + tag=tag, + commit=commit ) # download files and obtain path to download folder @@ -266,7 +272,8 @@ def run_cli(*, requirements_mode="all", reconcile_authors=False, branch=None, - tag=None + tag=None, + commit=None ): """Function to run all the required components of the cli for a repository""" # check if it is a valid url @@ -301,7 +308,7 @@ def run_cli(*, repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, repo_url=repo_url, ignore_github_metadata=ignore_github_metadata, readme_only=readme_only, keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, requirements_mode=requirements_mode, reconcile_authors=reconcile_authors, - branch=branch, tag=tag) + branch=branch, tag=tag, commit=commit) if hasattr(repo_data, "get_json"): repo_data = repo_data.get_json() @@ -335,15 +342,15 @@ def run_cli(*, repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, repo_url=repo_url, ignore_github_metadata=ignore_github_metadata, readme_only=readme_only, keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors, - branch=branch, tag=tag) + branch=branch, tag=tag, commit=commit) elif local_repo: repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, local_repo=local_repo, keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors, - branch=branch, tag=tag) + branch=branch, tag=tag, commit=commit) else: repo_data = cli_get_data(threshold=threshold, ignore_classifiers=ignore_classifiers, doc_src=doc_src, keep_tmp=keep_tmp, ignore_test_folder=ignore_test_folder, reconcile_authors=reconcile_authors, - branch=branch, tag=tag) + branch=branch, tag=tag, commit=commit) if hasattr(repo_data, "get_json"): repo_data = repo_data.get_json() diff --git a/src/somef/test/test_process_repository.py b/src/somef/test/test_process_repository.py index 02b5c81b..39c8a825 100644 --- a/src/somef/test/test_process_repository.py +++ b/src/somef/test/test_process_repository.py @@ -338,7 +338,478 @@ def test_issue_905_tag(self): source = version[0].get("source", "") assert "Widoco/v1.4.25" in source, f"The downloaded tag does not match the requested one. Source: {source}" - os.remove(test_data_path + "test_905_tag.json") + os.remove(test_data_path + "test_905_tag.json") + + @unittest.skipIf(os.getenv("CI") == "true", "Skipped in CI because it is already verified locally") + def test_issue_905_commit(self): + """ + Checks whether SOMEF correctly downloads and analyzes a specific commit + when the user specifies --commit. The test also verifies that commit metadata + (SHA, message, author, date) is fetched from the GitHub API and included in + the output. + """ + commit_sha = "f567b46b593123e22db5880b4f7fd97c9fe9c94b" + + somef_cli.run_cli(threshold=0.8, + ignore_classifiers=False, + repo_url="https://github.com/dgarijo/Widoco/", + local_repo=None, + doc_src=None, + in_file=None, + output=test_data_path + "test_905_commit.json", + graph_out=None, + graph_format="turtle", + codemeta_out=None, + pretty=True, + missing=False, + readme_only=False, + commit=commit_sha) + + with open(test_data_path + "test_905_commit.json", "r") as text_file: + json_content = json.load(text_file) + + assert json_content is not None + assert os.path.exists(test_data_path + "test_905_commit.json") + + # Verify that commit metadata is present in the output + date_created = json_content.get(constants.CAT_DATE_CREATED, []) + assert len(date_created) > 0, "Expected commit date metadata to be present" + + os.remove(test_data_path + "test_905_commit.json") + + +class TestFetchCommitMetadata(unittest.TestCase): + """ + Tests for the fetch_commit_metadata function that retrieves commit information + from the GitHub API. + """ + + @patch("somef.process_repository.resolve_release_commits") + @patch("somef.process_repository.rate_limit_get") + def test_fetch_commit_metadata_adds_author(self, mock_rlg, mock_resolve): + """fetch_commit_metadata should add the commit author to CAT_AUTHORS.""" + mock_resolve.side_effect = lambda m, *a, **kw: m + mock_resp = _make_mock_response(200) + mock_resp.json = MagicMock(return_value={ + "sha": "abc123def456", + "commit": { + "author": { + "name": "Test User", + "email": "test@example.com", + "date": "2024-01-15T10:30:00Z" + } + }, + "author": { + "login": "testuser" + }, + "html_url": "https://github.com/testowner/testrepo/commit/abc123def456" + }) + mock_rlg.return_value = (mock_resp, "2024-01-15") + + repo_metadata = Result() + headers = {"Authorization": "token test"} + result = process_repository.fetch_commit_metadata( + repo_metadata, + constants.RepositoryType.GITHUB, + "abc123def456", + headers, + repo_api_base_url="https://api.github.com/repos/testowner/testrepo" + ) + + authors = result.results.get(constants.CAT_AUTHORS, []) + self.assertGreater(len(authors), 0, "Commit author should be present") + author_value = authors[0]["result"]["value"] + self.assertEqual(author_value, "testuser") + + @patch("somef.process_repository.resolve_release_commits") + @patch("somef.process_repository.rate_limit_get") + def test_fetch_commit_metadata_adds_date(self, mock_rlg, mock_resolve): + """fetch_commit_metadata should add the commit date to CAT_DATE_CREATED.""" + mock_resolve.side_effect = lambda m, *a, **kw: m + mock_resp = _make_mock_response(200) + mock_resp.json = MagicMock(return_value={ + "sha": "abc123def456", + "commit": { + "author": { + "name": "Test User", + "email": "test@example.com", + "date": "2024-01-15T10:30:00Z" + } + }, + "html_url": "https://github.com/testowner/testrepo/commit/abc123def456" + }) + mock_rlg.return_value = (mock_resp, "2024-01-15") + + repo_metadata = Result() + headers = {} + result = process_repository.fetch_commit_metadata( + repo_metadata, + constants.RepositoryType.GITHUB, + "abc123def456", + headers, + repo_api_base_url="https://api.github.com/repos/testowner/testrepo" + ) + + dates = result.results.get(constants.CAT_DATE_CREATED, []) + self.assertGreater(len(dates), 0, "Commit date should be present") + date_value = dates[0]["result"]["value"] + self.assertEqual(date_value, "2024-01-15T10:30:00Z") + + @patch("somef.process_repository.resolve_release_commits") + @patch("somef.process_repository.rate_limit_get") + def test_fetch_commit_metadata_handles_missing_commit(self, mock_rlg, mock_resolve): + """fetch_commit_metadata should not crash when the commit endpoint returns 404.""" + mock_resolve.side_effect = lambda m, *a, **kw: m + mock_resp = _make_mock_response(404) + mock_rlg.return_value = (mock_resp, "") + + repo_metadata = Result() + headers = {} + result = process_repository.fetch_commit_metadata( + repo_metadata, + constants.RepositoryType.GITHUB, + "nonexistent_sha", + headers, + repo_api_base_url="https://api.github.com/repos/testowner/testrepo" + ) + + # The function should return the metadata object unchanged (only PROVENANCE default key) + self.assertEqual(len(result.results), 1) + + @patch("somef.process_repository.resolve_release_commits") + @patch("somef.process_repository.rate_limit_get") + def test_fetch_commit_metadata_handles_none_response(self, mock_rlg, mock_resolve): + """fetch_commit_metadata should handle rate_limit_get returning None.""" + mock_resolve.side_effect = lambda m, *a, **kw: m + mock_rlg.return_value = (None, None) + + repo_metadata = Result() + headers = {} + result = process_repository.fetch_commit_metadata( + repo_metadata, + constants.RepositoryType.GITHUB, + "abc123", + headers, + repo_api_base_url="https://api.github.com/repos/testowner/testrepo" + ) + + self.assertEqual(len(result.results), 1) + + +class TestResolveReleaseCommits(unittest.TestCase): + """ + Tests for the resolve_release_commits function that resolves each release's + tag to a commit SHA using the GitHub /tags endpoint. + """ + + @patch("somef.process_repository.get_all_paginated_results") + def test_resolve_release_commits_adds_sha(self, mock_get_tags): + """ + When a release tag matches a tag from the /tags endpoint, the commit SHA + should be written into the release result dict. + """ + mock_get_tags.return_value = [ + {"name": "v1.0.0", "commit": {"sha": "aaa111", "url": ""}}, + {"name": "v2.0.0", "commit": {"sha": "bbb222", "url": ""}}, + ] + + repo_metadata = Result() + release_1 = { + constants.PROP_RESULT: { + constants.PROP_TYPE: constants.RELEASE, + constants.PROP_VALUE: "https://github.com/owner/repo/releases/tag/v1.0.0", + constants.PROP_TAG: "v1.0.0", + }, + constants.PROP_CONFIDENCE: 1, + constants.PROP_TECHNIQUE: constants.TECHNIQUE_GITHUB_API, + } + release_2 = { + constants.PROP_RESULT: { + constants.PROP_TYPE: constants.RELEASE, + constants.PROP_VALUE: "https://github.com/owner/repo/releases/tag/v2.0.0", + constants.PROP_TAG: "v2.0.0", + }, + constants.PROP_CONFIDENCE: 1, + constants.PROP_TECHNIQUE: constants.TECHNIQUE_GITHUB_API, + } + repo_metadata.results[constants.CAT_RELEASES] = [release_1, release_2] + + result = process_repository.resolve_release_commits( + repo_metadata, + constants.RepositoryType.GITHUB, + {}, + repo_api_base_url="https://api.github.com/repos/owner/repo" + ) + + releases = result.results[constants.CAT_RELEASES] + self.assertEqual(releases[0][constants.PROP_RESULT].get(constants.PROP_COMMIT), "aaa111") + self.assertEqual(releases[1][constants.PROP_RESULT].get(constants.PROP_COMMIT), "bbb222") + + @patch("somef.process_repository.get_all_paginated_results") + def test_resolve_release_commits_skips_unmatched_tag(self, mock_get_tags): + """ + Releases whose tag is not present in the /tags response should not get + a commit SHA. + """ + mock_get_tags.return_value = [ + {"name": "v1.0.0", "commit": {"sha": "aaa111", "url": ""}}, + ] + + repo_metadata = Result() + release = { + constants.PROP_RESULT: { + constants.PROP_TYPE: constants.RELEASE, + constants.PROP_VALUE: "https://github.com/owner/repo/releases/tags/unknown", + constants.PROP_TAG: "unknown", + }, + constants.PROP_CONFIDENCE: 1, + constants.PROP_TECHNIQUE: constants.TECHNIQUE_GITHUB_API, + } + repo_metadata.results[constants.CAT_RELEASES] = [release] + + result = process_repository.resolve_release_commits( + repo_metadata, + constants.RepositoryType.GITHUB, + {}, + repo_api_base_url="https://api.github.com/repos/owner/repo" + ) + + resolved = result.results[constants.CAT_RELEASES][0][constants.PROP_RESULT] + self.assertIsNone(resolved.get(constants.PROP_COMMIT), + "An unknown tag should not receive a commit SHA") + + @patch("somef.process_repository.get_all_paginated_results") + def test_resolve_release_commits_no_tags(self, mock_get_tags): + """ + When the /tags endpoint returns an empty list, releases should be + left untouched. + """ + mock_get_tags.return_value = [] + + repo_metadata = Result() + release = { + constants.PROP_RESULT: { + constants.PROP_TYPE: constants.RELEASE, + constants.PROP_VALUE: "https://github.com/owner/repo/releases/tag/v1.0.0", + constants.PROP_TAG: "v1.0.0", + }, + constants.PROP_CONFIDENCE: 1, + constants.PROP_TECHNIQUE: constants.TECHNIQUE_GITHUB_API, + } + repo_metadata.results[constants.CAT_RELEASES] = [release] + + result = process_repository.resolve_release_commits( + repo_metadata, + constants.RepositoryType.GITHUB, + {}, + repo_api_base_url="https://api.github.com/repos/owner/repo" + ) + + resolved = result.results[constants.CAT_RELEASES][0][constants.PROP_RESULT] + self.assertIsNone(resolved.get(constants.PROP_COMMIT)) + + @patch("somef.process_repository.get_all_paginated_results") + def test_resolve_release_commits_no_releases(self, mock_get_tags): + """ + When there are no releases in the metadata, resolve_release_commits + should not crash and should not query the /tags endpoint. + """ + mock_get_tags.return_value = [ + {"name": "v1.0.0", "commit": {"sha": "aaa111", "url": ""}}, + ] + + repo_metadata = Result() + result = process_repository.resolve_release_commits( + repo_metadata, + constants.RepositoryType.GITHUB, + {}, + repo_api_base_url="https://api.github.com/repos/owner/repo" + ) + + self.assertEqual(len(result.results), 1) + + +class TestFetchCommitMetadataGitLab(unittest.TestCase): + """ + Tests for fetch_commit_metadata with GitLab API response format. + GitLab returns flat fields (author_name, authored_date, web_url) instead of + nested objects like GitHub. + """ + + @patch("somef.process_repository.resolve_release_commits") + @patch("somef.process_repository.rate_limit_get") + def test_gitlab_fetch_commit_metadata_adds_author(self, mock_rlg, mock_resolve): + """GitLab commit author should be read from author_name.""" + mock_resolve.side_effect = lambda m, *a, **kw: m + mock_resp = _make_mock_response(200) + mock_resp.json = MagicMock(return_value={ + "id": "abc123def456", + "author_name": "GitLab User", + "authored_date": "2024-01-15T10:30:00.000Z", + "committed_date": "2024-01-15T10:35:00.000Z", + "web_url": "https://gitlab.com/testowner/testrepo/-/commit/abc123def456" + }) + mock_rlg.return_value = (mock_resp, "2024-01-15") + + repo_metadata = Result() + headers = {} + result = process_repository.fetch_commit_metadata( + repo_metadata, + constants.RepositoryType.GITLAB, + "abc123def456", + headers, + project_api_url="https://gitlab.com/api/v4/projects/123" + ) + + authors = result.results.get(constants.CAT_AUTHORS, []) + self.assertGreater(len(authors), 0, "GitLab commit author should be present") + author_value = authors[0]["result"]["value"] + self.assertEqual(author_value, "GitLab User") + + @patch("somef.process_repository.resolve_release_commits") + @patch("somef.process_repository.rate_limit_get") + def test_gitlab_fetch_commit_metadata_adds_date(self, mock_rlg, mock_resolve): + """GitLab commit date should prefer authored_date.""" + mock_resolve.side_effect = lambda m, *a, **kw: m + mock_resp = _make_mock_response(200) + mock_resp.json = MagicMock(return_value={ + "id": "abc123def456", + "author_name": "GitLab User", + "authored_date": "2024-01-15T10:30:00.000Z", + "web_url": "https://gitlab.com/testowner/testrepo/-/commit/abc123def456" + }) + mock_rlg.return_value = (mock_resp, "2024-01-15") + + repo_metadata = Result() + headers = {} + result = process_repository.fetch_commit_metadata( + repo_metadata, + constants.RepositoryType.GITLAB, + "abc123def456", + headers, + project_api_url="https://gitlab.com/api/v4/projects/123" + ) + + dates = result.results.get(constants.CAT_DATE_CREATED, []) + self.assertGreater(len(dates), 0, "GitLab commit date should be present") + date_value = dates[0]["result"]["value"] + self.assertEqual(date_value, "2024-01-15T10:30:00.000Z") + + @patch("somef.process_repository.resolve_release_commits") + @patch("somef.process_repository.rate_limit_get") + def test_gitlab_fetch_commit_metadata_adds_url(self, mock_rlg, mock_resolve): + """GitLab commit URL should be read from web_url.""" + mock_resolve.side_effect = lambda m, *a, **kw: m + mock_resp = _make_mock_response(200) + mock_resp.json = MagicMock(return_value={ + "id": "abc123def456", + "author_name": "GitLab User", + "authored_date": "2024-01-15T10:30:00.000Z", + "web_url": "https://gitlab.com/testowner/testrepo/-/commit/abc123def456" + }) + mock_rlg.return_value = (mock_resp, "2024-01-15") + + repo_metadata = Result() + headers = {} + result = process_repository.fetch_commit_metadata( + repo_metadata, + constants.RepositoryType.GITLAB, + "abc123def456", + headers, + project_api_url="https://gitlab.com/api/v4/projects/123" + ) + + urls = result.results.get(constants.CAT_CODE_REPOSITORY, []) + self.assertGreater(len(urls), 0, "GitLab commit URL should be present") + url_value = urls[0]["result"]["value"] + self.assertEqual(url_value, "https://gitlab.com/testowner/testrepo/-/commit/abc123def456") + + @patch("somef.process_repository.resolve_release_commits") + @patch("somef.process_repository.rate_limit_get") + def test_gitlab_fetch_commit_metadata_handles_404(self, mock_rlg, mock_resolve): + """GitLab 404 should not crash.""" + mock_resolve.side_effect = lambda m, *a, **kw: m + mock_resp = _make_mock_response(404) + mock_rlg.return_value = (mock_resp, "") + + repo_metadata = Result() + headers = {} + result = process_repository.fetch_commit_metadata( + repo_metadata, + constants.RepositoryType.GITLAB, + "nonexistent", + headers, + project_api_url="https://gitlab.com/api/v4/projects/123" + ) + + self.assertEqual(len(result.results), 1) + + @patch("somef.process_repository.resolve_release_commits") + @patch("somef.process_repository.rate_limit_get") + def test_gitlab_fetch_commit_metadata_no_project_api_url(self, mock_rlg, mock_resolve): + """When project_api_url is missing for GitLab, return early.""" + mock_resolve.side_effect = lambda m, *a, **kw: m + + repo_metadata = Result() + result = process_repository.fetch_commit_metadata( + repo_metadata, + constants.RepositoryType.GITLAB, + "abc123", + {} + ) + + self.assertEqual(len(result.results), 1) + mock_rlg.assert_not_called() + + +class TestResolveReleaseCommitsGitLab(unittest.TestCase): + """ + Tests for resolve_release_commits with GitLab tag API response format. + GitLab uses commit.id instead of commit.sha for the commit SHA. + """ + + @patch("somef.process_repository.get_all_paginated_results") + def test_gitlab_resolve_release_commits_adds_sha(self, mock_get_tags): + """GitLab tags use commit.id instead of commit.sha.""" + mock_get_tags.return_value = [ + {"name": "v1.0.0", "commit": {"id": "aaa111", "short_id": "aaa"}}, + {"name": "v2.0.0", "commit": {"id": "bbb222", "short_id": "bbb"}}, + ] + + repo_metadata = Result() + release = { + constants.PROP_RESULT: { + constants.PROP_TYPE: constants.RELEASE, + constants.PROP_VALUE: "https://gitlab.com/owner/repo/-/releases/v1.0.0", + constants.PROP_TAG: "v1.0.0", + }, + constants.PROP_CONFIDENCE: 1, + constants.PROP_TECHNIQUE: constants.TECHNIQUE_GITLAB_API, + } + repo_metadata.results[constants.CAT_RELEASES] = [release] + + result = process_repository.resolve_release_commits( + repo_metadata, + constants.RepositoryType.GITLAB, + {}, + project_api_url="https://gitlab.com/api/v4/projects/123" + ) + + releases = result.results[constants.CAT_RELEASES] + self.assertEqual(releases[0][constants.PROP_RESULT].get(constants.PROP_COMMIT), "aaa111") + + @patch("somef.process_repository.get_all_paginated_results") + def test_gitlab_resolve_release_commits_no_project_api_url(self, mock_get_tags): + """When project_api_url is missing for GitLab, return early without API call.""" + repo_metadata = Result() + result = process_repository.resolve_release_commits( + repo_metadata, + constants.RepositoryType.GITLAB, + {}, + ) + + self.assertEqual(len(result.results), 1) + mock_get_tags.assert_not_called() def _make_mock_response(status_code, content=b""): diff --git a/src/somef/utils/constants.py b/src/somef/utils/constants.py index bec49c19..2033057c 100644 --- a/src/somef/utils/constants.py +++ b/src/somef/utils/constants.py @@ -254,6 +254,7 @@ PROP_SIZE = "size" PROP_SPDX_ID = "spdx_id" PROP_TAG = "tag" +PROP_COMMIT = "commit" PROP_URL = "url" PROP_USERNAME = "username" PROP_VERSION = "version"