Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ homepage = "https://github.com/KnowledgeCaptureAndDiscovery/somef"
nltk = "^3.9.0"
numpy = "^1.26.3"
pandas = "^2.1.4"
rdflib = "^7.0.0"
rdflib = ">7.0.0"
textblob = "^0.17.1"
validators = "^0.22.0"
xgboost = "^2.0.3"
Expand All @@ -36,7 +36,7 @@ homepage = "https://github.com/KnowledgeCaptureAndDiscovery/somef"
chardet = "^5.2.0"
imbalanced-learn = "^0.12.0"
pytest = "^8.0.0"
morph-kgc = "^2.6.4"
morph-kgc = "^2.7.0"
bibtexparser = "^1.4.1"
nbformat = "^5.9.2"
markdown = "^3.5.2"
Expand Down
8 changes: 2 additions & 6 deletions src/somef/extract_software_type.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
from pathlib import Path
import nbformat
import logging
from nbformat.reader import NotJSONError
from chardet import detect
import re
Expand All @@ -9,8 +10,6 @@
from .utils import constants
from .extract_ontologies import is_file_ontology

import pdb


def check_repository_type(path_repo, title, metadata_result: Result):
""" Function that adds the metadata result in the JSON
Expand Down Expand Up @@ -200,18 +199,15 @@ def check_static_websites(path_repo, repo_metadata: Result):
return False
try:
languages = repo_metadata[constants.CAT_PROGRAMMING_LANGUAGES]
print(languages)
for language in languages:
language_name = language[constants.PROP_RESULT][constants.PROP_NAME]
print(language_name)
if language_name.lower() == "javascript":
js_size += language[constants.PROP_RESULT][constants.PROP_SIZE]
print(js_size)
elif language_name.lower() == "scss" or language_name.lower() == "css":
css_size += language[constants.PROP_RESULT][constants.PROP_SIZE]
total_size += language[constants.PROP_RESULT][constants.PROP_SIZE]
except Exception as e:
print(e)
logging.warning(f"Could not retrieve programming languages for static website check: {e}")
if html_file > 0:
if js_size > 0 and css_size == 0:
if js_size / total_size < 0.91:
Expand Down
8 changes: 4 additions & 4 deletions src/somef/header_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def extract_header_content(text: str) -> Tuple[pd.DataFrame, str | None]:

# df['Content'].replace('', np.nan, inplace=True)
df['Content'] = df['Content'].replace('', np.nan)
df.dropna(subset=['Content'], inplace=True)
df = df.dropna(subset=['Content'])

return df, none_header_content

Expand Down Expand Up @@ -415,7 +415,7 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res
df['ParentGroup'] = df['ParentHeader'].fillna('').map(label_text)

df.loc[df['Group'].str.len() == 0, 'Group'] = df['ParentGroup']
df.drop(columns=['ParentGroup'], inplace=True)
df = df.drop(columns=['ParentGroup'])

if not df.iloc[0]['Group']:
df.loc[df.index[0], 'Group'] = ['unknown']
Expand All @@ -424,11 +424,11 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res
df.loc[df['Group'] == 'unknown', 'Group'] = np.nan

valid = df[df['Group'].notna()].copy()
valid.rename(columns={
valid = valid.rename(columns={
'Content': constants.PROP_VALUE,
'Header': constants.PROP_ORIGINAL_HEADER,
'ParentHeader': constants.PROP_PARENT_HEADER,
}, inplace=True)
})

source = None
if constants.CAT_README_URL in repository_metadata.results:
Expand Down
107 changes: 85 additions & 22 deletions src/somef/parser/codeowners_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def parse_codeowners_structured(dir_path, filename):
return {"codeowners": codeowners}


def parse_codeowners_file(file_path, metadata_result: Result, source, reconcile_authors=None) -> Result:
def parse_codeowners_file(file_path, metadata_result: Result, source, reconcile_authors=None, repo_type=None, server_url=None) -> Result:
try:
logging.info(f"Reconcile authors flag: {reconcile_authors}")
if Path(file_path).name.upper() == constants.CODEOWNERS_FILE:
Expand Down Expand Up @@ -51,14 +51,14 @@ def parse_codeowners_file(file_path, metadata_result: Result, source, reconcile_
added_maintainers.add(owner)

maintainer_data = {
"value": owner,
"username": owner,
"role": "Maintainer",
"type": "Person"
constants.PROP_VALUE: owner,
constants.PROP_USERNAME: owner,
constants.PROP_ROLE: "Maintainer",
constants.PROP_TYPE: "Person"
}

if reconcile_authors:
user_info = enrich_github_user(owner)
user_info = enrich_user(owner, repo_type, server_url)
if user_info:
if user_info.get(constants.PROP_CODEOWNERS_NAME):
maintainer_data[constants.PROP_NAME] = user_info.get(constants.PROP_CODEOWNERS_NAME)
Expand All @@ -81,26 +81,89 @@ def parse_codeowners_file(file_path, metadata_result: Result, source, reconcile_

return metadata_result

def enrich_github_user(username):
""" Enrich user metadata using the appropriate platform API.
Currently only GitHub is supported.
"""
try:
url = f"https://api.github.com/users/{username}"
response = requests.get(url, timeout=5)
# def enrich_github_user(username):
# """ Enrich user metadata using the appropriate platform API.
# Currently only GitHub is supported.
# """
# try:
# url = f"https://api.github.com/users/{username}"
# response = requests.get(url, timeout=5)

if response.status_code != 200:
return None
# if response.status_code != 200:
# return None

data = response.json()
# data = response.json()

return {
constants.PROP_CODEOWNERS_NAME: data.get("name"),
constants.PROP_CODEOWNERS_COMPANY: data.get("company"),
constants.PROP_CODEOWNERS_EMAIL: data.get("email"),
}
# return {
# constants.PROP_CODEOWNERS_NAME: data.get("name"),
# constants.PROP_CODEOWNERS_COMPANY: data.get("company"),
# constants.PROP_CODEOWNERS_EMAIL: data.get("email"),
# }


except Exception:
# except Exception:
# return None


def enrich_user(username, repo_type, server_url=None):
"""
Enrich user metadata using the appropriate platform API.

Parameters
----------
username : str Username to enrich.
repo_type : str "GITHUB" or "GITLAB"
server_url : str, optional
Base URL of GitLab instance if repo_type is "GITLAB"

Returns
-------
dict or None
Dictionary with available user info (name, company, email), or None if not found.
"""

if repo_type == constants.RepositoryType.GITHUB:
# logging.info(f"Enriching GitHub user {username}")
try:
url = f"https://api.github.com/users/{username}"
response = requests.get(url, timeout=5)
if response.status_code != 200:
return None
data = response.json()
return {
constants.PROP_CODEOWNERS_NAME: data.get("name"),
constants.PROP_CODEOWNERS_COMPANY: data.get("company"),
constants.PROP_CODEOWNERS_EMAIL: data.get("email"),
}
except Exception as e:
logging.warning(f"Error enriching GitHub user {username}: {e}")
return None

elif repo_type == constants.RepositoryType.GITLAB:
try:
if server_url is None:
server_url = "https://gitlab.com"
if not server_url.startswith("http"):
server_url = "https://" + server_url
api_url = f"{server_url.rstrip('/')}/api/v4/users?username={username}"
response = requests.get(api_url, timeout=5)
if response.status_code != 200:
logging.warning(f"GitLab API request failed for {username}: {response.status_code}")
return None
data = response.json()
if not data:
return None
user_info = data[0]
return {
constants.PROP_CODEOWNERS_NAME: user_info.get("name"),
constants.PROP_CODEOWNERS_COMPANY: user_info.get("organization"),
constants.PROP_CODEOWNERS_EMAIL: user_info.get("public_email"),
}
except Exception as e:
logging.error(f"Error enriching GitLab user {username}: {e}")
return None

else:
logging.warning(f"Unsupported repo_type {repo_type}")
return None

8 changes: 4 additions & 4 deletions src/somef/process_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
-------
@return: text of the main readme and a JSON dictionary (filtered_resp) with the findings in files
"""

global domain_gitlab
if repo_type == constants.RepositoryType.GITLAB:
domain_gitlab = extract_gitlab_domain(metadata_result, repo_type)

Expand Down Expand Up @@ -244,12 +244,12 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
)
if filename.upper() == constants.CODEOWNERS_FILE:
# codeowners_json = parse_codeowners_structured(dir_path,filename)
print("Processing CODEOWNERS file...")
logging.info("Processing CODEOWNERS file...")
codeowner_file_url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch,
repo_dir,
repo_relative_path, filename)
metadata_result = parse_codeowners_file(os.path.join(dir_path, filename), metadata_result, codeowner_file_url, reconcile_authors)

metadata_result = parse_codeowners_file(os.path.join(dir_path, filename), metadata_result, codeowner_file_url, reconcile_authors, repo_type, server_url=domain_gitlab)
parsed_build_files.add(filename.lower())

if filename.lower() == "codemeta.json":
Expand Down
39 changes: 32 additions & 7 deletions src/somef/process_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from . import configuration
from .process_results import Result
from .regular_expressions import detect_license_spdx
from .parser.codeowners_parser import enrich_github_user
from .parser.codeowners_parser import enrich_user

# Constructs a template HTTP header, which:
# - has a key for the authorization token if passed via the authorization argument, otherwise
Expand Down Expand Up @@ -58,15 +58,12 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=const
content_length = head_response.headers.get("Content-Length")
if content_length is not None:
size_bytes = int(content_length)
logging.info(f"HEAD Content-Length: {size_bytes}")
if size_bytes > size_limit_bytes:
logging.warning(
f"Download size {size_bytes} bytes exceeds limit of {size_limit_bytes} bytes. Skipping download."
)
return None, None
else:
# logging.warning(f"Could not determine file size for {url}. Skipping download.")
# return None, None
logging.warning(f"No Content-Length header for {url}. Proceeding with download anyway (unable to estimate size).")
except Exception as e:
logging.warning(f"HEAD/stream request failed: {e}. Continuing with GET...")
Expand Down Expand Up @@ -587,7 +584,7 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
if category == constants.CAT_OWNER:
if reconcile_authors:
logging.info("Enriching owner information from codeowners...")
user_info = enrich_github_user(owner)
user_info = enrich_user(owner,repo_type)
if user_info:
if user_info.get(constants.PROP_CODEOWNERS_NAME):
maintainer_data[constants.PROP_NAME] = user_info.get(constants.PROP_CODEOWNERS_NAME)
Expand Down Expand Up @@ -640,7 +637,12 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
if not ignore_api_metadata:
languages_raw, date = rate_limit_get(filtered_resp['languages_url'], headers=header)

languages = languages_raw.json()
if languages_raw is None:
logging.warning("Skipping languages: rate_limit_get returned None (size limit or network error)")
languages = {}
else:
languages = languages_raw.json()

if "message" in languages:
logging.error("Error while retrieving languages: " + languages["message"])
else:
Expand Down Expand Up @@ -778,12 +780,29 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization):
# download the repo at the selected branch with the link
repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip"
logging.info(f"Downloading {repo_archive_url}")

repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))

if repo_download is None:
logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.")
return None

if repo_download.status_code == 300:
logging.warning(f"Ambiguous ref detected for {repo_ref}, trying tags/heads resolution")

for ref_type in ["tags", "heads"]:
repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/refs/{ref_type}/{repo_ref}.zip"
logging.info(f"Trying to download {repo_archive_url}")

repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))

if repo_download is None:
logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content length.")
return None

if repo_download.status_code == 200:
break

if repo_download.status_code == 404:
logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}")
repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/main.zip"
Expand All @@ -794,7 +813,8 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization):
return None

if repo_download.status_code != 200:
sys.exit(f"Error: Archive request failed with HTTP {repo_download.status_code}")
logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}")
return None

repo_zip = repo_download.content

Expand Down Expand Up @@ -973,6 +993,10 @@ def get_all_paginated_results(base_url, headers, per_page=100):
url = f"{base_url}?per_page={per_page}&page={page}"
response, _ = rate_limit_get(url, headers=headers)

if response is None:
logging.warning(f"Skipping page {page}: rate_limit_get returned None (size limit or network error)")
break

if response.status_code != 200:
logging.warning(f"GitHub API error on page {page}: {response.status_code}")
break
Expand All @@ -985,3 +1009,4 @@ def get_all_paginated_results(base_url, headers, per_page=100):
page += 1

return all_results

Loading