KnowledgeCaptureAndDiscovery · dgarijo · Mar 6, 2026 · Mar 6, 2026
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,7 @@ homepage = "https://github.com/KnowledgeCaptureAndDiscovery/somef"
     nltk = "^3.9.0"
     numpy = "^1.26.3"
     pandas = "^2.1.4"
-    rdflib = "^7.0.0"
+    rdflib = ">7.0.0"
     textblob = "^0.17.1"
     validators = "^0.22.0"
     xgboost = "^2.0.3"
@@ -36,7 +36,7 @@ homepage = "https://github.com/KnowledgeCaptureAndDiscovery/somef"
     chardet = "^5.2.0"
     imbalanced-learn = "^0.12.0"
     pytest = "^8.0.0"
-    morph-kgc = "^2.6.4"
+    morph-kgc = "^2.7.0"
     bibtexparser = "^1.4.1"
     nbformat = "^5.9.2"
     markdown = "^3.5.2"

diff --git a/src/somef/extract_software_type.py b/src/somef/extract_software_type.py
@@ -1,6 +1,7 @@
 import os
 from pathlib import Path
 import nbformat
+import logging
 from nbformat.reader import NotJSONError
 from chardet import detect
 import re
@@ -9,8 +10,6 @@
 from .utils import constants
 from .extract_ontologies import is_file_ontology
 
-import pdb
-
 
 def check_repository_type(path_repo, title, metadata_result: Result):
     """ Function that adds the metadata result in the JSON 
@@ -200,18 +199,15 @@ def check_static_websites(path_repo, repo_metadata: Result):
                     return False
     try:
         languages = repo_metadata[constants.CAT_PROGRAMMING_LANGUAGES]
-        print(languages)
         for language in languages:
             language_name = language[constants.PROP_RESULT][constants.PROP_NAME]
-            print(language_name)
             if language_name.lower() == "javascript":
                 js_size += language[constants.PROP_RESULT][constants.PROP_SIZE]
-                print(js_size)
             elif language_name.lower() == "scss" or language_name.lower() == "css":
                 css_size += language[constants.PROP_RESULT][constants.PROP_SIZE]
             total_size += language[constants.PROP_RESULT][constants.PROP_SIZE]
     except Exception as e:
-        print(e)
+        logging.warning(f"Could not retrieve programming languages for static website check: {e}")
     if html_file > 0:
         if js_size > 0 and css_size == 0:
             if js_size / total_size < 0.91:

diff --git a/src/somef/header_analysis.py b/src/somef/header_analysis.py
@@ -147,7 +147,7 @@ def extract_header_content(text: str) -> Tuple[pd.DataFrame, str | None]:
 
     # df['Content'].replace('', np.nan, inplace=True)
     df['Content'] = df['Content'].replace('', np.nan)
-    df.dropna(subset=['Content'], inplace=True)
+    df = df.dropna(subset=['Content'])
 
     return df, none_header_content
 
@@ -415,7 +415,7 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res
         df['ParentGroup'] = df['ParentHeader'].fillna('').map(label_text)
 
         df.loc[df['Group'].str.len() == 0, 'Group'] = df['ParentGroup']
-        df.drop(columns=['ParentGroup'], inplace=True)
+        df = df.drop(columns=['ParentGroup'])
 
         if not df.iloc[0]['Group']:
             df.loc[df.index[0], 'Group'] = ['unknown']
@@ -424,11 +424,11 @@ def extract_categories(repo_data: str, repository_metadata: Result) -> Tuple[Res
         df.loc[df['Group'] == 'unknown', 'Group'] = np.nan
 
         valid = df[df['Group'].notna()].copy()
-        valid.rename(columns={
+        valid = valid.rename(columns={
             'Content': constants.PROP_VALUE,
             'Header': constants.PROP_ORIGINAL_HEADER,
             'ParentHeader': constants.PROP_PARENT_HEADER,
-        }, inplace=True)
+        })
 
         source = None
         if constants.CAT_README_URL in repository_metadata.results:

diff --git a/src/somef/parser/codeowners_parser.py b/src/somef/parser/codeowners_parser.py
@@ -21,7 +21,7 @@ def parse_codeowners_structured(dir_path, filename):
     return {"codeowners": codeowners}
 
 
-def parse_codeowners_file(file_path, metadata_result: Result, source, reconcile_authors=None) -> Result:
+def parse_codeowners_file(file_path, metadata_result: Result, source, reconcile_authors=None, repo_type=None, server_url=None) -> Result:
     try:
         logging.info(f"Reconcile authors flag: {reconcile_authors}")
         if Path(file_path).name.upper() == constants.CODEOWNERS_FILE:
@@ -51,14 +51,14 @@ def parse_codeowners_file(file_path, metadata_result: Result, source, reconcile_
                     added_maintainers.add(owner)
 
                     maintainer_data = {
-                        "value": owner,
-                        "username": owner,
-                        "role": "Maintainer",
-                        "type": "Person"
+                        constants.PROP_VALUE: owner,
+                        constants.PROP_USERNAME: owner,
+                        constants.PROP_ROLE: "Maintainer",
+                        constants.PROP_TYPE: "Person"
                     }
 
                     if reconcile_authors:
-                        user_info = enrich_github_user(owner)
+                        user_info = enrich_user(owner, repo_type, server_url)
                         if user_info:
                             if user_info.get(constants.PROP_CODEOWNERS_NAME):
                                 maintainer_data[constants.PROP_NAME] = user_info.get(constants.PROP_CODEOWNERS_NAME)
@@ -81,26 +81,89 @@ def parse_codeowners_file(file_path, metadata_result: Result, source, reconcile_
 
     return metadata_result
 
-def enrich_github_user(username):
-    """ Enrich user metadata using the appropriate platform API. 
-    Currently only GitHub is supported. 
-    """
-    try:
-        url = f"https://api.github.com/users/{username}"
-        response = requests.get(url, timeout=5)
+# def enrich_github_user(username):
+#     """ Enrich user metadata using the appropriate platform API. 
+#     Currently only GitHub is supported. 
+#     """
+#     try:
+#         url = f"https://api.github.com/users/{username}"
+#         response = requests.get(url, timeout=5)
 
-        if response.status_code != 200:
-            return None
+#         if response.status_code != 200:
+#             return None
 
-        data = response.json()
+#         data = response.json()
 
-        return {
-            constants.PROP_CODEOWNERS_NAME: data.get("name"),
-            constants.PROP_CODEOWNERS_COMPANY: data.get("company"),
-            constants.PROP_CODEOWNERS_EMAIL: data.get("email"),
-        }
+#         return {
+#             constants.PROP_CODEOWNERS_NAME: data.get("name"),
+#             constants.PROP_CODEOWNERS_COMPANY: data.get("company"),
+#             constants.PROP_CODEOWNERS_EMAIL: data.get("email"),
+#         }
 
 
-    except Exception:
+#     except Exception:
+#         return None
+
+
+def enrich_user(username, repo_type, server_url=None):
+    """
+    Enrich user metadata using the appropriate platform API.
+
+    Parameters
+    ----------
+    username : str Username to enrich.
+    repo_type : str "GITHUB" or "GITLAB"
+    server_url : str, optional
+        Base URL of GitLab instance if repo_type is "GITLAB"
+
+    Returns
+    -------
+    dict or None
+        Dictionary with available user info (name, company, email), or None if not found.
+    """
+
+    if repo_type == constants.RepositoryType.GITHUB:
+        # logging.info(f"Enriching GitHub user {username}")
+        try:
+            url = f"https://api.github.com/users/{username}"
+            response = requests.get(url, timeout=5)
+            if response.status_code != 200:
+                return None
+            data = response.json()
+            return {
+                constants.PROP_CODEOWNERS_NAME: data.get("name"),
+                constants.PROP_CODEOWNERS_COMPANY: data.get("company"),
+                constants.PROP_CODEOWNERS_EMAIL: data.get("email"),
+            }
+        except Exception as e:
+            logging.warning(f"Error enriching GitHub user {username}: {e}")
+            return None
+
+    elif repo_type == constants.RepositoryType.GITLAB:
+        try:
+            if server_url is None:
+                server_url = "https://gitlab.com"
+            if not server_url.startswith("http"):
+                server_url = "https://" + server_url
+            api_url = f"{server_url.rstrip('/')}/api/v4/users?username={username}"
+            response = requests.get(api_url, timeout=5)
+            if response.status_code != 200:
+                logging.warning(f"GitLab API request failed for {username}: {response.status_code}")
+                return None
+            data = response.json()
+            if not data:
+                return None
+            user_info = data[0]
+            return {
+                constants.PROP_CODEOWNERS_NAME: user_info.get("name"),
+                constants.PROP_CODEOWNERS_COMPANY: user_info.get("organization"),
+                constants.PROP_CODEOWNERS_EMAIL: user_info.get("public_email"),
+            }
+        except Exception as e:
+            logging.error(f"Error enriching GitLab user {username}: {e}")
+            return None
+
+    else:
+        logging.warning(f"Unsupported repo_type {repo_type}")
         return None
 
diff --git a/src/somef/process_files.py b/src/somef/process_files.py
@@ -49,7 +49,7 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
     -------
     @return: text of the main readme and a JSON dictionary (filtered_resp) with the findings in files
     """
-
+    global domain_gitlab
     if repo_type == constants.RepositoryType.GITLAB:      
         domain_gitlab = extract_gitlab_domain(metadata_result, repo_type)
 
@@ -244,12 +244,12 @@ def process_repository_files(repo_dir, metadata_result: Result, repo_type, owner
                                                    )
                 if filename.upper() == constants.CODEOWNERS_FILE:
                     # codeowners_json = parse_codeowners_structured(dir_path,filename)
-                    print("Processing CODEOWNERS file...")
+                    logging.info("Processing CODEOWNERS file...")
                     codeowner_file_url = get_file_link(repo_type, file_path, owner, repo_name, repo_default_branch,
                                                        repo_dir,
                                                        repo_relative_path, filename)
-                    
-                    metadata_result = parse_codeowners_file(os.path.join(dir_path, filename), metadata_result, codeowner_file_url, reconcile_authors)
+
+                    metadata_result = parse_codeowners_file(os.path.join(dir_path, filename), metadata_result, codeowner_file_url, reconcile_authors, repo_type, server_url=domain_gitlab)
                     parsed_build_files.add(filename.lower())
 
                 if filename.lower() == "codemeta.json":

diff --git a/src/somef/process_repository.py b/src/somef/process_repository.py
@@ -11,7 +11,7 @@
 from . import configuration
 from .process_results import Result
 from .regular_expressions import detect_license_spdx
-from .parser.codeowners_parser import enrich_github_user
+from .parser.codeowners_parser import enrich_user
 
 # Constructs a template HTTP header, which:
 # - has a key for the authorization token if passed via the authorization argument, otherwise
@@ -58,15 +58,12 @@ def rate_limit_get(*args, backoff_rate=2, initial_backoff=1, size_limit_mb=const
             content_length = head_response.headers.get("Content-Length")
             if content_length is not None:
                 size_bytes = int(content_length)
-                logging.info(f"HEAD Content-Length: {size_bytes}")
                 if size_bytes > size_limit_bytes:
                     logging.warning(
                         f"Download size {size_bytes} bytes exceeds limit of {size_limit_bytes} bytes. Skipping download."
                     )
                     return None, None
             else:
-                # logging.warning(f"Could not determine file size for {url}. Skipping download.")
-                # return None, None
                 logging.warning(f"No Content-Length header for {url}. Proceeding with download anyway (unable to estimate size).")
         except Exception as e:
             logging.warning(f"HEAD/stream request failed: {e}. Continuing with GET...")
@@ -587,7 +584,7 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
             if category == constants.CAT_OWNER:
                 if reconcile_authors:
                     logging.info("Enriching owner information from codeowners...")
-                    user_info = enrich_github_user(owner)
+                    user_info = enrich_user(owner,repo_type)
                     if user_info:
                         if user_info.get(constants.PROP_CODEOWNERS_NAME):
                             maintainer_data[constants.PROP_NAME] = user_info.get(constants.PROP_CODEOWNERS_NAME)
@@ -640,7 +637,12 @@ def load_online_repository_metadata(repository_metadata: Result, repository_url,
     if not ignore_api_metadata:
         languages_raw, date = rate_limit_get(filtered_resp['languages_url'], headers=header)
 
-        languages = languages_raw.json()
+        if languages_raw is None:
+            logging.warning("Skipping languages: rate_limit_get returned None (size limit or network error)")
+            languages = {}
+        else:
+            languages = languages_raw.json()
+
         if "message" in languages:
             logging.error("Error while retrieving languages: " + languages["message"])
         else:
@@ -778,12 +780,29 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization):
     # download the repo at the selected branch with the link
     repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/{repo_ref}.zip"
     logging.info(f"Downloading {repo_archive_url}")
+
     repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
 
     if repo_download is None:
         logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content lenght.")
         return None
 
+    if repo_download.status_code == 300:
+        logging.warning(f"Ambiguous ref detected for {repo_ref}, trying tags/heads resolution")
+
+        for ref_type in ["tags", "heads"]:
+            repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/refs/{ref_type}/{repo_ref}.zip"
+            logging.info(f"Trying to download {repo_archive_url}")
+
+            repo_download, date = rate_limit_get(repo_archive_url, headers=header_template(authorization))
+
+            if repo_download is None:
+                    logging.warning(f"Repository archive skipped due to size limit: {constants.SIZE_DOWNLOAD_LIMIT_MB} MB or not content length.")
+                    return None
+
+            if repo_download.status_code == 200:
+                break
+
     if repo_download.status_code == 404:
         logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}")
         repo_archive_url = f"https://github.com/{owner}/{repo_name}/archive/main.zip"
@@ -794,7 +813,8 @@ def download_github_files(directory, owner, repo_name, repo_ref, authorization):
             return None
 
     if repo_download.status_code != 200:
-        sys.exit(f"Error: Archive request failed with HTTP {repo_download.status_code}")
+        logging.error(f"Error: Archive request failed with HTTP {repo_download.status_code}")
+        return None
 
     repo_zip = repo_download.content
 
@@ -973,6 +993,10 @@ def get_all_paginated_results(base_url, headers, per_page=100):
         url = f"{base_url}?per_page={per_page}&page={page}"
         response, _ = rate_limit_get(url, headers=headers)
 
+        if response is None:
+            logging.warning(f"Skipping page {page}: rate_limit_get returned None (size limit or network error)")
+            break
+
         if response.status_code != 200:
             logging.warning(f"GitHub API error on page {page}: {response.status_code}")
             break
@@ -985,3 +1009,4 @@ def get_all_paginated_results(base_url, headers, per_page=100):
         page += 1
 
     return all_results
+