diff --git a/github_scripts/get_git_sources.py b/github_scripts/get_git_sources.py index ca233975..da5b3271 100644 --- a/github_scripts/get_git_sources.py +++ b/github_scripts/get_git_sources.py @@ -3,27 +3,26 @@ # The file LICENCE, distributed with this code, contains details of the terms # under which the code may be used. # ----------------------------------------------------------------------------- - """ -Clone sources for a rose-stem run for use with git bdiff module in scripts +Helper functions for cloning git sources in command line builds """ import re import subprocess +from datetime import datetime from typing import Optional from pathlib import Path from shutil import rmtree import shlex +import sys import logging logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO, stream=sys.stdout) def run_command( - command: str, - check: bool = True, - capture: bool = True, - timeout: int = 600 + command: str, check: bool = True, capture: bool = True, timeout: int = 600 ) -> Optional[subprocess.CompletedProcess]: """ Run a subprocess command and return the result object @@ -36,15 +35,13 @@ def run_command( args = shlex.split(command) try: - # Note: text=True and capture_output=True have high overhead - # for large buffers. Use capture=False for fire-and-forget tasks. result = subprocess.run( args, capture_output=capture, text=capture, timeout=timeout, shell=False, - check=False + check=False, ) if check and result.returncode != 0: err_msg = (result.stderr or "").strip() @@ -59,29 +56,198 @@ def run_command( raise +def validate_dependencies(dependencies: dict) -> None: + """ + Check that the dependencies file dictionary matches format expectations. + Each dictionary value should be a list of dictionaries (or a single dictionary) + Those dictionaries should have a "source" and a "ref" key + """ + for item, values in dependencies.items(): + failed = False + if isinstance(values, dict): + values = [values] + if not isinstance(values, list): + failed = True + else: + for entry in values: + if not isinstance(entry, dict) or ( + "source" not in entry or "ref" not in entry + ): + failed = True + if failed: + raise ValueError( + f"The dependency {item} does not contain a list of dictionaries (or a " + "single dictionary) with keys of 'source' and 'ref'.\nPlease edit your " + "dependencies.yaml file to satisfy this." + ) + + +def datetime_str() -> str: + """ + Create and return a datetime string at the current time + """ + return datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + +def get_source( + source: str, + ref: str, + dest: Path, + repo: str, + use_mirrors: bool = False, + mirror_loc: Path = Path(""), +) -> None: + """ + Call functions to clone or rsync git source + """ + + if ".git" in source: + if use_mirrors: + logger.info( + f"[{datetime_str()}] Cloning {repo} from {mirror_loc} at ref {ref}" + ) + mirror_loc = Path(mirror_loc) / "MetOffice" / repo + clone_repo_mirror(source, ref, mirror_loc, dest) + else: + logger.info(f"[{datetime_str()}] Cloning {repo} from {source} at ref {ref}") + clone_repo(source, ref, dest) + else: + logger.info(f"[{datetime_str()}] Syncing {repo} at ref {ref}") + sync_repo(source, ref, dest) + + +def merge_source( + source: str, + ref: str, + dest: Path, + repo: str, + use_mirrors: bool = False, + mirror_loc: Path = Path(""), +) -> None: + """ + Merge git source into a local git clone. Assumes dest is a git clone that this + source can be merged into. + """ + + logger.info( + f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Merging " + f"{source} at ref {ref} into {repo}" + ) + + if use_mirrors: + remote_path = Path(mirror_loc) / "MetOffice" / repo + else: + remote_path = source + run_command(f"git -C {dest} remote add local {remote_path}") + + if use_mirrors: + fetch = determine_mirror_fetch(source, ref) + else: + fetch = ref + + run_command(f"git -C {dest} fetch local {fetch}") + command = f"git -C {dest} merge --no-gpg-sign FETCH_HEAD" + result = run_command(command, check=False) + if result.returncode: + unmerged_files = get_unmerged(dest) + if unmerged_files: + handle_merge_conflicts(source, ref, dest, repo) + else: + raise subprocess.CalledProcessError( + result.returncode, command, result.stdout, result.stderr + ) + + # Remove the added remote + run_command(f"git -C {dest} remote remove local") + + +def handle_merge_conflicts(source: str, ref: str, loc: Path, dependency: str) -> None: + """ + If merge conflicts are in `rose-stem/` or `dependencies.yaml` then accept the + current changes and mark as resolved. + If others remain then raise an error + """ + + # For suites, merge conflicts in these files/directories are unimportant so accept + # the current changes + for filepath in ("dependencies.yaml", "rose-stem"): + logger.warning(f"Ignoring merge conflicts in {filepath}") + run_command(f"git -C {loc} checkout --ours -- {filepath}") + run_command(f"git -C {loc} add {filepath}") + + # Check if there are any remaining merge conflicts + unmerged = get_unmerged(loc) + if unmerged: + files = "\n".join(f for f in unmerged) + raise RuntimeError( + "\nA merge conflict has been identified while merging the following branch " + f"into the {dependency} source:\n\nsource: {source}\nref: {ref}\n\n" + f"with conflicting files:{files}" + "\n\nThese will need changing in the source branches to be useable together" + ) + + +def get_unmerged(loc: Path) -> list[str]: + """ + Return list of unmerged files in a git clone + """ + + files = run_command(f"git -C {loc} --no-pager diff --name-only --diff-filter=U") + return files.stdout.split() + + def clone_repo_mirror( - source: str, repo_ref: str, parent: str, mirror_loc: Path, loc: Path + repo_source: str, + repo_ref: str, + mirror_loc: Path, + loc: Path, ) -> None: """ Clone a repo source using a local git mirror. Assume the mirror is set up as per the Met Office + - repo_source: ssh url of the source repository + - repo_ref: git ref for the source. An empty string will get the default branch + - mirror_loc: path to the local git mirrors + - loc: path to clone the repository to """ - # Remove if this clone already exists + # If the repository exists and isn't a git repo, exit now as we don't want to + # overwrite it if loc.exists(): - rmtree(loc) + if not Path(loc / ".git").exists(): + raise RuntimeError( + f"The destination for the clone of {repo_source} already exists but " + "isn't a git directory. Exiting so as to not overwrite it." + ) - command = f"git clone {mirror_loc} {loc}" - run_command(command) + # Clone if the repo doesn't exist + else: + command = f"git clone {mirror_loc} {loc}" + run_command(command) - # If not provided a ref, return + # If not provided a ref, pull the latest repository and return if not repo_ref: + run_command(f"git -C {loc} pull") return - source = source.removeprefix("git@github.com:") - user = source.split("/")[0] + fetch = determine_mirror_fetch(repo_source, repo_ref) + commands = ( + f"git -C {loc} fetch origin {fetch}", + f"git -C {loc} checkout FETCH_HEAD", + ) + for command in commands: + run_command(command) + + +def determine_mirror_fetch(repo_source: str, repo_ref: str) -> str: + """ + Determine the fetch ref for the git mirrors + """ + + repo_source = repo_source.removeprefix("git@github.com:") + user = repo_source.split("/")[0] # Check that the user is different to the Upstream User - if user in parent.split("/")[0]: + if "MetOffice" in user: user = None # If the ref is a hash then we don't need the fork user as part of the fetch. @@ -90,36 +256,41 @@ def clone_repo_mirror( fetch = repo_ref else: fetch = f"{user}/{repo_ref}" - commands = ( - f"git -C {loc} fetch origin {fetch}", - f"git -C {loc} checkout FETCH_HEAD", - ) - for command in commands: - run_command(command) + + return fetch def clone_repo(repo_source: str, repo_ref: str, loc: Path) -> None: """ Clone the repo and checkout the provided ref Only if a remote source + - repo_source: ssh url of the source repository + - repo_ref: git ref for the source. An empty string will get the default branch + - loc: path to clone the repository to """ - # Remove if this clone already exists - if loc.exists(): - rmtree(loc) + if not loc.exists(): + # Create a clean clone location + loc.mkdir(parents=True) - # Create a clean clone location - loc.mkdir(parents=True) - - commands = ( - f"git -C {loc} init", - f"git -C {loc} remote add origin {repo_source}", - f"git -C {loc} fetch origin {repo_ref}", - f"git -C {loc} checkout FETCH_HEAD", - f"git -C {loc} fetch origin main:main", - ) - for command in commands: - run_command(command) + # This process is equivalent to doing a git clone + # It saves a small amount of space by not fetching all refs + commands = ( + f"git -C {loc} init", + f"git -C {loc} remote add origin {repo_source}", + f"git -C {loc} fetch origin {repo_ref}", + f"git -C {loc} checkout FETCH_HEAD", + f"git -C {loc} fetch origin main:main", + ) + for command in commands: + run_command(command) + else: + commands = ( + f"git -C {loc} fetch origin {repo_ref}", + f"git -C {loc} checkout FETCH_HEAD", + ) + for command in commands: + run_command(command) def sync_repo(repo_source: str, repo_ref: str, loc: Path) -> None: @@ -156,10 +327,31 @@ def sync_repo(repo_source: str, repo_ref: str, loc: Path) -> None: command = f"git -C {loc} fetch origin main:main" result = run_command(command, check=False) if result and result.returncode: - print("Warning - fetching main from origin resulted in an error") - print("This is likely due to the main branch already existing") - print(f"Error message:\n\n{result.stderr}") + logger.warning( + "Fetching main from origin resulted in an error." + "This is likely due to the main branch already existing" + f"\nError message:\n\n{result.stderr}" + ) if repo_ref: command = f"git -C {loc} checkout {repo_ref}" run_command(command) + + +def set_https(dependencies: dict) -> dict: + """ + Change sources in a dependencies dictionary to use https instead of ssh + """ + + logger.info("Modifying Dependencies to use https") + for dependency, opts in dependencies.items(): + if not isinstance(opts, list): + opts = [opts] + for values in opts: + if values["source"].startswith("git@github.com:"): + source = dependencies[dependency]["source"] + dependencies[dependency]["source"] = source.replace( + "git@github.com:", "https://github.com/" + ) + + return dependencies diff --git a/github_scripts/merge_sources.py b/github_scripts/merge_sources.py new file mode 100755 index 00000000..db47d5ba --- /dev/null +++ b/github_scripts/merge_sources.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +# ----------------------------------------------------------------------------- +# (C) Crown copyright Met Office. All rights reserved. +# The file LICENCE, distributed with this code, contains details of the terms +# under which the code may be used. +# ----------------------------------------------------------------------------- +""" +Script to clone and merge git sources +""" + +import argparse +import os +import yaml +from pathlib import Path +from get_git_sources import get_source, merge_source, set_https, validate_dependencies +import logging +import sys + + +def parse_args(): + """ + Parse arguments + """ + + parser = argparse.ArgumentParser(description="Extract and merge git sources") + parser.add_argument( + "-d", + "--dependencies", + default=Path(__file__).parent, + type=Path, + help="Path to the dependencies.yaml file", + ) + parser.add_argument( + "-p", + "--path", + default=None, + help="The path to extract the sources to. If part of a cylc suite, it will " + "default to $CYLC_WORKFLOW_SHARE_DIR/source, otherwise __file__/source", + ) + parser.add_argument( + "-m", + "--mirrors", + action="store_true", + help="If true, attempts to use local git mirrors", + ) + parser.add_argument( + "--mirror_loc", + default="/data/users/gitassist/git_mirrors", + help="Location of github mirrors", + ) + parser.add_argument( + "--tokens", + action="store_true", + help="If true, https github sources will be used, requiring github " + "authentication via Personal Access Tokens", + ) + args = parser.parse_args() + args.dependencies = args.dependencies.resolve() + if args.dependencies.is_dir(): + args.dependencies = args.dependencies / "dependencies.yaml" + + if not args.path: + args.path = Path(os.getenv("CYLC_WORKFLOW_SHARE_DIR", __file__)) / "source" + args.path = args.path.resolve() + + return args + + +def main(): + """ + Main Function + """ + + args = parse_args() + + logging.basicConfig(level=logging.INFO, stream=sys.stdout) + + dependencies = yaml.safe_load(args.dependencies.read_text()) + validate_dependencies(dependencies) + + if args.tokens: + dependencies = set_https(dependencies) + + for dependency, opts in dependencies.items(): + dest = args.path / dependency + + if not isinstance(opts, list): + opts = [opts] + + # Clone the first provided source + values = opts.pop(0) + get_source( + values["source"], + values["ref"], + dest, + dependency, + args.mirrors, + args.mirror_loc, + ) + # For all other sources, attempt to merge into the first + for values in opts: + merge_source( + values["source"], + values["ref"], + dest, + dependency, + args.mirrors, + args.mirror_loc, + ) + + +if __name__ == "__main__": + main() diff --git a/github_scripts/rose_stem_extract_source.py b/github_scripts/rose_stem_extract_source.py index 5a5e590d..4b42e79a 100755 --- a/github_scripts/rose_stem_extract_source.py +++ b/github_scripts/rose_stem_extract_source.py @@ -14,24 +14,9 @@ import os from pathlib import Path from ast import literal_eval -from get_git_sources import clone_repo, clone_repo_mirror, sync_repo -from datetime import datetime - - -def set_https(dependencies: dict) -> dict: - """ - Change sources in a dependencies dictions to use https instead of ssh - """ - - print("Modifying Dependencies") - for dependency, values in dependencies.items(): - if values["source"].startswith("git@github.com:"): - source = dependencies[dependency]["source"] - dependencies[dependency]["source"] = source.replace( - "git@github.com:", "https://github.com/" - ) - - return dependencies +from get_git_sources import get_source, merge_source, set_https, validate_dependencies +import logging +import sys def main() -> None: @@ -47,37 +32,44 @@ def main() -> None: 4. If USE_MIRRORS is True, clone from local mirrors at GIT_MIRROR_LOC """ + logging.basicConfig(level=logging.INFO, stream=sys.stdout) + clone_loc = Path(os.environ["SOURCE_DIRECTORY"]) dependencies: dict = literal_eval(os.environ["DEPENDENCIES"]) + validate_dependencies(dependencies) - if os.environ.get("USE_TOKENS", "False") == "True": + if os.environ.get("USE_TOKENS", "false").lower() == "true": dependencies = set_https(dependencies) - for dependency, values in dependencies.items(): + use_mirrors = os.environ.get("USE_MIRRORS", "false").lower() == "true" + mirror_loc = Path(os.getenv("GIT_MIRROR_LOC", "")) / "MetOffice" + + for dependency, opts in dependencies.items(): loc = clone_loc / dependency - if ".git" in values["source"]: - if os.environ.get("USE_MIRRORS", "False") == "True": - mirror_loc = Path(os.environ["GIT_MIRROR_LOC"]) / values["parent"] - print( - f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Cloning " - f"{dependency} from {mirror_loc} at ref {values['ref']}" - ) - clone_repo_mirror( - values["source"], values["ref"], values["parent"], mirror_loc, loc - ) - else: - print( - f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Cloning " - f"{dependency} from {values['source']} at ref {values['ref']}" - ) - clone_repo(values["source"], values["ref"], loc) - else: - print( - f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Syncing " - f"{dependency} at ref {values['ref']}" + if not isinstance(opts, list): + opts = [opts] + + # Clone the first provided source + values = opts.pop(0) + get_source( + values["source"], + values["ref"], + loc, + dependency, + use_mirrors, + mirror_loc, + ) + # For all other sources, attempt to merge into the first + for values in opts: + merge_source( + values["source"], + values["ref"], + loc, + dependency, + use_mirrors, + mirror_loc, ) - sync_repo(values["source"], values["ref"], loc) if __name__ == "__main__": diff --git a/github_scripts/suite_data.py b/github_scripts/suite_data.py index add597f9..e06b702d 100644 --- a/github_scripts/suite_data.py +++ b/github_scripts/suite_data.py @@ -323,7 +323,7 @@ def generate_cylc_url(self) -> str: Generate a markdown url to the cylc review page of a workflow """ suite_user = os.environ["USER"] - encoded_workflow_id = self.workflow_id.replace('/','%2F') + encoded_workflow_id = self.workflow_id.replace("/", "%2F") cylc_review = ( f"[{self.workflow_id}](https://cylchub/services/cylc-review/cycles" @@ -332,7 +332,6 @@ def generate_cylc_url(self) -> str: return cylc_review - def get_suite_starttime(self) -> str: """ Read the suite starttime from the suite database