From 5f1a8194dde1ac8b07afb0019ff773b686f78224 Mon Sep 17 00:00:00 2001 From: Marc LeBlanc Date: Thu, 14 Mar 2024 22:39:18 +0300 Subject: [PATCH] Set to always run in batches, improved parallelization, added retries --- config/test-repos-to-convert.yaml | 53 +-- repo-converter/build/run.py | 768 ++++++++++++++++++------------ 2 files changed, 477 insertions(+), 344 deletions(-) diff --git a/config/test-repos-to-convert.yaml b/config/test-repos-to-convert.yaml index 11ec6fe..f11cf5e 100644 --- a/config/test-repos-to-convert.yaml +++ b/config/test-repos-to-convert.yaml @@ -1,72 +1,64 @@ + allura: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/allura code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 ambari: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/ambari code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 + layout: standard ant: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/ant code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 apr: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/apr code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 beam: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/beam code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 board: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/board code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 bugs: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/bugs code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 cocoon: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/cocoon code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 comdev: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/comdev code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 commons: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/commons code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 crunch: type: SVN @@ -74,7 +66,6 @@ crunch: code-host-name: svn.apache.org git-org-name: asf layout: standard - fetch-batch-size: 100 curator: type: SVN @@ -82,231 +73,193 @@ curator: code-host-name: svn.apache.org git-org-name: asf layout: standard - fetch-batch-size: 100 datafu: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/datafu/site code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 db: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/db code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 + layout: standard eagle: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/eagle code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 falcon: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/falcon code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 fundraising: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/fundraising code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 httpcomponents: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/httpcomponents code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 httpd: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/httpd code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 infrastructure: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/infrastructure code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 jackrabbit: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/jackrabbit code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 johnzon: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/johnzon code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 karaf: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/karaf code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 knox: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/knox code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 kylin: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/kylin code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 lens: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/lens code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 lucene: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/lucene code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 manifoldcf: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/manifoldcf code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 maven: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/maven code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 openoffice: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/openoffice code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 parquet: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/parquet/site code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 pdfbox: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/pdfbox code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 perl: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/perl code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 phoenix: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/phoenix code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 poi: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/poi code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 reef: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/reef/site code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 spamassassin: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/spamassassin code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 subversion: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/subversion code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 tomcat: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/tomcat code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 twill: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/twill code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 uima: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/uima code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 - -usergrid: - type: SVN - svn-repo-code-root: https://svn.apache.org/repos/asf/usergrid/site - code-host-name: svn.apache.org - git-org-name: asf - fetch-batch-size: 100 xerces: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/xerces code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 xmlbeans: type: SVN svn-repo-code-root: https://svn.apache.org/repos/asf/xmlbeans code-host-name: svn.apache.org git-org-name: asf - fetch-batch-size: 100 zest: type: SVN diff --git a/repo-converter/build/run.py b/repo-converter/build/run.py index 22d90a1..265a01a 100644 --- a/repo-converter/build/run.py +++ b/repo-converter/build/run.py @@ -20,14 +20,21 @@ # SVN # Branches - # Sort out how to see all branches in Sourcegraph - # Atlassian's Java binary to tidy up branches and tags? + # Problem + # Sort out how to see all branches in Sourcegraph + # Approaches + # Can we accomplish the same with a bare clone, or do we need a working copy for git reasons? + # for a bare repo: git symbolic-ref HEAD refs/heads/trunk + # Do we need a Python Git library to modify the repo metadata (may be safer, if the library uses lock files?), or can we do it as a file-based operation? + # Atlassian's Java binary to tidy up branches and tags? # Parallelism - # Fork processes earlier, so more of the slow serial stuff for each repo happens in its own thread - # ex. git svn log + # Add a max concurrent repos environment variable - #.gitignore files + # SVN commands hanging + # Add a timeout for hanging svn info and svn log commands, if data isn't transferring + + # .gitignore files # git svn create-ignore # git svn show-ignore # https://git-scm.com/docs/git-svn#Documentation/git-svn.txt-emcreate-ignoreem @@ -59,12 +66,11 @@ # Should be automatic? # Run the one from Atlassian's Jar file if needed - # git default branch - # Configure for the individual repo, before git init, so that it doesn't need to be set globally - # for a bare repo git symbolic-ref HEAD refs/heads/trunk + # git default branch for init / initial clone + # Find a way to configure it for each repo, before or after git init, so that it doesn't need to be set globally, risk collisions # git list all config - # git -C $repo_path config --list + # git -C $local_repo_path config --list # Find a python library for working with git repos programmatically instead of depending on git CLI # https://gitpython.readthedocs.io/en/stable/tutorial.html @@ -81,6 +87,7 @@ import logging # https://docs.python.org/3/library/logging.html import multiprocessing # https://docs.python.org/3/library/multiprocessing.html import os # https://docs.python.org/3/library/os.html +import random # https://docs.python.org/3/library/random.html import shutil # https://docs.python.org/3/library/shutil.html import signal # https://docs.python.org/3/library/signal.html import subprocess # https://docs.python.org/3/library/subprocess.html @@ -127,12 +134,12 @@ def load_config_from_environment_variables(): # Set defaults in case they're not defined # DEBUG INFO WARNING ERROR CRITICAL - environment_variables_dict['LOG_LEVEL'] = os.environ.get("LOG_LEVEL", "DEBUG") - environment_variables_dict['REPO_CONVERTER_INTERVAL_SECONDS'] = int(os.environ.get("REPO_CONVERTER_INTERVAL_SECONDS", 3600)) + environment_variables_dict["LOG_LEVEL"] = os.environ.get("LOG_LEVEL", "DEBUG") + environment_variables_dict["REPO_CONVERTER_INTERVAL_SECONDS"] = int(os.environ.get("REPO_CONVERTER_INTERVAL_SECONDS", 3600)) # Path inside the container to find this file, only change to match if the right side of the volume mapping changes - environment_variables_dict['REPOS_TO_CONVERT'] = os.environ.get("REPOS_TO_CONVERT", "/sourcegraph/repos-to-convert.yaml") + environment_variables_dict["REPOS_TO_CONVERT"] = os.environ.get("REPOS_TO_CONVERT", "/sourcegraph/repos-to-convert.yaml") # Path inside the container to find this directory, only change to match if the right side of the volume mapping changes - environment_variables_dict['SRC_SERVE_ROOT'] = os.environ.get("SRC_SERVE_ROOT", "/sourcegraph/src-serve-root") + environment_variables_dict["SRC_SERVE_ROOT"] = os.environ.get("SRC_SERVE_ROOT", "/sourcegraph/src-serve-root") def load_config_from_repos_to_convert_file(): @@ -152,14 +159,14 @@ def configure_logging(): datefmt = "%Y-%m-%d %H:%M:%S", encoding = "utf-8", format = f"%(asctime)s; {script_name}; %(levelname)s; %(message)s", - level = environment_variables_dict['LOG_LEVEL'] + level = environment_variables_dict["LOG_LEVEL"] ) def git_config_safe_directory(): - cmd_cfg_git_safe_directory = ["git", "config", "--system", "--replace-all", "safe.directory", "\"*\""] - subprocess_run(cmd_cfg_git_safe_directory) + cmd_git_safe_directory = ["git", "config", "--system", "--replace-all", "safe.directory", "\"*\""] + subprocess_run(cmd_git_safe_directory) def parse_repos_to_convert_file_into_repos_dict(): @@ -174,7 +181,7 @@ def parse_repos_to_convert_file_into_repos_dict(): try: # Open the file - with open(environment_variables_dict['REPOS_TO_CONVERT'], "r") as repos_to_convert_file: + with open(environment_variables_dict["REPOS_TO_CONVERT"], "r") as repos_to_convert_file: # This should return a dict repos_dict = yaml.safe_load(repos_to_convert_file) @@ -202,307 +209,460 @@ def parse_repos_to_convert_file_into_repos_dict(): def clone_svn_repos(): - # Loop through the repos_dict, find the type: SVN repos, then add them to the dict of SVN repos + # Loop through the repos_dict, find the type: SVN repos, then fork off a process to clone them for repo_key in repos_dict.keys(): - # If this repo isn't SVN, skip it - if repos_dict[repo_key].get('type','').lower() != 'svn': - continue - - # Get config parameters read from repos-to-clone.yaml, and set defaults if they're not provided - git_repo_name = repo_key - svn_repo_code_root = repos_dict[repo_key].get('svn-repo-code-root', None) - username = repos_dict[repo_key].get('username', None) - password = repos_dict[repo_key].get('password', None) - code_host_name = repos_dict[repo_key].get('code-host-name', None) - git_org_name = repos_dict[repo_key].get('git-org-name', None) - git_default_branch = repos_dict[repo_key].get('git-default-branch','main') - fetch_batch_size = repos_dict[repo_key].get('fetch-batch-size', None) - authors_file_path = repos_dict[repo_key].get('authors-file-path', None) - authors_prog_path = repos_dict[repo_key].get('authors-prog-path', None) - git_ignore_file_path = repos_dict[repo_key].get('git-ignore-file-path', None) - layout = repos_dict[repo_key].get('layout', None) - trunk = repos_dict[repo_key].get('trunk', None) - tags = repos_dict[repo_key].get('tags', None) - branches = repos_dict[repo_key].get('branches', None) - - ## Parse config parameters into command args - # TODO: Interpret code_host_name, git_org_name, and git_repo_name if not given - # ex. https://svn.apache.org/repos/asf/parquet/site - # code_host_name = svn.apache.org # can get by removing url scheme, if any, till the first / - # arbitrary path on server = repos # optional, can either be a directory, or may actually be the repo - # git_org_name = asf - # git_repo_name = parquet - # git repo root = site # arbitrary path inside the repo where contributors decided to start storing /trunk /branches /tags and other files to be included in the repo - repo_path = str(environment_variables_dict['SRC_SERVE_ROOT']+"/"+code_host_name+"/"+git_org_name+"/"+git_repo_name) - - ## Define common command args - arg_svn_non_interactive = [ "--non-interactive" ] # Do not prompt, just fail if the command doesn't work, only used for direct `svn` command - arg_svn_username = [ "--username", username ] - arg_svn_password = [ "--password", password ] # Only used for direct `svn` commands - arg_svn_echo_password = None - arg_svn_repo_code_root = [ svn_repo_code_root ] - arg_git = [ "git", "-C", repo_path ] - arg_git_cfg = arg_git + [ "config" ] - arg_git_svn = arg_git + [ "svn" ] - arg_batch_end_revision = [ f"{git_config_namespace}.batch-end-revision" ] - - ## Define commands - cmd_run_svn_info = [ "svn", "info" ] + arg_svn_repo_code_root + arg_svn_non_interactive - cmd_run_svn_log = [ "svn", "log", "--xml", "--with-no-revprops" ] + arg_svn_repo_code_root + arg_svn_non_interactive - cmd_cfg_git_default_branch = arg_git_cfg + [ "--global", "init.defaultBranch", git_default_branch ] # Possibility of collisions if multiple of these are run overlapping, make sure it's quick between reading and using this - cmd_run_git_svn_init = arg_git_svn + [ "init" ] + arg_svn_repo_code_root - cmd_cfg_git_bare_clone = arg_git_cfg + [ "core.bare", "true" ] - cmd_cfg_git_authors_file = arg_git_cfg + [ "svn.authorsfile", authors_file_path ] - cmd_cfg_git_authors_prog = arg_git_cfg + [ "svn.authorsProg", authors_prog_path ] - cmd_run_git_svn_fetch = arg_git_svn + [ "fetch" ] - cmd_cfg_git_get_batch_end_revision = arg_git_cfg + [ "--get" ] + arg_batch_end_revision - cmd_cfg_git_set_batch_end_revision = arg_git_cfg + [ "--replace-all" ] + arg_batch_end_revision - cmd_cfg_git_get_svn_url = arg_git_cfg + [ "--get", "svn-remote.svn.url" ] - - ## Modify commands based on config parameters - if username: - cmd_run_svn_info += arg_svn_username - cmd_run_svn_log += arg_svn_username - cmd_run_git_svn_init += arg_svn_username - cmd_run_git_svn_fetch += arg_svn_username - - if password: - arg_svn_echo_password = True - cmd_run_svn_info += arg_svn_password - cmd_run_svn_log += arg_svn_password - - # States - # Create: - # State: - # The directory doesn't already exist - # The repo doesn't already exist - # How did we get here: - # First time - Create new path / repo / fetch job - # First run of the script - # New repo was added to the repos-to-convert.yaml file - # Repo was deleted from disk - # Approach: - # Harder to test for the negative, so assume we're in the Create state, unless we find we're in the Running or Update states - repo_state = "create" - # Running: - # State: - # An svn fetch process is still running - # How did we get here: - # Fetch process is still running from a previous run of the script - # Approach: - # Check first if the process is running, then continue this outer loop - # Update: - # State: - # Repo already exists, with a valid configuration - # How did we get here: - # A fetch job was previously run, but is not currently running - # Approach: - # Check if we're in the update state, then set repo_state = "update" - # repo_state = "update" - - - ## Check if we're in the Running state - # Check if a fetch process is currently running for this repo - cmd_run_git_svn_fetch_string = ' '.join(cmd_run_git_svn_fetch) + repo_type = repos_dict[repo_key].get('type','').lower() + + if "svn" in repo_type or "subversion" in repo_type: + + multiprocessing.Process(target=clone_svn_repo, name=f"clone_svn_repo_{repo_key}", args=(repo_key,)).start() + + +def clone_svn_repo(repo_key): + + # Get config parameters read from repos-to-clone.yaml, and set defaults if they're not provided + git_repo_name = repo_key + authors_file_path = repos_dict[repo_key].get('authors-file-path', None) + authors_prog_path = repos_dict[repo_key].get('authors-prog-path', None) + branches = repos_dict[repo_key].get('branches', None) + code_host_name = repos_dict[repo_key].get('code-host-name', None) + fetch_batch_size = repos_dict[repo_key].get('fetch-batch-size', 100) + git_default_branch = repos_dict[repo_key].get('git-default-branch','main') + git_ignore_file_path = repos_dict[repo_key].get('git-ignore-file-path', None) + git_org_name = repos_dict[repo_key].get('git-org-name', None) + layout = repos_dict[repo_key].get('layout', None) + password = repos_dict[repo_key].get('password', None) + svn_remote_repo_code_root = repos_dict[repo_key].get('svn-repo-code-root', None) + tags = repos_dict[repo_key].get('tags', None) + trunk = repos_dict[repo_key].get('trunk', None) + username = repos_dict[repo_key].get('username', None) + + ## Parse config parameters into command args + # TODO: Interpret code_host_name, git_org_name, and git_repo_name if not given + # ex. https://svn.apache.org/repos/asf/parquet/site + # code_host_name = svn.apache.org # can get by removing url scheme, if any, till the first / + # arbitrary path on server = repos # optional, can either be a directory, or may actually be the repo + # git_org_name = asf + # git_repo_name = parquet + # git repo root = site # arbitrary path inside the repo where contributors decided to start storing /trunk /branches /tags and other files to be included in the repo + local_repo_path = str(environment_variables_dict["SRC_SERVE_ROOT"]+"/"+code_host_name+"/"+git_org_name+"/"+git_repo_name) + + ## Define common command args + arg_batch_end_revision = [ f"{git_config_namespace}.batch-end-revision" ] + arg_git = [ "git", "-C", local_repo_path ] + arg_git_cfg = arg_git + [ "config" ] + arg_git_svn = arg_git + [ "svn" ] + arg_svn_echo_password = None + arg_svn_non_interactive = [ "--non-interactive" ] # Do not prompt, just fail if the command doesn't work, only used for direct `svn` command + arg_svn_password = [ "--password", password ] # Only used for direct `svn` commands + arg_svn_remote_repo_code_root = [ svn_remote_repo_code_root ] + arg_svn_username = [ "--username", username ] + + ## Define commands + # One offs in the new array + # Reused one in their own arrays above, even if they're single element arrays + cmd_git_authors_file = arg_git_cfg + [ "svn.authorsfile", authors_file_path ] + cmd_git_authors_prog = arg_git_cfg + [ "svn.authorsProg", authors_prog_path ] + cmd_git_bare_clone = arg_git_cfg + [ "core.bare", "true" ] + cmd_git_default_branch = arg_git_cfg + [ "--global", "init.defaultBranch", git_default_branch ] # Possibility of collisions if multiple of these are run overlapping, make sure it's quick between reading and using this + cmd_git_get_batch_end_revision = arg_git_cfg + [ "--get" ] + arg_batch_end_revision + cmd_git_get_svn_url = arg_git_cfg + [ "--get", "svn-remote.svn.url" ] + cmd_git_set_batch_end_revision = arg_git_cfg + [ "--replace-all" ] + arg_batch_end_revision + cmd_git_svn_fetch = arg_git_svn + [ "fetch" ] + cmd_git_svn_init = arg_git_svn + [ "init" ] + arg_svn_remote_repo_code_root + cmd_svn_info = [ "svn", "info" ] + arg_svn_non_interactive + arg_svn_remote_repo_code_root + cmd_svn_log = [ "svn", "log", "--xml", "--with-no-revprops" ] + arg_svn_non_interactive + arg_svn_remote_repo_code_root + + ## Modify commands based on config parameters + if username: + cmd_svn_info += arg_svn_username + cmd_svn_log += arg_svn_username + cmd_git_svn_init += arg_svn_username + cmd_git_svn_fetch += arg_svn_username + + if password: + arg_svn_echo_password = True + cmd_svn_info += arg_svn_password + cmd_svn_log += arg_svn_password + + # States + # Create: + # State: + # The directory doesn't already exist + # The repo doesn't already exist + # How did we get here: + # First time - Create new path / repo / fetch job + # First run of the script + # New repo was added to the repos-to-convert.yaml file + # Repo was deleted from disk + # Approach: + # Harder to test for the negative, so assume we're in the Create state, unless we find we're in the Running or Update states + repo_state = "create" + # Running: + # State: + # An svn fetch process is still running + # How did we get here: + # Fetch process is still running from a previous run of the script + # Approach: + # Check first if the process is running, then continue this outer loop + # Update: + # State: + # Repo already exists, with a valid configuration + # How did we get here: + # A fetch job was previously run, but is not currently running + # Approach: + # Check if we're in the update state, then set repo_state = "update" + # repo_state = "update" + + + ## Check if we're in the Running state + + # Check if a fetch or log process is currently running for this repo + try: - try: + # Get running processes, both as a list and string + ps_command = ["ps", "--no-headers", "-e", "--format", "pid,args"] + running_processes = subprocess_run(ps_command)["output"] + running_processes_string = " ".join(running_processes) - ps_command = ["ps", "-e", "--format", "%a"] + # Define the list of strings we're looking for in the running processes' commands + cmd_git_svn_fetch_string = " ".join(cmd_git_svn_fetch) + cmd_svn_log_string = " ".join(cmd_svn_log) + process_name = f"clone_svn_repo_{repo_key}" + log_failure_message = "" - running_processes = subprocess_run(ps_command) + # In priority order + concurrency_error_strings_and_messages = [ + (process_name, "previous process still running" ), + (cmd_git_svn_fetch_string, "previous fetching process still running"), + (cmd_svn_log_string, "previous svn log process still running" ), + (local_repo_path, "local repo path in process running" ), + (svn_remote_repo_code_root, "repo url in process running" ), + ] - running_processes_string = " ".join(running_processes) + # Loop through the list of strings we're looking for, to check the running processes for each of them + for concurrency_error_string_and_message in concurrency_error_strings_and_messages: - if cmd_run_git_svn_fetch_string in running_processes_string: + # If this string we're looking for is found + if concurrency_error_string_and_message[0] in running_processes_string: - logging.info(f"{repo_key}; Fetching process already running") - continue + # Find which process it's in + for running_process in running_processes: + pid, args = running_process.lstrip().split(' ', 1) - except Exception as exception: - logging.warning(f"{repo_key}; Failed to check if fetching process is already running, will try to start it. Exception: {type(exception)}, {exception.args}, {exception}") + # If it's this process, and this process hasn't already matched one of the previous concurrency errors + if ( + concurrency_error_string_and_message[0] in args and + pid not in log_failure_message + ): - ## Check if we're in the Update state - # Check if the git repo already exists and has the correct settings in the config file - try: + # Add its message to the string + log_failure_message += f"{concurrency_error_string_and_message[1]} in pid {pid}, command {args}, " - svn_remote_url = subprocess_run(cmd_cfg_git_get_svn_url)[0] + # Could use something like this to try and find the process name + # multiprocessing.Process(target=clone_svn_repo, name=f"clone_svn_repo_{repo_key}", args=(repo_key,)).start() + # name=f"clone_svn_repo_{repo_key}" shows up in exception stack traces, need to figure out where to find it + # try: - if svn_remote_url in svn_repo_code_root: + # process_name = psutil.Process(int(pid)).name() - repo_state = "update" + # if process_name: + # log_failure_message += f"process name: {process_name}, " - except TypeError as exception: - # Get an error when trying to git config --get svn-remote.svn.url, when the directory doesn't exist on disk - # WARNING; karaf; failed to check git config --get svn-remote.svn.url. Exception: , ("'NoneType' object is not subscriptable",), 'NoneType' object is not subscriptable - pass + # except psutil.NoSuchProcess as exception: + # pass - except Exception as exception: - logging.warning(f"{repo_key}; failed to check git config --get svn-remote.svn.url. Exception: {type(exception)}, {exception.args}, {exception}") + if log_failure_message: + logging.info(f"{repo_key}; {log_failure_message}skipping") + return + except FileNotFoundError as exception: + # FileNotFoundError: [Errno 2] No such file or directory: '/proc/69/cwd' + # Running ps can fail trying to find process data in the proc table + # Safe to ignore in this case + pass - ## Run commands - # Run the svn info command to test logging in to the SVN server, for network connectivity and credentials - # Capture the output so we know the max revision in this repo's history - svn_info = subprocess_run(cmd_run_svn_info, password, arg_svn_echo_password) - svn_info_string = " ".join(svn_info) + except Exception as exception: - # Get last changed revision for this repo - last_changed_rev_string = "Last Changed Rev: " - if last_changed_rev_string in svn_info_string: - last_changed_rev = svn_info_string.split(last_changed_rev_string)[1].split(" ")[0] + # repo-converter | 2024-03-14 10:46:41; run.py; WARNING; ant; Failed to check if fetching process is already running, will try to start it. + # Exception: , (2, 'No such file or directory'), [Errno 2] No such file or directory: '/proc/16/cwd' + logging.warning(f"{repo_key}; Failed to check if fetching process is already running, will try to start it. Exception: {type(exception)}, {exception.args}, {exception}") - # Check if the previous batch end revision is the same as the last changed rev from svn info - # If yes, we're up to date, continue to the next repo, instead of forking the git svn process to do the same check - if repo_state == "update": + stack = traceback.extract_stack() + (filename, line, procname, text) = stack[-1] - # TypeError: 'NoneType' object is not subscriptable - try: - previous_batch_end_revision = subprocess_run(cmd_cfg_git_get_batch_end_revision)[0] - except Exception as exception: - previous_batch_end_revision = "1" + logging.warning(f"filename, line, procname, text: {filename, line, procname, text}") - if previous_batch_end_revision == last_changed_rev: + raise exception - logging.info(f"{repo_key}; local rev {previous_batch_end_revision}, remote rev {last_changed_rev}, local clone is up to date, skipping it") - continue + ## Check if we're in the Update state + # Check if the git repo already exists and has the correct settings in the config file + try: - else: + svn_remote_url = subprocess_run(cmd_git_get_svn_url, quiet=True)["output"][0] - cmd_run_svn_log_remaining_revs = cmd_run_svn_log + ["--revision", f"{previous_batch_end_revision}:HEAD"] - svn_log_remaining_revs = subprocess_run(cmd_run_svn_log_remaining_revs, password, arg_svn_echo_password) - svn_log_remaining_revs_string = " ".join(svn_log_remaining_revs) - remaining_revs = svn_log_remaining_revs_string.count("revision=") + if svn_remote_url in svn_remote_repo_code_root: - logging.info(f"{repo_key}; local rev {previous_batch_end_revision}, remote rev {last_changed_rev}, {remaining_revs} revs remaining to catch up, fetching next batch of commits") + repo_state = "update" + except TypeError as exception: + # Get an error when trying to git config --get svn-remote.svn.url, when the directory doesn't exist on disk + # WARNING; karaf; failed to check git config --get svn-remote.svn.url. Exception: , ("'NoneType' object is not subscriptable",), 'NoneType' object is not subscriptable + logging.warning(f"{repo_key}; failed to check git config --get svn-remote.svn.url. Exception: {type(exception)}, {exception.args}, {exception}") - if repo_state == "create": + except Exception as exception: + logging.warning(f"{repo_key}; failed to check git config --get svn-remote.svn.url. Exception: {type(exception)}, {exception.args}, {exception}") - logging.info(f"{repo_key}; didn't find a local clone, creating one") - # Create the repo path if it doesn't exist - if not os.path.exists(repo_path): - os.makedirs(repo_path) + ## Run commands + # Run the svn info command to test logging in to the SVN server, for network connectivity and credentials + # Capture the output so we know the max revision in this repo's history + svn_info = subprocess_run(cmd_svn_info, password, arg_svn_echo_password) + svn_info_output_string = " ".join(svn_info["output"]) - # Set the default branch before init - subprocess_run(cmd_cfg_git_default_branch) + if svn_info["returncode"] != 0: - if layout: - cmd_run_git_svn_init += ["--stdlayout"] + # The clone_svn_repo function runs in its own process, get this process' start time + # Check for current time < start time + REPO_CONVERTER_INTERVAL_SECONDS, to ensure that this retry is not extending beyond the interval + process_create_time = psutil.Process(os.getpid()).create_time() - # Warn the user if they provided an invalid value for the layout, only standard is supported - if "standard" not in layout and "std" not in layout: - logging.warning(f"{repo_key}; Layout shortcut provided with incorrect value {layout}, only standard is supported for the shortcut, continuing assuming standard, otherwise provide --trunk, --tags, and --branches") + # Let it retry up top 80% of the time remaining until the next REPO_CONVERTER_INTERVAL_SECONDS, so this process doesn't overrun the current interval + retry_time_limit = process_create_time + int(environment_variables_dict["REPO_CONVERTER_INTERVAL_SECONDS"] * 0.8) - if trunk: - cmd_run_git_svn_init += ["--trunk", trunk] - if tags: - cmd_run_git_svn_init += ["--tags", tags] - if branches: - cmd_run_git_svn_init += ["--branches", branches] + # Set a maximum retry delay of one third of REPO_CONVERTER_INTERVAL_SECONDS, so it can retry 2-3 times, or 60 seconds, whichever is shorter + retry_delay_range = min(int(environment_variables_dict["REPO_CONVERTER_INTERVAL_SECONDS"] / 3), 60) - # Initialize the repo - subprocess_run(cmd_run_git_svn_init, password, arg_svn_echo_password) + # Set a maximum number of retries per script run + retry_attempts_max = 3 + retries_attempted = 0 - # Initialize this config with a 0 value - cmd_cfg_git_set_batch_end_revision.append(str(0)) - subprocess_run(cmd_cfg_git_set_batch_end_revision) + svn_connection_failure_message_to_check_for = "Unable to connect to a repository at" - # Configure the bare clone - # Testing without the bare clone to see if branching works easier - # and because I forget why a bare clone was needed - # subprocess_run(cmd_cfg_git_bare_clone) + while ( + svn_connection_failure_message_to_check_for in svn_info_output_string and + time.time() < retry_time_limit and + retries_attempted < retry_attempts_max + ): + retries_attempted += 1 + retry_delay_seconds = random.randrange(1, retry_delay_range) - ## Back to steps we do for both Create and Update states, so users can update the below parameters without having to restart the clone from scratch - # TODO: Check if these configs are already set the same before trying to set them + logging.warning(f"{repo_key}; Failed to connect to repo remote, retrying {retries_attempted} of max {retry_attempts_max} times, with a semi-random delay of {retry_delay_seconds} seconds") - # Configure the authors file, if provided - if authors_file_path: - if os.path.exists(authors_file_path): - subprocess_run(cmd_cfg_git_authors_file) - else: - logging.warning(f"{repo_key}; authors file not found at {authors_file_path}, skipping configuring it") + time.sleep(retry_delay_seconds) - # Configure the authors program, if provided - if authors_prog_path: - if os.path.exists(authors_prog_path): - subprocess_run(cmd_cfg_git_authors_prog) - else: - logging.warning(f"{repo_key}; authors prog not found at {authors_prog_path}, skipping configuring it") + svn_info = subprocess_run(cmd_svn_info, password, arg_svn_echo_password) + svn_info_output_string = " ".join(svn_info["output"]) - # Configure the .gitignore file, if provided - if git_ignore_file_path: - if os.path.exists(git_ignore_file_path): - shutil.copy2(git_ignore_file_path, repo_path) - else: - logging.warning(f"{repo_key}; .gitignore file not found at {git_ignore_file_path}, skipping configuring it") + if svn_info["returncode"] != 0: - # If the user has configured a batch size - if fetch_batch_size: + log_failure_message = "" - try: + if retries_attempted == retry_attempts_max: + log_failure_message = f"hit retry count limit {retry_attempts_max} for this run" + + elif not time.time() < retry_time_limit: + log_failure_message = f"hit retry time limit for this run" + + logging.error(f"{repo_key}; Failed to connect to repo remote, {log_failure_message}, skipping") + return + + else: + + logging.warning(f"{repo_key}; Successfully connected to repo remote after {retries_attempted} retries") + + # Get last changed revision for this repo + last_changed_rev = svn_info_output_string.split("Last Changed Rev: ")[1].split(" ")[0] - batch_start_revision = None - batch_end_revision = None + # Check if the previous batch end revision is the same as the last changed rev from svn info + # If yes, we're up to date, return to the next repo, instead of forking the git svn process to do the same check + if repo_state == "update": - # Get the revision number to start with - if repo_state == "update": + # TypeError: 'NoneType' object is not subscriptable + try: + previous_batch_end_revision = subprocess_run(cmd_git_get_batch_end_revision)["output"][0] + except Exception as exception: + previous_batch_end_revision = "1" + + if previous_batch_end_revision == last_changed_rev: + + logging.info(f"{repo_key}; up to date, skipping; local rev {previous_batch_end_revision}, remote rev {last_changed_rev}") + return + + else: + + cmd_svn_log_remaining_revs = cmd_svn_log + ["--revision", f"{previous_batch_end_revision}:HEAD"] + svn_log_remaining_revs = subprocess_run(cmd_svn_log_remaining_revs, password, arg_svn_echo_password)["output"] + svn_log_remaining_revs_string = " ".join(svn_log_remaining_revs) + remaining_revs = svn_log_remaining_revs_string.count("revision=") + logging.info(f"{repo_key}; out of date; local rev {previous_batch_end_revision}, remote rev {last_changed_rev}, {remaining_revs} revs remaining to catch up, fetching next batch of {min(remaining_revs,fetch_batch_size)} revisions") + + + if repo_state == "create": - # Try to retrieve repo-converter.batch-end-revision from git config - # previous_batch_end_revision = git config --get repo-converter.batch-end-revision - # Need to fail gracefully - previous_batch_end_revision = subprocess_run(cmd_cfg_git_get_batch_end_revision) + logging.info(f"{repo_key}; didn't find a local clone, creating one") - if previous_batch_end_revision: + # Create the repo path if it doesn't exist + if not os.path.exists(local_repo_path): + os.makedirs(local_repo_path) - batch_start_revision = int(" ".join(previous_batch_end_revision)) + 1 + # Set the default branch before init + subprocess_run(cmd_git_default_branch) - if repo_state == "create" or batch_start_revision == None: + if layout: + cmd_git_svn_init += ["--stdlayout"] - # If this is a new repo, get the first changed revision number for this repo from the svn server log - cmd_run_svn_log_batch_start_revision = cmd_run_svn_log + ["--limit", "1", "--revision", "1:HEAD"] - svn_log_batch_start_revision = subprocess_run(cmd_run_svn_log_batch_start_revision, password, arg_svn_echo_password) - batch_start_revision = int(" ".join(svn_log_batch_start_revision).split("revision=\"")[1].split("\"")[0]) + # Warn the user if they provided an invalid value for the layout, only standard is supported + if "standard" not in layout and "std" not in layout: + logging.warning(f"{repo_key}; Layout shortcut provided with incorrect value {layout}, only standard is supported for the shortcut, continuing assuming standard, otherwise provide --trunk, --tags, and --branches") - if batch_start_revision: + if trunk: + cmd_git_svn_init += ["--trunk", trunk] + if tags: + cmd_git_svn_init += ["--tags", tags] + if branches: + cmd_git_svn_init += ["--branches", branches] - # Get the batch size'th revision number for the rev to end this batch range - cmd_run_svn_log_batch_end_revision = cmd_run_svn_log + ["--limit", str(fetch_batch_size), "--revision", f"{batch_start_revision}:HEAD"] - cmd_run_svn_log_batch_end_revision_output = subprocess_run(cmd_run_svn_log_batch_end_revision, password, arg_svn_echo_password) + # Initialize the repo + subprocess_run(cmd_git_svn_init, password, arg_svn_echo_password) - try: + # Initialize this config with a 0 value + cmd_git_set_batch_end_revision.append(str(0)) + subprocess_run(cmd_git_set_batch_end_revision) - # While we're at it, update the batch starting rev to the first real rev number after the previous end rev +1 - batch_start_revision = int(" ".join(cmd_run_svn_log_batch_end_revision_output).split("revision=\"")[1].split("\"")[0]) + # Configure the bare clone + # Testing without the bare clone to see if branching works easier + # and because I forget why a bare clone was needed + # subprocess_run(cmd_git_bare_clone) - # Reverse the output so we can get the last revision number - cmd_run_svn_log_batch_end_revision_output.reverse() - batch_end_revision = int(" ".join(cmd_run_svn_log_batch_end_revision_output).split("revision=\"")[1].split("\"")[0]) - except IndexError as exception: - logging.warning(f"{repo_key}; IndexError when getting batch start or end revisions for batch size {fetch_batch_size}; running the fetch without the batch size limit; exception: {type(exception)}, {exception.args}, {exception}") + ## Back to steps we do for both Create and Update states, so users can update the below parameters without having to restart the clone from scratch + # TODO: Check if these configs are already set the same before trying to set them + # Configure the authors file, if provided + if authors_file_path: + if os.path.exists(authors_file_path): + subprocess_run(cmd_git_authors_file) + else: + logging.warning(f"{repo_key}; authors file not found at {authors_file_path}, skipping configuring it") + + # Configure the authors program, if provided + if authors_prog_path: + if os.path.exists(authors_prog_path): + subprocess_run(cmd_git_authors_prog) + else: + logging.warning(f"{repo_key}; authors prog not found at {authors_prog_path}, skipping configuring it") + + # Configure the .gitignore file, if provided + if git_ignore_file_path: + if os.path.exists(git_ignore_file_path): + shutil.copy2(git_ignore_file_path, local_repo_path) + else: + logging.warning(f"{repo_key}; .gitignore file not found at {git_ignore_file_path}, skipping configuring it") - if batch_start_revision and batch_end_revision: + # Batch processing + batch_start_revision = None + batch_end_revision = None + + try: + + # Get the revision number to start with + if repo_state == "update": - # If we were successful getting both starting and ending revision numbers, then use them - cmd_run_git_svn_fetch += ["--revision", f"{batch_start_revision}:{batch_end_revision}"] + # Try to retrieve repo-converter.batch-end-revision from git config + # previous_batch_end_revision = git config --get repo-converter.batch-end-revision + # Need to fail gracefully + previous_batch_end_revision = subprocess_run(cmd_git_get_batch_end_revision)["output"] - # Store the ending revision number, hoping that this batch completes successfully, as these revs won't be retried - cmd_cfg_git_set_batch_end_revision.append(str(batch_end_revision)) - subprocess_run(cmd_cfg_git_set_batch_end_revision) + if previous_batch_end_revision: - except Exception as exception: + batch_start_revision = int(" ".join(previous_batch_end_revision)) + 1 - # Log a warning if this fails, and run the fetch without the --revision arg - logging.warning(f"{repo_key}; failed to get batch start or end revision for batch size {fetch_batch_size}; running the fetch without the batch size limit; exception: {type(exception)}, {exception.args}, {exception}") + if repo_state == "create" or batch_start_revision == None: - # Start the fetch - cmd_run_git_svn_fetch_string_may_have_batch_range = ' '.join(cmd_run_git_svn_fetch) - logging.info(f"{repo_key}; fetching with {cmd_run_git_svn_fetch_string_may_have_batch_range}") - multiprocessing.Process(target=subprocess_run, name=f"subprocess_run({cmd_run_git_svn_fetch})", args=(cmd_run_git_svn_fetch, password, password)).start() + # If this is a new repo, get the first changed revision number for this repo from the svn server log + cmd_svn_log_batch_start_revision = cmd_svn_log + ["--limit", "1", "--revision", "1:HEAD"] + svn_log_batch_start_revision = subprocess_run(cmd_svn_log_batch_start_revision, password, arg_svn_echo_password)["output"] + batch_start_revision = int(" ".join(svn_log_batch_start_revision).split("revision=\"")[1].split("\"")[0]) + + # Get the revision number to end with + if batch_start_revision: + + # Get the batch size'th revision number for the rev to end this batch range + cmd_svn_log_batch_end_revision = cmd_svn_log + ["--limit", str(fetch_batch_size), "--revision", f"{batch_start_revision}:HEAD"] + cmd_svn_log_batch_end_revision_output = subprocess_run(cmd_svn_log_batch_end_revision, password, arg_svn_echo_password)["output"] + + try: + + # While we're at it, update the batch starting rev to the first real rev number after the previous end rev +1 + batch_start_revision = int(" ".join(cmd_svn_log_batch_end_revision_output).split("revision=\"")[1].split("\"")[0]) + + # Reverse the output so we can get the last revision number + cmd_svn_log_batch_end_revision_output.reverse() + batch_end_revision = int(" ".join(cmd_svn_log_batch_end_revision_output).split("revision=\"")[1].split("\"")[0]) + + except IndexError as exception: + logging.warning(f"{repo_key}; IndexError when getting batch start or end revisions for batch size {fetch_batch_size}; running the fetch without the batch size limit; exception: {type(exception)}, {exception.args}, {exception}") + # , ('list index out of range',), list index out of range + # Need to handle the issue where revisions seem to be out of order on the server + + + # If we were successful getting both starting and ending revision numbers + if batch_start_revision and batch_end_revision: + + # Use them + cmd_git_svn_fetch += ["--revision", f"{batch_start_revision}:{batch_end_revision}"] + + except Exception as exception: + + # Log a warning if this fails, and run the fetch without the --revision arg + logging.warning(f"{repo_key}; failed to get batch start or end revision for batch size {fetch_batch_size}; running the fetch without the batch size limit; exception: {type(exception)}, {exception.args}, {exception}") + + # Start the fetch + cmd_git_svn_fetch_string_may_have_batch_range = ' '.join(cmd_git_svn_fetch) + logging.info(f"{repo_key}; fetching with {cmd_git_svn_fetch_string_may_have_batch_range}") + git_svn_fetch_result = subprocess_run(cmd_git_svn_fetch, password, password) + + # If the fetch succeed, and if we have a batch_end_revision + if git_svn_fetch_result["returncode"] == 0 and batch_end_revision: + + # Store the ending revision number + cmd_git_set_batch_end_revision.append(str(batch_end_revision)) + subprocess_run(cmd_git_set_batch_end_revision) + + +def clean_remote_branches(): + + cmd_git_get_branches = ["git", "branch", "-ra"] + subprocess_run(cmd_git_get_branches) + +# # From https://github.com/cjwilburn/svn-migration-scripts/blob/5c50dddf7f2d7c0bf6971985f6b1f018821732b4/src/main/scala/Branches.scala#L28C1-L52C8 +# git.forEachRefFull("refs/remotes/") +# .filterNot(_ startsWith "refs/remotes/tags") +# .foreach { +# branch_ref => +# // delete the "refs/remotes/" prefix +# val branch = branch_ref stripPrefix "refs/remotes/" + +# // create a local branch ref only if it's not trunk (which is already mapped to master) +# // and if it is not an intermediate branch (ex: foo@42) +# if (branch != "trunk" && !git.isIntermediateRef(branch)) { +# println("Creating the local branch '%s' for Subversion branch '%s'.".format(branch, branch_ref)) +# if (options.shouldCreate) { +# git("git", "branch", "-f", branch, branch_ref).! +# if (branch.length > 120) { +# printerr("WARNING: Branch %s is too long and cannot be tracked" format (branch)) +# } else { +# // Since Git 1.8.4 you can't track non-remote refs +# // https://github.com/git/git/commit/41c21f22d0fc06f1f22489621980396aea9f7a62 +# // Manually add the tracking to be used by SyncRebase +# // Note that the branch and merge can be different due to us 'cleaning' the names in fixNames() +# git("git", "config", "branch." + branch + ".merge", branch_ref) ! +# } +# } +# } +# } def redact_password(args, password=None): @@ -548,11 +708,12 @@ def redact_password(args, password=None): return args_without_password -def subprocess_run(args, password=None, echo_password=None): +def subprocess_run(args, password=None, echo_password=None, quiet=False): + return_dict = {} + return_dict["returncode"] = 1 + return_dict["output"] = None subprocess_output_to_log = None - subprocess_stdout_to_return = None - subprocess_stderr_to_check = None try: @@ -584,8 +745,11 @@ def subprocess_run(args, password=None, echo_password=None): subprocess_output = subprocess_to_run.communicate() # Redact password from output for logging - subprocess_output_to_log = redact_password(subprocess_output[0].splitlines(), password) - subprocess_output_to_log_backup = subprocess_output_to_log.copy() + subprocess_output = subprocess_output[0].splitlines() + subprocess_output_to_log = redact_password(subprocess_output, password) + + # Set the output to return + return_dict["output"] = subprocess_output # If the output is longer than max_output_total_characters, it's probably just a list of all files converted, so truncate it max_output_total_characters = 1000 @@ -607,27 +771,34 @@ def subprocess_run(args, password=None, echo_password=None): # If the process exited successfully if subprocess_to_run.returncode == 0: - # Assign the function return value, so the function doesn't return None - # Create a separate list for the stdout content, so we can log a truncated version, without changing the content for the return value - subprocess_stdout_to_return = subprocess_output_to_log_backup.copy() + return_dict["returncode"] = 0 status_message = "succeeded" print_process_status(process_dict, status_message, subprocess_output_to_log) else: - subprocess_stderr_to_check = subprocess_output_to_log - status_message = "failed" - print_process_status(process_dict, status_message, subprocess_stderr_to_check, log_level = logging.ERROR) + + log_level = logging.ERROR + + if quiet: + log_level = logging.DEBUG + + print_process_status(process_dict, status_message, subprocess_output_to_log, log_level=log_level) except subprocess.CalledProcessError as exception: status_message = f"raised an exception: {type(exception)}, {exception.args}, {exception}" - print_process_status(process_dict, status_message, subprocess_output_to_log, log_level = logging.ERROR) + log_level = logging.ERROR - if subprocess_stderr_to_check: + if quiet: + log_level = logging.DEBUG + + print_process_status(process_dict, status_message, subprocess_output_to_log, log_level=log_level) + + if subprocess_to_run.returncode != 0: # May need to make this more generic Git for all repo conversions # Handle the case of abandoned git svn lock files blocking fetch processes @@ -636,7 +807,7 @@ def subprocess_run(args, password=None, echo_password=None): lock_file_error_strings = ["Unable to create", "index.lock", "File exists"] # Handle this as a string, - stderr_without_password_string = " ".join(subprocess_stderr_to_check) + stderr_without_password_string = " ".join(subprocess_output_to_log) lock_file_error_conditions = (lock_file_error_string in stderr_without_password_string for lock_file_error_string in lock_file_error_strings) if all(lock_file_error_conditions): @@ -656,7 +827,7 @@ def subprocess_run(args, password=None, echo_password=None): except ValueError as exception: logging.error(f"Failed to find git execution path in command args while trying to delete {lock_file_path} with exception: {type(exception)}, {exception.args}, {exception}") - return subprocess_stdout_to_return + return return_dict def clone_tfs_repos(): @@ -696,19 +867,27 @@ def status_update_and_cleanup_zombie_processes(): # Loop through for each processes for process in psutil.process_iter(): - # Get all upstream parent PIDs of the process - process_parents_pids = [process_parent.pid for process_parent in process.parents()] + # The process may finish in the time between .process_iter() and .parents() + try: - # If this pid is in the parents, then we know its a child / grandchild / great-grandchild / etc. process of this process - if os_this_pid in process_parents_pids: + # Get all upstream parent PIDs of the process + # Caught a process doesn't exist exception here, could see if it could be handled + process_parents_pids = [process_parent.pid for process_parent in process.parents()] - # Add the process' own PID to the set - process_pids_to_wait_for.add(process.pid) + # If this pid is in the parents, then we know its a child / grandchild / great-grandchild / etc. process of this process + if os_this_pid in process_parents_pids: - # Loop through the process' parents and add them to the set too - for process_parents_pid in process_parents_pids: + # Add the process' own PID to the set + process_pids_to_wait_for.add(process.pid) - process_pids_to_wait_for.add(process_parents_pid) + # Loop through the process' parents and add them to the set too + for process_parents_pid in process_parents_pids: + + process_pids_to_wait_for.add(process_parents_pid) + + except psutil.NoSuchProcess as exception: + + logging.debug(f"Caught an exception when listing parents of processes: {exception}") # Remove this script's PID so it's not waiting on itself process_pids_to_wait_for.discard(os_this_pid) @@ -721,6 +900,7 @@ def status_update_and_cleanup_zombie_processes(): # Raises an exception for process_pid_to_wait_for in process_pids_to_wait_for: + process_dict = {} status_message = "" process_to_wait_for = None @@ -730,13 +910,13 @@ def status_update_and_cleanup_zombie_processes(): # Raises psutil.NoSuchProcess if the PID has already finished process_to_wait_for = psutil.Process(process_pid_to_wait_for) + # Get the process attributes from the OS + process_dict = process_to_wait_for.as_dict() + # This rarely fires, ex. if cleaning up processes at the beginning of a script execution and the process finished during the interval if process_to_wait_for.status() == psutil.STATUS_ZOMBIE: status_message = "is a zombie" - # Get the process attributes from the OS - process_dict = process_to_wait_for.as_dict() - # Wait a short period, and capture the return status # Raises psutil.TimeoutExpired if the process is busy executing longer than the wait time return_status = process_to_wait_for.wait(0.1) @@ -751,9 +931,10 @@ def status_update_and_cleanup_zombie_processes(): except Exception as exception: status_message = f"raised an exception while waiting: {type(exception)}, {exception.args}, {exception}" - finally: + if "pid" not in process_dict.keys(): + process_dict["pid"] = process_pid_to_wait_for - print_process_status(process_dict, status_message) + print_process_status(process_dict, status_message) def print_process_status(process_dict = {}, status_message = "", std_out = "", log_level = logging.DEBUG): @@ -773,14 +954,13 @@ def print_process_status(process_dict = {}, status_message = "", std_out = "", l process_dict_to_log = {key: process_dict[key] for key in process_attributes_to_log if key in process_dict} - # Calculate the running clock time - process_clock_time_seconds = time.time() - process_dict['create_time'] - process_clock_time_formatted = time.strftime("%H:%M:%S", time.localtime(process_clock_time_seconds)) - # Formulate the log message log_message = f"pid {process_dict['pid']}; {status_message}" - if status_message != "started": + if status_message != "started" and "create_time" in process_dict.keys(): + + process_clock_time_seconds = time.time() - process_dict["create_time"] + process_clock_time_formatted = time.strftime("%H:%M:%S", time.localtime(process_clock_time_seconds)) log_message += f"; clock time {process_clock_time_formatted}" if std_out: @@ -791,10 +971,10 @@ def print_process_status(process_dict = {}, status_message = "", std_out = "", l except psutil.NoSuchProcess as exception: log_message = f"pid {process_dict['pid']}; finished on status check" - except Exception as exception: - log_level = logging.ERROR - exception_string = " ".join(traceback.format_exception(exception)).replace("\n", " ") - log_message = f"Exception raised while checking process status. Exception: {exception_string}" + # except Exception as exception: + # log_level = logging.ERROR + # exception_string = " ".join(traceback.format_exception(exception)).replace("\n", " ") + # log_message = f"Exception raised while checking process status. Exception: {exception_string}" finally: # Log the message @@ -832,7 +1012,7 @@ def main(): logging.info(f"Sleeping for REPO_CONVERTER_INTERVAL_SECONDS={environment_variables_dict['REPO_CONVERTER_INTERVAL_SECONDS']} seconds") script_run_number += 1 - time.sleep(environment_variables_dict['REPO_CONVERTER_INTERVAL_SECONDS']) + time.sleep(environment_variables_dict["REPO_CONVERTER_INTERVAL_SECONDS"]) if __name__ == "__main__":