diff --git a/accession-monitoring/__init__.py b/accession-monitoring/__init__.py new file mode 100644 index 00000000..1fb673ee --- /dev/null +++ b/accession-monitoring/__init__.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 + +# Copyright 2020 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import configparser +import logging +import os +import subprocess +import sys + + +def init_logger(): + logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)-15s %(levelname)s %(message)s') + result_logger = logging.getLogger(__name__) + return result_logger + + +def get_args_from_properties_file(properties_file): + parser = configparser.ConfigParser() + parser.optionxform = str + + with open(properties_file, "r") as properties_file_handle: + # Dummy section is needed because + # ConfigParser is not clever enough to read config files without section headers + properties_section_name = "pipeline_properties" + properties_string = '[{0}]\n{1}'.format(properties_section_name, properties_file_handle.read()) + parser.read_string(properties_string) + config = dict(parser.items(section=properties_section_name)) + return config + + +def get_mongo_connection_details_from_properties_file(properties_file): + properties_file_args = get_args_from_properties_file(properties_file) + mongo_connection_properties = {"mongo_host": properties_file_args["spring.data.mongodb.host"], + "mongo_port": properties_file_args["spring.data.mongodb.port"], + "mongo_db": properties_file_args["spring.data.mongodb.database"], + "mongo_username": properties_file_args["spring.data.mongodb.username"], + "mongo_password": properties_file_args["spring.data.mongodb.password"], + "mongo_auth_db": properties_file_args["spring.data.mongodb.authentication-database"]} + return mongo_connection_properties + + +def run_command_with_output(command_description, command, return_process_output=False): + process_output = "" + + logger.info("Starting process: " + command_description) + logger.info("Running command: " + command) + + with subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=1, universal_newlines=True, + shell=True) as process: + for line in iter(process.stdout.readline, ''): + line = str(line).rstrip() + logger.info(line) + if return_process_output: + process_output += line + for line in iter(process.stderr.readline, ''): + line = str(line).rstrip() + logger.error(line) + if process.returncode != 0: + logger.error(command_description + " failed! Refer to the error messages for details.") + raise subprocess.CalledProcessError(process.returncode, process.args) + else: + logger.info(command_description + " - completed successfully") + if return_process_output: + return process_output + + +logger = init_logger() diff --git a/accession-monitoring/monitor_duplicate_accessions.py b/accession-monitoring/monitor_duplicate_accessions.py new file mode 100644 index 00000000..0008b968 --- /dev/null +++ b/accession-monitoring/monitor_duplicate_accessions.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 + +# Copyright 2020 EMBL - European Bioinformatics Institute +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import click +from datetime import datetime +import getpass +import smtplib +from __init__ import * +from urllib.parse import quote_plus + + +def get_mongo_uri(mongo_connection_properties): + return "mongodb://{0}:{1}@{2}/{3}?authSource={4}".format(mongo_connection_properties["mongo_username"], + quote_plus(mongo_connection_properties["mongo_password"]), + mongo_connection_properties["mongo_host"], + mongo_connection_properties["mongo_db"], + mongo_connection_properties["mongo_auth_db"]) + + +def export_mongo_accessions(mongo_connection_properties, collection_name, export_output_filename): + export_command = 'mongoexport --uri {0} ' \ + '--collection {1} --type=csv --fields accession ' \ + "--query '{{\"remappedFrom\": {{\"$exists\": false}}}}' " \ + '-o "{2}" 2>&1' \ + .format(get_mongo_uri(mongo_connection_properties), collection_name, export_output_filename) + run_command_with_output("Exporting accessions in the {0} collection in the {1} database at {2}..." + .format(collection_name, mongo_connection_properties["mongo_db"], + mongo_connection_properties["mongo_host"]), export_command) + + +def notify_by_email(mongo_connection_properties, collection_name, duplicates_output_filename, + number_of_duplicate_accessions, email_recipients): + error_message = "{0} DUPLICATE ACCESSIONS !!! in the {1} collection in the {2} database at {3}"\ + .format(number_of_duplicate_accessions, collection_name, mongo_connection_properties["mongo_db"], + mongo_connection_properties["mongo_host"]) + logger.error(error_message) + email_message = "Subject: {0}\n\n" \ + "Please see {1} for the list of duplicates.".format(error_message, duplicates_output_filename) + smtplib.SMTP('localhost').sendmail(getpass.getuser(), email_recipients, email_message) + + +def report_duplicates_in_exported_accessions_file(mongo_connection_properties, collection_name, export_output_filename, + duplicates_output_filename, email_recipients): + sorted_export_output_filename = export_output_filename.replace(".csv", "_sorted.csv") + run_command_with_output("Sorting {0}...".format(duplicates_output_filename), + 'sort -S 4G -T {0} -o "{1}" "{2}"' + .format(os.path.dirname(export_output_filename), sorted_export_output_filename, + export_output_filename)) + run_command_with_output("Exporting duplicates to {0}...".format(duplicates_output_filename), + 'uniq -d "{0}" > {1}'.format(sorted_export_output_filename, duplicates_output_filename)) + number_of_duplicate_accessions = run_command_with_output("Find duplicate accessions in the exported file...", + 'wc -l < "{0}"'.format(duplicates_output_filename), + return_process_output=True) + if int(number_of_duplicate_accessions) > 0: + notify_by_email(mongo_connection_properties, collection_name, duplicates_output_filename, + number_of_duplicate_accessions, email_recipients) + return 1 + else: + logger.info("NO duplicate accessions were found in the {0} collection in the {1} database at {2}..." + .format(collection_name, mongo_connection_properties["mongo_db"], + mongo_connection_properties["mongo_host"])) + return 0 + + +def report_duplicate_accessions_in_mongo(pipeline_properties_file, accessions_export_output_dir, + collection_name, email_recipients): + mongo_connection_properties = get_mongo_connection_details_from_properties_file(pipeline_properties_file) + export_output_filename = os.path.sep.join([accessions_export_output_dir, + "accessions_in_{0}_{1}_at_{2}_as_of_{3}.csv" + .format(mongo_connection_properties["mongo_db"], collection_name, + mongo_connection_properties["mongo_host"], + datetime.today().strftime('%Y%m%d%H%M%S'))]) + duplicates_output_filename = export_output_filename.replace("accessions_in", "duplicate_accessions_in") + + logger.info("Checking duplicate accessions in the {0} collection in the {1} database at {2}..." + .format(collection_name, mongo_connection_properties["mongo_db"], + mongo_connection_properties["mongo_host"])) + + export_mongo_accessions(mongo_connection_properties, collection_name, export_output_filename) + + return report_duplicates_in_exported_accessions_file(mongo_connection_properties, collection_name, + export_output_filename, duplicates_output_filename, + email_recipients) + + +@click.option("-p", "--pipeline-properties-file", required=True) +@click.option("-o", "--accessions-export-output-dir", required=True) +@click.option("-e", "--email-recipients", multiple=True, required=True) +@click.argument("collection-names", nargs=-1, required=True) +@click.command() +def main(pipeline_properties_file, accessions_export_output_dir, email_recipients, collection_names): + exit_code = 0 + for collection_name in collection_names: + exit_code = exit_code or \ + report_duplicate_accessions_in_mongo(pipeline_properties_file, accessions_export_output_dir, + collection_name, email_recipients) + sys.exit(exit_code) + + +if __name__ == '__main__': + main() diff --git a/accession-monitoring/requirements.txt b/accession-monitoring/requirements.txt new file mode 100644 index 00000000..b98f6609 --- /dev/null +++ b/accession-monitoring/requirements.txt @@ -0,0 +1 @@ +click \ No newline at end of file