cedadev · Shanrahan16 · May 17, 2023 · Aug 7, 2023 · Aug 7, 2023 · Aug 7, 2023
diff --git a/checksit/check.py b/checksit/check.py
@@ -10,7 +10,7 @@
 
 from .cvs import vocabs, vocabs_prefix
 from .rules import rules, rules_prefix
-from .readers import pp, badc_csv, cdl, yml
+from .readers import pp, badc_csv, cdl, yml, image
 from .specs import SpecificationChecker
 from .utils import get_file_base, extension, UNDEFINED
 from .config import get_config
@@ -385,6 +385,8 @@ def parse_file_header(self, file_path, auto_cache=False, verbose=False):
             reader = badc_csv
         elif ext in ("yml"):
             reader = yml
+        elif ext in ("png", "PNG", "jpg", "JPG", "jpeg", "JPEG"):
+            reader = image   
         else:
             raise Exception(f"No known reader for file with extension: {ext}")
 

diff --git a/checksit/generic.py b/checksit/generic.py
@@ -94,7 +94,10 @@ def check_global_attrs(dct, defined_attrs=None, vocab_attrs=None, regex_attrs=No
             errors.append(f"[global-attributes:**************:{attr}]: No value defined for attribute '{attr}'.")
         else:
             errors.extend(vocabs.check(vocab_attrs[attr], dct["global_attributes"].get(attr), label=f"[global-attributes:******:{attr}]***"))
-
+            #vocab_check_output = vocabs.check(vocab_attrs[attr], dct["global_attributes"].get(attr), label=f"[global-attributes:******:{attr}]***")
+            #warnings.extend(vocab_check_output[1])
+            #errors.extend(vocab_check_output[0])
+
     for attr in regex_attrs:
         if attr not in dct['global_attributes']:
             errors.append(
@@ -118,7 +121,9 @@ def check_global_attrs(dct, defined_attrs=None, vocab_attrs=None, regex_attrs=No
         elif is_undefined(dct['global_attributes'].get(attr)):
             errors.append(f"[global-attributes:**************:{attr}]: No value defined for attribute '{attr}'.")
         else:
-            errors.extend(rules.check(rules_attrs[attr], dct['global_attributes'].get(attr), label=f"[global-attributes:******:{attr}]***"))
+            rules_check_output = rules.check(rules_attrs[attr], dct['global_attributes'].get(attr), context=dct['inpt'], label=f"[global-attributes:******:{attr}]***")
+            warnings.extend(rules_check_output[1])
+            errors.extend(rules_check_output[0])
 
 
     return errors, warnings

diff --git a/checksit/readers/cdl.py b/checksit/readers/cdl.py
@@ -2,6 +2,7 @@
 import re
 import yaml
 import subprocess as sp
+import sys
 
 from ..cvs import vocabs, vocabs_prefix
 
@@ -40,7 +41,8 @@ def _parse(self, inpt):
 
         for s in self.CDL_SPLITTERS:
             if s not in cdl_lines:
-                raise Exception(f"Invalid file or CDL contents provided: '{inpt[:100]}...'")
+                print(f"Please check your command - invalid file or CDL contents provided: '{inpt[:100]}...'")
+                sys.exit(1)
 
         sections = self._get_sections(cdl_lines, split_patterns=self.CDL_SPLITTERS, start_at=1)
 
@@ -188,7 +190,8 @@ def to_yaml(self):
     def to_dict(self):
         return {"dimensions": self.dimensions,
                 "variables": self.variables,
-                "global_attributes": self.global_attrs}
+                "global_attributes": self.global_attrs,
+                "inpt": self.inpt}
 
 
 def read(fpath, verbose=False):

diff --git a/checksit/readers/image.py b/checksit/readers/image.py
@@ -0,0 +1,60 @@
+import subprocess as sp
+import yaml
+
+def get_output(cmd):
+    subp = sp.Popen(cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
+    return subp.stdout.read().decode("charmap"), subp.stderr.read().decode("charmap")
+
+
+class ImageParser:
+
+    def __init__(self, inpt, verbose=False):
+        self.inpt = inpt
+        self.verbose = verbose
+        self.base_exiftool_arguments = ["exiftool", "-G1", "-j", "-c", "%+.6f"]
+        self._find_exiftool()
+        self._parse(inpt)
+
+    def _parse(self, inpt):
+        if self.verbose: print(f"[INFO] Parsing input: {inpt[:100]}...")
+        self.global_attrs = {}
+        exiftool_arguments = self.base_exiftool_arguments + [inpt]
+        exiftool_return_string = sp.check_output(exiftool_arguments)
+        raw_global_attrs = yaml.load(exiftool_return_string, Loader=yaml.SafeLoader)[0]
+        for tag_name in raw_global_attrs.keys():
+                value_type = type(raw_global_attrs[tag_name])
+                if value_type == list:
+                    self.global_attrs[tag_name] = str(raw_global_attrs[tag_name][0])
+                else:
+                    self.global_attrs[tag_name] = str(raw_global_attrs[tag_name])
+
+    def _find_exiftool(self):
+        if self.verbose: print("[INFO] Searching for exiftool...")
+        which_output, which_error = get_output("which exiftool")
+        if which_error.startswith("which: no exiftool in"):
+            msg = (
+                f"'exiftool' required to read image file metadata but cannot be found.\n"
+                f"              Visit https://exiftool.org/ for information on 'exiftool'."
+            )
+            raise RuntimeError(msg)
+        else:
+            self.exiftool_location = which_output.strip()
+            if self.verbose: print(f"[INFO] Found exiftool at {self.exiftool_location}.")
+
+    def _attrs_dict(self,content_lines):
+        attr_dict = {}
+        for line in content_lines:
+            if self.verbose: print(f"WORKING ON LINE: {line}")
+            key_0 = line.split("=",1)[0].strip()
+            key = key_0[1:]    #removes first character - unwanted quotation marks
+            value = line.split("=",1)[1].strip()
+            attr_dict[key] = value
+        return attr_dict
+
+    def to_dict(self):
+        return {"global_attributes": self.global_attrs, "inpt": self.inpt}
+
+
+def read(fpath, verbose=False):
+    return ImageParser(fpath, verbose=verbose)
+
diff --git a/checksit/rules/rule_funcs.py b/checksit/rules/rule_funcs.py
@@ -1,8 +1,16 @@
 import os
 import re
+from datetime import datetime
+import requests
+import json
+import pandas as pd
+from urllib.request import urlopen
+import json
+import pandas as pd
 
 from . import processors
 from ..config import get_config
+from pandas import json_normalize
 
 conf = get_config()
 rule_splitter = conf["settings"].get("rule_splitter", "|")
@@ -82,3 +90,216 @@ def string_of_length(value, context, extras=None, label=""):
         errors.append(f"{label} '{value}' must be exactly {min_length} characters")
 
     return errors
+
+
+def validate_image_date_time(value, context, extras=None, label=""):
+    """
+    A function to indifity if a date-time value is compatible with the NCAS image standard
+    """
+    errors = []
+
+    try:
+        if value != datetime.strptime(value, "%Y:%m:%d %H:%M:%S").strftime("%Y:%m:%d %H:%M:%S") and value != datetime.strptime(value, "%Y:%m:%d #%H:%M:%S.%f").strftime("%Y:%m:%d %H:%M:%S.%f"):
+            errors.append(f"{label} '{value}' needs to be of the format YYYY:MM:DD hh:mm:ss or YYYY:MM:DD hh:mm:ss.s")
+    except ValueError:
+        errors.append(f"{label} '{value}' needs to be of the format YYYY:MM:DD hh:mm:ss or YYYY:MM:DD hh:mm:ss.s")
+
+    return errors
+
+
+def validate_orcid_ID(value, context, extras=None, label=""):
+    """
+    A function to verify the format of an orcid ID
+    """
+    orcid_string = "https://orcid.org/"                                     # required format of start of the string
+
+    errors = []
+
+    PI_orcid_digits = value[-19:]
+    PI_orcid_digits_only = PI_orcid_digits.replace("-", "")
+
+    # Check that total the length is correct
+    if len(value) != 37:    
+        errors.append(f"{label} '{value}' needs to be of the format https://orcid.org/XXXX-XXXX-XXXX-XXXX")
+
+    # Check the start of the string (first 18 characters)
+    elif (value[0:18] != orcid_string or
+
+        # Check that the "-" are in the correct places
+        value[22] != "-" or
+        value[27] != "-" or
+        value[32] != "-" or
+
+        # Check that the last characters contain only "-" and digits
+        not PI_orcid_digits_only.isdigit):
+
+        errors.append(f"{label} '{value}' needs to be of the format https://orcid.org/XXXX-XXXX-XXXX-XXXX")
+
+    return errors
+
+
+def list_of_names(value, context, extras=None, label=""):
+    """
+    A function to verify the names of people when a list of names may be provided
+    """
+    name_pattern = r'(.)+, (.)+ ?((.)+|((.)\.))'                # The format names should be written in
+    character_name_pattern = r'[A-Za-z_À-ÿ\-\'\ \.\,]+'
+
+    warnings = []
+
+    if type(value) == list:
+        for i in value:
+            if not re.fullmatch(name_pattern, i):
+                warnings.append(f"{label} '{value}' should be of the format <last name>, <first name> <middle initials(s)> or <last name>, <first name> <middle name(s)> where appropriate")
+            if not re.fullmatch(character_name_pattern, i):
+                warnings.append(f"{label} '{value}' - please use characters A-Z, a-z, À-ÿ where appropriate")
+
+    if type(value) == str:
+        if not re.fullmatch(name_pattern, value):
+            warnings.append(f"{label} '{value}' should be of the format <last name>, <first name> <middle initials(s)> or <last name>, <first name> <middle name(s)> where appropriate")
+        if not re.fullmatch(character_name_pattern, value):
+            warnings.append(f"{label} '{value}' - please use characters A-Z, a-z, À-ÿ where appropriate")
+
+    return warnings
+
+
+def headline(value, context, extras=None, label=""):
+    """
+    A function to verify the format of the Headline
+    """
+    warnings = []
+
+    if len(value) > 150:
+        warnings.append(f"{label} '{value}' should contain no more than one sentence")
+
+    if value.count(".") >= 2:
+        warnings.append(f"{label} '{value}' should contain no more than one sentence")
+
+    if not value[0].isupper():
+        warnings.append(f"{label} '{value}' should start with a capital letter")
+
+    if len(value) < 10:
+        warnings.append(f"{label} '{value}' should be at least 10 characters")
+
+    return warnings
+
+
+def title_check(value, context, extras=None, label=""):
+    """
+    A function to check if the title matches the system filename
+    """
+    errors = []
+
+    if value != os.path.basename(context) :
+        errors.append(f"{label} '{value}' must match the name of the file")
+
+    return errors
+
+
+def title_instrument(value, context, extras=None, label=""):
+    """
+    A function to check if the instrument in the title is contained in the controlled vocabulary lists
+    """
+    warnings = []
+
+    instrument = value.partition("_")[0]
+
+    # open JSON controlled vocab files:
+    n = open ('./checksit/vocabs/AMF_CVs/2.0.0/AMF_ncas_instrument.json', "r")
+    c = open ('./checksit/vocabs/AMF_CVs/2.0.0/AMF_community_instrument.json', "r")
+
+    ## Reading from file:
+    ncas_data = json.loads(n.read())
+    community_data = json.loads(c.read())
+
+    if instrument not in ncas_data['ncas_instrument'] and instrument not in community_data['community_instrument']:
+        warnings.append(f"{label} '{instrument}' should be contained one of the instrument controlled vocabulary lists")
+
+    # Closing file
+    n.close()
+    c.close()
+
+    return warnings
+
+def title_platform(value, context, extras=None, label=""):
+    """
+    A function to check if the platform in the title is contained in the controlled vocabulary list
+    """
+    warnings = []
+
+    platform = value.split("_")[1]
+
+    # open JSON controlled vocab file:
+    g = open ('./checksit/vocabs/AMF_CVs/2.0.0/AMF_platform.json', "r")
+
+    ## Reading from file:
+    data = json.loads(g.read())
+
+    if platform not in data['platform']:
+        warnings.append(f"{label} '{platform}' should be contained in the platform controlled vocabulary list")
+
+    # Closing file
+    g.close()
+
+    return warnings
+
+def url_checker(value, context, extras=None, label=""):
+    """
+    A function to check if the url exists
+    """
+    warnings = []
+
+    try: url=urlopen(value)
+    except:
+        warnings.append(f"{label} '{value}' is not a reachable url")
+    else:
+        if url.getcode() != 200:           # (200 means it exists and is up and reachable)
+            warnings.append(f"{label} '{value}' is not a reachable url")
+    finally:
+        return warnings
+
+
+def relation_url_checker(value, context, extras=None, label=""):
+    """
+    A function to check if Relation field is in the correct format, and that the url exists
+    """
+    errors = []
+
+    if " " not in value:
+        errors.append(f"{label} '{value}' should contain a space before the url")
+    else:
+        relation_url = value.partition(" ")[2]        # extract only the url part of the relation string
+        if url_checker(relation_url, context, extras, label) != []:
+            errors.append(url_checker(relation_url, context, extras, label))       # check the url exists using the url_checker() function defined above
+
+    return errors
+
+
+def latitude(value, context, extras=None, label=""):
+    """
+    A function to check if the latitude is within -90 and +90
+    """
+    errors = []
+
+    latitude = re.findall(r'[0-9]+', value)[0]
+    int_latitude = int(latitude)
+
+    if int_latitude > 90:
+        errors.append(f"{label} '{value}' must be within -90 and +90 ")
+
+    return errors
+
+
+def longitude(value, context, extras=None, label=""):
+    """
+    A function to check if the longitude is within -180 and +180
+    """
+    errors = []
+
+    longitude = re.findall(r'[0-9]+', value)[0]
+    int_longitude = int(longitude)
+
+    if int_longitude > 180:
+        errors.append(f"{label} '{value}' must be within -180 and +180 ")
+
+    return errors