Databank now support 3 embargo states - Dark, Embargoed and Open.

Added a profiler to the pylons middleware
bhavanaananda · Sep 20, 2011 · af77eb7 · af77eb7
1 parent 1b3b927
commit af77eb7
Show file tree

Hide file tree

Showing 18 changed files with 1,777 additions and 689 deletions.
diff --git a/development-jenkins.ini b/development-jenkins.ini
@@ -8,7 +8,7 @@ debug = false
 # Uncomment and replace with the address which should receive any error reports
 #email_to = [email protected]
 smtp_server = localhost
-error_email_from = paste@localhost
+error_email_from = paste@jenkins
 
 [server:main]
 use = egg:Paste#http

diff --git a/development.ini b/development.ini
@@ -4,7 +4,7 @@
 # The %(here)s variable will be replaced with the parent directory of this file
 #
 [DEFAULT]
-debug = false
+debug = true
 # Uncomment and replace with the address which should receive any error reports
 #email_to = [email protected]
 smtp_server = localhost
@@ -13,11 +13,11 @@ error_email_from = paste@localhost
 [server:main]
 use = egg:Paste#http
 #Use these setings to run pylons using mod_wsgi and apache
-host = 127.0.0.1
-port = 5000
+#host = 127.0.0.1
+#port = 5000
 #Use these settings tp run pylons from the commandline
-#host = 0.0.0.0
-#port = 80
+host = 0.0.0.0
+port = 80
 
 [app:main]
 use = egg:rdfdatabank
@@ -30,16 +30,18 @@ beaker.session.secret = somesecret
 
 who.config_file = %(here)s/who.ini
 who.log_level = info
-who.log_file = /var/log/databank/who.log
-#who.log_file = stdout
+#who.log_file = /var/log/databank/who.log
+who.log_file = stdout
 #who.log_file = %(here)s/logs/who.log
 
 redis.host = localhost
 
 granary.store = %(here)s/silos
-#granary.uri_root = http://databank.bodleian.ox.ac.uk/datasets/
 granary.uri_root = http://192.168.23.133/
 
+profile.log_filename = %(here)s/logs/profile.log
+profile.path = /__profile__
+
 auth.file = %(here)s/passwd
 auth.info = %(here)s/rdfdatabank/config/users.py
 
@@ -80,17 +82,17 @@ keys = generic
 
 [logger_root]
 level = INFO
-handlers = logfile
+handlers = console
 
 [logger_routes]
 level = INFO
-handlers = logfile
+handlers = console
 qualname = routes.middleware
 # "level = DEBUG" logs the route matched and routing variables.
 
 [logger_rdfdatabank]
 level = DEBUG
-handlers = logfile
+handlers = console
 qualname = rdfdatabank
 
 [handler_console]

diff --git a/production.ini b/production.ini
@@ -8,7 +8,7 @@ debug = false
 # Uncomment and replace with the address which should receive any error reports
 email_to = [email protected]
 smtp_server = localhost
-error_email_from = paste@localhost
+error_email_from = paste@databank
 
 [server:main]
 use = egg:Paste#http
@@ -35,10 +35,8 @@ who.log_file = /var/log/databank/who.log
 
 redis.host = localhost
 
-#granary.store = %(here)s/silos
 granary.store = /silos
 granary.uri_root = http://databank.ora.ox.ac.uk/
-#granary.uri_root = http://163.1.127.173/
 
 auth.file = %(here)s/passwd
 auth.info = %(here)s/rdfdatabank/config/users.py

diff --git a/rdfdatabank/config/middleware.py b/rdfdatabank/config/middleware.py
@@ -46,13 +46,21 @@ def make_app(global_conf, full_stack=True, static_files=True, **app_conf):
     app = PylonsApp()
 
     #app = httpexceptions.make_middleware(app, global_conf)
+    if asbool(config['debug']):
+        from repoze.profile.profiler import AccumulatingProfileMiddleware
+        app = AccumulatingProfileMiddleware(
+           app,
+           log_filename=app_conf['profile.log_filename'],
+           discard_first_request=True,
+           flush_at_shutdown=True,
+           path=app_conf['profile.path']
+           )
 
     # Routing/Session/Cache Middleware
     app = RoutesMiddleware(app, config['routes.map'])
     app = SessionMiddleware(app, config)
     app = CacheMiddleware(app, config)
 
-
     # CUSTOM MIDDLEWARE HERE (filtered by error handling middlewares)
     if asbool(full_stack):
         # Handle Python exceptions

diff --git a/rdfdatabank/controllers/datasets.py b/rdfdatabank/controllers/datasets.py
@@ -3,14 +3,16 @@
 import re, os, shutil, codecs
 import simplejson
 from datetime import datetime, timedelta
+from dateutil.relativedelta import *
+from dateutil.parser import parse
 import time
 from uuid import uuid4
 from pylons import request, response, session, tmpl_context as c, url, app_globals as ag
 from pylons.controllers.util import abort, redirect
 from pylons.decorators import rest
 from paste.fileapp import FileApp
 from rdfdatabank.lib.base import BaseController, render
-from rdfdatabank.lib.utils import create_new, is_embargoed, get_readme_text, test_rdf, munge_manifest, serialisable_stat, allowable_id2
+from rdfdatabank.lib.utils import create_new, is_embargoed, get_readme_text, test_rdf, munge_manifest, serialisable_stat, allowable_id2, get_rdf_template
 from rdfdatabank.lib.file_unpack import get_zipfiles_in_dataset
 from rdfdatabank.lib.conneg import MimeType as MT, parse as conneg_parse
 
@@ -218,14 +220,16 @@ def datasetview(self, silo, id):
                     if ident['repoze.who.userid'] == creator or ident.get('role') in ["admin", "manager"]:
                         c.editor = True
 
-            if c.version and not c.version == currentversion:
-                c.editor = False
-
+
             c.show_files = True
             #Only the administrator, manager and creator can view embargoed files.
             if embargoed and not c.editor:
                 c.show_files = False
 
+            #Display but do not edit previous versions of files, since preious versions are read only.
+            if c.version and not c.version == currentversion:
+                c.editor = False
+
             # View options
             if "view" in options and c.editor:
                 c.view = options['view']
@@ -240,7 +244,8 @@ def datasetview(self, silo, id):
             c.embargos[id] = is_embargoed(c_silo, id)
             c.parts = item.list_parts(detailed=True)
             c.manifest_pretty = item.rdf_to_string(format="pretty-xml")
-            c.manifest = item.rdf_to_string()
+            #c.manifest = item.rdf_to_string()
+            c.manifest = get_rdf_template(item.uri, id)
             c.zipfiles = get_zipfiles_in_dataset(item)
             c.readme_text = None
             #if item.isfile("README"):
@@ -366,20 +371,28 @@ def datasetview(self, silo, id):
                     abort(403)
                 item.increment_version_delta(clone_previous_version=True, copy_filenames=['manifest.rdf'])
                 #if params.has_key('embargoed'):
-                if (params.has_key('embargo_change') and params.has_key('embargoed')) or \
-                   (params.has_key('embargoed') and params['embargoed'].lower() == 'true'):
+                if (params.has_key('embargo_change') and params.has_key('embargoed') and \
+                   params['embargoed'].lower() in ['true', '1'] and params['embargo_change'].lower() in ['true', '1']) or \
+                   (params.has_key('embargoed') and params['embargoed'].lower() in ['true', '1']):
+                    embargoed_until_date = None
                     if params.has_key('embargoed_until') and params['embargoed_until']:
-                        embargoed_until_date = params['embargoed_until']
-                    elif params.has_key('embargo_days_from_now') and params['embargo_days_from_now']:
-                        embargoed_until_date = (datetime.now() + timedelta(days=params['embargo_days_from_now'])).isoformat()
-                    else:
-                        embargoed_until_date = (datetime.now() + timedelta(days=365*70)).isoformat()
+                        try:
+                            embargoed_until_date = parse(params['embargoed_until']).isoformat()
+                        except:
+                            embargoed_until_date = (datetime.now() + relativedelta(years=+70)).isoformat()
+                    elif params.has_key('embargo_days_from_now') and params['embargo_days_from_now'].isdigit():
+                        embargoed_until_date = (datetime.now() + timedelta(days=int(params['embargo_days_from_now']))).isoformat()
+                    #It is embargoed indefinitely by default
+                    #else:
+                    #    embargoed_until_date = (datetime.now() + timedelta(days=365*70)).isoformat()
                     item.metadata['embargoed'] = True
-                    item.metadata['embargoed_until'] = embargoed_until_date
+                    item.metadata['embargoed_until'] = ''
                     item.del_triple(item.uri, u"oxds:isEmbargoed")
                     item.del_triple(item.uri, u"oxds:embargoedUntil")
                     item.add_triple(item.uri, u"oxds:isEmbargoed", 'True')
-                    item.add_triple(item.uri, u"oxds:embargoedUntil", embargoed_until_date)
+                    if embargoed_until_date:
+                        item.metadata['embargoed_until'] = embargoed_until_date
+                        item.add_triple(item.uri, u"oxds:embargoedUntil", embargoed_until_date)
                 else:
                     #if is_embargoed(c_silo, id)[0] == True:
                     item.metadata['embargoed'] = False
@@ -822,13 +835,15 @@ def itemview(self, silo, id, path):
                     if ident['repoze.who.userid'] == creator or ident.get('role') in ["admin", "manager"]:
                         c.editor = True
 
-            if c.version and not c.version == currentversion:
-                c.editor = False
-
             c.show_files = True
+            #Only the administrator, manager and creator can view embargoed files.
             if embargoed and not c.editor:
                 c.show_files = False
 
+            #Display but do not edit previous versions of files, since preious versions are read only.
+            if c.version and not c.version == currentversion:
+                c.editor = False
+
             # View options
             if "view" in options and c.editor:
                 c.view = options['view']
@@ -1063,6 +1078,11 @@ def itemview(self, silo, id, path):
                     response.status_int = 403
                     response.status = "403 Forbidden"
                     return "Forbidden - Cannot delete the manifest"
+                if '3=' in path or '4=' in path:
+                    response.content_type = "text/plain"
+                    response.status_int = 403
+                    response.status = "403 Forbidden"
+                    return "Forbidden - These files are generated by the system and connot be deleted"
                 item.increment_version_delta(clone_previous_version=True, copy_filenames=['manifest.rdf'])
                 item.del_stream(path)
                 item.del_triple(item.uri, u"dcterms:modified")

diff --git a/rdfdatabank/controllers/users.py b/rdfdatabank/controllers/users.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 import logging
 import simplejson
+import codecs
 from pylons import request, response, session, config, tmpl_context as c, url
 from pylons.controllers.util import abort, redirect
 from pylons.decorators import rest

diff --git a/rdfdatabank/lib/file_unpack.py b/rdfdatabank/lib/file_unpack.py
@@ -19,8 +19,12 @@ class BadZipfile(Exception):
     """Cannot open zipfile using commandline tool 'unzip' to target directory"""
 
 def check_file_mimetype(real_filepath, mimetype):
+    if os.path.isdir(real_filepath):
+        return False
     if os.path.islink(real_filepath):
         real_filepath = os.readlink(real_filepath)
+    if not os.path.isfile(real_filepath):
+        return False
     p = subprocess.Popen("file -ib '%s'" %(real_filepath), shell=True, stdout=subprocess.PIPE)
     output_file = p.stdout
     output_str = output_file.read()
@@ -29,11 +33,15 @@ def check_file_mimetype(real_filepath, mimetype):
     else:
         return False
 
-def get_zipfiles_in_dataset_old(dataset):
+def get_zipfiles_in_dataset(dataset):
     derivative = dataset.list_rdf_objects("*", "ore:aggregates")
     zipfiles = {}
-    if derivative and derivative.values() and derivative.values()[0]:
-        for file_uri in derivative.values()[0]:
+    #if derivative and derivative.values() and derivative.values()[0]:
+    if derivative:
+        #for file_uri in derivative.values()[0]:
+        for file_uri in derivative:
+            if not file_uri.lower().endswith('.zip'):
+                continue
             filepath = file_uri[len(dataset.uri)+1:]
             real_filepath = dataset.to_dirpath(filepath)
             if os.path.islink(real_filepath):
@@ -43,7 +51,7 @@ def get_zipfiles_in_dataset_old(dataset):
                 zipfiles[filepath]="%s-%s"%(dataset.item_id, fn)
     return zipfiles
 
-def get_zipfiles_in_dataset(dataset):
+def get_zipfiles_in_dataset_new(dataset):
     p = subprocess.Popen("""file -iL `find %s -name '*.zip'` | grep  "application/zip" | awk -F":" '{print $1}'""" %dataset.to_dirpath(), shell=True, stdout=subprocess.PIPE)
     stdout_value = p.communicate()[0]
     zipfiles = {}
@@ -148,6 +156,9 @@ def unpack_zip_item(target_dataset, current_dataset, zip_item, silo, ident):
     if os.path.islink(filepath):
         filepath = os.readlink(filepath)
 
+    emb = target_dataset.metadata.get('embargoed')
+    emb_until = target_dataset.metadata.get('embargoed_until')
+
     # -- Step 1 -----------------------------
     unpacked_dir = unzip_file(filepath)
 
@@ -181,9 +192,15 @@ def unpack_zip_item(target_dataset, current_dataset, zip_item, silo, ident):
     target_dataset.add_triple(target_dataset.uri, u"rdf:type", "oxds:Grouping")
     target_dataset.add_triple(target_dataset.uri, "dcterms:isVersionOf", file_uri)
     #TODO: Adding the following metadata again as moving directory deletes all this information. Need to find a better way
-    embargoed_until_date = (datetime.now() + timedelta(days=365*70)).isoformat()
-    target_dataset.add_triple(target_dataset.uri, u"oxds:isEmbargoed", 'True')
-    target_dataset.add_triple(target_dataset.uri, u"oxds:embargoedUntil", embargoed_until_date)
+    if emb:
+        target_dataset.add_triple(target_dataset.uri, u"oxds:isEmbargoed", 'True')
+        if emb_until:
+            target_dataset.add_triple(target_dataset.uri, u"oxds:embargoedUntil", emb_until)
+    else:
+        target_dataset.add_triple(target_dataset.uri, u"oxds:isEmbargoed", 'False')
+    #The embargo
+    #embargoed_until_date = (datetime.now() + timedelta(days=365*70)).isoformat()
+    #target_dataset.add_triple(target_dataset.uri, u"oxds:embargoedUntil", embargoed_until_date)
     target_dataset.add_triple(target_dataset.uri, u"dcterms:identifier", target_dataset.item_id)
     target_dataset.add_triple(target_dataset.uri, u"dcterms:mediator", ident)
     target_dataset.add_triple(target_dataset.uri, u"dcterms:publisher", ag.publisher)

diff --git a/rdfdatabank/lib/utils.py b/rdfdatabank/lib/utils.py
@@ -1,5 +1,7 @@
 # -*- coding: utf-8 -*-
 from datetime import datetime, timedelta
+from dateutil.relativedelta import *
+from dateutil.parser import parse
 from time import sleep
 from redis import Redis
 from redis.exceptions import ConnectionError
@@ -14,6 +16,7 @@
 #from rdflib.parser import StringInputSource
 from rdflib import Namespace, RDF, RDFS, URIRef, Literal, BNode
 
+
 from uuid import uuid4
 import re
 
@@ -146,23 +149,31 @@ def is_embargoed_no_redis(silo, id, refresh=False):
 def create_new(silo, id, creator, title=None, embargoed=True, embargoed_until=None, embargo_days_from_now=None, **kw):
     item = silo.get_item(id, startversion="0")
     item.metadata['createdby'] = creator
-    item.metadata['embargoed'] = embargoed
+    item.metadata['embargoed_until'] = ''
     item.metadata['uuid'] = uuid4().hex
     item.add_namespace('oxds', "http://vocab.ox.ac.uk/dataset/schema#")
     item.add_triple(item.uri, u"rdf:type", "oxds:DataSet")
 
-    if embargoed:
-        if embargoed_until:
-            embargoed_until_date = embargoed_until
-        elif embargo_days_from_now:
-            embargoed_until_date = (datetime.now() + timedelta(days=embargo_days_from_now)).isoformat()
-        else:
-            embargoed_until_date = (datetime.now() + timedelta(days=365*70)).isoformat()
-        item.metadata['embargoed_until'] = embargoed_until_date
+    if embargoed==True or embargoed.lower() in ['true', '1'] :
+        item.metadata['embargoed'] = True
         item.add_triple(item.uri, u"oxds:isEmbargoed", 'True')
-        item.add_triple(item.uri, u"oxds:embargoedUntil", embargoed_until_date)
+        embargoed_until_date = None
+        if embargoed_until:
+            try:
+                embargoed_until_date = parse(embargoed_until).isoformat()
+            except:
+                embargoed_until_date = (datetime.now() + relativedelta(years=+70)).isoformat()
+        elif embargo_days_from_now and embargo_days_from_now.isdigit():
+            embargoed_until_date = (datetime.now() + timedelta(days=int(embargo_days_from_now))).isoformat()
+        #TODO: Do we want the default embargo_until to be 70 years or indefinite. Going with indefinite
+        #else:
+        #    embargoed_until_date = (datetime.now() + relativedelta(years=+70)).isoformat()
+        if embargoed_until_date:
+            item.metadata['embargoed_until'] = embargoed_until_date
+            item.add_triple(item.uri, u"oxds:embargoedUntil", embargoed_until_date)
     else:
         item.add_triple(item.uri, u"oxds:isEmbargoed", 'False')
+        item.metadata['embargoed'] = False
     item.add_triple(item.uri, u"dcterms:identifier", id)
     item.add_triple(item.uri, u"dcterms:mediator", creator)
     item.add_triple(item.uri, u"dcterms:publisher", ag.publisher)
@@ -188,6 +199,14 @@ def get_readme_text(item, filename="README"):
         text = fn.read().decode("utf-8")
     return u"%s\n\n%s" % (filename, text)
 
+def get_rdf_template(item_uri, item_id):
+    g = ConjunctiveGraph(identifier=item_uri)
+    g.bind('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#')
+    g.bind('dcterms', 'http://purl.org/dc/terms/')
+    g.add((URIRef(item_uri), URIRef('http://purl.org/dc/terms/identifier'), Literal(item_id)))
+    data2 = g.serialize(format='xml', encoding="utf-8") + '\n'
+    return data2
+
 #def test_rdf(text):
 def test_rdf(mfile):
     g = ConjunctiveGraph()

diff --git a/rdfdatabank/templates/datasetview.html b/rdfdatabank/templates/datasetview.html
@@ -90,7 +90,7 @@ <h3>Information for version ${c.version} of the dataset</h3>
     <p><small>Embargo date: Aim is for ISO8601 dates to provide embargo trigger events. Currently unused, unvalidated and unparsed.</small></p>
     </dd>
     <!-- Change RDF Manifest -->
-    <dt><b>Change RDF Manifest:</b></dt>
+    <dt><b>Add metadata to the RDF Manifest:</b></dt>
     <dd><%include file="/rdf_manifest_form.html"/></dd>
     </dl>
     <!-- Upload File -->