Skip to content
This repository has been archived by the owner on Mar 31, 2023. It is now read-only.

Commit

Permalink
Data packages are displayed with pagination to enable displaying silo…
Browse files Browse the repository at this point in the history
…s with large number of data packages.

For this, I use a mysql table to index the list of data packages in a silo to enable fast lookup.
* List of silos displayed with more information
* The path /{silo_name} and /{silo_name}/datasets use the same controller

Search for the three different types now works.
Admin controller and solr worker modified to broadcast silo creation and updates and index the silo metadata in solr.
Extra output fields can be passed to search using the parameter fl
File names are indexed with the path, to enable searching in a new solr field called filename
The search controller uses the filename field to search for files and directories

Modified sword submissions for adding new data packages to the table
  • Loading branch information
Anusha Ranganathan committed Jun 11, 2012
1 parent 0444b06 commit a3fca25
Show file tree
Hide file tree
Showing 45 changed files with 868 additions and 397 deletions.
9 changes: 7 additions & 2 deletions development.ini
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ metadata.embargoed = False

solr.host = http://localhost:8080/solr
naming_rule = [^0-9a-zA-Z_\-\:]
naming_rule_humanized = Numbers, alphabets and -:
naming_rule_humanized = numbers, letters, '-' and ':', must be more than one character long and must not contain any spaces.
formats_served = text/html,text/xhtml,text/plain,application/json,application/rdf+xml,text/xml,text/rdf+n3,application/x-turtle,text/rdf+ntriples,text/rdf+nt
publisher = Bodleian Libraries, University of Oxford
rights = http://ora.ouls.ox.ac.uk/objects/uuid%3A1d00eebb-8fed-46ad-8e38-45dbdb4b224c
Expand All @@ -99,7 +99,7 @@ api.version = 0.3

# Logging configuration
[loggers]
keys = root, routes, rdfdatabank
keys = root, routes, rdfdatabank, sqlalchemy

[handlers]
keys = console, logfile
Expand All @@ -122,6 +122,11 @@ level = DEBUG
handlers = console
qualname = rdfdatabank

[logger_sqlalchemy]
level = INFO
handlers = console
qualname = sqlalchemy.engine

[handler_console]
class = StreamHandler
args = (sys.stderr,)
Expand Down
1 change: 1 addition & 0 deletions docs/solr_config/conf/schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@
<field name="currentVersion" type="int" indexed="true" stored="true" />
<field name="doi" type="text" multiValued="true" indexed="true" stored="true" />
<field name="aggregatedResource" type="text" multiValued="true" indexed="true" stored="true" />
<field name="filename" type="text" multiValued="true" indexed="true" stored="true" />
<field name="publicationDate" type="text" indexed="true" stored="true" />
<field name="abstract" type="text" multiValued="true" indexed="true" stored="true" />
<field name="accessRights" type="text" multiValued="true" indexed="true" stored="true" />
Expand Down
19 changes: 16 additions & 3 deletions message_workers/solr_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from rdflib import URIRef
import simplejson
from collections import defaultdict
from uuid import uuid4

from recordsilo import Granary
from solr import SolrConnection
Expand All @@ -58,6 +59,11 @@ def gather_document(silo_name, item):
for (_,p,o) in graph.triples((URIRef(item.uri), None, None)):
if str(p) in solr_fields_mapping:
field = solr_fields_mapping[str(p)]
if field == "aggregatedResource":
if '/datasets/' in o:
fn = unicode(o).split('/datasets/')
if len(fn) == 2 and fn[1]:
document['filename'].append(unicode(fn[1]).encode("utf-8"))
if field == "embargoedUntilDate":
ans = u"%sZ"%unicode(o).split('.')[0]
document[field].append(unicode(ans).encode("utf-8"))
Expand Down Expand Up @@ -115,13 +121,14 @@ def gather_document(silo_name, item):

toCommit = True
msg = simplejson.loads(line)
# solr switch
# get silo name
try:
silo_name = msg['silo']
except:
logger.error("Msg badly formed %s\n"%str(msg))
rq.task_complete()
continue
# Re-initialize granary
if silo_name not in g.silos and not msg['type'] == "d":
g = Granary(granary_root)
g.state.revert()
Expand All @@ -134,7 +141,7 @@ def gather_document(silo_name, item):
if msg['type'] == "c" or msg['type'] == "u" or msg['type'] == "embargo":
s = g.get_rdf_silo(silo_name)
# Creation, update or embargo change
itemid = msg.get('id')
itemid = msg.get('id', None)
logger.info("Got creation message on id:%s in silo:%s" % (itemid, silo_name))
if itemid and s.exists(itemid):
item = s.get_item(itemid)
Expand All @@ -149,10 +156,16 @@ def gather_document(silo_name, item):
pass
rq.task_failed()
continue
else:
silo_metadata = g.describe_silo(silo_name)
solr_doc = {'id':silo_name, 'silo':silo_name, 'type':'Silo', 'uuid':uuid4().hex}
solr_doc['title'] = silo_metadata['title']
solr_doc['description'] = silo_metadata['description']
solr.add(_commit=False, **solr_doc)
rq.task_complete()
elif msg['type'] == "d":
# Deletion
itemid = msg.get('id')
itemid = msg.get('id', None)
if itemid:
logger.info("Got deletion message on id:%s in silo:%s" % (itemid, silo_name))
query='silo:"%s" AND id:"%s"'%(silo_name, itemid)
Expand Down
11 changes: 8 additions & 3 deletions production.ini
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ metadata.embargoed = False

solr.host = http://localhost:8080/solr
naming_rule = [^0-9a-zA-Z_\-\:]
naming_rule_humanized = Numbers, alphabets and -:
naming_rule_humanized = numbers, letters, '-' and ':', must be more than one character long and must not contain any spaces.
formats_served = text/html,text/xhtml,text/plain,application/json,application/rdf+xml,text/xml,text/rdf+n3,application/x-turtle,text/rdf+ntriples,text/rdf+nt
publisher = Bodleian Libraries, University of Oxford
rights = http://ora.ouls.ox.ac.uk/objects/uuid%3A1d00eebb-8fed-46ad-8e38-45dbdb4b224c
Expand All @@ -92,10 +92,10 @@ api.version = 0.3
# Debug mode will enable the interactive debugging tool, allowing ANYONE to
# execute malicious code after an exception is raised.
#set debug = false
#

# Logging configuration
[loggers]
keys = root, routes, rdfdatabank
keys = root, routes, rdfdatabank, sqlalchemy

[handlers]
keys = console, logfile
Expand All @@ -118,6 +118,11 @@ level = INFO
handlers = logfile
qualname = rdfdatabank

[logger_sqlalchemy]
level = INFO
handlers = logfile
qualname = sqlalchemy.engine

[handler_console]
class = StreamHandler
args = (sys.stderr,)
Expand Down
3 changes: 2 additions & 1 deletion rdfdatabank/config/routing.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,9 @@ def make_map():
map.connect('/{silo}/admin', controller='admin', action='siloview')

map.connect('/silos', controller='silos', action='index')
map.connect('/{silo}', controller='silos', action='siloview')
#map.connect('/{silo}', controller='silos', action='siloview')

map.connect('/{silo}', controller='datasets', action='siloview')
map.connect('/{silo}/datasets', controller='datasets', action='siloview')
map.connect('/{silo}/datasets/{id}', controller='datasets', action='datasetview')
map.connect('/{silo}/datasets/{id}/{path:.*}', controller='datasets', action='itemview')
Expand Down
18 changes: 14 additions & 4 deletions rdfdatabank/controllers/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,12 @@ def index(self):

# Add silo to database
add_silo(silo)


try:
ag.b.silo_creation(silo, ident=ident['repoze.who.userid'])
except:
pass

#Add users belonging to the silo, to the database
all_silo_users = []

Expand All @@ -172,12 +177,12 @@ def index(self):
mimetype = accept_list.pop(0)
while(mimetype):
if str(mimetype).lower() in ["text/html", "text/xhtml"]:
redirect(url(controller="silos", action="siloview", silo=silo))
redirect(url(controller="datasets", action="siloview", silo=silo))
elif str(mimetype).lower() in ["text/plain", "application/json"]:
response.content_type = "text/plain"
response.status_int = 201
response.status = "201 Created"
response.headers['Content-Location'] = url(controller="silos", action="siloview", silo=silo)
response.headers['Content-Location'] = url(controller="datasets", action="siloview", silo=silo)
return "201 Created Silo %s" % silo
try:
mimetype = accept_list.pop(0)
Expand All @@ -187,7 +192,7 @@ def index(self):
response.content_type = "text/plain"
response.status_int = 201
response.status = "201 Created"
response.headers['Content-Location'] = url(controller="silos", action="siloview", silo=silo)
response.headers['Content-Location'] = url(controller="datasets", action="siloview", silo=silo)
return "201 Created Silo %s" % silo
else:
response.content_type = "text/plain"
Expand Down Expand Up @@ -327,6 +332,11 @@ def siloview(self, silo):
#Add new silo users into database
if new_silo_users:
add_group_users(silo, new_silo_users)
if updateMetadata:
try:
ag.b.silo_change(silo, ident=ident['repoze.who.userid'])
except:
pass

# conneg return
accept_list = None
Expand Down
68 changes: 64 additions & 4 deletions rdfdatabank/controllers/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from rdfdatabank.lib.utils import is_embargoed, test_rdf, munge_manifest, get_embargo_values, get_rdf_template, extract_metadata
from rdfdatabank.lib.file_unpack import get_zipfiles_in_dataset
from rdfdatabank.lib.conneg import MimeType as MT, parse as conneg_parse
from rdfdatabank.lib.auth_entry import add_dataset, delete_dataset, get_datasets_count, get_datasets

JAILBREAK = re.compile("[\/]*\.\.[\/]*")

Expand Down Expand Up @@ -71,11 +72,37 @@ def siloview(self, silo):
silos = ag.authz(ident)
if silo in silos:
c.editor = True

options = request.GET
c.start = 0
if 'start' in options and options['start']:
try:
c.start = int(options['start'])
except ValueError:
c.start = 0
c.rows = 100
if 'rows' in options and options['rows']:
try:
c.rows = int(options['rows'])
except ValueError:
c.rows = 100

#TODO: Get this information from SOLR, not the granary OR Do not get the embargo information, as that is what fails
c_silo = ag.granary.get_rdf_silo(silo)
# Get title of silo
state_info = ag.granary.describe_silo(silo)
if 'title' in state_info and state_info['title']:
c.title = state_info['title']
# Get number of data packages in silo
numFound = get_datasets_count(silo)
try:
c.numFound = int(numFound)
except ValueError:
c.numFound = 0

#c.embargos = {'params':{'numFound':numFound, 'start':c.start, 'rows':c.rows}}
c.embargos = {}
for item in c_silo.list_items():
#for item in c_silo.list_items():
for item in get_datasets(silo, start=c.start, rows=c.rows):
try:
c.embargos[item] = is_embargoed(c_silo, item)
except:
Expand All @@ -93,6 +120,37 @@ def siloview(self, silo):
mimetype = accept_list.pop(0)
while(mimetype):
if str(mimetype).lower() in ["text/html", "text/xhtml"]:
#Calculate the pagination for display of data packages
c.permissible_offsets = []
c.pages_to_show = 5
print type(c.start), type(c.pages_to_show), type(c.rows), type(c.numFound)
print c.start, c.pages_to_show, c.rows, c.numFound
try:
remainder = c.numFound % c.rows
if remainder > 0:
c.lastPage = c.numFound - remainder
else:
c.lastPage = c.numFound - c.rows

if c.numFound > c.rows:
offset_start = c.start - ( (c.pages_to_show/2) * c.rows )
if offset_start < 0:
offset_start = 0

offset_end = offset_start + (c.pages_to_show * c.rows)
if offset_end > c.numFound:
offset_end = c.numFound
if remainder > 0:
offset_start = c.lastPage - (c.pages_to_show * c.rows)
else:
offset_start = c.lastPage - ((c.pages_to_show-1) * c.rows)

if offset_start < 0:
offset_start = 0

c.permissible_offsets = list( xrange( offset_start, offset_end, c.rows) )
except ValueError:
pass
return render('/siloview.html')
elif str(mimetype).lower() in ["text/plain", "application/json"]:
response.content_type = 'application/json; charset="UTF-8"'
Expand Down Expand Up @@ -134,11 +192,11 @@ def siloview(self, silo):
response.content_type = "text/plain"
response.status_int = 400
response.status = "400 Bad request. Data package name not valid"
return "Data package name can contain only the following characters - %s and has to be more than 1 character"%ag.naming_rule_humanized
return "Data package name can only contain %s"%ag.naming_rule_humanized

del params['id']
item = create_new(c_silo, id, ident['repoze.who.userid'], **params)
add_dataset(silo, id)
# Broadcast change as message
try:
ag.b.creation(silo, id, ident=ident['repoze.who.userid'])
Expand Down Expand Up @@ -364,6 +422,7 @@ def datasetview(self, silo, id):
return "Data package name can contain only the following characters - %s and has to be more than 1 character"%ag.naming_rule_humanized
params = {}
item = create_new(c_silo, id, ident['repoze.who.userid'], **params)
add_dataset(silo, id)
code = 201
response.status = "201 Created"
response.status_int = 201
Expand Down Expand Up @@ -638,6 +697,7 @@ def datasetview(self, silo, id):
pass

c_silo.del_item(id)
delete_dataset(silo, id)

response.content_type = "text/plain"
response.status_int = 200
Expand Down
14 changes: 12 additions & 2 deletions rdfdatabank/controllers/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __before__(self):
c.field_names = term_list().get_search_field_dictionary()
c.facetable_fields = term_list().get_all_facet_fields()
c.types = term_list().get_type_field_dictionary()
c.search_fields = ['silo', 'id', 'uuid', 'embargoStatus', 'embargoedUntilDate', 'currentVersion', 'doi', 'publicationDate', 'abstract', 'description', 'creator', 'isVersionOf', 'isPartOf', 'subject']
c.search_fields = ['silo', 'id', 'title', 'uuid', 'embargoStatus', 'embargoedUntilDate', 'currentVersion', 'doi', 'publicationDate', 'abstract', 'description', 'creator', 'isVersionOf', 'isPartOf', 'subject', 'type']
c.sort_options = {'score desc':'Relevance', 'publicationDate desc':'Date (Latest to oldest)','publicationDate asc':'Date (Oldest to Latest)','silo asc':'Silo A to Z','silo desc':'Silo Z to A'}

def raw(self):
Expand Down Expand Up @@ -174,6 +174,7 @@ def detailed(self, query=None, additional_fields=[]):
start = request.params.get('start', None)
rows = request.params.get('rows', None)
sort = request.params.get('sort', None)
fields = request.params.get('fl', None)
res_format = request.params.get('format', None)
if not res_format:
accept_list = None
Expand Down Expand Up @@ -215,6 +216,14 @@ def detailed(self, query=None, additional_fields=[]):
c.chosen_fields = []
c.chosen_fields.extend(c.search_fields)

if fields:
fields = fields.split(',')
if fields and type(fields).__name__ == 'list':
fields = [x.strip() for x in fields]
for fld in fields:
if fld in c.all_fields and not fld in c.chosen_fields:
c.chosen_fields.append(fld)

for fld in additional_fields:
if not fld in c.chosen_fields:
c.chosen_fields.append(fld)
Expand Down Expand Up @@ -305,7 +314,8 @@ def detailed(self, query=None, additional_fields=[]):
elif c.typ and 'dataset' in c.typ:
solr_params['q'] = c.q.encode('utf-8')+query_filter+" AND type:dataset"
elif c.typ and 'item' in c.typ and c.q != "*:*":
solr_params['q'] = """aggregatedResource:"%s" %s"""%(c.q.encode('utf-8'),query_filter)
#solr_params['q'] = """aggregatedResource:"%s" %s"""%(c.q.encode('utf-8'),query_filter)
solr_params['q'] = """filename:"%s" %s"""%(c.q.encode('utf-8'),query_filter)
else:
solr_params['q'] = c.q.encode('utf-8')+query_filter

Expand Down
20 changes: 17 additions & 3 deletions rdfdatabank/controllers/silos.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
from paste.fileapp import FileApp

from rdfdatabank.lib.base import BaseController, render
from rdfdatabank.lib.utils import is_embargoed
from rdfdatabank.lib.auth_entry import list_silos
from rdfdatabank.lib.utils import is_embargoed, getSiloModifiedDate
from rdfdatabank.lib.auth_entry import list_silos, get_datasets_count
from rdfdatabank.lib.conneg import MimeType as MT, parse as conneg_parse

JAILBREAK = re.compile("[\/]*\.\.[\/]*")
Expand All @@ -54,6 +54,17 @@ def index(self):
abort(401, "Not Authorised")
c.silos = ag.authz(ident)

c.silo_infos = {}
for silo in c.silos:
c.silo_infos[silo] = []
state_info = ag.granary.describe_silo(silo)
if 'title' in state_info and state_info['title']:
c.silo_infos[silo].append(state_info['title'])
else:
c.silo_infos[silo].append(silo)
c.silo_infos[silo].append(get_datasets_count(silo))
c.silo_infos[silo].append(getSiloModifiedDate(silo))

# conneg return
accept_list = None
if 'HTTP_ACCEPT' in request.environ:
Expand Down Expand Up @@ -103,7 +114,10 @@ def siloview(self, silo):
if silo in ['ww1archives', 'digitalbooks']:
abort(501, "The silo %s contains too many data packages to list"%silo)

rdfsilo = ag.granary.get_rdf_silo(silo)
rdfsilo = ag.granary.get_rdf_silo(silo)
state_info = ag.granary.describe_silo(silo)
if 'title' in state_info and state_info['title']:
c.title = state_info['title']
c.embargos = {}
c.items = []
for item in rdfsilo.list_items():
Expand Down
Loading

0 comments on commit a3fca25

Please sign in to comment.