diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7ee712a..ea58b5d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -61,6 +61,12 @@ jobs: run: | pip install -U pytest-rerunfailures if: ${{ matrix.ckan-version == '2.9' }} + - name: Setup other extensions + run: | + git clone https://github.com/OpenGov-OpenData/ckanext-harvest + pip install -e ckanext-harvest + pip install -r ckanext-harvest/requirements.txt + pip install -r ckanext-harvest/dev-requirements.txt - name: Setup extension run: | ckan -c test.ini db init diff --git a/ckanext/custom_harvest/configuration_processors.py b/ckanext/custom_harvest/configuration_processors.py index 3e702f9..266cf3e 100644 --- a/ckanext/custom_harvest/configuration_processors.py +++ b/ckanext/custom_harvest/configuration_processors.py @@ -369,8 +369,8 @@ def modify_package_dict(package_dict, config, source_dict): target_email = contact_point_mapping.get('target_email') # Get contact point name - contact_point_name = contact_point_mapping.get('default_name') - if source_name.startswith('extras.'): + contact_point_name = '' + if source_name and source_name.startswith('extras.'): source_extra = get_extra(source_name[7:], source_dict) if source_extra: contact_point_name = source_extra.get('value') @@ -395,8 +395,8 @@ def modify_package_dict(package_dict, config, source_dict): package_dict['extras'].remove(existing_extra) # Get contact point email - contact_point_email = contact_point_mapping.get('default_email') - if source_email.startswith('extras.'): + contact_point_email = '' + if source_email and source_email.startswith('extras.'): source_extra = get_extra(source_email[7:], source_dict) if source_extra: contact_point_email = source_extra.get('value') diff --git a/ckanext/custom_harvest/harvesters/base.py b/ckanext/custom_harvest/harvesters/base.py index f98c95f..1bdf553 100644 --- a/ckanext/custom_harvest/harvesters/base.py +++ b/ckanext/custom_harvest/harvesters/base.py @@ -1,16 +1,8 @@ -import os import logging -import six -import requests -import rdflib - from ckan import plugins as p from ckan import model -from ckantoolkit import config -import ckan.plugins.toolkit as toolkit - from ckanext.harvest.harvesters import HarvesterBase from ckanext.harvest.model import HarvestObject diff --git a/ckanext/custom_harvest/harvesters/package_search.py b/ckanext/custom_harvest/harvesters/package_search.py index fabd687..6028ada 100644 --- a/ckanext/custom_harvest/harvesters/package_search.py +++ b/ckanext/custom_harvest/harvesters/package_search.py @@ -29,7 +29,7 @@ class PackageSearchHarvester(CustomHarvester): def info(self): return { - 'name': 'package_search_harvester', + 'name': 'package_search_harvest', 'title': 'CKAN Package Search', 'description': 'Harvester for CKAN instances utilizing the package_search API', 'form_config_interface': 'Text' @@ -359,7 +359,7 @@ def import_stage(self, harvest_object): # Upload tabular resources to datastore upload_to_datastore = self.config.get('upload_to_datastore', True) - if upload_to_datastore: + if upload_to_datastore and p.get_plugin('xloader'): # Get package dict again in case there's new resource ids pkg_dict = p.toolkit.get_action('package_show')(context, {'id': package_id}) upload_resources_to_datastore(context, pkg_dict, source_dict, base_search_url) diff --git a/ckanext/custom_harvest/tests/conftest.py b/ckanext/custom_harvest/tests/conftest.py new file mode 100644 index 0000000..9419c16 --- /dev/null +++ b/ckanext/custom_harvest/tests/conftest.py @@ -0,0 +1,9 @@ +import pytest + +import ckan.plugins as p + +@pytest.fixture +def clean_db(reset_db, migrate_db_for): + reset_db() + if p.get_plugin('harvest'): + migrate_db_for('harvest') diff --git a/ckanext/custom_harvest/tests/harvesters/__init__.py b/ckanext/custom_harvest/tests/harvesters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ckanext/custom_harvest/tests/harvesters/mock_ckan.py b/ckanext/custom_harvest/tests/harvesters/mock_ckan.py new file mode 100644 index 0000000..a26d7b3 --- /dev/null +++ b/ckanext/custom_harvest/tests/harvesters/mock_ckan.py @@ -0,0 +1,219 @@ +from __future__ import print_function + +import json +import re +import copy +from urllib.parse import unquote_plus + +from threading import Thread + +from http.server import SimpleHTTPRequestHandler +from socketserver import TCPServer + + +PORT = 8998 + + +class MockCkanHandler(SimpleHTTPRequestHandler): + def do_GET(self): + # test name is the first bit of the URL and makes CKAN behave + # differently in some way. + # Its value is recorded and then removed from the path + self.test_name = None + test_name_match = re.match('^/([^/]+)/', self.path) + if test_name_match: + self.test_name = test_name_match.groups()[0] + if self.test_name == 'api': + self.test_name = None + else: + self.path = re.sub('^/([^/]+)/', '/', self.path) + if self.test_name == 'site_down': + return self.respond('Site is down', status=500) + + # The API version is recorded and then removed from the path + api_version = None + version_match = re.match(r'^/api/(\d)', self.path) + if version_match: + api_version = int(version_match.groups()[0]) + self.path = re.sub(r'^/api/(\d)/', '/api/', self.path) + + if self.path == '/api/action/package_list': + dataset_names = [d['name'] for d in DATASETS] + return self.respond_action(dataset_names) + if self.path.startswith('/api/action/package_show'): + params = self.get_url_params() + dataset_ref = params['id'] + dataset = self.get_dataset(dataset_ref) + if dataset: + return self.respond_action(dataset) + # /api/3/action/package_search?fq=metadata_modified:[2015-10-23T14:51:13.282361Z TO *]&rows=1000 + if self.path.startswith('/api/action/package_search'): + params = self.get_url_params() + + # ignore sort param for now + if 'sort' in params: + del params['sort'] + if params['start'] != '0': + datasets = [] + elif set(params.keys()) == set(['rows', 'start']): + datasets = ['dataset1', DATASETS[1]['name']] + elif set(params.keys()) == set(['fq', 'rows', 'start']) and \ + params['fq'] == '-organization:org1': + datasets = [DATASETS[1]['name']] + elif set(params.keys()) == set(['fq', 'rows', 'start']) and \ + params['fq'] == 'organization:org1': + datasets = ['dataset1'] + elif set(params.keys()) == set(['fq', 'rows', 'start']) and \ + params['fq'] == '-groups:group1': + datasets = [DATASETS[1]['name']] + elif set(params.keys()) == set(['fq', 'rows', 'start']) and \ + params['fq'] == 'groups:group1': + datasets = ['dataset1'] + elif set(params.keys()) == set(['fq', 'rows', 'start']) and \ + 'metadata_modified' in params['fq']: + assert '+TO+' not in params['fq'], \ + 'Spaces should not be decoded by now - seeing + '\ + 'means they were double encoded and SOLR doesnt like '\ + 'that' + datasets = [DATASETS[1]['name']] + elif set(params.keys()) == set(['tags', 'rows', 'start']) and \ + params['tags'] == 'test-tag': + datasets = [DATASETS[0]['name'], DATASETS[1]['name']] + else: + return self.respond( + 'Not implemented search params %s' % params, + status=400) + + out = {'count': len(datasets), + 'results': [self.get_dataset(dataset_ref_) + for dataset_ref_ in datasets]} + return self.respond_action(out) + + # if we wanted to server a file from disk, then we'd call this: + # return SimpleHTTPServer.SimpleHTTPRequestHandler.do_GET(self) + + self.respond('Mock CKAN doesnt recognize that call', status=400) + + def get_dataset(self, dataset_ref): + for dataset in DATASETS: + if dataset['name'] == dataset_ref or \ + dataset['id'] == dataset_ref: + return dataset + + def get_url_params(self): + params_str = self.path.split('?')[-1] + params_unicode = unquote_plus(params_str) + params = params_unicode.split('&') + return dict([param.split('=') for param in params]) + + def respond_action(self, result_dict, status=200): + response_dict = {'result': result_dict, 'success': True} + return self.respond_json(response_dict, status=status) + + def respond_json(self, content_dict, status=200): + return self.respond(json.dumps(content_dict), status=status, + content_type='application/json') + + def respond(self, content, status=200, content_type='application/json'): + self.send_response(status) + self.send_header('Content-Type', content_type) + self.end_headers() + self.wfile.write(content.encode('utf-8')) + self.wfile.close() + + +def serve(port=PORT): + '''Runs a CKAN-alike app (over HTTP) that is used for harvesting tests''' + + # Choose the directory to serve files from + # os.chdir(os.path.join(os.path.dirname(os.path.abspath(__file__)), + # 'mock_ckan_files')) + + class TestServer(TCPServer): + allow_reuse_address = True + + httpd = TestServer(('', PORT), MockCkanHandler) + + print('Serving test HTTP server at port {}'.format(PORT)) + + httpd_thread = Thread(target=httpd.serve_forever) + httpd_thread.setDaemon(True) + httpd_thread.start() + + +def convert_dataset_to_restful_form(dataset): + dataset = copy.deepcopy(dataset) + dataset['extras'] = dict([(e['key'], e['value']) for e in dataset['extras']]) + dataset['tags'] = [t['name'] for t in dataset.get('tags', [])] + return dataset + + +# Datasets are in the package_show form, rather than the RESTful form +DATASETS = [ + { + 'id': 'dataset1-id', + 'name': 'dataset1', + 'title': 'Test Dataset1', + 'organization': { + 'id': '0f8380d6-241a-47de-aa52-8bd91c763d97', + 'name': 'org1', + 'title': 'Test Org1' + }, + 'owner_org': '0f8380d6-241a-47de-aa52-8bd91c763d97', + 'tags': [ + { + 'name': 'test-tag' + } + ], + 'groups': [ + { + 'id': '10037fa4-e683-4a67-892a-efba815e24ad', + 'name': 'group1', + 'title': 'Test Group1' + } + ], + 'resources': [ + { + 'id': 'resource1-id', + 'name': 'Test Resource 1', + 'url': 'http://test.gov/test1.csv', + 'format': 'CSV', + 'position': 0 + } + ], + 'extras': [] + }, + { + 'id': 'dataset2-id', + 'name': 'dataset2', + 'title': 'Test Dataset2', + 'organization': { + 'id': 'aa1e068a-23da-4563-b9c2-2cad272b663e', + 'name': 'org2', + 'title': 'Test Org2' + }, + 'owner_org': 'aa1e068a-23da-4563-b9c2-2cad272b663e', + 'tags': [ + { + 'name': 'test-tag' + } + ], + 'groups': [ + { + 'id': '9853c3e1-eebb-4e8c-9ae7-1668a01bf2ca', + 'name': 'group2', + 'title': 'Test Group2' + } + ], + 'resources': [ + { + 'id': 'resource2-id', + 'name': 'Test Resource 2', + 'url': 'http://test.gov/test2.csv', + 'format': 'CSV', + 'position': 0 + } + ], + 'extras': [] + } +] \ No newline at end of file diff --git a/ckanext/custom_harvest/tests/harvesters/test_package_search_harvester.py b/ckanext/custom_harvest/tests/harvesters/test_package_search_harvester.py new file mode 100644 index 0000000..3ca2065 --- /dev/null +++ b/ckanext/custom_harvest/tests/harvesters/test_package_search_harvester.py @@ -0,0 +1,170 @@ +from __future__ import absolute_import + +import json +import pytest + +from ckantoolkit.tests.factories import Organization + +from ckanext.harvest.tests.factories import (HarvestSourceObj, HarvestJobObj, + HarvestObjectObj) +from ckanext.harvest.tests.lib import run_harvest_job +import ckanext.harvest.model as harvest_model + +from ckanext.custom_harvest.harvesters.package_search import copy_across_resource_ids, PackageSearchHarvester +from ckanext.custom_harvest.tests.harvesters import mock_ckan + + +# Start CKAN-alike server we can test harvesting against it +mock_ckan.serve() + + +@pytest.mark.usefixtures('with_plugins', 'clean_db', 'clean_index') +class TestPackageSearchHarvester(object): + + def test_gather_normal(self): + source = HarvestSourceObj( + url='http://localhost:%s/api/action/package_search?tags=test-tag' % mock_ckan.PORT + ) + job = HarvestJobObj(source=source) + + harvester = PackageSearchHarvester() + obj_ids = harvester.gather_stage(job) + + assert job.gather_errors == [] + assert isinstance(obj_ids, list) + assert len(obj_ids) == len(mock_ckan.DATASETS) + harvest_object = harvest_model.HarvestObject.get(obj_ids[0]) + assert harvest_object.guid == mock_ckan.DATASETS[0]['name'] + assert json.loads(harvest_object.content) == mock_ckan.DATASETS[0] + + def test_fetch_normal(self): + source = HarvestSourceObj( + url='http://localhost:%s/api/action/package_search?tags=test-tag' % mock_ckan.PORT + ) + job = HarvestJobObj(source=source) + harvest_object = HarvestObjectObj( + guid=mock_ckan.DATASETS[0]['name'], + job=job, + content=json.dumps(mock_ckan.DATASETS[0])) + + harvester = PackageSearchHarvester() + result = harvester.fetch_stage(harvest_object) + + assert harvest_object.errors == [] + assert result is True + + def test_import_normal(self): + org = Organization() + harvest_object = HarvestObjectObj( + guid=mock_ckan.DATASETS[0]['name'], + content=json.dumps(mock_ckan.DATASETS[0]), + job__source__owner_org=org['id']) + + harvester = PackageSearchHarvester() + result = harvester.import_stage(harvest_object) + + assert harvest_object.errors == [] + assert result is True + assert harvest_object.guid + + def test_harvest(self): + source = HarvestSourceObj( + url='http://localhost:%s/api/action/package_search?tags=test-tag' % mock_ckan.PORT, + config='', + source_type='test' + ) + job = HarvestJobObj(source=source, run=False) + results_by_guid = run_harvest_job(job, PackageSearchHarvester()) + + result = results_by_guid[mock_ckan.DATASETS[0]['name']] + assert result['state'] == 'COMPLETE' + assert result['errors'] == [] + + result = results_by_guid[mock_ckan.DATASETS[1]['name']] + assert result['state'] == 'COMPLETE' + assert result['errors'] == [] + + +class TestCopyAcrossResourceIds(object): + def test_copied_because_same_name_url_format(self): + harvested_dataset = {'resources': [ + {'name': 'abc', 'url': 'http://abc', 'format': 'csv'}]} + copy_across_resource_ids({'resources': [ + {'name': 'abc', 'url': 'http://abc', 'format': 'csv', 'id': '1'}]}, + harvested_dataset, + ) + assert harvested_dataset['resources'][0].get('id') == '1' + assert harvested_dataset['resources'][0].get('url') == 'http://abc' + + def test_copied_because_same_url(self): + harvested_dataset = {'resources': [ + {'url': 'http://abc'}]} + copy_across_resource_ids({'resources': [ + {'url': 'http://abc', 'id': '1'}]}, + harvested_dataset, + ) + assert harvested_dataset['resources'][0].get('id') == '1' + + def test_copied_with_same_url_and_changed_name(self): + harvested_dataset = {'resources': [ + {'url': 'http://abc', 'name': 'link updated'}]} + copy_across_resource_ids({'resources': [ + {'url': 'http://abc', 'name': 'link', 'id': '1'}]}, + harvested_dataset, + ) + assert harvested_dataset['resources'][0].get('id') == '1' + + def test_copied_with_repeated_urls_but_unique_names(self): + harvested_dataset = {'resources': [ + {'url': 'http://abc', 'name': 'link1'}, + {'url': 'http://abc', 'name': 'link5'}, + {'url': 'http://abc', 'name': 'link3'}, + {'url': 'http://abc', 'name': 'link2'}, + {'url': 'http://abc', 'name': 'link4'}, + {'url': 'http://abc', 'name': 'link new'}, + ]} + copy_across_resource_ids({'resources': [ + {'url': 'http://abc', 'name': 'link1', 'id': '1'}, + {'url': 'http://abc', 'name': 'link2', 'id': '2'}, + {'url': 'http://abc', 'name': 'link3', 'id': '3'}, + {'url': 'http://abc', 'name': 'link4', 'id': '4'}, + {'url': 'http://abc', 'name': 'link5', 'id': '5'}, + ]}, + harvested_dataset, + ) + assert ([(r.get('id'), r['name']) for r in harvested_dataset['resources']] == + [('1', 'link1'), ('5', 'link5'), ('3', 'link3'), ('2', 'link2'), + ('4', 'link4'), (None, 'link new')]) + + def test_copied_with_keeping_existing_resources(self): + existing_dataset = {'resources': [ + {'url': 'http://abc1', 'name': 'link 1', 'id': '1'}, + {'url': 'http://abc2', 'name': 'link 2', 'id': '2'}, + {'url': 'http://abc3', 'name': 'link 3', 'id': '3'}, + {'url': 'http://abc4', 'name': 'link 4', 'id': '4'}, + {'url': 'http://abc5', 'name': 'link 5', 'id': '5'}, + ]} + harvested_dataset = {'resources': [ + {'url': 'http://abc1', 'name': 'link 1'}, + {'url': 'http://abc2', 'name': 'link 2'}, + {'url': 'http://abc3', 'name': 'link 3'}, + {'url': 'http://abc4', 'name': 'link 4'}, + {'url': 'http://abc00', 'name': 'new link'}, + ]} + copy_across_resource_ids( + existing_dataset=existing_dataset, + harvested_dataset=harvested_dataset, + config={'keep_existing_resources': True} + ) + assert ([(r.get('id'), r['name']) for r in harvested_dataset['resources']] == + [('1', 'link 1'), ('2', 'link 2'), ('3', 'link 3'), ('4', 'link 4'), + (None, 'new link'), ('5', 'link 5')]) + + def test_not_copied_because_completely_different(self): + harvested_dataset = {'resources': [ + {'url': 'http://def', 'name': 'link other'}]} + copy_across_resource_ids({'resources': [ + {'url': 'http://abc', 'name': 'link', 'id': '1'}]}, + harvested_dataset, + ) + assert harvested_dataset['resources'][0].get('id') == None diff --git a/dev-requirements.txt b/dev-requirements.txt index eac82b4..fd7d9cb 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1 +1,2 @@ -pytest-ckan +mock +pytest-ckan \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e69de29..0253f05 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1 @@ +ckantoolkit>=0.0.7 \ No newline at end of file diff --git a/test.ini b/test.ini index 91cf696..5fdb643 100644 --- a/test.ini +++ b/test.ini @@ -8,7 +8,7 @@ use = config:../ckan/test-core.ini # Insert any custom config settings to be used when running your extension's # tests here. These will override the one defined in CKAN core's test-core.ini -ckan.plugins = custom_harvest +ckan.plugins = harvest test_harvester custom_harvest # Logging configuration