diff --git a/jest.config.js b/jest.config.js index 21b892a8..ec3ecb53 100644 --- a/jest.config.js +++ b/jest.config.js @@ -10,10 +10,10 @@ export default { coverageReporters: ['json-summary', 'text'], coverageThreshold: { global: { - lines: 71.01, - statements: 70.97, - branches: 64.56, - functions: 68.75, + lines: 71.7, + statements: 71.73, + branches: 63.3, + functions: 69.72, }, }, transform: { diff --git a/shacl/register.ttl b/shacl/register.ttl index ec9fd5fc..7aa5ac64 100644 --- a/shacl/register.ttl +++ b/shacl/register.ttl @@ -320,15 +320,66 @@ dcat:DatasetShape sh:property [ sh:minCount 1 ; sh:path dc:title - ], [ + ], + [ sh:path dc:alternative - ], [ - sh:path dc:description - ], [ + ], + [ + sh:path dc:description ; + sh:minCount 1 ; + sh:severity sh:Info ; + sh:message "Dataset kan een beschrijving hebben"@nl, "Dataset should have a description"@en ; + ], + [ sh:minCount 1 ; sh:maxCount 1 ; sh:path dc:license ], + [ + sh:path dc:created ; + sh:minCount 1 ; + sh:maxCount 1 ; + sh:severity sh:Info ; + sh:message "Dataset kan een aanmaakdatum hebben"@nl, "Dataset should have a creation date"@en ; + ], + [ + sh:path dc:issued ; + sh:minCount 1 ; + sh:maxCount 1 ; + sh:severity sh:Info ; + sh:message "Dataset kan een uitgavedatum hebben"@nl, "Dataset should have an issued date"@en ; + ], + [ + sh:path dc:modified ; + sh:minCount 1 ; + sh:maxCount 1 ; + sh:severity sh:Info ; + sh:message "Dataset kan een laatste wijzigingsdatum hebben"@nl, "Dataset should have a last modified date"@en ; + ], + [ + sh:path dc:keyword ; + sh:minCount 1 ; + sh:severity sh:Info ; + sh:message "Dataset kan trefwoorden hebben"@nl, "Dataset should have keywords"@en ; + ], + [ + sh:path dc:spatial ; + sh:minCount 1 ; + sh:severity sh:Info ; + sh:message "Dataset kan een gebiedsaanduiding hebben"@nl, "Dataset should have spatial coverage"@en ; + ], + [ + sh:path dc:temporal ; + sh:minCount 1 ; + sh:severity sh:Info ; + sh:message "Dataset kan een tijdsaanduiding hebben"@nl, "Dataset should have temporal coverage"@en ; + ], + [ + sh:path dc:language ; + sh:minCount 1 ; + sh:severity sh:Info ; + sh:message "Dataset kan een taal hebben"@nl, "Dataset should have a language"@en ; + ], [ sh:path dc:publisher ; sh:minCount 1 ; @@ -346,6 +397,8 @@ dcat:DatasetShape ], [ sh:path dc:creator ; + sh:minCount 1 ; + sh:severity sh:Info ; sh:or ( [ sh:class foaf:Organization ; @@ -356,11 +409,15 @@ dcat:DatasetShape sh:node dcat:PersonShape ; ] ) ; + sh:message "Dataset kan een maker hebben"@nl, "Dataset should have a creator"@en ; ], [ sh:path dcat:distribution ; sh:class dcat:Distribution ; - sh:node dcat:DistributionShape + sh:node dcat:DistributionShape ; + sh:minCount 1 ; + sh:severity sh:Info ; + sh:message "Dataset moet een distributie hebben"@nl, "Dataset should have a distribution"@en ; ] ; sh:targetClass dcat:Dataset ; . diff --git a/src/crawler.ts b/src/crawler.ts index 30368726..b0dda2e8 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -1,15 +1,17 @@ import {RegistrationStore} from './registration.js'; -import {DatasetStore, extractIris} from './dataset.js'; +import {DatasetStore, extractIri, extractIris} from './dataset.js'; import {dereference, fetch, HttpError, NoDatasetFoundAtUrl} from './fetch.js'; import DatasetExt from 'rdf-ext/lib/Dataset'; import Pino from 'pino'; -import {Validator} from './validator.js'; +import {Valid, Validator} from './validator.js'; import {crawlCounter} from './instrumentation.js'; +import {rate, RatingStore} from './rate.js'; export class Crawler { constructor( private registrationStore: RegistrationStore, private datasetStore: DatasetStore, + private ratingStore: RatingStore, private validator: Validator, private logger: Pino.Logger ) {} @@ -28,10 +30,16 @@ export class Crawler { try { const data = await dereference(registration.url); - isValid = (await this.validator.validate(data)).state === 'valid'; + const validationResult = await this.validator.validate(data); + isValid = validationResult.state === 'valid'; if (isValid) { datasets = await fetch(registration.url); await this.datasetStore.store(datasets); + datasets.map(async dataset => { + const dcatValidationResult = await this.validator.validate(dataset); + const rating = rate(dcatValidationResult as Valid); + this.ratingStore.store(extractIri(dataset), rating); + }); } } catch (e) { if (e instanceof HttpError) { diff --git a/src/dataset.ts b/src/dataset.ts index dd747475..7bea3c0d 100644 --- a/src/dataset.ts +++ b/src/dataset.ts @@ -7,6 +7,7 @@ import {Readable, Transform} from 'stream'; import {StreamParser} from 'n3'; import {JsonLdParser} from 'jsonld-streaming-parser'; import {StandardizeSchemaOrgPrefixToHttps} from './transform.js'; +import {DatasetCore} from 'rdf-js'; export interface DatasetStore { /** @@ -19,17 +20,20 @@ export interface DatasetStore { countOrganisations(): Promise; } +export function extractIri(dataset: DatasetCore): URL { + const quad = [ + ...dataset.match( + null, + factory.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), + datasetType + ), + ][0]; + return new URL(quad.subject.value); +} + export function extractIris(datasets: DatasetExt[]): Map { return datasets.reduce((map, dataset) => { - const quad = [ - ...dataset.match( - null, - factory.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), - datasetType - ), - ][0]; - const url = new URL(quad.subject.value); - map.set(url, dataset); + map.set(extractIri(dataset), dataset); return map; }, new Map()); } diff --git a/src/graphdb.ts b/src/graphdb.ts index d697afc5..172a9676 100644 --- a/src/graphdb.ts +++ b/src/graphdb.ts @@ -11,6 +11,7 @@ import { RegistrationStore, } from './registration.js'; import {DatasetStore, extractIris} from './dataset.js'; +import {Rating, RatingStore} from './rate.js'; export type SparqlResult = { results: { @@ -62,22 +63,24 @@ export class GraphDbClient { this.token = response.headers.get('Authorization')!; } - public async request( - method: string, - url: string, - body?: string, - accept?: string - ): Promise { + public async request(options: { + method: string; + url?: string; + body?: string; + accept?: string; + contentType?: string; + }): Promise { const headers = await this.getHeaders(); - headers.set('Content-Type', 'application/x-trig'); - if (accept) { - headers.set('Accept', accept); + headers.set('Content-Type', options.contentType ?? 'application/x-trig'); + if (options.accept) { + headers.set('Accept', options.accept); } - const repositoryUrl = this.url + '/repositories/' + this.repository + url; + const repositoryUrl = + this.url + '/repositories/' + this.repository + options.url; const response = await fetch(repositoryUrl, { - method: method, + method: options.method, headers: headers, - body: body, + body: options.body, }); if ( // 409 = `Auth token hash mismatch`, which occurs after GraphDB has restarted. @@ -87,12 +90,17 @@ export class GraphDbClient { ) { this.token = undefined; // Retry original request. - await this.request(method, url, body); + await this.request(options); } if (!response.ok) { console.error( - 'HTTP error ' + response.status + ' for ' + method + ' ' + repositoryUrl + 'HTTP error ' + + response.status + + ' for ' + + options.method + + ' ' + + repositoryUrl ); } @@ -100,16 +108,30 @@ export class GraphDbClient { } public async query(query: string): Promise { - const response = await this.request( - 'GET', - '?' + querystring.stringify({query}), - undefined, - 'application/sparql-results+json' - ); + const response = await this.request({ + method: 'GET', + url: '?' + querystring.stringify({query}), + accept: 'application/sparql-results+json', + }); return (await response.json()) as SparqlResult; } + public async update(payload: string): Promise { + const response = await this.request({ + method: 'POST', + url: '/statements', + body: payload, + contentType: 'application/sparql-update', + }); + + if (!response.ok) { + console.error( + `${response.status} response for SPARQL update ${payload})` + ); + } + } + private async getHeaders(): Promise { if (this.username === undefined || this.password === undefined) { return new Headers(); @@ -223,15 +245,20 @@ export class GraphDbRegistrationStore implements RegistrationStore { return new Promise((resolve, reject) => { getWriter(quads).end(async (error, result) => { try { - await this.client.request( - 'DELETE', - '/statements?' + + await this.client.request({ + method: 'DELETE', + url: + '/statements?' + querystring.stringify({ subj: '<' + registration.url.toString() + '>', context: '<' + this.registrationsGraph + '>', - }) - ); - await this.client.request('POST', '/statements', result); + }), + }); + await this.client.request({ + method: 'POST', + url: '/statements', + body: result, + }); resolve(null); } catch (e) { reject(e); @@ -346,12 +373,13 @@ export class GraphDbDatasetStore implements DatasetStore { async (error, result) => { try { resolve( - await this.client.request( - 'PUT', - '/rdf-graphs/service?graph=' + + await this.client.request({ + method: 'PUT', + url: + '/rdf-graphs/service?graph=' + encodeURIComponent(graphIri.toString()), - result - ) + body: result, + }) ); } catch (e) { reject(e); @@ -368,3 +396,37 @@ function getWriter(quads: Quad[]): Writer { return writer; } + +export class GraphDbRatingStore implements RatingStore { + private readonly graph = + 'https://data.netwerkdigitaalerfgoed.nl/registry/ratings'; + + constructor(private readonly client: GraphDbClient) {} + async store(datasetUri: URL, rating: Rating): Promise { + await this.client.update(` + PREFIX schema: + + WITH <${this.graph}> + DELETE { + ?dataset schema:contentRating ?rating . + ?rating ?p ?o . + } + WHERE { + BIND(<${datasetUri}> as ?dataset) + ?dataset schema:contentRating ?rating . + ?rating ?p ?o . + }; + + WITH <${this.graph}> + INSERT { + <${datasetUri}> schema:contentRating [ + schema:bestRating ${rating.bestRating} ; + schema:worstRating ${rating.worstRating} ; + schema:ratingValue ${rating.score} ; + schema:ratingExplanation "${rating.explanation}" ; + ] + } + WHERE {} + `); + } +} diff --git a/src/main.ts b/src/main.ts index e6b04812..60a28617 100644 --- a/src/main.ts +++ b/src/main.ts @@ -2,6 +2,7 @@ import { GraphDbAllowedRegistrationDomainStore, GraphDbClient, GraphDbDatasetStore, + GraphDbRatingStore, GraphDbRegistrationStore, } from './graphdb.js'; import {readUrl, ShaclValidator} from './validator.js'; @@ -26,6 +27,7 @@ const client = new GraphDbClient( const logger = Pino(); const datasetStore = new GraphDbDatasetStore(client); const registrationStore = new GraphDbRegistrationStore(client); + const ratingStore = new GraphDbRatingStore(client); const allowedRegistrationDomainStore = new GraphDbAllowedRegistrationDomainStore(client); await startInstrumentation(datasetStore); @@ -34,6 +36,7 @@ const client = new GraphDbClient( const crawler = new Crawler( registrationStore, datasetStore, + ratingStore, validator, logger ); diff --git a/src/rate.ts b/src/rate.ts new file mode 100644 index 00000000..1aaf183d --- /dev/null +++ b/src/rate.ts @@ -0,0 +1,80 @@ +import {shacl, Valid} from './validator.js'; +import {dcat, dct} from './query.js'; + +const penalties = new Map([ + [[dct('description')], 20], + [[dcat('distribution')], 20], + [[dct('creator')], 10], + [[dct('created'), dct('issued')], 10], + [[dct('modified')], 5], + [[dct('keyword'), dct('spatial'), dct('temporal')], 5], + [[dct('language')], 5], +]); + +const worstRating = [...penalties].reduce( + (score, [, penalty]) => score - penalty, + 100 +); + +export function rate(validationResult: Valid): Rating { + const violations: Map = [...validationResult.errors].reduce( + (map, quad) => { + if (quad.predicate.equals(shacl('resultPath'))) { + map.set( + quad.object.value, + [ + ...validationResult.errors.match( + quad.subject, + shacl('resultMessage') + ), + ][0]?.value + ); + } + return map; + }, + new Map() + ); + + const appliedPenalties = [...penalties].reduce( + (appliedPenalties, [properties, penalty]) => { + for (const property of properties) { + if (!violations.has(property.value)) { + return appliedPenalties; + } + } + + return [...appliedPenalties, new Penalty(properties[0].value, penalty)]; + }, + new Array() + ); + + return new Rating(appliedPenalties, worstRating); +} + +export class Penalty { + public constructor( + public readonly path: string, + public readonly score: number + ) {} +} + +export class Rating { + public readonly score: number; + public readonly explanation: string; + + public constructor( + private readonly penalties: Penalty[], + public readonly worstRating: number, + public readonly bestRating = 100 + ) { + this.score = penalties.reduce( + (score, penalty) => score - penalty.score, + 100 + ); + this.explanation = penalties.map(penalty => penalty.path).join(', '); + } +} + +export interface RatingStore { + store(datasetUri: URL, rating: Rating): Promise; +} diff --git a/test/crawler.test.ts b/test/crawler.test.ts index 0fdba9e1..39b42021 100644 --- a/test/crawler.test.ts +++ b/test/crawler.test.ts @@ -1,19 +1,28 @@ import {Registration} from '../src/registration'; import {Crawler} from '../src/crawler'; import {URL} from 'url'; -import {file, MockDatasetStore, MockRegistrationStore} from './mock'; +import { + file, + MockDatasetStore, + MockRatingStore, + MockRegistrationStore, +} from './mock'; import nock from 'nock'; import Pino from 'pino'; import {Validator} from '../src/validator'; import factory from 'rdf-ext'; +import {DatasetCore} from 'rdf-js'; let registrationStore: MockRegistrationStore; let crawler: Crawler; -const validator = (isValid: boolean): Validator => ({ +const validator = ( + isValid: boolean, + errors: DatasetCore = factory.dataset() +): Validator => ({ validate: () => Promise.resolve({ state: isValid ? 'valid' : 'invalid', - errors: factory.dataset(), + errors, }), }); @@ -23,6 +32,7 @@ describe('Crawler', () => { crawler = new Crawler( registrationStore, new MockDatasetStore(), + new MockRatingStore(), validator(true), Pino({enabled: false}) ); @@ -46,6 +56,24 @@ describe('Crawler', () => { ]); }); + it('crawls valid URL with minimal description', async () => { + storeRegistrationFixture(new URL('https://example.com/minimal')); + + const response = await file('dataset-schema-org-valid-minimal.jsonld'); + nock('https://example.com') + .defaultReplyHeaders({'Content-Type': 'application/ld+json'}) + .get('/minimal') + .times(2) + .reply(200, response); + await crawler.crawl(new Date('3000-01-01')); + + const readRegistration = registrationStore.all()[0]; + expect(readRegistration.statusCode).toBe(200); + expect(readRegistration.datasets).toEqual([ + new URL('http://data.bibliotheken.nl/id/dataset/rise-alba'), + ]); + }); + it('stores error HTTP response status code', async () => { storeRegistrationFixture(new URL('https://example.com/registered-url')); @@ -74,6 +102,7 @@ describe('Crawler', () => { crawler = new Crawler( registrationStore, new MockDatasetStore(), + new MockRatingStore(), validator(false), Pino({enabled: false}) ); diff --git a/test/datasets/dataset-dcat-valid-minimal.jsonld b/test/datasets/dataset-dcat-valid-minimal.jsonld new file mode 100644 index 00000000..2aa3feef --- /dev/null +++ b/test/datasets/dataset-dcat-valid-minimal.jsonld @@ -0,0 +1,16 @@ +{ + "@context": { + "dcat": "http://www.w3.org/ns/dcat#", + "dct": "http://purl.org/dc/terms/", + "foaf": "http://xmlns.com/foaf/0.1/" + }, + "@type": "dcat:Dataset", + "@id": "http://data.bibliotheken.nl/id/dataset/rise-alba", + "dct:title": "Alba amicorum van de Koninklijke Bibliotheek", + "dct:license": "http://creativecommons.org/publicdomain/zero/1.0/", + "dct:publisher": { + "@id": "https://example.com/dataset-provider", + "@type": "foaf:Person", + "foaf:name": "Dataset Provider" + } +} diff --git a/test/datasets/dataset-dcat-valid.jsonld b/test/datasets/dataset-dcat-valid.jsonld index 46f058e6..35c83931 100644 --- a/test/datasets/dataset-dcat-valid.jsonld +++ b/test/datasets/dataset-dcat-valid.jsonld @@ -7,16 +7,25 @@ "@type": "dcat:Dataset", "@id": "http://data.bibliotheken.nl/id/dataset/rise-alba", "dct:title": "Alba amicorum van de Koninklijke Bibliotheek", - "dct:identifier": "http://data.bibliotheken.nl/id/dataset/rise-alba", + "dct:description": "Just some dataset", "dct:keyword": [ "alba amicorum" ], "dct:license": "http://creativecommons.org/publicdomain/zero/1.0/", + "dct:created": "2021-05-27", + "dct:issued": "2021-06-27", + "dct:modified": "2021-07-27", "dct:publisher": { "@id": "https://example.com/dataset-provider", "@type": "foaf:Person", "foaf:name": "Dataset Provider" }, + "dct:creator": { + "@type": "foaf:Organization", + "@id": "https://example.com/dataset-creator", + "foaf:name": "Dataset creator" + }, + "dct:language": "nl-NL", "dcat:distribution": [ { "@type": "dcat:Distribution", diff --git a/test/datasets/dataset-schema-org-valid-plus-organization.jsonld b/test/datasets/dataset-schema-org-valid-plus-organization.jsonld index 1386bec4..0a16ff38 100644 --- a/test/datasets/dataset-schema-org-valid-plus-organization.jsonld +++ b/test/datasets/dataset-schema-org-valid-plus-organization.jsonld @@ -55,4 +55,4 @@ "addressCountry": "NL" } } -] \ No newline at end of file +] diff --git a/test/mock.ts b/test/mock.ts index 871e6dfa..d6219d3d 100644 --- a/test/mock.ts +++ b/test/mock.ts @@ -7,6 +7,7 @@ import {URL} from 'url'; import {DatasetStore} from '../src/dataset'; import fs from 'fs'; import DatasetExt from 'rdf-ext/lib/Dataset'; +import {Rating, RatingStore} from '../src/rate'; export class MockRegistrationStore implements RegistrationStore { private readonly registrations: Map = new Map(); @@ -61,5 +62,12 @@ export class MockDatasetStore implements DatasetStore { } } +export class MockRatingStore implements RatingStore { + public readonly ratings: Rating[] = []; + async store(datasetUri: URL, rating: Rating): Promise { + this.ratings.push(rating); + } +} + export const file = async (filename: string) => await fs.promises.readFile(`test/datasets/${filename}`, 'utf-8'); diff --git a/test/rate.test.ts b/test/rate.test.ts new file mode 100644 index 00000000..f789d0d1 --- /dev/null +++ b/test/rate.test.ts @@ -0,0 +1,22 @@ +import {rate} from '../src/rate'; +import {Valid} from '../src/validator'; +import {validate} from './validator.test'; + +describe('Rate', () => { + it('rates minimal dataset description', async () => { + const validationResult = (await validate( + 'dataset-dcat-valid-minimal.jsonld' + )) as Valid; + expect(rate(validationResult).worstRating).toBe(25); + expect(rate(validationResult).score).toBe(25); + }); + + it('rates complete dataset description', async () => { + const validationResult = (await validate( + 'dataset-dcat-valid.jsonld' + )) as Valid; + const rating = rate(validationResult); + expect(rating.score).toBe(100); + expect(rating.explanation).toBe(''); + }); +}); diff --git a/test/validator.test.ts b/test/validator.test.ts index cda39189..2122e4d5 100644 --- a/test/validator.test.ts +++ b/test/validator.test.ts @@ -1,26 +1,16 @@ import rdf from 'rdf-ext'; import {JsonLdParser} from 'jsonld-streaming-parser'; import * as fs from 'fs'; -import { - InvalidDataset, - shacl, - ShaclValidator, - Valid, - Validator, -} from '../src/validator'; +import {InvalidDataset, shacl, ShaclValidator, Valid} from '../src/validator'; import {StreamParser} from 'n3'; import {Transform} from 'stream'; import {StandardizeSchemaOrgPrefixToHttps} from '../src/transform'; import {MicrodataRdfParser} from 'microdata-rdf-streaming-parser/lib/MicrodataRdfParser'; import {RdfaParser} from 'rdfa-streaming-parser/lib/RdfaParser'; -let validator: Validator; +const validator = await ShaclValidator.fromUrl('shacl/register.ttl'); describe('Validator', () => { - beforeAll(async () => { - validator = await ShaclValidator.fromUrl('shacl/register.ttl'); - }); - it('accepts minimal valid Schema.org dataset', async () => { const report = (await validate( 'dataset-schema-org-valid-minimal.jsonld' @@ -177,7 +167,7 @@ describe('Validator', () => { }); }); -const validate = async (filename: string, parser?: Transform) => +export const validate = async (filename: string, parser?: Transform) => validator.validate(await dataset(filename, parser)); const dataset = async (filename: string, parser?: Transform) => {