Skip to content

Commit

Permalink
feat: Rate datasets when crawling
Browse files Browse the repository at this point in the history
* Base ratings on DCAT SHACL
  • Loading branch information
ddeboer committed Nov 5, 2023
1 parent 6d932ff commit 768acba
Show file tree
Hide file tree
Showing 14 changed files with 358 additions and 70 deletions.
8 changes: 4 additions & 4 deletions jest.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ export default {
coverageReporters: ['json-summary', 'text'],
coverageThreshold: {
global: {
lines: 71.01,
statements: 70.97,
branches: 64.56,
functions: 68.75,
lines: 71.7,
statements: 71.73,
branches: 63.3,
functions: 69.72,
},
},
transform: {
Expand Down
67 changes: 62 additions & 5 deletions shacl/register.ttl
Original file line number Diff line number Diff line change
Expand Up @@ -320,15 +320,66 @@ dcat:DatasetShape
sh:property [
sh:minCount 1 ;
sh:path dc:title
], [
],
[
sh:path dc:alternative
], [
sh:path dc:description
], [
],
[
sh:path dc:description ;
sh:minCount 1 ;
sh:severity sh:Info ;
sh:message "Dataset kan een beschrijving hebben"@nl, "Dataset should have a description"@en ;
],
[
sh:minCount 1 ;
sh:maxCount 1 ;
sh:path dc:license
],
[
sh:path dc:created ;
sh:minCount 1 ;
sh:maxCount 1 ;
sh:severity sh:Info ;
sh:message "Dataset kan een aanmaakdatum hebben"@nl, "Dataset should have a creation date"@en ;
],
[
sh:path dc:issued ;
sh:minCount 1 ;
sh:maxCount 1 ;
sh:severity sh:Info ;
sh:message "Dataset kan een uitgavedatum hebben"@nl, "Dataset should have an issued date"@en ;
],
[
sh:path dc:modified ;
sh:minCount 1 ;
sh:maxCount 1 ;
sh:severity sh:Info ;
sh:message "Dataset kan een laatste wijzigingsdatum hebben"@nl, "Dataset should have a last modified date"@en ;
],
[
sh:path dc:keyword ;
sh:minCount 1 ;
sh:severity sh:Info ;
sh:message "Dataset kan trefwoorden hebben"@nl, "Dataset should have keywords"@en ;
],
[
sh:path dc:spatial ;
sh:minCount 1 ;
sh:severity sh:Info ;
sh:message "Dataset kan een gebiedsaanduiding hebben"@nl, "Dataset should have spatial coverage"@en ;
],
[
sh:path dc:temporal ;
sh:minCount 1 ;
sh:severity sh:Info ;
sh:message "Dataset kan een tijdsaanduiding hebben"@nl, "Dataset should have temporal coverage"@en ;
],
[
sh:path dc:language ;
sh:minCount 1 ;
sh:severity sh:Info ;
sh:message "Dataset kan een taal hebben"@nl, "Dataset should have a language"@en ;
],
[
sh:path dc:publisher ;
sh:minCount 1 ;
Expand All @@ -346,6 +397,8 @@ dcat:DatasetShape
],
[
sh:path dc:creator ;
sh:minCount 1 ;
sh:severity sh:Info ;
sh:or (
[
sh:class foaf:Organization ;
Expand All @@ -356,11 +409,15 @@ dcat:DatasetShape
sh:node dcat:PersonShape ;
]
) ;
sh:message "Dataset kan een maker hebben"@nl, "Dataset should have a creator"@en ;
],
[
sh:path dcat:distribution ;
sh:class dcat:Distribution ;
sh:node dcat:DistributionShape
sh:node dcat:DistributionShape ;
sh:minCount 1 ;
sh:severity sh:Info ;
sh:message "Dataset moet een distributie hebben"@nl, "Dataset should have a distribution"@en ;
] ;
sh:targetClass dcat:Dataset ;
.
Expand Down
14 changes: 11 additions & 3 deletions src/crawler.ts
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import {RegistrationStore} from './registration.js';
import {DatasetStore, extractIris} from './dataset.js';
import {DatasetStore, extractIri, extractIris} from './dataset.js';
import {dereference, fetch, HttpError, NoDatasetFoundAtUrl} from './fetch.js';
import DatasetExt from 'rdf-ext/lib/Dataset';
import Pino from 'pino';
import {Validator} from './validator.js';
import {Valid, Validator} from './validator.js';
import {crawlCounter} from './instrumentation.js';
import {rate, RatingStore} from './rate.js';

export class Crawler {
constructor(
private registrationStore: RegistrationStore,
private datasetStore: DatasetStore,
private ratingStore: RatingStore,
private validator: Validator,
private logger: Pino.Logger
) {}
Expand All @@ -28,10 +30,16 @@ export class Crawler {

try {
const data = await dereference(registration.url);
isValid = (await this.validator.validate(data)).state === 'valid';
const validationResult = await this.validator.validate(data);
isValid = validationResult.state === 'valid';
if (isValid) {
datasets = await fetch(registration.url);
await this.datasetStore.store(datasets);
datasets.map(async dataset => {
const dcatValidationResult = await this.validator.validate(dataset);
const rating = rate(dcatValidationResult as Valid);
this.ratingStore.store(extractIri(dataset), rating);
});
}
} catch (e) {
if (e instanceof HttpError) {
Expand Down
22 changes: 13 additions & 9 deletions src/dataset.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import {Readable, Transform} from 'stream';
import {StreamParser} from 'n3';
import {JsonLdParser} from 'jsonld-streaming-parser';
import {StandardizeSchemaOrgPrefixToHttps} from './transform.js';
import {DatasetCore} from 'rdf-js';

export interface DatasetStore {
/**
Expand All @@ -19,17 +20,20 @@ export interface DatasetStore {
countOrganisations(): Promise<number>;
}

export function extractIri(dataset: DatasetCore): URL {
const quad = [
...dataset.match(
null,
factory.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
datasetType
),
][0];
return new URL(quad.subject.value);
}

export function extractIris(datasets: DatasetExt[]): Map<URL, DatasetExt> {
return datasets.reduce((map, dataset) => {
const quad = [
...dataset.match(
null,
factory.namedNode('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
datasetType
),
][0];
const url = new URL(quad.subject.value);
map.set(url, dataset);
map.set(extractIri(dataset), dataset);
return map;
}, new Map<URL, DatasetExt>());
}
Expand Down
124 changes: 93 additions & 31 deletions src/graphdb.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import {
RegistrationStore,
} from './registration.js';
import {DatasetStore, extractIris} from './dataset.js';
import {Rating, RatingStore} from './rate.js';

export type SparqlResult = {
results: {
Expand Down Expand Up @@ -62,22 +63,24 @@ export class GraphDbClient {
this.token = response.headers.get('Authorization')!;
}

public async request(
method: string,
url: string,
body?: string,
accept?: string
): Promise<Response> {
public async request(options: {
method: string;
url?: string;
body?: string;
accept?: string;
contentType?: string;
}): Promise<Response> {
const headers = await this.getHeaders();
headers.set('Content-Type', 'application/x-trig');
if (accept) {
headers.set('Accept', accept);
headers.set('Content-Type', options.contentType ?? 'application/x-trig');
if (options.accept) {
headers.set('Accept', options.accept);
}
const repositoryUrl = this.url + '/repositories/' + this.repository + url;
const repositoryUrl =
this.url + '/repositories/' + this.repository + options.url;
const response = await fetch(repositoryUrl, {
method: method,
method: options.method,
headers: headers,
body: body,
body: options.body,
});
if (
// 409 = `Auth token hash mismatch`, which occurs after GraphDB has restarted.
Expand All @@ -87,29 +90,48 @@ export class GraphDbClient {
) {
this.token = undefined;
// Retry original request.
await this.request(method, url, body);
await this.request(options);
}

if (!response.ok) {
console.error(
'HTTP error ' + response.status + ' for ' + method + ' ' + repositoryUrl
'HTTP error ' +
response.status +
' for ' +
options.method +
' ' +
repositoryUrl
);
}

return response;
}

public async query(query: string): Promise<SparqlResult> {
const response = await this.request(
'GET',
'?' + querystring.stringify({query}),
undefined,
'application/sparql-results+json'
);
const response = await this.request({
method: 'GET',
url: '?' + querystring.stringify({query}),
accept: 'application/sparql-results+json',
});

return (await response.json()) as SparqlResult;
}

public async update(payload: string): Promise<void> {
const response = await this.request({
method: 'POST',
url: '/statements',
body: payload,
contentType: 'application/sparql-update',
});

if (!response.ok) {
console.error(
`${response.status} response for SPARQL update ${payload})`
);
}
}

private async getHeaders(): Promise<Headers> {
if (this.username === undefined || this.password === undefined) {
return new Headers();
Expand Down Expand Up @@ -223,15 +245,20 @@ export class GraphDbRegistrationStore implements RegistrationStore {
return new Promise((resolve, reject) => {
getWriter(quads).end(async (error, result) => {
try {
await this.client.request(
'DELETE',
'/statements?' +
await this.client.request({
method: 'DELETE',
url:
'/statements?' +
querystring.stringify({
subj: '<' + registration.url.toString() + '>',
context: '<' + this.registrationsGraph + '>',
})
);
await this.client.request('POST', '/statements', result);
}),
});
await this.client.request({
method: 'POST',
url: '/statements',
body: result,
});
resolve(null);
} catch (e) {
reject(e);
Expand Down Expand Up @@ -346,12 +373,13 @@ export class GraphDbDatasetStore implements DatasetStore {
async (error, result) => {
try {
resolve(
await this.client.request(
'PUT',
'/rdf-graphs/service?graph=' +
await this.client.request({
method: 'PUT',
url:
'/rdf-graphs/service?graph=' +
encodeURIComponent(graphIri.toString()),
result
)
body: result,
})
);
} catch (e) {
reject(e);
Expand All @@ -368,3 +396,37 @@ function getWriter(quads: Quad[]): Writer {

return writer;
}

export class GraphDbRatingStore implements RatingStore {
private readonly graph =
'https://data.netwerkdigitaalerfgoed.nl/registry/ratings';

constructor(private readonly client: GraphDbClient) {}
async store(datasetUri: URL, rating: Rating): Promise<void> {
await this.client.update(`
PREFIX schema: <http://schema.org/>
WITH <${this.graph}>
DELETE {
?dataset schema:contentRating ?rating .
?rating ?p ?o .
}
WHERE {
BIND(<${datasetUri}> as ?dataset)
?dataset schema:contentRating ?rating .
?rating ?p ?o .
};
WITH <${this.graph}>
INSERT {
<${datasetUri}> schema:contentRating [
schema:bestRating ${rating.bestRating} ;
schema:worstRating ${rating.worstRating} ;
schema:ratingValue ${rating.score} ;
schema:ratingExplanation "${rating.explanation}" ;
]
}
WHERE {}
`);
}
}
3 changes: 3 additions & 0 deletions src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import {
GraphDbAllowedRegistrationDomainStore,
GraphDbClient,
GraphDbDatasetStore,
GraphDbRatingStore,
GraphDbRegistrationStore,
} from './graphdb.js';
import {readUrl, ShaclValidator} from './validator.js';
Expand All @@ -26,6 +27,7 @@ const client = new GraphDbClient(
const logger = Pino();
const datasetStore = new GraphDbDatasetStore(client);
const registrationStore = new GraphDbRegistrationStore(client);
const ratingStore = new GraphDbRatingStore(client);
const allowedRegistrationDomainStore =
new GraphDbAllowedRegistrationDomainStore(client);
await startInstrumentation(datasetStore);
Expand All @@ -34,6 +36,7 @@ const client = new GraphDbClient(
const crawler = new Crawler(
registrationStore,
datasetStore,
ratingStore,
validator,
logger
);
Expand Down
Loading

0 comments on commit 768acba

Please sign in to comment.