diff --git a/ckanext/dcat/profiles/__init__.py b/ckanext/dcat/profiles/__init__.py index 6d30a244..668de499 100644 --- a/ckanext/dcat/profiles/__init__.py +++ b/ckanext/dcat/profiles/__init__.py @@ -25,4 +25,5 @@ from .euro_dcat_ap_3 import EuropeanDCATAP3Profile from .dcat_us_3 import DCATUS3Profile from .euro_dcat_ap_scheming import EuropeanDCATAPSchemingProfile +from .euro_health_dcat_ap import EuropeanHealthDCATAPProfile from .schemaorg import SchemaOrgProfile diff --git a/ckanext/dcat/profiles/base.py b/ckanext/dcat/profiles/base.py index 30b989e7..a93eeb5c 100644 --- a/ckanext/dcat/profiles/base.py +++ b/ckanext/dcat/profiles/base.py @@ -2,16 +2,16 @@ import json from urllib.parse import quote +from ckan.lib.helpers import resource_formats +from ckan.model.license import LicenseRegister +from ckantoolkit import ObjectNotFound, asbool, aslist, config, get_action, url_for from dateutil.parser import parse as parse_date -from rdflib import term, URIRef, BNode, Literal -from rdflib.namespace import Namespace, RDF, XSD, SKOS, RDFS, ORG -from geomet import wkt, InvalidGeoJSONException +from geomet import InvalidGeoJSONException, wkt +from rdflib import BNode, Literal, URIRef, term +from rdflib.namespace import ORG, RDF, RDFS, SKOS, XSD, Namespace -from ckantoolkit import config, url_for, asbool, aslist, get_action, ObjectNotFound -from ckan.model.license import LicenseRegister -from ckan.lib.helpers import resource_formats from ckanext.dcat.utils import DCAT_EXPOSE_SUBCATALOGS -from ckanext.dcat.validators import is_year, is_year_month, is_date +from ckanext.dcat.validators import is_date, is_year, is_year_month CNT = Namespace("http://www.w3.org/2011/content#") DCT = Namespace("http://purl.org/dc/terms/") diff --git a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py index 3a2742a1..bea68935 100644 --- a/ckanext/dcat/profiles/euro_dcat_ap_scheming.py +++ b/ckanext/dcat/profiles/euro_dcat_ap_scheming.py @@ -1,6 +1,6 @@ import json -from rdflib import URIRef, BNode, Literal +from rdflib import URIRef, BNode, Literal, term from .base import RDFProfile, CleanedURIRef, URIRefOrLiteral from .base import ( RDF, @@ -10,6 +10,7 @@ FOAF, SKOS, LOCN, + RDFS, ) @@ -118,6 +119,11 @@ def _parse_list_value(data_dict, field_name): if agents: dataset_dict[key] = agents + # Add any qualifiedRelations + qual_relations = self._relationship_details(dataset_ref, DCAT.qualifiedRelation) + if qual_relations: + dataset_dict["qualified_relation"] = qual_relations + # Repeating subfields: resources for schema_field in self._dataset_schema["resource_fields"]: if "repeating_subfields" in schema_field: @@ -227,6 +233,10 @@ def _graph_from_dataset_v2_scheming(self, dataset_dict, dataset_ref): spatial_ref, field[1], item[field[0]] ) + self._add_relationship( + dataset_ref, dataset_dict, "qualified_relation", DCAT.qualifiedRelation + ) + resources = dataset_dict.get("resources", []) for resource in resources: if resource.get("access_services"): @@ -292,6 +302,80 @@ def _add_agents( _type=URIRefOrLiteral, ) + def _relationship_details(self, subject, predicate): + """ + Returns a list of dicts with details about a dcat:Relationship property, e.g. + dcat:qualifiedRelation + + Both subject and predicate must be rdflib URIRef or BNode objects + + Returns keys for uri, role, and relation with the values set to + an empty string if they could not be found. + """ + + relations = [] + for relation in self.g.objects(subject, predicate): + relation_details = {} + relation_details["uri"] = ( + str(relation) if isinstance(relation, term.URIRef) else "" + ) + relation_details["role"] = self._object_value(relation, DCAT.hadRole) + relation_details["relation"] = self._object_value(relation, DCT.relation) + relations.append(relation_details) + + return relations + + def _add_relationship( + self, + dataset_ref, + dataset_dict, + relation_key, + rdf_predicate, + ): + """ + Adds one or more Relationships to the RDF graph. + + :param dataset_ref: The RDF reference of the dataset + :param dataset_dict: The dataset dictionary containing agent information + :param relation_key: field name in the CKAN dict (.e.g. "qualifiedRelation") + :param rdf_predicate: The RDF predicate (DCAT.qualifiedRelation) + """ + relation = dataset_dict.get(relation_key) + if ( + isinstance(relation, list) + and len(relation) + and self._not_empty_dict(relation[0]) + ): + relations = relation + + for relation in relations: + + agent_uri = relation.get("uri") + if agent_uri: + agent_ref = CleanedURIRef(agent_uri) + else: + agent_ref = BNode() + + self.g.add((agent_ref, RDF.type, DCAT.Relationship)) + self.g.add((dataset_ref, rdf_predicate, agent_ref)) + + self._add_triple_from_dict( + relation, + agent_ref, + DCT.relation, + "relation", + _type=URIRefOrLiteral, + _class=RDFS.Resource, + ) + self._add_triple_from_dict( + relation, + agent_ref, + DCAT.hadRole, + "role", + _type=URIRefOrLiteral, + _class=DCAT.Role, + ) + @staticmethod def _not_empty_dict(data_dict): return any(data_dict.values()) diff --git a/ckanext/dcat/profiles/euro_health_dcat_ap.py b/ckanext/dcat/profiles/euro_health_dcat_ap.py new file mode 100644 index 00000000..7e3702f1 --- /dev/null +++ b/ckanext/dcat/profiles/euro_health_dcat_ap.py @@ -0,0 +1,152 @@ +from rdflib import XSD, Literal, URIRef +from rdflib.namespace import Namespace + +from ckanext.dcat.profiles.base import URIRefOrLiteral +from ckanext.dcat.profiles.euro_dcat_ap_3 import EuropeanDCATAP3Profile + +# HealthDCAT-AP namespace. Note: not finalized yet +HEALTHDCATAP = Namespace("http://healthdataportal.eu/ns/health#") + +# Data Privacy Vocabulary namespace +DPV = Namespace("https://w3id.org/dpv#") + +namespaces = { + "healthdcatap": HEALTHDCATAP, + "dpv": DPV, +} + + +class EuropeanHealthDCATAPProfile(EuropeanDCATAP3Profile): + """ + A profile implementing HealthDCAT-AP, a health-related extension of the DCAT + application profile for sharing information about Catalogues containing Datasets + and Data Services descriptions in Europe. + """ + + def parse_dataset(self, dataset_dict, dataset_ref): + # Call super method for DCAT-AP 3 properties + dataset_dict = super(EuropeanHealthDCATAPProfile, self).parse_dataset( + dataset_dict, dataset_ref + ) + + dataset_dict = self._parse_health_fields(dataset_dict, dataset_ref) + + return dataset_dict + + def _parse_health_fields(self, dataset_dict, dataset_ref): + self.__parse_healthdcat_stringvalues(dataset_dict, dataset_ref) + + self.__parse_healthdcat_intvalues(dataset_dict, dataset_ref) + + # Add the HDAB. There should only ever be one but you never know + agents = self._agents_details(dataset_ref, HEALTHDCATAP.hdab) + if agents: + dataset_dict["hdab"] = agents + + # Retention period + retention_start, retention_end = self._time_interval( + dataset_ref, HEALTHDCATAP.retentionPeriod, dcat_ap_version=2 + ) + retention_dict = {} + if retention_start is not None: + retention_dict["start"] = retention_start + if retention_end is not None: + retention_dict["end"] = retention_end + if retention_dict: + dataset_dict["retention_period"] = [retention_dict] + + return dataset_dict + + def __parse_healthdcat_intvalues(self, dataset_dict, dataset_ref): + for key, predicate in ( + ("min_typical_age", HEALTHDCATAP.minTypicalAge), + ("max_typical_age", HEALTHDCATAP.maxTypicalAge), + ("number_of_records", HEALTHDCATAP.numberOfRecords), + ("number_of_unique_individuals", HEALTHDCATAP.numberOfUniqueIndividuals), + ): + value = self._object_value_int(dataset_ref, predicate) + # A zero value evaluates as False but is definitely not a None + if value is not None: + dataset_dict[key] = value + + def __parse_healthdcat_stringvalues(self, dataset_dict, dataset_ref): + for (key, predicate,) in ( + ("analytics", HEALTHDCATAP.analytics), + ("code_values", HEALTHDCATAP.hasCodeValues), + ("coding_system", HEALTHDCATAP.hasCodingSystem), + ("health_category", HEALTHDCATAP.healthCategory), + ("health_theme", HEALTHDCATAP.healthTheme), + ("legal_basis", DPV.hasLegalBasis), + ("personal_data", DPV.hasPersonalData), + ("population_coverage", HEALTHDCATAP.populationCoverage), + ("publisher_note", HEALTHDCATAP.publisherNote), + ("publisher_type", HEALTHDCATAP.publisherType), + ("purpose", DPV.hasPurpose), + ): + values = self._object_value_list(dataset_ref, predicate) + if values: + dataset_dict[key] = values + + def graph_from_dataset(self, dataset_dict, dataset_ref): + super().graph_from_dataset(dataset_dict, dataset_ref) + for prefix, namespace in namespaces.items(): + self.g.bind(prefix, namespace) + + # key, predicate, fallbacks, _type, _class + items = [ + ("analytics", HEALTHDCATAP.analytics, None, URIRefOrLiteral), + ("code_values", HEALTHDCATAP.hasCodeValues, None, URIRefOrLiteral), + ("coding_system", HEALTHDCATAP.hasCodingSystem, None, URIRefOrLiteral), + ("health_category", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral), + ("health_theme", HEALTHDCATAP.healthCategory, None, URIRefOrLiteral), + ("legal_basis", DPV.hasLegalBasis, None, URIRefOrLiteral), + ( + "population_coverage", + HEALTHDCATAP.populationCoverage, + None, + URIRefOrLiteral, + ), + ("personal_data", DPV.hasPersonalData, None, URIRef), + ("publisher_note", HEALTHDCATAP.publisherNote, None, URIRefOrLiteral), + ("publisher_type", HEALTHDCATAP.publisherType, None, URIRefOrLiteral), + ("purpose", DPV.hasPurpose, None, URIRefOrLiteral), + ] + self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) + + items = [ + ("min_typical_age", HEALTHDCATAP.minTypicalAge), + ("max_typical_age", HEALTHDCATAP.maxTypicalAge), + ("number_of_records", HEALTHDCATAP.numberOfRecords), + ("number_of_unique_individuals", HEALTHDCATAP.numberOfUniqueIndividuals), + ] + for key, predicate in items: + self._add_nonneg_integer_triple(dataset_dict, dataset_ref, key, predicate) + + self._add_agents(dataset_ref, dataset_dict, "hdab", HEALTHDCATAP.hdab) + + def _add_nonneg_integer_triple(self, dataset_dict, dataset_ref, key, predicate): + """ + Adds non-negative integers to the Dataset graph (xsd:nonNegativeInteger) + + dataset_ref: subject of Graph + key: scheming key in CKAN + predicate: predicate to use + """ + value = self._get_dict_value(dataset_dict, key) + + if value: + try: + if int(value) < 0: + raise ValueError("Not a non-negative integer") + self.g.add( + ( + dataset_ref, + predicate, + Literal(int(value), datatype=XSD.nonNegativeInteger), + ) + ) + except (ValueError, TypeError): + self.g.add((dataset_ref, predicate, Literal(value))) + + def graph_from_catalog(self, catalog_dict, catalog_ref): + super().graph_from_catalog(catalog_dict, catalog_ref) diff --git a/ckanext/dcat/schemas/dcat_ap_full.yaml b/ckanext/dcat/schemas/dcat_ap_full.yaml index aee4fffd..0dcd1b49 100644 --- a/ckanext/dcat/schemas/dcat_ap_full.yaml +++ b/ckanext/dcat/schemas/dcat_ap_full.yaml @@ -268,6 +268,23 @@ dataset_fields: help_inline: true help_text: This property refers to a related Dataset that is a version, edition, or adaptation of the described Dataset. +- field_name: qualified_relation + label: Qualified relation + repeating_label: Relationship + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: relation + label: Relation + help_text: The resource related to the source resource. + + - field_name: role + label: Role + help_text: The function of an entity or agent with respect to another entity or resource. + help_text: A description of a relationship with another resource. + #- field_name: hvd_category # label: HVD Category # preset: multiple_text diff --git a/ckanext/dcat/schemas/dcat_us_full.yaml b/ckanext/dcat/schemas/dcat_us_full.yaml index 6f55903f..31478bb9 100644 --- a/ckanext/dcat/schemas/dcat_us_full.yaml +++ b/ckanext/dcat/schemas/dcat_us_full.yaml @@ -331,6 +331,23 @@ dataset_fields: - field_name: license label: License +- field_name: qualified_relation + label: Qualified relation + repeating_label: Relationship + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: relation + label: Relation + help_text: The resource related to the source resource. + + - field_name: role + label: Role + help_text: The function of an entity or agent with respect to another entity or resource. + help_text: A description of a relationship with another resource. + # Note: if not provided, this will be autogenerated - field_name: uri label: URI diff --git a/ckanext/dcat/schemas/health_dcat_ap.yaml b/ckanext/dcat/schemas/health_dcat_ap.yaml new file mode 100644 index 00000000..6245756d --- /dev/null +++ b/ckanext/dcat/schemas/health_dcat_ap.yaml @@ -0,0 +1,598 @@ +scheming_version: 2 +dataset_type: dataset +about: Schema for HealthDCAT-AP +about_url: http://github.com/ckan/ckanext-dcat + +dataset_fields: + +- field_name: title + label: Title + preset: title + required: true + help_text: A descriptive title for the dataset. + +- field_name: name + label: URL + preset: dataset_slug + form_placeholder: eg. my-dataset + +- field_name: notes + label: Description + required: true + form_snippet: markdown.html + help_text: A free-text account of the dataset. + +- field_name: tag_string + label: Keywords + preset: tag_string_autocomplete + form_placeholder: eg. economy, mental health, government + help_text: Keywords or tags describing the dataset. Use commas to separate multiple values. + +- field_name: contact + label: Contact points + repeating_label: Contact point + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the contact point. Such as a ROR ID. + + help_text: Contact information for enquiries about the dataset. + +- field_name: publisher + label: Publisher + repeating_label: Publisher + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the publisher, such as a ROR ID. + help_text: Entity responsible for making the dataset available. + +- field_name: creator + label: Creator + repeating_label: Creator + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + help_text: URI of the creator, if available. + + - field_name: name + label: Name + help_text: Name of the entity or person who created the dataset. + + - field_name: email + label: Email + display_snippet: email.html + help_text: Contact email of the creator. + + - field_name: url + label: URL + display_snippet: link.html + help_text: URL for more information about the creator. + + - field_name: type + label: Type + help_text: Type of creator (e.g., Organization, Person). + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the creator, such as an ORCID or ROR ID. + +- field_name: license_id + label: License + form_snippet: license.html + help_text: License definitions and additional information can be found at http://opendefinition.org/. + +- field_name: owner_org + label: Organization + preset: dataset_organization + help_text: The CKAN organization the dataset belongs to. + +- field_name: url + label: Landing page + form_placeholder: http://example.com/dataset.json + display_snippet: link.html + help_text: Web page that can be navigated to gain access to the dataset, its distributions and/or additional information. + + # Note: this will fall back to metadata_created if not present +- field_name: issued + label: Release date + preset: dcat_date + help_text: Date of publication of the dataset. + + # Note: this will fall back to metadata_modified if not present +- field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the dataset was changed, updated or modified. + +- field_name: version + label: Version + validators: ignore_missing unicode_safe package_version_validator + help_text: Version number or other version designation of the dataset. + +- field_name: version_notes + label: Version notes + validators: ignore_missing unicode_safe + form_snippet: markdown.html + display_snippet: markdown.html + help_text: A description of the differences between this version and a previous version of the dataset. + + # Note: CKAN will generate a unique identifier for each dataset +- field_name: identifier + label: Identifier + help_text: A unique identifier of the dataset. + +- field_name: frequency + label: Frequency + help_text: The frequency at which dataset is published. + +- field_name: provenance + label: Provenance + form_snippet: markdown.html + display_snippet: markdown.html + help_text: A statement about the lineage of the dataset. + +- field_name: dcat_type + label: Type + help_text: The type of the dataset. + # TODO: controlled vocabulary? + +- field_name: temporal_coverage + label: Temporal coverage + repeating_subfields: + + - field_name: start + label: Start + preset: dcat_date + + - field_name: end + label: End + preset: dcat_date + help_text: The temporal period or periods the dataset covers. + +- field_name: temporal_resolution + label: Temporal resolution + help_text: Minimum time period resolvable in the dataset. + +- field_name: spatial_coverage + label: Spatial coverage + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: text + label: Label + + - field_name: geom + label: Geometry + + - field_name: bbox + label: Bounding Box + + - field_name: centroid + label: Centroid + help_text: A geographic region that is covered by the dataset. + +- field_name: spatial_resolution_in_meters + label: Spatial resolution in meters + help_text: Minimum spatial separation resolvable in a dataset, measured in meters. + +- field_name: access_rights + label: Access rights + validators: ignore_missing unicode_safe + help_text: Information that indicates whether the dataset is Open Data, has access restrictions or is not public. + +- field_name: alternate_identifier + label: Other identifier + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: This property refers to a secondary identifier of the dataset, such as MAST/ADS, DataCite, DOI, etc. + +- field_name: theme + label: Theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A category of the dataset. A Dataset may be associated with multiple themes. + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the dataset. + # TODO: language form snippet / validator / graph + +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A page or document about this dataset. + +- field_name: conforms_to + label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: An implementing rule or other specification that the dataset follows. + +- field_name: is_referenced_by + label: Is referenced by + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A related resource, such as a publication, that references, cites, or otherwise points to the dataset. + +- field_name: analytics + label: Analytics + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + An analytics distribution of the dataset. + Publishers are encouraged to provide URLs pointing to API endpoints or document + repositories where users can access or request associated resources such as + technical reports of the dataset, quality measurements, usability indicators,... + or analytics services. + +- field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the dataset. + +- field_name: code_values + label: Code values + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Health classifications and their codes associated with the dataset. + +- field_name: coding_system + label: Coding system + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + Coding systems in use (e.g. ICD-10-CM, DGRs, SNOMED CT, ...). + To comply with HealthDCAT-AP, Wikidata URIs MUST be used. + +- field_name: purpose + label: Purpose + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A free text statement of the purpose of the processing of data or personal data. + +- field_name: health_category + label: Health category + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + The health category to which this dataset belongs as described in the Commission Regulation on + the European Health Data Space laying down a list of categories of electronic data for + secondary use, Art.33. + +- field_name: health_theme + label: Health theme + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + A category of the Dataset or tag describing the Dataset. + +- field_name: legal_basis + label: Legal basis + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legal basis used to justify processing of personal data. + +- field_name: min_typical_age + label: Minimum typical age + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Minimum typical age of the population within the dataset. + +- field_name: max_typical_age + label: Maximum typical age + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Maximum typical age of the population within the dataset. + +- field_name: number_of_records + label: Number of records + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Size of the dataset in terms of the number of records + +- field_name: number_of_unique_individuals + label: Number of records for unique individuals. + validators: ignore_missing int_validator + form_snippet: number.html + help_text: Number of records for unique individuals. + +- field_name: personal_data + label: Personal data + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Key elements that represent an individual in the dataset. + +- field_name: publisher_note + label: Publisher note + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + A description of the publisher activities. + +- field_name: publisher_type + label: Publisher type + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + A type of organisation that makes the Dataset available. + +- field_name: population_coverage + label: Population coverage + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: > + A definition of the population within the dataset. + +- field_name: retention_period + label: Retention period + repeating_subfields: + + - field_name: start + label: Start + preset: dcat_date + + - field_name: end + label: End + preset: dcat_date + + help_text: A temporal period which the dataset is available for secondary use. + + +# Officially there can only be one HDAB for now, but keep it repeating subfield just in case +- field_name: hdab + label: Health data access body + repeating_label: Health data access body + repeating_once: true + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: name + label: Name + + - field_name: email + label: Email + display_snippet: email.html + + - field_name: url + label: URL + display_snippet: link.html + + - field_name: type + label: Type + + - field_name: identifier + label: Identifier + help_text: Unique identifier for the HDAB, such as a ROR ID. + help_text: Health Data Access Body supporting access to data in the Member State. + +- field_name: qualified_relation + label: Qualified relation + repeating_label: Relationship + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: relation + label: Relation + help_text: The resource related to the source resource. + + - field_name: role + label: Role + help_text: The function of an entity or agent with respect to another entity or resource. + help_text: A description of a relationship with another resource. + +# Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + help_text: An URI for this dataset (if not provided it will be autogenerated). + +# TODO: relation-based properties are not yet included (e.g. is_version_of, source, sample, etc) +# +resource_fields: + +- field_name: url + label: URL + preset: resource_url_upload + +- field_name: name + label: Name + form_placeholder: + help_text: A descriptive title for the resource. + +- field_name: description + label: Description + form_snippet: markdown.html + help_text: A free-text account of the resource. + +- field_name: format + label: Format + preset: resource_format_autocomplete + help_text: File format. If not provided it will be guessed. + +- field_name: mimetype + label: Media type + validators: if_empty_guess_format ignore_missing unicode_safe + help_text: Media type for this format. If not provided it will be guessed. + +- field_name: compress_format + label: Compress format + help_text: The format of the file in which the data is contained in a compressed form. + +- field_name: package_format + label: Package format + help_text: The format of the file in which one or more data files are grouped together. + +- field_name: size + label: Size + validators: ignore_missing int_validator + form_snippet: number.html + display_snippet: file_size.html + help_text: File size in bytes + +- field_name: hash + label: Hash + help_text: Checksum of the downloaded file. + +- field_name: hash_algorithm + label: Hash Algorithm + help_text: Algorithm used to calculate to checksum. + +- field_name: rights + label: Rights + form_snippet: markdown.html + display_snippet: markdown.html + help_text: Some statement about the rights associated with the resource. + +- field_name: availability + label: Availability + help_text: Indicates how long it is planned to keep the resource available. + +- field_name: status + label: Status + preset: select + choices: + - value: http://purl.org/adms/status/Completed + label: Completed + - value: http://purl.org/adms/status/UnderDevelopment + label: Under Development + - value: http://purl.org/adms/status/Deprecated + label: Deprecated + - value: http://purl.org/adms/status/Withdrawn + label: Withdrawn + help_text: The status of the resource in the context of maturity lifecycle. + +- field_name: license + label: License + help_text: License in which the resource is made available. If not provided will be inherited from the dataset. + +- field_name: has_version + label: Has version + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_inline: true + help_text: This property refers to a related Dataset that is a version, edition, or adaptation of the described Dataset. + + # Note: this falls back to the standard resource url field +- field_name: access_url + label: Access URL + help_text: URL that gives access to the dataset (defaults to the standard resource URL). + + # Note: this falls back to the standard resource url field +- field_name: download_url + label: Download URL + display_snippet: link.html + help_text: URL that provides a direct link to a downloadable file (defaults to the standard resource URL). + +- field_name: issued + label: Release date + preset: dcat_date + help_text: Date of publication of the resource. + +- field_name: modified + label: Modification date + preset: dcat_date + help_text: Most recent date on which the resource was changed, updated or modified. + +- field_name: temporal_resolution + label: Temporal resolution + help_text: Minimum time period resolvable in the distribution. + +- field_name: spatial_resolution_in_meters + label: Spatial resolution in meters + help_text: Minimum spatial separation resolvable in the distribution, measured in meters. + +- field_name: language + label: Language + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: Language or languages of the resource. + +- field_name: documentation + label: Documentation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: A page or document about this resource. + +- field_name: conforms_to + label: Conforms to + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: An established schema to which the described resource conforms. + +- field_name: applicable_legislation + label: Applicable legislation + preset: multiple_text + validators: ignore_missing scheming_multiple_text + help_text: The legislation that mandates the creation or management of the resource. + +- field_name: access_services + label: Access services + repeating_label: Access service + repeating_subfields: + + - field_name: uri + label: URI + + - field_name: title + label: Title + + - field_name: endpoint_description + label: Endpoint description + + - field_name: endpoint_url + label: Endpoint URL + preset: multiple_text + + - field_name: serves_dataset + label: Serves dataset + preset: multiple_text + validators: ignore_missing scheming_multiple_text + + help_text: A data service that gives access to the resource. + + # Note: if not provided, this will be autogenerated +- field_name: uri + label: URI + help_text: An URI for this resource (if not provided it will be autogenerated). diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py index fdda473f..1bce901c 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_euro_dcatap_2_profile_parse.py @@ -555,7 +555,6 @@ def test_dataset_distribution_access_service_list_values_only(self): # List endpoint_url_list = access_service.get('endpoint_url') - print(access_service) assert len(endpoint_url_list) == 1 assert 'http://publications.europa.eu/webapi/rdf/sparql' in endpoint_url_list diff --git a/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py b/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py index 27c6e770..ea343c10 100644 --- a/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py +++ b/ckanext/dcat/tests/profiles/dcat_ap_2/test_scheming_support.py @@ -265,6 +265,17 @@ def test_e2e_ckan_to_dcat(self): wkt_geom = wkt.dumps(dataset["spatial_coverage"][0]["geom"], decimals=4) assert self._triple(g, spatial[0][2], LOCN.Geometry, wkt_geom, GSP.wktLiteral) + # Test qualified relation + relation = [t for t in g.triples((dataset_ref, DCAT.qualifiedRelation, None))] + assert len(relation) == 1 + relation_items = [ + (DCT.relation, URIRef(dataset_dict["qualified_relation"][0]["relation"])), + (DCAT.hadRole, URIRef(dataset_dict["qualified_relation"][0]["role"])), + ] + for predicate, value in relation_items: + assert self._triple( + g, relation[0][2], predicate, value + ), f"relation Predicate {predicate} does not have value {value}" # Statements for item in [ ("access_rights", DCT.accessRights), @@ -747,6 +758,17 @@ def test_e2e_dcat_to_ckan(self): ) assert dataset["spatial_coverage"][0]["geom"] + assert len(dataset["qualified_relation"]) == 1 + assert ( + dataset["qualified_relation"][0]["relation"] + == "http://example.com/dataset/3.141592" + ) + assert ( + dataset["qualified_relation"][0]["role"] + == "http://www.iana.org/assignments/relation/related" + ) + + resource = dataset["resources"][0] # Resources: core fields diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/__init__.py b/ckanext/dcat/tests/profiles/health_dcat_ap/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py new file mode 100644 index 00000000..7abcacb4 --- /dev/null +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_parse.py @@ -0,0 +1,179 @@ +# test +import json +import logging +from pprint import pprint + +import pytest +from ckan.tests.helpers import call_action + +from ckanext.dcat.processors import RDFParser +from ckanext.dcat.tests.utils import BaseParseTest + +log = logging.getLogger(__name__) + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", "ckanext.dcat.schemas:health_dcat_ap.yaml" +) +@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_health_dcat_ap") +class TestSchemingParseSupport(BaseParseTest): + def test_e2e_dcat_to_ckan(self): + """ + Parse a DCAT RDF graph into a CKAN dataset dict, create a dataset with + package_create and check that all expected fields are there + """ + + contents = self._get_file_contents("dcat/dataset_health.ttl") + + p = RDFParser() + + p.parse(contents, _format="turtle") + + datasets = [d for d in p.datasets()] + + assert len(datasets) == 1 + + dataset_dict = datasets[0] + + dataset_dict["name"] = "test-dcat-1" + dataset = call_action("package_create", **dataset_dict) + + # Core fields + + assert dataset["title"] == "HealthDCAT-AP test dataset" + assert ( + dataset["notes"] + == "This dataset is an example of using HealthDCAT-AP in CKAN" + ) + + assert sorted([t["name"] for t in dataset["tags"]]) == [ + "Test 1", + "Test 2", + "Test 3", + ] + + # Standard fields + assert dataset["version_notes"] == "Dataset continuously updated" + assert dataset["identifier"] == "http://example.com/dataset/1234567890" + assert ( + dataset["frequency"] + == "http://publications.europa.eu/resource/authority/frequency/DAILY" + ) + assert ( + dataset["access_rights"] + == "http://publications.europa.eu/resource/authority/access-right/NON_PUBLIC" + ) + assert ( + dataset["provenance"] + == "This example dataset is partly sourced from TEHDAS2" + ) + + # Hard to map (example uses a blank node which doesn't work well in CKAN) + # assert dataset["dcat_type"] == "test-type" + + assert dataset["issued"] == "2024-01-01T00:00:00+00:00" + assert dataset["modified"] == "2024-12-31T23:59:59+00:00" + assert dataset["temporal_resolution"] == "P1D" + + assert dataset["analytics"] == ["http://example.com/analytics"] + assert sorted(dataset["code_values"]) == [ + "http://example.com/code1", + "http://example.com/code2", + ] + assert sorted(dataset["coding_system"]) == [ + "http://www.wikidata.org/entity/P1690", + "http://www.wikidata.org/entity/P4229", + ] + + assert dataset["spatial_coverage"] == [ + { + "uri": "http://publications.europa.eu/resource/authority/country/BEL", + } + ] + + # List fields + assert sorted(dataset["conforms_to"]) == [ + "http://www.wikidata.org/entity/Q19597236" + ] + assert sorted(dataset["language"]) == [ + "http://publications.europa.eu/resource/authority/language/ENG", + "http://publications.europa.eu/resource/authority/language/FRA", + "http://publications.europa.eu/resource/authority/language/NLD", + ] + assert sorted(dataset["theme"]) == [ + "http://publications.europa.eu/resource/authority/data-theme/HEAL" + ] + + assert sorted(dataset["is_referenced_by"]) == [ + "https://doi.org/10.1038/sdata.2016.18", + "https://dx.doi.org/10.1002/jmri.28679", + ] + assert sorted(dataset["applicable_legislation"]) == [ + "http://data.europa.eu/eli/reg/2022/868/oj", + ] + + # Repeating subfields + assert dataset["contact"][0]["name"] == "Contact Point" + assert dataset["contact"][0]["email"] == "contact@example.com" + + assert dataset["publisher"][0]["name"] == "Contact Point" + assert dataset["publisher"][0]["email"] == "info@example.com" + assert dataset["publisher"][0]["url"] == "https://healthdata.nl" + + assert len(dataset["qualified_relation"]) == 1 + assert ( + dataset["qualified_relation"][0]["relation"] + == "http://example.com/dataset/3.141592" + ) + assert ( + dataset["qualified_relation"][0]["role"] + == "http://www.iana.org/assignments/relation/related" + ) + + assert dataset["temporal_coverage"][0]["start"] == "2020-03-01" + assert dataset["temporal_coverage"][0]["end"] == "2024-12-31" + + ## HealthDCAT specific + assert sorted(dataset["health_theme"]) == [ + "http://www.wikidata.org/entity/Q58624061", + "http://www.wikidata.org/entity/Q7907952", + ] + + assert dataset["legal_basis"] == ["https://w3id.org/dpv#Consent"] + + assert dataset["hdab"][0]["name"] == "EU Health Data Access Body" + assert dataset["hdab"][0]["email"] == "hdab@example.com" + assert dataset["hdab"][0]["url"] == "https://www.example.com/hdab" + + # CKAN converts these to strings, but also converts back to decimal/nonneg int + assert dataset["min_typical_age"] == "0" + assert dataset["max_typical_age"] == "110" + assert dataset["number_of_records"] == "123456789" + assert dataset["number_of_unique_individuals"] == "7654321" + + assert sorted(dataset["personal_data"]) == [ + "https://w3id.org/dpv/dpv-pd#Age", + "https://w3id.org/dpv/dpv-pd#Gender", + "https://w3id.org/dpv/dpv-pd#HealthRecord", + ] + + assert dataset["population_coverage"] == [ + "This example includes a very non-descript population" + ] + assert dataset["publisher_note"] == [ + "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation." + ] + assert dataset["publisher_type"] == [ + "http://example.com/publisherType/undefined" + ] + + assert dataset["purpose"] == ["https://w3id.org/dpv#AcademicResearch"] + + assert dataset["retention_period"] == [ + { + "start": "2020-03-01", + "end": "2034-12-31", + } + ] diff --git a/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py new file mode 100644 index 00000000..0bfade6e --- /dev/null +++ b/ckanext/dcat/tests/profiles/health_dcat_ap/test_euro_health_dcat_ap_profile_serialize.py @@ -0,0 +1,101 @@ +import json + +import pytest +from ckan.tests.helpers import call_action +from geomet import wkt +from rdflib import Graph +from rdflib.namespace import RDF +from rdflib.term import URIRef + +from ckanext.dcat import utils +from ckanext.dcat.processors import RDFSerializer +from ckanext.dcat.profiles import ( + ADMS, + DCAT, + DCATAP, + DCT, + FOAF, + GSP, + LOCN, + OWL, + RDF, + RDFS, + SKOS, + SPDX, + VCARD, + XSD, +) +from ckanext.dcat.profiles.euro_health_dcat_ap import HEALTHDCATAP +from ckanext.dcat.tests.utils import BaseSerializeTest + +DCAT_AP_PROFILES = ["euro_dcat_ap_3"] + + +@pytest.mark.usefixtures("with_plugins", "clean_db") +@pytest.mark.ckan_config("ckan.plugins", "dcat scheming_datasets") +@pytest.mark.ckan_config( + "scheming.dataset_schemas", "ckanext.dcat.schemas:health_dcat_ap.yaml" +) +@pytest.mark.ckan_config("ckanext.dcat.rdf.profiles", "euro_health_dcat_ap") +class TestEuroDCATAP3ProfileSerializeDataset(BaseSerializeTest): + def test_e2e_ckan_to_dcat(self): + """ + End to end testing of CKAN dataset to RDF triples. + + Note: in this HealthDCAT-AP profile, only the HealthDCAT-AP specific triples are tested for. + Triples in other profiles could be tested, but should mainly be tested by their respective + profiles.""" + dataset_dict = json.loads(self._get_file_contents("ckan/health_dcat_ap.json"))[ + 0 + ] + + dataset = call_action("package_create", **dataset_dict) + + # Make sure schema was used + assert dataset["hdab"][0]["name"] == "EU Health Data Access Body" + + s = RDFSerializer() + g = s.g + + dataset_ref = s.graph_from_dataset(dataset) + + # Test dataset URI + assert str(dataset_ref) == utils.dataset_uri(dataset) + + # Load Reference graph that only containes + contents = self._get_file_contents("dcat/dataset_health_no_blank.ttl") + reference = Graph() + reference.parse(data=contents, format="turtle") + + # First check that all non-blind nodes from the reference are present in the output + # Any other nodes added by other profiles (e.g. DCAT-AP 3) we do not have an opinion about + for triple in reference: + assert triple in g, f"Triple {triple} not in output graph" + # assert all(triple in g for triple in reference) + + # Test HealthDCAT-AP specific HDAB triples + # We can assume other blank nodes (e.g. contact point, publisher, temporal) are taken care + # of by the base profile. + hdab = [t for t in g.triples((dataset_ref, HEALTHDCATAP.hdab, None))] + assert len(hdab) == 1 + hdab_items = [ + (FOAF.name, dataset_dict["hdab"][0]["name"]), + (VCARD.hasEmail, URIRef("mailto:" + dataset_dict["hdab"][0]["email"])), + (FOAF.homepage, URIRef(dataset_dict["hdab"][0]["url"])), + ] + for predicate, value in hdab_items: + assert self._triple( + g, hdab[0][2], predicate, value + ), f"HDAB Predicate {predicate} does not have value {value}" + + # Test qualified relation + relation = [t for t in g.triples((dataset_ref, DCAT.qualifiedRelation, None))] + assert len(relation) == 1 + relation_items = [ + (DCT.relation, URIRef(dataset_dict["qualified_relation"][0]["relation"])), + (DCAT.hadRole, URIRef(dataset_dict["qualified_relation"][0]["role"])), + ] + for predicate, value in relation_items: + assert self._triple( + g, relation[0][2], predicate, value + ), f"relation Predicate {predicate} does not have value {value}" diff --git a/ckanext/dcat/tests/shacl/test_shacl.py b/ckanext/dcat/tests/shacl/test_shacl.py index 0a550c27..62cf2644 100644 --- a/ckanext/dcat/tests/shacl/test_shacl.py +++ b/ckanext/dcat/tests/shacl/test_shacl.py @@ -147,6 +147,9 @@ def test_validate_dcat_ap_2_graph_shapes_range(): known_failures = [ "Value does not have class skos:Concept", "Value does not have class dcat:Dataset", + # Qualified relations + "Value does not conform to Shape :DcatResource_Shape. See details for more information.", + "The node is either a Catalog, Dataset or a DataService", ] assert set(failures) - set(known_failures) == set(), results_text diff --git a/docs/application-profiles.md b/docs/application-profiles.md new file mode 100644 index 00000000..12374b12 --- /dev/null +++ b/docs/application-profiles.md @@ -0,0 +1,80 @@ +# Application profiles + +Besides the [base profiles](profiles.md) there are other profiles included to support other domain +specific application profiles. + +!!! Note + If you are interested in contributing a profile that might be useful for the wider community + check the documentation on [writing custom profiles](writing-profiles.md) and the + [contribution guidelines](contributing.md#including-new-profiles). + +## HealthDCAT-AP + +### Introduction + +This extension contains a profile (`euro_health_dcat_ap`) for the proposed +[HealthDCAT-AP](https://healthdcat-ap.github.io/) specification. +This is a health-related extension of the DCAT application profile for sharing information about +Catalogues containing Datasets and Data Services descriptions in Europe (DCAT-AP). + +The development of a Health DCAT application profile aims to standardize health metadata within +the scope of the [European Health Data Space](https://health.ec.europa.eu/ehealth-digital-health-and-care/european-health-data-space_en) +(EHDS), fostering greater interoperability, findability and accessibility of electronic health +data across the EU. + +The goal of this profile is to provide the wider FAIR community and other EU portals with a starting +point for implementing HealthDCAT-AP within their own data catalogs. + +!!! Note + HealthDCAT-AP is still under active development and not finalized yet. Cardinalities, + certain vocabularies and the namespace have not been officially ratified yet. These are + expected to be finalized after the public consultation in Q1 2025. + + +### Usage + +Use the included `euro_health_dcat_ap` profile in your configuration: + +```ini +ckanext.dcat.rdf.profiles = euro_health_dcat_ap +``` + +The HealthDCAT-AP profile is an extension of the DCAT-AP v3 profile and requires ckanext-scheming. +See the [documentation](getting-started.md#schemas) on how to set it up. You can use the included +`health_dcat_ap.yaml` schema file as a starting point to adapt it to your needs: + +```ini +scheming.dataset_schemas = ckanext.dcat.schemas:health_dcat_ap.yaml +``` + +This profile has currently no additional settings. + +### Limitations and deviations + +As HealthDCAT-AP is still a draft, it is bound to change. There are currently still some +inconsistencies in the standard and unclarities regarding certain properties. Below is a short summary +of limitations and implementaiton decisions made during development of this profile. + +1. Cardinalities have not yet been finalized for HealthDCAT-AP. This CKAN schema has taken a very + liberal approach and takes all values as strictly optional (no failed validation for missing + fields). Note that some mandatory fields are currently impossible to fill with real data e.g. the + Health Data Access Body (HDAB) field: the EHDS legislation has not been implemented yet and no HDABs + have been formally appointed. +2. The HealthDCAT-AP namespace is not formally defined yet. For now, + `http://healthdataportal.eu/ns/health#` is used. This will be updated once the final namespace is + standardized. +3. The official examples of the standard uses the `dct:description` property to encode the data + purpose. This does not seem to be according to the Data Privacy Vocabulary specification, which + proposes a controlled vocabulary. See [this issue](https://github.com/HealthDCAT-AP-de/healthdcat-ap.de/issues/11) + for the German perspective on this. +4. The distributions proposed by HealthDCAT-AP, *analytics* and *sample*, are not specifically + implemented. URIs are linked, the resources themselves are not loaded. For *sample*, as this is + an upstream DCAT-AP property, this can be included once picked up there. +5. Documentation (*foaf:page*) is implemented as an URI. There is some HealthDCAT-AP example data + out in the wild that uses a blank node for this and adds several properties, however this is + inconsistent with other DCAT implementations. +6. DatasetSeries are not supported yet by CKAN, and also not by this profile. +7. The *quality annotation* property has not been implemented due to usage not being completely +defined yet. +8. There is no multilingual support yet. +9. For other properties, any limitations from the DCAT-AP profiles still apply. diff --git a/docs/endpoints.md b/docs/endpoints.md index f3fd6585..748cbf87 100644 --- a/docs/endpoints.md +++ b/docs/endpoints.md @@ -42,7 +42,7 @@ RDF representations will be advertised using `` tags on th ``` -Check the [RDF DCAT Serializer](profiles.md#rdf-dcat-serializer) section for more details about how these are generated and how to customize the output using [profiles](profiles.md#profiles). +Check the [RDF DCAT Serializer](writing-profiles.md#rdf-dcat-serializer) section for more details about how these are generated and how to customize the output using [profiles](profiles.md#profiles). You can specify the profile by using the `profiles=,` query parameter on the dataset endpoint (as a comma-separated list): diff --git a/docs/getting-started.md b/docs/getting-started.md index 86462e44..e1ac04bc 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -31,14 +31,14 @@ The extension includes ready to use [ckanext-scheming](https://github.com/ckan/c that enable DCAT support. These include a schema definition file (located in [`ckanext/dcat/schemas`](https://github.com/ckan/ckanext-dcat/tree/master/ckanext/dcat/schemas)) plus extra validators and other custom logic that integrates the metadata modifications with the -RDF DCAT [Parsers](profiles.md#rdf-dcat-parser) and [Serializers](profiles.md#rdf-dcat-serializer) and other CKAN features and extensions. +RDF DCAT [Parsers](writing-profiles.md#rdf-dcat-parser) and [Serializers](writing-profiles.md#rdf-dcat-serializer) and other CKAN features and extensions. There are the following schemas currently included with the extension: * *dcat_ap_recommended.yaml*: Includes the recommended properties for `dcat:Dataset` and `dcat:Distribution` according to the DCAT AP specification. You can use this schema with the `euro_dcat_ap_2` (+ `euro_dcat_ap_scheming`) and `euro_dcat_ap_3` profiles. * *dcat_ap_full.yaml*: Includes most of the properties defined for `dcat:Dataset` and `dcat:Distribution` in the [DCAT AP v2.1](https://semiceu.github.io/DCAT-AP/releases/2.1.1/) and [DCAT AP v3](https://semiceu.github.io/DCAT-AP/releases/3.0.0/) specification. You can use this schema with the `euro_dcat_ap_2` (+ `euro_dcat_ap_scheming`) and `euro_dcat_ap_3` profiles. * *dcat_ap_multilingual.yaml*: An example schema implementing multilingual metadata in some fields using [ckanext-fluent](https://github.com/ckan/ckanext-fluent). See [Multilingual support](profiles.md#multilingual-support) for more information. - +* *dcat_us_full.yaml*: Includes most of the properties defined for `dcat:Dataset` and `dcat:Distribution` in the [DCAT US v3](https://doi-do.github.io/dcat-us/) specification. You can use this schema with the `dcat_us_3` profile. Most sites will want to use these as a base to create their own custom schema to address their own requirements, perhaps alongside a [custom profile](profiles.md#profiles). Of course site maintainers can add or remove schema fields, as well as change the existing validators. diff --git a/docs/harvester.md b/docs/harvester.md index 0a0b2aa2..02ca2711 100644 --- a/docs/harvester.md +++ b/docs/harvester.md @@ -1,6 +1,6 @@ ## RDF DCAT harvester -The [RDF parser](profiles.md#rdf-dcat-parser) described in the previous section has been integrated into a harvester, +The [RDF parser](writing-profiles.md#rdf-dcat-parser) described in the previous section has been integrated into a harvester, to allow automatic import of datasets from remote sources. To enable the RDF harvester, add the `harvest` and `dcat_rdf_harvester` plugins to your CKAN configuration file (you will also need to install [ckanext-harvest](https://github.com/ckan/ckanext-harvest)): ckan.plugins = ... harvest dcat_rdf_harvester diff --git a/docs/index.md b/docs/index.md index c3caca01..925d2bef 100644 --- a/docs/index.md +++ b/docs/index.md @@ -116,6 +116,6 @@ These are implemented internally using: * A base [mapping](mapping.md) between DCAT and CKAN datasets and viceversa (compatible with **DCAT-AP** [v1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11), [v2.1](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210) and [v3](https://semiceu.github.io/DCAT-AP/releases/3.0.0/) and **DCAT-US** [v3](https://doi-do.github.io/dcat-us/)). -* An [RDF Parser](profiles.md#rdf-dcat-parser) that allows to read RDF serializations in different formats and extract CKAN dataset dicts, using customizable [profiles](profiles.md#profiles). +* An [RDF Parser](writing-profiles.md#rdf-dcat-parser) that allows to read RDF serializations in different formats and extract CKAN dataset dicts, using customizable [profiles](profiles.md#profiles). -* An [RDF Serializer](profiles.md#rdf-dcat-serializer) that allows to transform CKAN datasets metadata to different semantic formats, also allowing customizable [profiles](profiles.md#profiles). +* An [RDF Serializer](writing-profiles.md#rdf-dcat-serializer) that allows to transform CKAN datasets metadata to different semantic formats, also allowing customizable [profiles](profiles.md#profiles). diff --git a/docs/mapping.md b/docs/mapping.md index d127a1c9..fa05ade2 100644 --- a/docs/mapping.md +++ b/docs/mapping.md @@ -4,7 +4,7 @@ The following table provides a generic mapping between the fields of the `dcat:D their equivalents in the CKAN model. In most cases this mapping is deliberately a loose one. For instance, it does not try to link the DCAT publisher property with a CKAN dataset author, maintainer or organization, as the link between them is not straight-forward and may depend on a particular instance needs. When mapping from CKAN metadata to DCAT though, there are in some cases fallback fields -that are used if the default field is not present (see [RDF Serializer](profiles.md#rdf-dcat-serializer) for more details on this). +that are used if the default field is not present (see [RDF Serializer](writing-profiles.md#rdf-dcat-serializer) for more details on this). This mapping is compatible with **DCAT-AP** [v1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11), [v2.1](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210) and [v3](https://semiceu.github.io/DCAT-AP/releases/3.0.0/) and **DCAT-US** [v3](https://doi-do.github.io/dcat-us/). It depends on the active [profile(s)](profiles.md#profiles) and the fields present in your custom [schema](getting-started.md#schemas) which DCAT properties are mapped. @@ -276,7 +276,7 @@ If no `publisher` or `publisher_*` fields are found, the serializers will fall b ### Spatial coverage -The following formats for `dct:spatial` are supported by the default [parser](profiles.md#rdf-dcat-parser). Note that the default [serializer](profiles.md#rdf-dcat-serializer) will return the single `dct:spatial` instance form by default. +The following formats for `dct:spatial` are supported by the default [parser](writing-profiles.md#rdf-dcat-parser). Note that the default [serializer](writing-profiles.md#rdf-dcat-serializer) will return the single `dct:spatial` instance form by default. - One `dct:spatial` instance, URI only diff --git a/docs/profiles.md b/docs/profiles.md index 9813ff0f..21f8d8a9 100644 --- a/docs/profiles.md +++ b/docs/profiles.md @@ -11,14 +11,15 @@ Profiles define : They essentially define the mapping between DCAT and CKAN. In most cases the default profile will provide a good mapping that will cover most properties described in the DCAT standard. If you want to extract extra fields defined in the RDF, are using a custom schema or -need custom logic, you can write a [custom profile](#writing-custom-profiles) that extends or replaces one of the default ones. +need custom logic, you can write a [custom profile](writing-profiles.md) that extends or replaces one of the default ones. The profiles currently shipped with the extension are mostly based in the -[DCAT application profile for data portals in Europe](https://joinup.ec.europa.eu/asset/dcat_application_profile/description). As mentioned before though, they should be generic enough for most DCAT based representations. +DCAT application profiles for data portals in [Europe](https://joinup.ec.europa.eu/asset/dcat_application_profile/description) and the [US](https://doi-do.github.io/dcat-us/). As mentioned before though, they should be generic enough for most DCAT based representations. Sites that want to support a particular version of the DCAT-AP can enable a specific profile using one of the profiles below: * [DCAT-AP v3](https://semiceu.github.io/DCAT-AP/releases/3.0.0) (default): `euro_dcat_ap_3` +* [DCAT-US v3](https://doi-do.github.io/dcat-us/): `dcat_us_3` * [DCAT-AP v2.1.0](https://joinup.ec.europa.eu/collection/semantic-interoperability-community-semic/solution/dcat-application-profile-data-portals-europe/release/210): `euro_dcat_ap_2` * [DCAT-AP v1.1.1](https://joinup.ec.europa.eu/asset/dcat_application_profile/asset_release/dcat-ap-v11): `euro_dcat_ap` @@ -45,75 +46,6 @@ serializer = RDFSerializer(profiles=['euro_dcat_ap', 'sweden_dcat_ap']) Note that in both cases the order in which you define them is important, as it will be the one that the profiles will be run on. -### Writing custom profiles - -Internally, profiles are classes that define a particular set of methods called during the parsing process. -For instance, the `parse_dataset()` method is called on each DCAT dataset found when parsing an RDF file, and should return a CKAN dataset. -Conversely, the `graph_from_dataset()` will be called when requesting an RDF representation for a dataset, and will need to generate the necessary RDF graph. - -Custom profiles should always extend the `ckanext.dcat.profiles.RDFProfile` class. This class has several helper -functions to make getting metadata from the RDF graph easier. These include helpers for getting fields for FOAF and VCard entities like the ones -used to define publishers or contact points. Check the source code of `ckanex.dcat.profiles.base.py` to see what is available. - -Profiles can extend other profiles to avoid repeating rules, or can be completely independent. - -The following example shows a complete example of a profile built on top of the European DCAT-AP profile (`euro_dcat_ap`): - -```python - -from rdflib.namespace import Namespace -from ckanext.dcat.profiles import RDFProfile - -DCT = Namespace("http://purl.org/dc/terms/") - - -class SwedishDCATAPProfile(RDFProfile): - ''' - An RDF profile for the Swedish DCAT-AP recommendation for data portals - - It requires the European DCAT-AP profile (`euro_dcat_ap`) - ''' - - def parse_dataset(self, dataset_dict, dataset_ref): - - # Spatial label - spatial = self._object(dataset_ref, DCT.spatial) - if spatial: - spatial_label = self.g.label(spatial) - if spatial_label: - dataset_dict['extras'].append({'key': 'spatial_text', - 'value': str(spatial_label)}) - - return dataset_dict - - def graph_from_dataset(self, dataset_dict, dataset_ref): - - g = self.g - - spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri') - spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text') - - if spatial_uri: - spatial_ref = URIRef(spatial_uri) - else: - spatial_ref = BNode() - - if spatial_text: - g.add((dataset_ref, DCT.spatial, spatial_ref)) - g.add((spatial_ref, RDF.type, DCT.Location)) - g.add((spatial_ref, RDFS.label, Literal(spatial_text))) -``` - -Note how the dataset dict is passed between profiles so it can be further tweaked. - -Extensions define their available profiles using the `ckan.rdf.profiles` entrypoint in the `setup.py` file, as in this [example](https://github.com/ckan/ckanext-dcat/blob/cc5fcc7be0be62491301db719ce597aec7c684b0/setup.py#L37:L38) from this same extension: - - [ckan.rdf.profiles] - euro_dcat_ap=ckanext.dcat.profiles:EuropeanDCATAPProfile - euro_dcat_ap_2=ckanext.dcat.profiles:EuropeanDCATAP2Profile - euro_dcat_ap_3=ckanext.dcat.profiles:EuropeanDCATAP3Profile - euro_dcat_ap_scheming=ckanext.dcat.profiles:EuropeanDCATAPSchemingProfile - schemaorg=ckanext.dcat.profiles:SchemaOrgProfile ## Multilingual support @@ -185,132 +117,6 @@ See [*examples/ckan/ckan_dataset_multilingual.json*](https://github.com/ckan/cka for examples of a multilingual CKAN dataset and DCAT serialization. -Users [writing custom profiles](#writing-custom-profiles) can make use of the `_object_value_multilingual()` +Users [writing custom profiles](writing-profiles.md) can make use of the `_object_value_multilingual()` and `_object_value_list_multilingual()` functions of the profile class to handle custom fields not defined in the base profiles. - - -## Internals - -### RDF DCAT Parser - -The `ckanext.dcat.processors.RDFParser` class allows to read RDF serializations in different -formats and extract CKAN dataset dicts. It will look for DCAT datasets and distributions -and create CKAN datasets and resources, as dictionaries that can be passed to [`package_create`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.create.package_create) or [`package_update`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.update.package_update). - -Here is a quick overview of how it works: - -```python - -from ckanext.dcat.processors import RDFParser, RDFParserException - -parser = RDFParser() - -# Parsing a local RDF/XML file - -with open('datasets.rdf', 'r') as f: - try: - parser.parse(f.read()) - - for dataset in parser.datasets(): - print('Got dataset with title {0}'.format(dataset['title']) - - except RDFParserException, e: - print ('Error parsing the RDF file: {0}'.format(e)) - -# Parsing a remote JSON-LD file - -import requests - -parser = RDFParser() - -content = requests.get('https://some.catalog.org/datasets.jsonld').content - -try: - parser.parse(content, _format='json-ld') - - for dataset in parser.datasets(): - print('Got dataset with title {0}'.format(dataset['title']) - -except RDFParserException, e: - print ('Error parsing the RDF file: {0}'.format(e)) - -``` - -The parser is implemented using [RDFLib](https://rdflib.readthedocs.org/), a Python library for working with RDF. Any -RDF serialization format supported by RDFLib can be parsed into CKAN datasets. The `examples` folder contains -serializations in different formats including RDF/XML, Turtle or JSON-LD. - -### RDF DCAT Serializer - -The `ckanext.dcat.processors.RDFSerializer` class generates RDF serializations in different -formats from CKAN dataset dicts, like the ones returned by [`package_show`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.get.package_show) or [`package_search`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.get.package_search). - -Here is an example of how to use it: - -```python - -from ckanext.dcat.processors import RDFSerializer - -# Serializing a single dataset - -dataset = get_action('package_show')({}, {'id': 'my-dataset'}) - -serializer = RDFserializer() - -dataset_ttl = serializer.serialize_dataset(dataset, _format='turtle') - - -# Serializing the whole catalog (or rather part of it) - -datasets = get_action('package_search')({}, {'q': '*:*', 'rows': 50}) - -serializer = RDFserializer() - -catalog_xml = serializer.serialize_catalog({'title': 'My catalog'}, - dataset_dicts=datasets, - _format='xml') - -# Creating and RDFLib graph from a single dataset - -dataset = get_action('package_show')({}, {'id': 'my-dataset'}) - -serializer = RDFserializer() - -dataset_reference = serializer.graph_from_dataset(dataset) - -# serializer.g now contains the full dataset graph, an RDFLib Graph class - -``` - -The serializer uses customizable [profiles](#profiles) to generate an RDF graph (an [RDFLib Graph class](https://rdflib.readthedocs.org/en/latest/apidocs/rdflib.html#rdflib.graph.Graph)). -By default these use the [mapping](mapping.md) described in the previous section. - -In some cases, if the default CKAN field that maps to a DCAT property is not present, some other fallback -values will be used instead. For instance, if the `contact_email` field is not found, `maintainer_email` -and `author_email` will be used (if present) for the email property of the `adms:contactPoint` property. - -Note that the serializer will look both for a first level field or an extra field with the same key, ie both -the following values will be used for `dct:accrualPeriodicity`: - - { - "name": "my-dataset", - "frequency": "monthly", - ... - } - - { - "name": "my-dataset", - "extras": [ - {"key": "frequency", "value": "monthly"}, - ] - ... - } - -Once the dataset graph has been obtained, this is serialized into a text format using [RDFLib](https://rdflib.readthedocs.org/), -so any format it supports can be obtained (common formats are 'xml', 'turtle' or 'json-ld'). - - - - - diff --git a/docs/writing-profiles.md b/docs/writing-profiles.md new file mode 100644 index 00000000..19cf0543 --- /dev/null +++ b/docs/writing-profiles.md @@ -0,0 +1,191 @@ +## Writing custom profiles + +Internally, profiles are classes that define a particular set of methods called during the parsing process. +For instance, the `parse_dataset()` method is called on each DCAT dataset found when parsing an RDF file, and should return a CKAN dataset. +Conversely, the `graph_from_dataset()` will be called when requesting an RDF representation for a dataset, and will need to generate the necessary RDF graph. + +Custom profiles should always extend the `ckanext.dcat.profiles.RDFProfile` class. This class has several helper +functions to make getting metadata from the RDF graph easier. These include helpers for getting fields for FOAF and VCard entities like the ones +used to define publishers or contact points. Check the source code of `ckanex.dcat.profiles.base.py` to see what is available. + +Profiles can extend other profiles to avoid repeating rules, or can be completely independent. + +The following example shows a complete example of a profile built on top of the European DCAT-AP profile (`euro_dcat_ap`): + +```python + +from rdflib.namespace import Namespace +from ckanext.dcat.profiles import RDFProfile + +DCT = Namespace("http://purl.org/dc/terms/") + + +class SwedishDCATAPProfile(RDFProfile): + ''' + An RDF profile for the Swedish DCAT-AP recommendation for data portals + + It requires the European DCAT-AP profile (`euro_dcat_ap`) + ''' + + def parse_dataset(self, dataset_dict, dataset_ref): + + # Spatial label + spatial = self._object(dataset_ref, DCT.spatial) + if spatial: + spatial_label = self.g.label(spatial) + if spatial_label: + dataset_dict['extras'].append({'key': 'spatial_text', + 'value': str(spatial_label)}) + + return dataset_dict + + def graph_from_dataset(self, dataset_dict, dataset_ref): + + g = self.g + + spatial_uri = self._get_dataset_value(dataset_dict, 'spatial_uri') + spatial_text = self._get_dataset_value(dataset_dict, 'spatial_text') + + if spatial_uri: + spatial_ref = URIRef(spatial_uri) + else: + spatial_ref = BNode() + + if spatial_text: + g.add((dataset_ref, DCT.spatial, spatial_ref)) + g.add((spatial_ref, RDF.type, DCT.Location)) + g.add((spatial_ref, RDFS.label, Literal(spatial_text))) +``` + +Note how the dataset dict is passed between profiles so it can be further tweaked. + +Extensions define their available profiles using the `ckan.rdf.profiles` entrypoint in the `setup.py` file, as in this [example](https://github.com/ckan/ckanext-dcat/blob/cc5fcc7be0be62491301db719ce597aec7c684b0/setup.py#L37:L38) from this same extension: + + [ckan.rdf.profiles] + euro_dcat_ap=ckanext.dcat.profiles:EuropeanDCATAPProfile + euro_dcat_ap_2=ckanext.dcat.profiles:EuropeanDCATAP2Profile + euro_dcat_ap_3=ckanext.dcat.profiles:EuropeanDCATAP3Profile + euro_dcat_ap_scheming=ckanext.dcat.profiles:EuropeanDCATAPSchemingProfile + schemaorg=ckanext.dcat.profiles:SchemaOrgProfile + + +## Internals + +### RDF DCAT Parser + +The `ckanext.dcat.processors.RDFParser` class allows to read RDF serializations in different +formats and extract CKAN dataset dicts. It will look for DCAT datasets and distributions +and create CKAN datasets and resources, as dictionaries that can be passed to [`package_create`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.create.package_create) or [`package_update`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.update.package_update). + +Here is a quick overview of how it works: + +```python + +from ckanext.dcat.processors import RDFParser, RDFParserException + +parser = RDFParser() + +# Parsing a local RDF/XML file + +with open('datasets.rdf', 'r') as f: + try: + parser.parse(f.read()) + + for dataset in parser.datasets(): + print('Got dataset with title {0}'.format(dataset['title']) + + except RDFParserException, e: + print ('Error parsing the RDF file: {0}'.format(e)) + +# Parsing a remote JSON-LD file + +import requests + +parser = RDFParser() + +content = requests.get('https://some.catalog.org/datasets.jsonld').content + +try: + parser.parse(content, _format='json-ld') + + for dataset in parser.datasets(): + print('Got dataset with title {0}'.format(dataset['title']) + +except RDFParserException, e: + print ('Error parsing the RDF file: {0}'.format(e)) + +``` + +The parser is implemented using [RDFLib](https://rdflib.readthedocs.org/), a Python library for working with RDF. Any +RDF serialization format supported by RDFLib can be parsed into CKAN datasets. The `examples` folder contains +serializations in different formats including RDF/XML, Turtle or JSON-LD. + +### RDF DCAT Serializer + +The `ckanext.dcat.processors.RDFSerializer` class generates RDF serializations in different +formats from CKAN dataset dicts, like the ones returned by [`package_show`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.get.package_show) or [`package_search`](http://docs.ckan.org/en/latest/api/index.html#ckan.logic.action.get.package_search). + +Here is an example of how to use it: + +```python + +from ckanext.dcat.processors import RDFSerializer + +# Serializing a single dataset + +dataset = get_action('package_show')({}, {'id': 'my-dataset'}) + +serializer = RDFserializer() + +dataset_ttl = serializer.serialize_dataset(dataset, _format='turtle') + + +# Serializing the whole catalog (or rather part of it) + +datasets = get_action('package_search')({}, {'q': '*:*', 'rows': 50}) + +serializer = RDFserializer() + +catalog_xml = serializer.serialize_catalog({'title': 'My catalog'}, + dataset_dicts=datasets, + _format='xml') + +# Creating and RDFLib graph from a single dataset + +dataset = get_action('package_show')({}, {'id': 'my-dataset'}) + +serializer = RDFserializer() + +dataset_reference = serializer.graph_from_dataset(dataset) + +# serializer.g now contains the full dataset graph, an RDFLib Graph class + +``` + +The serializer uses customizable [profiles](profiles.md) to generate an RDF graph (an [RDFLib Graph class](https://rdflib.readthedocs.org/en/latest/apidocs/rdflib.html#rdflib.graph.Graph)). +By default these use the [mapping](mapping.md) described in the previous section. + +In some cases, if the default CKAN field that maps to a DCAT property is not present, some other fallback +values will be used instead. For instance, if the `contact_email` field is not found, `maintainer_email` +and `author_email` will be used (if present) for the email property of the `adms:contactPoint` property. + +Note that the serializer will look both for a first level field or an extra field with the same key, ie both +the following values will be used for `dct:accrualPeriodicity`: + + { + "name": "my-dataset", + "frequency": "monthly", + ... + } + + { + "name": "my-dataset", + "extras": [ + {"key": "frequency", "value": "monthly"}, + ] + ... + } + +Once the dataset graph has been obtained, this is serialized into a text format using [RDFLib](https://rdflib.readthedocs.org/), +so any format it supports can be obtained (common formats are 'xml', 'turtle' or 'json-ld'). + diff --git a/examples/ckan/ckan_full_dataset_dcat_ap.json b/examples/ckan/ckan_full_dataset_dcat_ap.json index 6adb770d..bc170025 100644 --- a/examples/ckan/ckan_full_dataset_dcat_ap.json +++ b/examples/ckan/ckan_full_dataset_dcat_ap.json @@ -169,6 +169,13 @@ } ], "spatial_resolution_in_meters": 1.5, + "qualified_relation": [ + { + "uri": "", + "relation": "http://example.com/dataset/3.141592", + "role": "http://www.iana.org/assignments/relation/related" + } + ], "resources": [ { "name": "Resource 1", diff --git a/examples/ckan/ckan_full_dataset_dcat_ap_vocabularies.json b/examples/ckan/ckan_full_dataset_dcat_ap_vocabularies.json index 9e0193e9..3c8ca3c5 100644 --- a/examples/ckan/ckan_full_dataset_dcat_ap_vocabularies.json +++ b/examples/ckan/ckan_full_dataset_dcat_ap_vocabularies.json @@ -148,6 +148,13 @@ } ], "spatial_resolution_in_meters": 1.5, + "qualified_relation": [ + { + "uri": "", + "relation": "http://example.com/dataset/3.141592", + "role": "http://www.iana.org/assignments/relation/related" + } + ], "resources": [ { "name": "Resource 1", diff --git a/examples/ckan/ckan_full_dataset_dcat_us_vocabularies.json b/examples/ckan/ckan_full_dataset_dcat_us_vocabularies.json index 69ff8d6b..d9e3992f 100644 --- a/examples/ckan/ckan_full_dataset_dcat_us_vocabularies.json +++ b/examples/ckan/ckan_full_dataset_dcat_us_vocabularies.json @@ -186,6 +186,13 @@ "license": "https://resources.data.gov/vocab/license/TODO/CC_BYNC_4_0" } ], + "qualified_relation": [ + { + "uri": "", + "relation": "http://example.com/dataset/3.141592", + "role": "http://www.iana.org/assignments/relation/related" + } + ], "resources": [ { "name": "Resource 1", diff --git a/examples/ckan/health_dcat_ap.json b/examples/ckan/health_dcat_ap.json new file mode 100644 index 00000000..2670c77b --- /dev/null +++ b/examples/ckan/health_dcat_ap.json @@ -0,0 +1,199 @@ +[ + { + "access_rights": "http://publications.europa.eu/resource/authority/access-right/NON_PUBLIC", + "analytics": [ + "http://example.com/analytics" + ], + "alternate_identifier": [ + "internalURI:admsIdentifier0" + ], + "applicable_legislation": [ + "http://data.europa.eu/eli/reg/2022/868/oj" + ], + "author": null, + "author_email": null, + "code_values": [ + "http://example.com/code1", + "http://example.com/code2" + ], + "coding_system": [ + "http://www.wikidata.org/entity/P1690", + "http://www.wikidata.org/entity/P4229" + ], + "conforms_to": [ + "http://www.wikidata.org/entity/Q19597236" + ], + "creator_user_id": null, + "dcat_type": "n1049372e768c4429a6b2200c22f5f1a4b7", + "documentation": [ + "n1049372e768c4429a6b2200c22f5f1a4b9" + ], + "frequency": "http://publications.europa.eu/resource/authority/frequency/DAILY", + "health_category": [ + "http://example.com/ontology/resource/authority/healthcategories/PHDR", + "http://example.com/ontology/resource/authority/healthcategories/IDHP", + "http://example.com/ontology/resource/authority/healthcategories/DIOH", + "http://example.com/ontology/resource/authority/healthcategories/EHRS" + ], + "health_theme": [ + "http://www.wikidata.org/entity/Q7907952", + "http://www.wikidata.org/entity/Q58624061" + ], + "id": "e7ccf79d-705c-427f-8e96-f87bcd6e5318", + "identifier": "http://example.com/dataset/1234567890", + "is_referenced_by": [ + "https://doi.org/10.1038/sdata.2016.18", + "https://dx.doi.org/10.1002/jmri.28679" + ], + "isopen": false, + "issued": "2024-01-01T00:00:00+00:00", + "language": [ + "http://publications.europa.eu/resource/authority/language/ENG", + "http://publications.europa.eu/resource/authority/language/NLD", + "http://publications.europa.eu/resource/authority/language/FRA" + ], + "legal_basis": [ + "https://w3id.org/dpv#Consent" + ], + "license_id": "", + "license_title": "", + "maintainer": null, + "maintainer_email": null, + "max_typical_age": "110", + "metadata_created": "2024-12-02T19:00:30.897399", + "metadata_modified": "2024-12-02T19:00:30.897406", + "min_typical_age": "0", + "modified": "2024-12-31T23:59:59+00:00", + "name": "test-dcat-1", + "notes": "This dataset is an example of using HealthDCAT-AP in CKAN", + "num_resources": 0, + "num_tags": 3, + "number_of_records": "123456789", + "number_of_unique_individuals": "7654321", + "organization": null, + "personal_data": [ + "https://w3id.org/dpv/dpv-pd#Age", + "https://w3id.org/dpv/dpv-pd#Gender", + "https://w3id.org/dpv/dpv-pd#HealthRecord" + ], + "population_coverage": [ + "This example includes a very non-descript population" + ], + "private": false, + "provenance": "This example dataset is partly sourced from TEHDAS2", + "publisher_note": [ + "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation." + ], + "publisher_type": [ + "http://example.com/publisherType/undefined" + ], + "purpose": [ + "https://w3id.org/dpv#AcademicResearch" + ], + "qualified_relation": [ + { + "uri": "", + "relation": "http://example.com/dataset/3.141592", + "role": "http://www.iana.org/assignments/relation/related" + } + ], + "state": "active", + "temporal_resolution": "P1D", + "theme": [ + "http://publications.europa.eu/resource/authority/data-theme/HEAL" + ], + "title": "HealthDCAT-AP test dataset", + "type": "dataset", + "uri": "http://example.healthdata.nl/set/dataset", + "version_notes": "Dataset continuously updated", + "contact": [ + { + "email": "covacsurv@sciensano.be", + "identifier": "", + "name": "Contact Point" + } + ], + "creator": [ + { + "email": "info@example.com", + "identifier": "", + "name": "Contact Point", + "type": "", + "url": "https:/example.com/homepage" + } + ], + "extras": [ + { + "key": "related_resource", + "value": "[\"http://example.com/dataset/9876543210\"]" + }, + { + "key": "sample", + "value": "[\"http://example.com/sample\"]" + }, + { + "key": "spatial_uri", + "value": "http://publications.europa.eu/resource/authority/country/BEL" + } + ], + "hdab": [ + { + "email": "hdab@example.com", + "identifier": "", + "name": "EU Health Data Access Body", + "type": "", + "uri": "", + "url": "https://www.example.com/hdab" + } + ], + "publisher": [ + { + "email": "info@example.com", + "identifier": "", + "name": "Contact Point", + "type": "", + "uri": "", + "url": "https://healthdata.nl" + } + ], + "retention_period": [ + { + "end": "2034-12-31", + "start": "2020-03-01" + } + ], + "tags": [ + { + "display_name": "Test 1", + "id": "5c418ec2-cb41-4c42-9b9c-f5d1e3a831e5", + "name": "Test 1", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "Test 2", + "id": "c4117ace-2114-470d-b6e9-0df7580a12d8", + "name": "Test 2", + "state": "active", + "vocabulary_id": null + }, + { + "display_name": "Test 3", + "id": "d5a5288d-3bff-431e-be94-12c71d25d75b", + "name": "Test 3", + "state": "active", + "vocabulary_id": null + } + ], + "temporal_coverage": [ + { + "end": "2024-12-31", + "start": "2020-03-01" + } + ], + "resources": [], + "groups": [], + "relationships_as_subject": [], + "relationships_as_object": [] + } +] \ No newline at end of file diff --git a/examples/dcat/dataset.rdf b/examples/dcat/dataset.rdf index 42f1ea5e..5ce71e1c 100644 --- a/examples/dcat/dataset.rdf +++ b/examples/dcat/dataset.rdf @@ -70,6 +70,12 @@ PT15M + + + + + + Point of Contact diff --git a/examples/dcat/dataset_health.ttl b/examples/dcat/dataset_health.ttl new file mode 100644 index 00000000..a665c1ee --- /dev/null +++ b/examples/dcat/dataset_health.ttl @@ -0,0 +1,295 @@ +@prefix adms: . +@prefix dcat: . +@prefix dcatap: . +@prefix dct: . +@prefix dqv: . +@prefix foaf: . +@prefix locn: . +@prefix oa: . +@prefix prov: . +@prefix rdfs: . +@prefix skos: . +@prefix spdx: . +@prefix vcard: . + + + a dcat:Resource , dcat:Dataset; + dcatap:applicableLegislation ; + + ; + + , + ; + , + ; + + [ a foaf:Organization; + foaf:homepage ; + foaf:mbox ; + foaf:name "EU Health Data Access Body" + ]; + + , , , ; + + , ; + + "110"^^; + + "0"^^; + + "123456789"^^; + + "7654321"^^; + + "This example includes a very non-descript population"; + + "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation."; + + ; + + [ a dct:PeriodOfTime; + rdfs:comment "As stated in the CSI deliberation"; + dcat:endDate "2034-12-31"^^; + dcat:startDate "2020-03-01"^^ + ]; + dct:accessRights ; + dct:accrualPeriodicity ; + dct:alternative "TEST-DATASET"; + dct:conformsTo ; + dct:creator ; + dct:description "This dataset is an example of using HealthDCAT-AP in CKAN"; + dct:identifier "http://example.com/dataset/1234567890"^^; + dct:isPartOf ; + dct:isReferencedBy , ; + dct:issued "2024-01-01T00:00:00Z"^^; + dct:language , , ; + dct:modified "2024-12-31T23:59:59Z"^^; + dct:provenance [ a dct:ProvenanceStatement; + rdfs:label "This example dataset is partly sourced from TEHDAS2" + ]; + dct:publisher [ a foaf:Organization , foaf:Agent; + foaf:homepage ; + foaf:mbox ; + foaf:name "Contact Point" + ]; + dct:relation ; + dcat:qualifiedRelation [ + a dcat:Relationship ; + dct:relation ; + dcat:hadRole + ]; + dct:spatial ; + dct:temporal [ a dct:PeriodOfTime; + dcat:endDate "2024-12-31"^^; + dcat:startDate "2020-03-01"^^ + ]; + dct:title "HealthDCAT-AP test dataset"; + dct:type [ a skos:Concept; + skos:inScheme ; + skos:prefLabel "Personal Data" + ]; + adms:identifier ; + adms:sample ; + adms:versionNotes "Dataset continuously updated"; + dcat:contactPoint ; + # dcat:distribution ; + dcat:hasVersion ; + dcat:keyword "Test 1" , "Test 2" , "Test 3"; + dcat:spatialResolutionInMeters "10"^^; + dcat:temporalResolution "P1D"^^; + dcat:theme ; + # dcat:version is not mapped in ckan and should be hasVersion + # dcat:version "Project HDBP0250"; + dqv:hasQualityAnnotation [ a dqv:QualityCertificate; + oa:hasBody ; + oa:hasTarget ; + oa:motivatedBy dqv:qualityAssessment + ]; + prov:qualifiedAttribution ; + prov:wasGeneratedBy ; + foaf:page [ a foaf:Document; + rdfs:label "Landing Page for Sciensano"; + foaf:homepage + ]; + + ; + + , + , + ; + + . + + + a dcat:Distribution; + dcatap:applicableLegislation ; + dct:format ; + dct:identifier "http://ehelse.healthdataportal.eu/analytics/47f55653-a151-48c1-8d90-940561da6e57"; + dct:isPartOf ; + dct:issued "2024-06-03T08:51:00Z"^^; + dct:license ; + dct:modified "2024-06-04T18:00:00Z"^^; + dct:rights [ a dct:RightsStatement; + rdfs:label "_g_L202C11377" , "internalURI:wasGeneratedBy0" , "_g_L123C7733" + ]; + dct:title "Technical report number of unique study subjects available by environment for project HDBP0250"; + dcat:accessURL ; + dcat:downloadURL ; + dcat:mediaType . + + + a dct:MediaType . + + + a foaf:Agent; + foaf:homepage ; + foaf:mbox ; + foaf:name "Contact Point" . + + + a adms:Identifier; + skos:notation "https://www.healthinformationportal.eu/health-information-sources/linking-registers-covid-19-vaccine-surveillance"^^; + adms:schemaAgency "Health Information Portal" . + + + a vcard:Organization , vcard:Kind; + vcard:fn "Contact Point"; + vcard:hasEmail ; + vcard:hasURL ; + vcard:organisationName "Contact Point"; + vcard:organisationUnit "Health Information" . + + + a dcat:CatalogRecord; + dct:creator ; + dct:identifier "16e16149-bf41-42f6-8741-225e8c97a35e"; + dct:issued "2024-10-04T14:28:36Z"^^; + dct:modified "2024-10-09T17:34:28Z"^^; + spdx:checksum [ a spdx:Checksum; + spdx:algorithm spdx:checksumAlgorithm_md5; + spdx:checksumValue "ea77c251b6945e450ae4d66c581495d4" + ]; + foaf:primaryTopic . + + + + a dct:LinguisticSystem . + + + a ; + dct:title "ID_TU_STATBEL_POP"; + + ; + dcat:keyword "TEST-DATASET" . + + + a dcat:Distribution; + dcatap:applicableLegislation ; + dct:format ; + dct:identifier "http://ehelse.healthdataportal.eu/sample/fe921169-4619-4386-8bfe-60ea131dbe96"; + dct:isPartOf ; + dct:issued "2024-06-03T08:51:00Z"^^; + dct:language ; + dct:license ; + dct:modified "2024-06-04T18:00:00Z"^^; + dct:rights [ a dct:RightsStatement; + rdfs:label "Free access." + ]; + dct:title "Proxy data generating for the EHDS2 Pilot project Sciensano Use Case"; + dcat:accessURL ; + dcat:downloadURL ; + dcat:mediaType . + + + + a dct:LinguisticSystem . + + + a dct:LinguisticSystem . + + + a skos:Concept; + skos:prefLabel "National Public Health Institute" . + + + a dct:RightsStatement . + + + a dct:Frequency . + + + a prov:Attribution; + dcat:hadRole ; + prov:agent [ a foaf:Organization; + foaf:homepage ; + foaf:mbox ; + foaf:name "Contact Point" + ] . + + + a dct:Location . + + + a skos:Concept; + dct:identifier "https://icd.who.int/browse10/2019/en#/Y59.0"^^; + skos:definition "Viral vaccines"; + skos:hasTopConcept ; + skos:notation "Y59.0"; + skos:prefLabel "Viral vaccines" . + + + a dct:MediaTypeOrExtent . + +# +# a dcat:Distribution; +# dcatap:applicableLegislation ; +# dct:description "EU Health Data Access Body For better Healthcare, Research & Policy Making"; +# dct:format ; +# dct:identifier "http://ehelse.healthdataportal.eu/distribution/13a3851d-6cdf-4570-a7f0-7f03015d1925"; +# dct:isPartOf ; +# dct:issued "2024-06-03T08:51:00Z"^^; +# dct:license ; +# dct:modified "2024-06-04T18:00:00Z"^^; +# dct:rights [ a dct:RightsStatement; +# rdfs:label "Access to data is conditional on the issuance of a permit by the HDAB after submission of a data request application (English)" +# ]; +# dct:title "EU Health Data Access Body"; +# dcat:accessURL ; +# dcat:byteSize "80000"^^ . + + + a prov:Activity; + rdfs:label "http://dbpedia.org/resource/Record_linkage"; + rdfs:seeAlso ; + dct:type ; + prov:startedAtTime "2021-01-01T00:00:00Z"^^; + prov:wasAssociatedWith [ a prov:Agent; + prov:actedOnBehalfOf [ a prov:Organization , prov:Agent; + foaf:name "Contact Point" + ]; + foaf:homepage ; + foaf:mbox ; + foaf:name "Dr. Joris van Loenhout" + ]; + foaf:page . + + + a ; + + ; + + "Patient death reason\tInformation on wheter the cause of death was COVID-19."; + + "CD_COD_COVID" . + + + a skos:Concept; + dct:identifier "https://icd.who.int/browse10/2019/en#/U07.1"^^; + skos:definition "COVID-19, virus identified"; + skos:hasTopConcept ; + skos:notation "U07.1"; + skos:prefLabel "Test 1" . + + + a dct:LicenseDocument; + rdfs:label "Creative Commons Attribution-NonCommercial-NoDerivs 3.0 Unported" . diff --git a/examples/dcat/dataset_health_no_blank.ttl b/examples/dcat/dataset_health_no_blank.ttl new file mode 100644 index 00000000..ba854b5f --- /dev/null +++ b/examples/dcat/dataset_health_no_blank.ttl @@ -0,0 +1,81 @@ +# This Graph contains no blank nodes, to allow for easy comparison between a generated graph +# The blind nodes can be compared manually + +@prefix adms: . +@prefix dcat: . +@prefix dcatap: . +@prefix dct: . +@prefix dpv: . +@prefix foaf: . +@prefix healthdcatap: . +@prefix rdfs: . +@prefix skos: . +@prefix vcard: . +@prefix xsd: . + + a dcat:Dataset ; +# healthdcatap:hdab [ a foaf:Agent ; +# vcard:hasEmail ; +# foaf:homepage ; +# foaf:name "EU Health Data Access Body" ] ; +# dct:provenance [ a dct:ProvenanceStatement ; +# rdfs:label "This example dataset is partly sourced from TEHDAS2" ] ; +# dct:publisher [ a foaf:Agent ; +# vcard:hasEmail ; +# foaf:homepage ; +# foaf:name "Contact Point" ] ; +# dct:temporal [ a dct:PeriodOfTime ; +# dcat:endDate "2024-12-31"^^xsd:date ; +# dcat:startDate "2020-03-01"^^xsd:date ] ; +# adms:identifier [ a adms:Identifier ; +# skos:notation "internalURI:admsIdentifier0" ] ; +# dcat:contactPoint [ a vcard:Kind ; +# vcard:fn "Contact Point" ; +# vcard:hasEmail ] ; + dcatap:applicableLegislation ; + healthdcatap:analytics ; + healthdcatap:hasCodeValues , + ; + healthdcatap:hasCodingSystem , + ; + healthdcatap:healthCategory , + , + , + , + , + ; + healthdcatap:maxTypicalAge "110"^^xsd:nonNegativeInteger ; + healthdcatap:minTypicalAge "0"^^xsd:nonNegativeInteger ; + healthdcatap:numberOfRecords "123456789"^^xsd:nonNegativeInteger ; + healthdcatap:numberOfUniqueIndividuals "7654321"^^xsd:nonNegativeInteger ; + healthdcatap:populationCoverage "This example includes a very non-descript population" ; + healthdcatap:publisherNote "Health-RI is the Dutch health care initiative to build an integrated health data infrastructure for research and innovation." ; + healthdcatap:publisherType ; + dct:accessRights ; + dct:accrualPeriodicity ; + dct:conformsTo ; + dct:description "This dataset is an example of using HealthDCAT-AP in CKAN" ; + dct:identifier ; + dct:isReferencedBy , + ; + dct:issued "2024-01-01T00:00:00+00:00"^^xsd:dateTime ; + dct:language , + , + ; + dct:modified "2024-12-31T23:59:59+00:00"^^xsd:dateTime ; + dct:relation ; + dct:title "HealthDCAT-AP test dataset" ; + dct:type "n1049372e768c4429a6b2200c22f5f1a4b7" ; + adms:sample ; + adms:versionNotes "Dataset continuously updated" ; + dcat:keyword "Test 1", + "Test 2", + "Test 3" ; + dcat:temporalResolution "P1D"^^xsd:duration ; + dcat:theme ; + foaf:page "n1049372e768c4429a6b2200c22f5f1a4b9" ; + dpv:hasLegalBasis dpv:Consent ; + dpv:hasPurpose dpv:AcademicResearch ; + dpv:hasPersonalData , + , + . diff --git a/mkdocs.yml b/mkdocs.yml index bed64a4b..5287e055 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -70,7 +70,9 @@ nav: - DCAT support: - 'endpoints.md' - DCAT ↔ CKAN mapping: 'mapping.md' - - 'profiles.md' + - Base profiles: 'profiles.md' + - Application profiles: 'application-profiles.md' + - Writing profiles: 'writing-profiles.md' - Other features: - 'harvester.md' - Google Dataset Search: 'google-dataset-search.md' diff --git a/pyproject.toml b/pyproject.toml index b7634286..80033250 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,5 +61,6 @@ euro_dcat_ap = "ckanext.dcat.profiles:EuropeanDCATAPProfile" euro_dcat_ap_2 = "ckanext.dcat.profiles:EuropeanDCATAP2Profile" euro_dcat_ap_3 = "ckanext.dcat.profiles:EuropeanDCATAP3Profile" euro_dcat_ap_scheming = "ckanext.dcat.profiles:EuropeanDCATAPSchemingProfile" -dcat_us_3="ckanext.dcat.profiles:DCATUS3Profile" +euro_health_dcat_ap = "ckanext.dcat.profiles:EuropeanHealthDCATAPProfile" +dcat_us_3 = "ckanext.dcat.profiles:DCATUS3Profile" schemaorg = "ckanext.dcat.profiles:SchemaOrgProfile"