diff --git a/Makefile b/Makefile index d83540bc3..04f1b911b 100644 --- a/Makefile +++ b/Makefile @@ -34,7 +34,7 @@ COVBASE=coverage run --append # Updating the Major & Minor version below? # Don't forget to update setup.py as well -VERSION=8.2.$(shell date +%Y%m%d%H%M%S --utc --date=`git log --first-parent \ +VERSION=8.3.$(shell date +%Y%m%d%H%M%S --utc --date=`git log --first-parent \ --max-count=1 --format=format:%cI`) ## all : default task diff --git a/schema_salad/avro/schema.py b/schema_salad/avro/schema.py index c1eb92c5a..e02e61fdf 100644 --- a/schema_salad/avro/schema.py +++ b/schema_salad/avro/schema.py @@ -303,7 +303,7 @@ def __init__( type_schema = make_avsc_object(atype, names) except Exception as e: raise SchemaParseException( - f'Type property "{atype}" not a valid Avro schema.' + f'Type property "{atype}" not a valid Avro schema: {e}' ) from e self.set_prop("type", type_schema) self.set_prop("name", name) @@ -409,8 +409,8 @@ def __init__( items_schema = make_avsc_object(items, names) except Exception as err: raise SchemaParseException( - f"Items schema ({items}) not a valid Avro schema: (known " - f"names: {list(names.names.keys())})." + f"Items schema ({items}) not a valid Avro schema: {err}. " + f"Known names: {list(names.names.keys())})." ) from err self.set_prop("items", items_schema) @@ -451,7 +451,7 @@ def __init__( new_schema = make_avsc_object(schema, names) except Exception as err: raise SchemaParseException( - f"Union item must be a valid Avro schema: {schema}" + f"Union item must be a valid Avro schema: {err}; {schema}," ) from err # check the new schema if ( @@ -477,7 +477,7 @@ class RecordSchema(NamedSchema): def make_field_objects(field_data: List[PropsType], names: Names) -> List[Field]: """We're going to need to make message parameters too.""" field_objects = [] # type: List[Field] - field_names = [] # type: List[str] + parsed_fields: Dict[str, PropsType] = {} for field in field_data: if hasattr(field, "get") and callable(field.get): atype = field.get("type") @@ -504,10 +504,15 @@ def make_field_objects(field_data: List[PropsType], names: Names) -> List[Field] atype, name, has_default, default, order, names, doc, other_props ) # make sure field name has not been used yet - if new_field.name in field_names: - fail_msg = f"Field name {new_field.name} already in use." - raise SchemaParseException(fail_msg) - field_names.append(new_field.name) + if new_field.name in parsed_fields: + old_field = parsed_fields[new_field.name] + if not is_subtype(old_field["type"], field["type"]): + raise SchemaParseException( + f"Field name {new_field.name} already in use with " + "incompatible type. " + f"{field['type']} vs {old_field['type']}." + ) + parsed_fields[new_field.name] = field else: raise SchemaParseException(f"Not a valid field: {field}") field_objects.append(new_field) @@ -655,3 +660,62 @@ def make_avsc_object(json_data: JsonDataType, names: Optional[Names] = None) -> # not for us! fail_msg = f"Could not make an Avro Schema object from {json_data}." raise SchemaParseException(fail_msg) + + +def is_subtype(existing: PropType, new: PropType) -> bool: + """Checks if a new type specification is compatible with an existing type spec.""" + if existing == new: + return True + if isinstance(existing, list) and (new in existing): + return True + if existing == "Any": + if new is None or new == [] or new == ["null"] or new == "null": + return False + if isinstance(new, list) and "null" in new: + return False + return True + if ( + isinstance(existing, dict) + and "type" in existing + and existing["type"] == "array" + and isinstance(new, dict) + and "type" in new + and new["type"] == "array" + ): + return is_subtype(existing["items"], new["items"]) + if ( + isinstance(existing, dict) + and "type" in existing + and existing["type"] == "enum" + and isinstance(new, dict) + and "type" in new + and new["type"] == "enum" + ): + return is_subtype(existing["symbols"], new["symbols"]) + if ( + isinstance(existing, dict) + and "type" in existing + and existing["type"] == "record" + and isinstance(new, dict) + and "type" in new + and new["type"] == "record" + ): + for new_field in cast(List[Dict[str, Any]], new["fields"]): + new_field_missing = True + for existing_field in cast(List[Dict[str, Any]], existing["fields"]): + if new_field["name"] == existing_field["name"]: + if not is_subtype(existing_field["type"], new_field["type"]): + return False + new_field_missing = False + if new_field_missing: + return False + return True + if isinstance(existing, list) and isinstance(new, list): + missing = False + for _type in new: + if _type not in existing and ( + not is_subtype(existing, cast(PropType, _type)) + ): + missing = True + return not missing + return False diff --git a/schema_salad/metaschema.py b/schema_salad/metaschema.py index afd679e93..01670ad90 100644 --- a/schema_salad/metaschema.py +++ b/schema_salad/metaschema.py @@ -22,7 +22,7 @@ Type, Union, ) -from urllib.parse import quote, urlsplit, urlunsplit, urlparse +from urllib.parse import quote, urlparse, urlsplit, urlunsplit from urllib.request import pathname2url from ruamel.yaml.comments import CommentedMap diff --git a/schema_salad/metaschema/metaschema.yml b/schema_salad/metaschema/metaschema.yml index 3340c2277..0e5d7b28a 100644 --- a/schema_salad/metaschema/metaschema.yml +++ b/schema_salad/metaschema/metaschema.yml @@ -309,7 +309,8 @@ $graph: type: boolean? doc: | If true, this record is abstract and may be used as a base for other - records, but is not valid on its own. + records, but is not valid on its own. Inherited fields may be + re-specified to narrow their type. - name: extends type: @@ -321,7 +322,7 @@ $graph: refScope: 1 doc: | Indicates that this record inherits fields from one or more base records. - + Inherited fields may be re-specified to narrow their type. - name: specialize type: - SpecializeDef[]? diff --git a/schema_salad/metaschema/salad.md b/schema_salad/metaschema/salad.md index a931636de..a2a08d9ce 100644 --- a/schema_salad/metaschema/salad.md +++ b/schema_salad/metaschema/salad.md @@ -9,6 +9,7 @@ Contributors: * The developers of Apache Avro * The developers of JSON-LD * Nebojša Tijanić , Seven Bridges Genomics +* Michael R. Crusoe, ELIXIR-DE # Abstract @@ -86,6 +87,13 @@ specification, the following changes have been made: is poorly documented, not included in conformance testing, and not widely supported. +## Introduction to v1.2 + +This is the fourth version of the Schema Salad specification. It was created to +ease the development of extensions to CWL v1.2. The only change is that +inherited records can narrow the types of fields if those fields are re-specified +with a matching jsonldPredicate. + ## References to Other Specifications **Javascript Object Notation (JSON)**: http://json.org diff --git a/schema_salad/python_codegen_support.py b/schema_salad/python_codegen_support.py index e8770b57e..35b9a9a9a 100644 --- a/schema_salad/python_codegen_support.py +++ b/schema_salad/python_codegen_support.py @@ -19,7 +19,7 @@ Type, Union, ) -from urllib.parse import quote, urlsplit, urlunsplit, urlparse +from urllib.parse import quote, urlparse, urlsplit, urlunsplit from urllib.request import pathname2url from ruamel.yaml.comments import CommentedMap diff --git a/schema_salad/tests/metaschema-pre.yml b/schema_salad/tests/metaschema-pre.yml index 5400d64de..38f1918ad 100644 --- a/schema_salad/tests/metaschema-pre.yml +++ b/schema_salad/tests/metaschema-pre.yml @@ -3,7 +3,7 @@ "name": "https://w3id.org/cwl/salad#Semantic_Annotations_for_Linked_Avro_Data", "type": "documentation", "doc": [ - "# Semantic Annotations for Linked Avro Data (SALAD)\n\nAuthor:\n\n* Peter Amstutz , Veritas Genetics\n\nContributors:\n\n* The developers of Apache Avro\n* The developers of JSON-LD\n* Neboj\u0161a Tijani\u0107 , Seven Bridges Genomics\n\n# Abstract\n\nSalad is a schema language for describing structured linked data documents\nin JSON or YAML documents. A Salad schema provides rules for\npreprocessing, structural validation, and link checking for documents\ndescribed by a Salad schema. Salad builds on JSON-LD and the Apache Avro\ndata serialization system, and extends Avro with features for rich data\nmodeling such as inheritance, template specialization, object identifiers,\nand object references. Salad was developed to provide a bridge between the\nrecord oriented data modeling supported by Apache Avro and the Semantic\nWeb.\n\n# Status of This Document\n\nThis document is the product of the [Common Workflow Language working\ngroup](https://groups.google.com/forum/#!forum/common-workflow-language). The\nlatest version of this document is available in the \"schema_salad\" repository at\n\nhttps://github.com/common-workflow-language/schema_salad\n\nThe products of the CWL working group (including this document) are made available\nunder the terms of the Apache License, version 2.0.\n\n\n\n# Introduction\n\nThe JSON data model is an extremely popular way to represent structured\ndata. It is attractive because of its relative simplicity and is a\nnatural fit with the standard types of many programming languages.\nHowever, this simplicity means that basic JSON lacks expressive features\nuseful for working with complex data structures and document formats, such\nas schemas, object references, and namespaces.\n\nJSON-LD is a W3C standard providing a way to describe how to interpret a\nJSON document as Linked Data by means of a \"context\". JSON-LD provides a\npowerful solution for representing object references and namespaces in JSON\nbased on standard web URIs, but is not itself a schema language. Without a\nschema providing a well defined structure, it is difficult to process an\narbitrary JSON-LD document as idiomatic JSON because there are many ways to\nexpress the same data that are logically equivalent but structurally\ndistinct.\n\nSeveral schema languages exist for describing and validating JSON data,\nsuch as the Apache Avro data serialization system, however none understand\nlinked data. As a result, to fully take advantage of JSON-LD to build the\nnext generation of linked data applications, one must maintain separate\nJSON schema, JSON-LD context, RDF schema, and human documentation, despite\nsignificant overlap of content and obvious need for these documents to stay\nsynchronized.\n\nSchema Salad is designed to address this gap. It provides a schema\nlanguage and processing rules for describing structured JSON content\npermitting URI resolution and strict document validation. The schema\nlanguage supports linked data through annotations that describe the linked\ndata interpretation of the content, enables generation of JSON-LD context\nand RDF schema, and production of RDF triples by applying the JSON-LD\ncontext. The schema language also provides for robust support of inline\ndocumentation.\n\n## Introduction to v1.1\n\nThis is the third version of of the Schema Salad specification. It is\ndeveloped concurrently with v1.1 of the Common Workflow Language for use in\nspecifying the Common Workflow Language, however Schema Salad is intended to be\nuseful to a broader audience. Compared to the v1.0 schema salad\nspecification, the following changes have been made:\n\n* Support for `default` values on record fields to specify default values\n* Add subscoped fields (fields which introduce a new inner scope for identifiers)\n* Add the *inVocab* flag (default true) to indicate if a type is added to the vocabulary of well known terms or must be prefixed\n* Add *secondaryFilesDSL* micro DSL (domain specific language) to convert text strings to a secondaryFiles record type used in CWL\n* The `$mixin` feature has been removed from the specification, as it\n is poorly documented, not included in conformance testing,\n and not widely supported.\n\n## References to Other Specifications\n\n**Javascript Object Notation (JSON)**: http://json.org\n\n**JSON Linked Data (JSON-LD)**: http://json-ld.org\n\n**YAML**: https://yaml.org/spec/1.2/spec.html\n\n**Avro**: https://avro.apache.org/docs/current/spec.html\n\n**Uniform Resource Identifier (URI) Generic Syntax**: https://tools.ietf.org/html/rfc3986)\n\n**Resource Description Framework (RDF)**: http://www.w3.org/RDF/\n\n**UTF-8**: https://www.ietf.org/rfc/rfc2279.txt)\n\n## Scope\n\nThis document describes the syntax, data model, algorithms, and schema\nlanguage for working with Salad documents. It is not intended to document\na specific implementation of Salad, however it may serve as a reference for\nthe behavior of conforming implementations.\n\n## Terminology\n\nThe terminology used to describe Salad documents is defined in the Concepts\nsection of the specification. The terms defined in the following list are\nused in building those definitions and in describing the actions of an\nSalad implementation:\n\n**may**: Conforming Salad documents and Salad implementations are permitted but\nnot required to be interpreted as described.\n\n**must**: Conforming Salad documents and Salad implementations are required\nto be interpreted as described; otherwise they are in error.\n\n**error**: A violation of the rules of this specification; results are\nundefined. Conforming implementations may detect and report an error and may\nrecover from it.\n\n**fatal error**: A violation of the rules of this specification; results\nare undefined. Conforming implementations must not continue to process the\ndocument and may report an error.\n\n**at user option**: Conforming software may or must (depending on the modal verb in\nthe sentence) behave as described; if it does, it must provide users a means to\nenable or disable the behavior described.\n\n# Document model\n\n## Data concepts\n\nAn **object** is a data structure equivalent to the \"object\" type in JSON,\nconsisting of a unordered set of name/value pairs (referred to here as\n**fields**) and where the name is a string and the value is a string, number,\nboolean, array, or object.\n\nA **document** is a file containing a serialized object, or an array of\nobjects.\n\nA **document type** is a class of files that share a common structure and\nsemantics.\n\nA **document schema** is a formal description of the grammar of a document type.\n\nA **base URI** is a context-dependent URI used to resolve relative references.\n\nAn **identifier** is a URI that designates a single document or single\nobject within a document.\n\nA **vocabulary** is the set of symbolic field names and enumerated symbols defined\nby a document schema, where each term maps to absolute URI.\n\n## Syntax\n\nConforming Salad v1.1 documents are serialized and loaded using a\nsubset of YAML 1.2 syntax and UTF-8 text encoding. Salad documents\nare written using the [JSON-compatible subset of YAML described in\nsection 10.2](https://yaml.org/spec/1.2/spec.html#id2803231). The\nfollowing features of YAML must not be used in conforming Salad\ndocuments:\n\n* Use of explicit node tags with leading `!` or `!!`\n* Use of anchors with leading `&` and aliases with leading `*`\n* %YAML directives\n* %TAG directives\n\nIt is a fatal error if the document is not valid YAML.\n\nA Salad document must consist only of either a single root object or an\narray of objects.\n\n## Document context\n\n### Implied context\n\nThe implicit context consists of the vocabulary defined by the schema and\nthe base URI. By default, the base URI must be the URI that was used to\nload the document. It may be overridden by an explicit context.\n\n### Explicit context\n\nIf a document consists of a root object, this object may contain the\nfields `$base`, `$namespaces`, `$schemas`, and `$graph`:\n\n * `$base`: Must be a string. Set the base URI for the document used to\n resolve relative references.\n\n * `$namespaces`: Must be an object with strings as values. The keys of\n the object are namespace prefixes used in the document; the values of\n the object are the prefix expansions.\n\n * `$schemas`: Must be an array of strings. This field may list URI\n references to documents in RDF-XML format which will be queried for RDF\n schema data. The subjects and predicates described by the RDF schema\n may provide additional semantic context for the document, and may be\n used for validation of prefixed extension fields found in the document.\n\nOther directives beginning with `$` must be ignored.\n\n## Document graph\n\nIf a document consists of a single root object, this object may contain the\nfield `$graph`. This field must be an array of objects. If present, this\nfield holds the primary content of the document. A document that consists\nof array of objects at the root is an implicit graph.\n\n## Document metadata\n\nIf a document consists of a single root object, metadata about the\ndocument, such as authorship, may be declared in the root object.\n\n## Document schema\n\nDocument preprocessing, link validation and schema validation require a\ndocument schema. A schema may consist of:\n\n * At least one record definition object which defines valid fields that\n make up a record type. Record field definitions include the valid types\n that may be assigned to each field and annotations to indicate fields\n that represent identifiers and links, described below in \"Semantic\n Annotations\".\n\n * Any number of enumerated type objects which define a set of finite set of symbols that are\n valid value of the type.\n\n * Any number of documentation objects which allow in-line documentation of the schema.\n\nThe schema for defining a salad schema (the metaschema) is described in\ndetail in the [Schema](#Schema) section.\n\n## Record field annotations\n\nIn a document schema, record field definitions may include the field\n`jsonldPredicate`, which may be either a string or object. Implementations\nmust use the following document preprocessing of fields by the following\nrules:\n\n * If the value of `jsonldPredicate` is `@id`, the field is an identifier\n field.\n\n * If the value of `jsonldPredicate` is an object, and that\n object contains the field `_type` with the value `@id`, the field is a\n link field subject to [link validation](#Link_validation).\n\n * If the value of `jsonldPredicate` is an object which contains the\n field `_type` with the value `@vocab`, the field value is subject to\n [vocabulary resolution](#Vocabulary_resolution).\n\n## Document traversal\n\nTo perform document document preprocessing, link validation and schema\nvalidation, the document must be traversed starting from the fields or\narray items of the root object or array and recursively visiting each child\nitem which contains an object or arrays.\n\n## Short names\n\nThe \"short name\" of an fully qualified identifier is the portion of\nthe identifier following the final slash `/` of either the fragment\nidentifier following `#` or the path portion, if there is no fragment.\nSome examples:\n\n* the short name of `http://example.com/foo` is `foo`\n* the short name of `http://example.com/#bar` is `bar`\n* the short name of `http://example.com/foo/bar` is `bar`\n* the short name of `http://example.com/foo#bar` is `bar`\n* the short name of `http://example.com/#foo/bar` is `bar`\n* the short name of `http://example.com/foo#bar/baz` is `baz`\n\n## Inheritance and specialization\n\nA record definition may inherit from one or more record definitions\nwith the `extends` field. This copies the fields defined in the\nparent record(s) as the base for the new record. A record definition\nmay `specialize` type declarations of the fields inherited from the\nbase record. For each field inherited from the base record, any\ninstance of the type in `specializeFrom` is replaced with the type in\n`specializeTo`. The type in `specializeTo` should extend from the\ntype in `specializeFrom`.\n\nA record definition may be `abstract`. This means the record\ndefinition is not used for validation on its own, but may be extended\nby other definitions. If an abstract type appears in a field\ndefinition, it is logically replaced with a union of all concrete\nsubtypes of the abstract type. In other words, the field value does\nnot validate as the abstract type, but must validate as some concrete\ntype that inherits from the abstract type.\n\n# Document preprocessing\n\nAfter processing the explicit context (if any), document preprocessing\nbegins. Starting from the document root, object fields values or array\nitems which contain objects or arrays are recursively traversed\ndepth-first. For each visited object, field names, identifier fields, link\nfields, vocabulary fields, and `$import` and `$include` directives must be\nprocessed as described in this section. The order of traversal of child\nnodes within a parent node is undefined.\n", + "# Semantic Annotations for Linked Avro Data (SALAD)\n\nAuthor:\n\n* Peter Amstutz , Veritas Genetics\n\nContributors:\n\n* The developers of Apache Avro\n* The developers of JSON-LD\n* Neboj\u0161a Tijani\u0107 , Seven Bridges Genomics\n* Michael R. Crusoe, ELIXIR-DE\n\n# Abstract\n\nSalad is a schema language for describing structured linked data documents\nin JSON or YAML documents. A Salad schema provides rules for\npreprocessing, structural validation, and link checking for documents\ndescribed by a Salad schema. Salad builds on JSON-LD and the Apache Avro\ndata serialization system, and extends Avro with features for rich data\nmodeling such as inheritance, template specialization, object identifiers,\nand object references. Salad was developed to provide a bridge between the\nrecord oriented data modeling supported by Apache Avro and the Semantic\nWeb.\n\n# Status of This Document\n\nThis document is the product of the [Common Workflow Language working\ngroup](https://groups.google.com/forum/#!forum/common-workflow-language). The\nlatest version of this document is available in the \"schema_salad\" repository at\n\nhttps://github.com/common-workflow-language/schema_salad\n\nThe products of the CWL working group (including this document) are made available\nunder the terms of the Apache License, version 2.0.\n\n\n\n# Introduction\n\nThe JSON data model is an extremely popular way to represent structured\ndata. It is attractive because of its relative simplicity and is a\nnatural fit with the standard types of many programming languages.\nHowever, this simplicity means that basic JSON lacks expressive features\nuseful for working with complex data structures and document formats, such\nas schemas, object references, and namespaces.\n\nJSON-LD is a W3C standard providing a way to describe how to interpret a\nJSON document as Linked Data by means of a \"context\". JSON-LD provides a\npowerful solution for representing object references and namespaces in JSON\nbased on standard web URIs, but is not itself a schema language. Without a\nschema providing a well defined structure, it is difficult to process an\narbitrary JSON-LD document as idiomatic JSON because there are many ways to\nexpress the same data that are logically equivalent but structurally\ndistinct.\n\nSeveral schema languages exist for describing and validating JSON data,\nsuch as the Apache Avro data serialization system, however none understand\nlinked data. As a result, to fully take advantage of JSON-LD to build the\nnext generation of linked data applications, one must maintain separate\nJSON schema, JSON-LD context, RDF schema, and human documentation, despite\nsignificant overlap of content and obvious need for these documents to stay\nsynchronized.\n\nSchema Salad is designed to address this gap. It provides a schema\nlanguage and processing rules for describing structured JSON content\npermitting URI resolution and strict document validation. The schema\nlanguage supports linked data through annotations that describe the linked\ndata interpretation of the content, enables generation of JSON-LD context\nand RDF schema, and production of RDF triples by applying the JSON-LD\ncontext. The schema language also provides for robust support of inline\ndocumentation.\n\n## Introduction to v1.1\n\nThis is the third version of of the Schema Salad specification. It is\ndeveloped concurrently with v1.1 of the Common Workflow Language for use in\nspecifying the Common Workflow Language, however Schema Salad is intended to be\nuseful to a broader audience. Compared to the v1.0 schema salad\nspecification, the following changes have been made:\n\n* Support for `default` values on record fields to specify default values\n* Add subscoped fields (fields which introduce a new inner scope for identifiers)\n* Add the *inVocab* flag (default true) to indicate if a type is added to the vocabulary of well known terms or must be prefixed\n* Add *secondaryFilesDSL* micro DSL (domain specific language) to convert text strings to a secondaryFiles record type used in CWL\n* The `$mixin` feature has been removed from the specification, as it\n is poorly documented, not included in conformance testing,\n and not widely supported.\n\n## Introduction to v1.2\n\nThis is the fourth version of the Schema Salad specification. It was created to\nease the development of extensions to CWL v1.2. The only change is that\ninherited records can narrow the types of fields if those fields are re-specified\nwith a matching jsonldPredicate.\n\n## References to Other Specifications\n\n**Javascript Object Notation (JSON)**: http://json.org\n\n**JSON Linked Data (JSON-LD)**: http://json-ld.org\n\n**YAML**: https://yaml.org/spec/1.2/spec.html\n\n**Avro**: https://avro.apache.org/docs/current/spec.html\n\n**Uniform Resource Identifier (URI) Generic Syntax**: https://tools.ietf.org/html/rfc3986)\n\n**Resource Description Framework (RDF)**: http://www.w3.org/RDF/\n\n**UTF-8**: https://www.ietf.org/rfc/rfc2279.txt)\n\n## Scope\n\nThis document describes the syntax, data model, algorithms, and schema\nlanguage for working with Salad documents. It is not intended to document\na specific implementation of Salad, however it may serve as a reference for\nthe behavior of conforming implementations.\n\n## Terminology\n\nThe terminology used to describe Salad documents is defined in the Concepts\nsection of the specification. The terms defined in the following list are\nused in building those definitions and in describing the actions of an\nSalad implementation:\n\n**may**: Conforming Salad documents and Salad implementations are permitted but\nnot required to be interpreted as described.\n\n**must**: Conforming Salad documents and Salad implementations are required\nto be interpreted as described; otherwise they are in error.\n\n**error**: A violation of the rules of this specification; results are\nundefined. Conforming implementations may detect and report an error and may\nrecover from it.\n\n**fatal error**: A violation of the rules of this specification; results\nare undefined. Conforming implementations must not continue to process the\ndocument and may report an error.\n\n**at user option**: Conforming software may or must (depending on the modal verb in\nthe sentence) behave as described; if it does, it must provide users a means to\nenable or disable the behavior described.\n\n# Document model\n\n## Data concepts\n\nAn **object** is a data structure equivalent to the \"object\" type in JSON,\nconsisting of a unordered set of name/value pairs (referred to here as\n**fields**) and where the name is a string and the value is a string, number,\nboolean, array, or object.\n\nA **document** is a file containing a serialized object, or an array of\nobjects.\n\nA **document type** is a class of files that share a common structure and\nsemantics.\n\nA **document schema** is a formal description of the grammar of a document type.\n\nA **base URI** is a context-dependent URI used to resolve relative references.\n\nAn **identifier** is a URI that designates a single document or single\nobject within a document.\n\nA **vocabulary** is the set of symbolic field names and enumerated symbols defined\nby a document schema, where each term maps to absolute URI.\n\n## Syntax\n\nConforming Salad v1.1 documents are serialized and loaded using a\nsubset of YAML 1.2 syntax and UTF-8 text encoding. Salad documents\nare written using the [JSON-compatible subset of YAML described in\nsection 10.2](https://yaml.org/spec/1.2/spec.html#id2803231). The\nfollowing features of YAML must not be used in conforming Salad\ndocuments:\n\n* Use of explicit node tags with leading `!` or `!!`\n* Use of anchors with leading `&` and aliases with leading `*`\n* %YAML directives\n* %TAG directives\n\nIt is a fatal error if the document is not valid YAML.\n\nA Salad document must consist only of either a single root object or an\narray of objects.\n\n## Document context\n\n### Implied context\n\nThe implicit context consists of the vocabulary defined by the schema and\nthe base URI. By default, the base URI must be the URI that was used to\nload the document. It may be overridden by an explicit context.\n\n### Explicit context\n\nIf a document consists of a root object, this object may contain the\nfields `$base`, `$namespaces`, `$schemas`, and `$graph`:\n\n * `$base`: Must be a string. Set the base URI for the document used to\n resolve relative references.\n\n * `$namespaces`: Must be an object with strings as values. The keys of\n the object are namespace prefixes used in the document; the values of\n the object are the prefix expansions.\n\n * `$schemas`: Must be an array of strings. This field may list URI\n references to documents in RDF-XML format which will be queried for RDF\n schema data. The subjects and predicates described by the RDF schema\n may provide additional semantic context for the document, and may be\n used for validation of prefixed extension fields found in the document.\n\nOther directives beginning with `$` must be ignored.\n\n## Document graph\n\nIf a document consists of a single root object, this object may contain the\nfield `$graph`. This field must be an array of objects. If present, this\nfield holds the primary content of the document. A document that consists\nof array of objects at the root is an implicit graph.\n\n## Document metadata\n\nIf a document consists of a single root object, metadata about the\ndocument, such as authorship, may be declared in the root object.\n\n## Document schema\n\nDocument preprocessing, link validation and schema validation require a\ndocument schema. A schema may consist of:\n\n * At least one record definition object which defines valid fields that\n make up a record type. Record field definitions include the valid types\n that may be assigned to each field and annotations to indicate fields\n that represent identifiers and links, described below in \"Semantic\n Annotations\".\n\n * Any number of enumerated type objects which define a set of finite set of symbols that are\n valid value of the type.\n\n * Any number of documentation objects which allow in-line documentation of the schema.\n\nThe schema for defining a salad schema (the metaschema) is described in\ndetail in the [Schema](#Schema) section.\n\n## Record field annotations\n\nIn a document schema, record field definitions may include the field\n`jsonldPredicate`, which may be either a string or object. Implementations\nmust use the following document preprocessing of fields by the following\nrules:\n\n * If the value of `jsonldPredicate` is `@id`, the field is an identifier\n field.\n\n * If the value of `jsonldPredicate` is an object, and that\n object contains the field `_type` with the value `@id`, the field is a\n link field subject to [link validation](#Link_validation).\n\n * If the value of `jsonldPredicate` is an object which contains the\n field `_type` with the value `@vocab`, the field value is subject to\n [vocabulary resolution](#Vocabulary_resolution).\n\n## Document traversal\n\nTo perform document document preprocessing, link validation and schema\nvalidation, the document must be traversed starting from the fields or\narray items of the root object or array and recursively visiting each child\nitem which contains an object or arrays.\n\n## Short names\n\nThe \"short name\" of an fully qualified identifier is the portion of\nthe identifier following the final slash `/` of either the fragment\nidentifier following `#` or the path portion, if there is no fragment.\nSome examples:\n\n* the short name of `http://example.com/foo` is `foo`\n* the short name of `http://example.com/#bar` is `bar`\n* the short name of `http://example.com/foo/bar` is `bar`\n* the short name of `http://example.com/foo#bar` is `bar`\n* the short name of `http://example.com/#foo/bar` is `bar`\n* the short name of `http://example.com/foo#bar/baz` is `baz`\n\n## Inheritance and specialization\n\nA record definition may inherit from one or more record definitions\nwith the `extends` field. This copies the fields defined in the\nparent record(s) as the base for the new record. A record definition\nmay `specialize` type declarations of the fields inherited from the\nbase record. For each field inherited from the base record, any\ninstance of the type in `specializeFrom` is replaced with the type in\n`specializeTo`. The type in `specializeTo` should extend from the\ntype in `specializeFrom`.\n\nA record definition may be `abstract`. This means the record\ndefinition is not used for validation on its own, but may be extended\nby other definitions. If an abstract type appears in a field\ndefinition, it is logically replaced with a union of all concrete\nsubtypes of the abstract type. In other words, the field value does\nnot validate as the abstract type, but must validate as some concrete\ntype that inherits from the abstract type.\n\n# Document preprocessing\n\nAfter processing the explicit context (if any), document preprocessing\nbegins. Starting from the document root, object fields values or array\nitems which contain objects or arrays are recursively traversed\ndepth-first. For each visited object, field names, identifier fields, link\nfields, vocabulary fields, and `$import` and `$include` directives must be\nprocessed as described in this section. The order of traversal of child\nnodes within a parent node is undefined.\n", "## Field name resolution\n\nThe document schema declares the vocabulary of known field names. During\npreprocessing traversal, field name in the document which are not part of\nthe schema vocabulary must be resolved to absolute URIs. Under \"strict\"\nvalidation, it is an error for a document to include fields which are not\npart of the vocabulary and not resolvable to absolute URIs. Fields names\nwhich are not part of the vocabulary are resolved using the following\nrules:\n\n* If a field name URI begins with a namespace prefix declared in the\ndocument context (`@context`) followed by a colon `:`, the prefix and\ncolon must be replaced by the namespace declared in `@context`.\n\n* If there is a vocabulary term which maps to the URI of a resolved\nfield, the field name must be replace with the vocabulary term.\n\n* If a field name URI is an absolute URI consisting of a scheme and path\nand is not part of the vocabulary, no processing occurs.\n\nField name resolution is not relative. It must not be affected by the\nbase URI.\n\n### Field name resolution example\n\nGiven the following schema:\n\n```\n", "{\n \"$namespaces\": {\n \"acid\": \"http://example.com/acid#\"\n },\n \"$graph\": [{\n \"name\": \"ExampleType\",\n \"type\": \"record\",\n \"documentRoot\": true,\n \"fields\": [{\n \"name\": \"base\",\n \"type\": \"string\",\n \"jsonldPredicate\": \"http://example.com/base\"\n }]\n }]\n}\n", "```\n\nProcess the following example:\n\n```\n", @@ -565,7 +565,7 @@ "null", "boolean" ], - "doc": "If true, this record is abstract and may be used as a base for other\nrecords, but is not valid on its own.\n" + "doc": "If true, this record is abstract and may be used as a base for other\nrecords, but is not valid on its own. Inherited fields may be\nre-specified to narrow their type.\n" }, { "name": "https://w3id.org/cwl/salad#SaladRecordSchema/extends", @@ -582,7 +582,7 @@ "_type": "@id", "refScope": 1 }, - "doc": "Indicates that this record inherits fields from one or more base records.\n" + "doc": "Indicates that this record inherits fields from one or more base records.\nInherited fields may be re-specified to narrow their type.\n" }, { "name": "https://w3id.org/cwl/salad#SaladRecordSchema/specialize", @@ -662,4 +662,4 @@ } ] } -] +] \ No newline at end of file diff --git a/schema_salad/tests/test_schema/avro_naming_base.yml b/schema_salad/tests/test_schema/avro_naming_base.yml index faa50edaf..d38b00ee6 100644 --- a/schema_salad/tests/test_schema/avro_naming_base.yml +++ b/schema_salad/tests/test_schema/avro_naming_base.yml @@ -20,3 +20,7 @@ $graph: doc: | This is an arbitrary abstract thing from a base schema that might be extended. + fields: + override_me: + type: [ string, int ] + jsonldPredicate: "bs:override_me" diff --git a/schema_salad/tests/test_schema/avro_subtype.yml b/schema_salad/tests/test_schema/avro_subtype.yml new file mode 100644 index 000000000..1678bc943 --- /dev/null +++ b/schema_salad/tests/test_schema/avro_subtype.yml @@ -0,0 +1,25 @@ +$base: "https://example.com/derived_schema#" + +$namespaces: + bs: "https://example.com/base_schema#" + +$graph: + +- $import: avro_naming_base.yml + +- name: ExtendedThing + type: record + doc: | + A refinement of the base schema's arbitrary abstract thing, + that allows one of the base schema's types as a field. + inVocab: false + extends: bs:AbstractThing + fields: + field_one: + type: + type: array + items: [string, bs:RealThing] + override_me: + type: string + jsonldPredicate: "bs:override_me" + diff --git a/schema_salad/tests/test_schema/avro_subtype_bad.yml b/schema_salad/tests/test_schema/avro_subtype_bad.yml new file mode 100644 index 000000000..e29dbd757 --- /dev/null +++ b/schema_salad/tests/test_schema/avro_subtype_bad.yml @@ -0,0 +1,25 @@ +$base: "https://example.com/derived_schema#" + +$namespaces: + bs: "https://example.com/base_schema#" + +$graph: + +- $import: avro_naming_base.yml + +- name: ExtendedThing + type: record + doc: | + A refinement of the base schema's arbitrary abstract thing, + that allows one of the base schema's types as a field. + inVocab: false + extends: bs:AbstractThing + fields: + field_one: + type: + type: array + items: [string, bs:RealThing] + override_me: + type: Any + jsonldPredicate: bs:override_me + diff --git a/schema_salad/tests/test_subtypes.py b/schema_salad/tests/test_subtypes.py new file mode 100644 index 000000000..1e0c56d76 --- /dev/null +++ b/schema_salad/tests/test_subtypes.py @@ -0,0 +1,111 @@ +"""Confirm subtypes.""" +import pytest + +from schema_salad.avro import schema +from schema_salad.avro.schema import Names, SchemaParseException +from schema_salad.schema import load_schema + +from .util import get_data + +types = [ + (["int", "float", "double"], "int", True), + (["int", "float", "double"], ["int"], True), + (["int", "float", "double"], ["int", "float"], True), + (["int", "float", "double"], ["int", "float", "File"], False), + ({"type": "array", "items": ["int", "float", "double"]}, ["int", "float"], False), + ( + {"type": "array", "items": ["int", "float", "double"]}, + {"type": "array", "items": ["int", "float"]}, + True, + ), + ("Any", "int", True), + ("Any", ["int", "null"], False), + ("Any", ["int"], True), + ("Any", None, False), + ("Any", ["null"], False), + ("Any", "null", False), + ( + "Any", + {"type": "record", "fields": [{"name": "species", "type": "string"}]}, + True, + ), + ("Any", {"type": "enum", "symbols": ["homo_sapiens"]}, True), + ( + {"type": "enum", "symbols": ["homo_sapiens", "mus_musculus"]}, + {"type": "enum", "symbols": ["homo_sapiens"]}, + True, + ), + ( + {"type": "enum", "symbols": ["homo_sapiens", "mus_musculus"]}, + {"type": "enum", "symbols": ["homo_sapiens", "drosophila_melanogaster"]}, + False, + ), + ( + {"type": "record", "fields": [{"name": "species", "type": "string"}]}, + {"type": "enum", "symbols": ["homo_sapiens"]}, + False, + ), + ( + { + "type": "record", + "fields": [ + {"name": "species", "type": "string"}, + {"name": "id", "type": "int"}, + ], + }, + {"type": "record", "fields": [{"name": "species", "type": "string"}]}, + True, + ), + ( + { + "type": "record", + "fields": [ + {"name": "species", "type": "string"}, + {"name": "id", "type": "int"}, + ], + }, + {"type": "record", "fields": [{"name": "species", "type": "int"}]}, + False, + ), + ( + {"type": "record", "fields": [{"name": "species", "type": "string"}]}, + { + "type": "record", + "fields": [ + {"name": "species", "type": "string"}, + {"name": "id", "type": "int"}, + ], + }, + False, + ), +] + + +@pytest.mark.parametrize("old,new,result", types) +def test_subtypes(old: schema.PropType, new: schema.PropType, result: bool) -> None: + """Test is_subtype() function.""" + assert schema.is_subtype(old, new) == result + + +def test_avro_loading_subtype() -> None: + """Confirm conversion of SALAD style names to avro when overriding.""" + path = get_data("tests/test_schema/avro_subtype.yml") + assert path + document_loader, avsc_names, schema_metadata, metaschema_loader = load_schema(path) + assert isinstance(avsc_names, Names) + assert avsc_names.get_name("com.example.derived_schema.ExtendedThing", None) + + +def test_avro_loading_subtype_bad() -> None: + """Confirm subtype error when overriding incorrectly.""" + path = get_data("tests/test_schema/avro_subtype_bad.yml") + assert path + target_error = ( + r"Union\s+item\s+must\s+be\s+a\s+valid\s+Avro\s+schema:\s+" + r"Field\s+name\s+override_me\s+already\s+in\s+use\s+with\s+incompatible\s+" + r"type\.\s+org\.w3id\.cwl\.salad\.Any\s+vs\s+\['string',\s+'int'\]\." + ) + with pytest.raises(SchemaParseException, match=target_error): + document_loader, avsc_names, schema_metadata, metaschema_loader = load_schema( + path + ) diff --git a/setup.py b/setup.py index d18989a32..c356ddc07 100644 --- a/setup.py +++ b/setup.py @@ -90,7 +90,7 @@ setup( name="schema-salad", - version="8.2", # update the VERSION prefix in the Makefile as well 🙂 + version="8.3", # update the VERSION prefix in the Makefile as well 🙂 description="Schema Annotations for Linked Avro Data (SALAD)", long_description=open(README).read(), long_description_content_type="text/x-rst",