Skip to content

Commit f3e52e3

Browse files
authored
Merge pull request #1275 from Sage-Bionetworks/SYNPY-1682
[SYNPY-1682] Redo how list items become arrays in JSON Schemas
2 parents d3256f6 + 249a3b2 commit f3e52e3

File tree

10 files changed

+565
-532
lines changed

10 files changed

+565
-532
lines changed

synapseclient/extensions/curator/schema_generation.py

Lines changed: 151 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -88,25 +88,53 @@ def camelize(string, uppercase_first_letter=True):
8888
AllOf = dict[str, Any]
8989

9090

91+
class ColumnType(Enum):
92+
"""Names of values allowed in the columnType column in datamodel csvs."""
93+
94+
95+
class AtomicColumnType(ColumnType):
96+
"""Column Types that are not lists"""
97+
98+
STRING = "string"
99+
NUMBER = "number"
100+
INTEGER = "integer"
101+
BOOLEAN = "boolean"
102+
103+
104+
class ListColumnType(ColumnType):
105+
"""Column Types that are lists"""
106+
107+
STRING_LIST = "string_list"
108+
INTEGER_LIST = "integer_list"
109+
BOOLEAN_LIST = "boolean_list"
110+
111+
112+
ALL_COLUMN_TYPE_VALUES = [member.value for member in AtomicColumnType] + [
113+
member.value for member in ListColumnType
114+
]
115+
116+
117+
# Translates list types to their atomic type
118+
LIST_TYPE_DICT = {
119+
ListColumnType.STRING_LIST: AtomicColumnType.STRING,
120+
ListColumnType.INTEGER_LIST: AtomicColumnType.INTEGER,
121+
ListColumnType.BOOLEAN_LIST: AtomicColumnType.BOOLEAN,
122+
}
123+
124+
91125
class ValidationRuleName(Enum):
92126
"""Names of validation rules that are used to create JSON Schema"""
93127

128+
# list validation rule is been deprecated for use in deciding type
129+
# TODO: remove list:
130+
# https://sagebionetworks.jira.com/browse/SYNPY-1692
94131
LIST = "list"
95132
DATE = "date"
96133
URL = "url"
97134
REGEX = "regex"
98135
IN_RANGE = "inRange"
99136

100137

101-
class JSONSchemaType(Enum):
102-
"""This enum is the currently supported JSON Schema types"""
103-
104-
STRING = "string"
105-
NUMBER = "number"
106-
INTEGER = "integer"
107-
BOOLEAN = "boolean"
108-
109-
110138
class RegexModule(Enum):
111139
"""This enum are allowed modules for the regex validation rule"""
112140

@@ -131,6 +159,9 @@ class ValidationRule:
131159

132160

133161
_VALIDATION_RULES = {
162+
# list validation rule is been deprecated for use in deciding type
163+
# TODO: remove list:
164+
# https://sagebionetworks.jira.com/browse/SYNPY-1692
134165
"list": ValidationRule(
135166
name=ValidationRuleName.LIST,
136167
incompatible_rules=[],
@@ -1645,22 +1676,37 @@ def is_class_in_schema(self, node_label: str) -> bool:
16451676

16461677
def get_node_column_type(
16471678
self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
1648-
) -> Optional[JSONSchemaType]:
1679+
) -> Optional[ColumnType]:
16491680
"""Gets the column type of the node
16501681
16511682
Args:
16521683
node_label: The label of the node to get the type from
16531684
node_display_name: The display name of the node to get the type from
16541685
1686+
Raises:
1687+
ValueError: If the value from the node is not allowed
1688+
16551689
Returns:
16561690
The column type of the node if it has one, otherwise None
16571691
"""
16581692
node_label = self._get_node_label(node_label, node_display_name)
16591693
rel_node_label = self.dmr.get_relationship_value("columnType", "node_label")
1660-
type_string = self.graph.nodes[node_label][rel_node_label]
1661-
if type_string is None:
1662-
return type_string
1663-
return JSONSchemaType(type_string)
1694+
column_type_value = self.graph.nodes[node_label][rel_node_label]
1695+
if column_type_value is None:
1696+
return column_type_value
1697+
column_type_string = str(column_type_value).lower()
1698+
try:
1699+
column_type = AtomicColumnType(column_type_string)
1700+
except ValueError:
1701+
try:
1702+
column_type = ListColumnType(column_type_string)
1703+
except ValueError:
1704+
msg = (
1705+
f"Node: '{node_label}' had illegal column type value: '{column_type_string}'. "
1706+
f"Allowed values are: [{ALL_COLUMN_TYPE_VALUES}]"
1707+
)
1708+
raise ValueError(msg)
1709+
return column_type
16641710

16651711
def _get_node_label(
16661712
self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
@@ -2778,7 +2824,7 @@ def define_data_model_relationships(self) -> dict:
27782824
"required_header": False,
27792825
"edge_rel": False,
27802826
"node_attr_dict": {"default": None},
2781-
"allowed_values": [enum.value for enum in JSONSchemaType],
2827+
"allowed_values": ALL_COLUMN_TYPE_VALUES,
27822828
},
27832829
}
27842830

@@ -4243,6 +4289,9 @@ def _get_rules_by_names(names: list[str]) -> list[ValidationRule]:
42434289

42444290
def _get_validation_rule_based_fields(
42454291
validation_rules: list[str],
4292+
explicit_is_array: Optional[bool],
4293+
name: str,
4294+
column_type: Optional[ColumnType],
42464295
logger: Logger,
42474296
) -> tuple[
42484297
bool,
@@ -4263,7 +4312,13 @@ def _get_validation_rule_based_fields(
42634312
42644313
Arguments:
42654314
validation_rules: A list of input validation rules
4315+
explicit_is_array:
4316+
True: If the type is set explicitly with a list-type
4317+
False: If the type is set explicitly with a non list-type
4318+
None: If the type was not set explicitly
42664319
name: The name of the node the validation rules belong to
4320+
column_type: The type of this node if set explicitly
4321+
logger: A logger for handling warnings
42674322
42684323
Raises:
42694324
ValueError: When an explicit JSON Schema type is given, but the implicit type is different
@@ -4284,6 +4339,10 @@ def _get_validation_rule_based_fields(
42844339
js_maximum = None
42854340
js_pattern = None
42864341

4342+
# If an array type is explicitly set then is_array can be set to True
4343+
if explicit_is_array is not None:
4344+
js_is_array = explicit_is_array
4345+
42874346
if validation_rules:
42884347
validation_rules = filter_unused_inputted_rules(
42894348
inputted_rules=validation_rules, logger=logger
@@ -4295,7 +4354,49 @@ def _get_validation_rule_based_fields(
42954354
validation_rules
42964355
)
42974356

4298-
js_is_array = ValidationRuleName.LIST in validation_rule_names
4357+
# list validation rule is been deprecated for use in deciding type
4358+
# TODO: Sunset both if blocks below: https://sagebionetworks.jira.com/browse/SYNPY-1692
4359+
4360+
implicit_is_array = ValidationRuleName.LIST in validation_rule_names
4361+
if explicit_is_array is None:
4362+
# If an array type is not explicitly set then is_array can be set by using
4363+
# whether or not a list validation rule is present.
4364+
# Since this is deprecated behavior a warning should be given.
4365+
js_is_array = implicit_is_array
4366+
if implicit_is_array:
4367+
msg = (
4368+
f"A list validation rule is set for property: {name}, "
4369+
f"but columnType is not a list type (current value: {column_type}). "
4370+
"To properly define an array property, set columnType to a list type "
4371+
"(e.g., 'string_list', 'integer_list', 'boolean_list') "
4372+
"instead of using the list validation rule."
4373+
"This behavior is deprecated and list validation rules will no longer "
4374+
"be used in the future.."
4375+
)
4376+
logger.warning(msg)
4377+
else:
4378+
msg = (
4379+
f"No columnType is set for property: {name}. "
4380+
"Please set the columnType."
4381+
)
4382+
logger.warning(msg)
4383+
if explicit_is_array != implicit_is_array:
4384+
# If an array type is explicitly but it is the opposite of whether or not a
4385+
# list validation rule is present, then the user should be warned of the mismatch.
4386+
if explicit_is_array:
4387+
msg = (
4388+
f"For property: {name}, the columnType is a list-type: {column_type} "
4389+
"but no list validation rule is present. "
4390+
"The columnType will be used to set type."
4391+
)
4392+
logger.warning(msg)
4393+
else:
4394+
msg = (
4395+
f"For property: {name}, the columnType is not a list-type: {column_type} "
4396+
"but a list validation rule is present. "
4397+
"The columnType will be used to set type."
4398+
)
4399+
logger.warning(msg)
42994400

43004401
if ValidationRuleName.URL in validation_rule_names:
43014402
js_format = JSONSchemaFormat.URI
@@ -4316,6 +4417,7 @@ def _get_validation_rule_based_fields(
43164417
if regex_rule:
43174418
js_pattern = get_regex_parameters_from_inputted_rule(regex_rule)
43184419

4420+
print(js_is_array)
43194421
return (
43204422
js_is_array,
43214423
js_format,
@@ -4360,7 +4462,7 @@ class TraversalNode: # pylint: disable=too-many-instance-attributes
43604462
dependencies: list[str] = field(init=False)
43614463
description: str = field(init=False)
43624464
is_array: bool = field(init=False)
4363-
type: Optional[JSONSchemaType] = field(init=False)
4465+
type: Optional[ColumnType] = field(init=False)
43644466
format: Optional[JSONSchemaFormat] = field(init=False)
43654467
minimum: Optional[float] = field(init=False)
43664468
maximum: Optional[float] = field(init=False)
@@ -4391,8 +4493,22 @@ def __post_init__(self) -> None:
43914493
self.description = self.dmge.get_node_comment(
43924494
node_display_name=self.display_name
43934495
)
4394-
self.type = self.dmge.get_node_column_type(node_display_name=self.display_name)
4496+
column_type = self.dmge.get_node_column_type(
4497+
node_display_name=self.display_name
4498+
)
43954499

4500+
# list validation rule is been deprecated for use in deciding type
4501+
# TODO: set self.is_array here instead of return from _get_validation_rule_based_fields
4502+
# https://sagebionetworks.jira.com/browse/SYNPY-1692
4503+
if isinstance(column_type, AtomicColumnType):
4504+
self.type = column_type
4505+
explicit_is_array = False
4506+
elif isinstance(column_type, ListColumnType):
4507+
self.type = LIST_TYPE_DICT[column_type]
4508+
explicit_is_array = True
4509+
else:
4510+
self.type = None
4511+
explicit_is_array = None
43964512
(
43974513
self.is_array,
43984514
self.format,
@@ -4401,6 +4517,9 @@ def __post_init__(self) -> None:
44014517
self.pattern,
44024518
) = _get_validation_rule_based_fields(
44034519
validation_rules=validation_rules,
4520+
explicit_is_array=explicit_is_array,
4521+
name=self.name,
4522+
column_type=self.type,
44044523
logger=self.logger,
44054524
)
44064525

@@ -4784,13 +4903,9 @@ def _create_enum_array_property(
47844903
47854904
Example:
47864905
{
4787-
"oneOf": [
4788-
{
4789-
"type": "array",
4790-
"title": "array",
4791-
"items": {"enum": ["enum1"]},
4792-
}
4793-
],
4906+
"type": "array",
4907+
"title": "array",
4908+
"items": {"enum": ["enum1"], "type": "string"},
47944909
},
47954910
47964911
@@ -4806,20 +4921,13 @@ def _create_enum_array_property(
48064921
valid_values = node.valid_value_display_names
48074922
else:
48084923
valid_values = node.valid_values
4809-
items: Items = {"enum": valid_values}
4810-
types = [
4811-
{
4812-
"type": "array",
4813-
"title": "array",
4814-
"items": items,
4815-
}
4816-
]
4817-
4818-
if not node.is_required:
4819-
types += [{"type": "null", "title": "null"}]
4820-
4821-
enum_array_property: Property = {"oneOf": types}
4822-
return enum_array_property # type: ignore
4924+
items: Items = {"enum": valid_values, "type": "string"}
4925+
array_property = {
4926+
"type": "array",
4927+
"title": "array",
4928+
"items": items,
4929+
}
4930+
return array_property
48234931

48244932

48254933
def _create_array_property(node: Node) -> Property:
@@ -4828,13 +4936,9 @@ def _create_array_property(node: Node) -> Property:
48284936
48294937
Example:
48304938
{
4831-
"oneOf": [
4832-
{
4833-
"type": "array",
4834-
"title": "array",
4835-
"items": {"type": "number", "minimum": 0, "maximum": 1},
4836-
}
4837-
],
4939+
"type": "array",
4940+
"title": "array",
4941+
"items": {"type": "integer", "minimum": 0, "maximum": 1},
48384942
}
48394943
48404944
Arguments:
@@ -4850,17 +4954,11 @@ def _create_array_property(node: Node) -> Property:
48504954
_set_type_specific_keywords(items, node)
48514955

48524956
array_type_dict: TypeDict = {"type": "array", "title": "array"}
4853-
null_type_dict: TypeDict = {"type": "null", "title": "null"}
48544957

48554958
if items:
48564959
array_type_dict["items"] = items
48574960

4858-
types = [array_type_dict]
4859-
if not node.is_required:
4860-
types.append(null_type_dict)
4861-
4862-
array_property: Property = {"oneOf": types}
4863-
return array_property
4961+
return array_type_dict
48644962

48654963

48664964
def _create_enum_property(

tests/unit/synapseclient/extensions/conftest.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
DataModelGraph,
99
DataModelGraphExplorer,
1010
DataModelParser,
11+
DataModelRelationships,
1112
)
1213

1314
TESTS_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -63,3 +64,9 @@ def DMGE(helpers: Helpers) -> DataModelGraphExplorer:
6364
path="example.model.column_type_component.csv"
6465
)
6566
return dmge
67+
68+
69+
@pytest.fixture(name="dmr")
70+
def fixture_dmr():
71+
"Returns a DataModelRelationships instance"
72+
return DataModelRelationships()

tests/unit/synapseclient/extensions/schema_files/example.model.column_type_component.csv

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -53,23 +53,24 @@ MockRDB,,,"Component, MockRDB_id, SourceManifest",,FALSE,DataType,,,,
5353
MockRDB_id,,,,,TRUE,DataProperty,,,int,
5454
SourceManifest,,,,,TRUE,DataProperty,,,,
5555
MockFilename,,,"Component, Filename",,FALSE,DataType,,,,
56-
JSONSchemaComponent,Component to hold attributes for testing JSON Schemas,,"Component, No Rules, No Rules Not Required, String, String Not Required, Enum, Enum Not Required, Date, URL, InRange, Regex, List, List Not Required, List Enum, List Enum Not Required, List String, List InRange",,FALSE,DataType,,,,
56+
JSONSchemaComponent,Component to hold attributes for testing JSON Schemas,,"Component, No Rules, No Rules Not Required, String, String Not Required, Enum, Enum Not Required, Date, URL, InRange, Regex, List, List Not Required, List Enum, List Enum Not Required, List Boolean, List, Integer, List InRange",,FALSE,DataType,,,,
5757
No Rules,,,,,TRUE,DataProperty,,,,
5858
No Rules Not Required,,,,,FALSE,DataProperty,,,,
59-
String,,,,,TRUE,DataProperty,,,str error,string
60-
String Not Required,,,,,FALSE,DataProperty,,,str error,string
61-
Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,,
62-
Enum Not Required,,"ab, cd, ef, gh",,,FALSE,DataProperty,,,,
59+
String,,,,,TRUE,DataProperty,,,,string
60+
String Not Required,,,,,FALSE,DataProperty,,,,string
61+
Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,,string
62+
Enum Not Required,,"ab, cd, ef, gh",,,FALSE,DataProperty,,,,string
6363
Date,,,,,TRUE,DataProperty,,,date,string
6464
URL,,,,,TRUE,DataProperty,,,url,string
6565
InRange,,,,,TRUE,DataProperty,,,inRange 50 100,number
6666
Regex,,,,,TRUE,DataProperty,,,regex search [a-f],string
67-
List,,,,,TRUE,DataProperty,,,list,
68-
List Not Required,,,,,FALSE,DataProperty,,,list,
69-
List Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,list,
70-
List Enum Not Required,,"ab, cd, ef, gh",,,FALSE,DataProperty,,,list,
71-
List String,,,,,TRUE,DataProperty,,,list::str,string
72-
List InRange,,,,,TRUE,DataProperty,,,list::inRange 50 100,number
67+
List,,,,,TRUE,DataProperty,,,,string_list
68+
List Not Required,,,,,FALSE,DataProperty,,,,string_list
69+
List Enum,,"ab, cd, ef, gh",,,TRUE,DataProperty,,,,string_list
70+
List Enum Not Required,,"ab, cd, ef, gh",,,FALSE,DataProperty,,,,string_list
71+
List Boolean,,,,,TRUE,DataProperty,,,,boolean_list
72+
List Integer,,,,,TRUE,DataProperty,,,,integer_list
73+
List InRange,,,,,TRUE,DataProperty,,,inRange 50 100,integer_list
7374
TypeDefinitionComponent,Component to check type specification,,"Component, String type, String type caps, Int type, Int type caps, Num type, Num type caps, Nan type, Missing type, Boolean type, Boolean type caps",,FALSE,DataType,,,,
7475
String type,,,,,TRUE,DataProperty,,,,string
7576
String type caps,,,,,TRUE,DataProperty,,,,STRING

0 commit comments

Comments
 (0)