Skip to content

Commit a557de9

Browse files
authored
Merge pull request #1276 from Sage-Bionetworks/SYNPY-1679
[SYNPY-1679] Add format column to csv data model.
2 parents ee64f1d + e7ffba2 commit a557de9

File tree

13 files changed

+854
-380
lines changed

13 files changed

+854
-380
lines changed

synapseclient/extensions/curator/schema_generation.py

Lines changed: 172 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -122,13 +122,37 @@ class ListColumnType(ColumnType):
122122
}
123123

124124

125+
class JSONSchemaFormat(Enum):
126+
"""
127+
Allowed formats by the JSON Schema validator used by Synapse: https://github.com/everit-org/json-schema#format-validators
128+
For descriptions see: https://json-schema.org/understanding-json-schema/reference/type#format
129+
"""
130+
131+
DATE_TIME = "date-time"
132+
EMAIL = "email"
133+
HOSTNAME = "hostname"
134+
IPV4 = "ipv4"
135+
IPV6 = "ipv6"
136+
URI = "uri"
137+
URI_REFERENCE = "uri-reference"
138+
URI_TEMPLATE = "uri-template"
139+
JSON_POINTER = "json-pointer"
140+
DATE = "date"
141+
TIME = "time"
142+
REGEX = "regex"
143+
RELATIVE_JSON_POINTER = "relative-json-pointer"
144+
145+
125146
class ValidationRuleName(Enum):
126147
"""Names of validation rules that are used to create JSON Schema"""
127148

128149
# list validation rule is been deprecated for use in deciding type
129150
# TODO: remove list:
130151
# https://sagebionetworks.jira.com/browse/SYNPY-1692
131152
LIST = "list"
153+
# url and date rules are deprecated for adding format keyword
154+
# TODO: remove url and date
155+
# https://sagebionetworks.jira.com/browse/SYNPY-1685
132156
DATE = "date"
133157
URL = "url"
134158
REGEX = "regex"
@@ -166,6 +190,9 @@ class ValidationRule:
166190
name=ValidationRuleName.LIST,
167191
incompatible_rules=[],
168192
),
193+
# url and date rules are deprecated for adding format keyword
194+
# TODO: remove url and date
195+
# https://sagebionetworks.jira.com/browse/SYNPY-1685
169196
"date": ValidationRule(
170197
name=ValidationRuleName.DATE,
171198
incompatible_rules=[
@@ -215,13 +242,6 @@ def __post_init__(self) -> None:
215242
self.display_name = str(self.fields["displayName"])
216243

217244

218-
class JSONSchemaFormat(Enum):
219-
"""This enum is the currently supported JSON Schema formats"""
220-
221-
DATE = "date"
222-
URI = "uri"
223-
224-
225245
def load_json(file_path: str) -> Any:
226246
"""Load json document from file path or url
227247
@@ -635,7 +655,10 @@ def gather_csv_attributes_relationships(
635655

636656
# get attributes from Attribute column
637657
attributes = model_df.to_dict("records")
658+
659+
# Check for presence of optional columns
638660
model_includes_column_type = "columnType" in model_df.columns
661+
model_includes_format = "Format" in model_df.columns
639662

640663
# Build attribute/relationship dictionary
641664
relationship_types = self.required_headers
@@ -659,6 +682,9 @@ def gather_csv_attributes_relationships(
659682
attr_rel_dictionary[attribute_name]["Relationships"].update(
660683
column_type_dict
661684
)
685+
if model_includes_format:
686+
format_dict = self.parse_format(attr)
687+
attr_rel_dictionary[attribute_name]["Relationships"].update(format_dict)
662688
return attr_rel_dictionary
663689

664690
def parse_column_type(self, attr: dict) -> dict:
@@ -691,6 +717,34 @@ def parse_column_type(self, attr: dict) -> dict:
691717

692718
return {"ColumnType": column_type}
693719

720+
def parse_format(self, attribute_dict: dict) -> dict[str, str]:
721+
"""Finds the format value if it exists and returns it as a dictionary.
722+
723+
Args:
724+
attribute_dict: The attribute dictionary.
725+
726+
Returns:
727+
A dictionary containing the format value if it exists
728+
else an empty dict
729+
"""
730+
from pandas import isna
731+
732+
format_value = attribute_dict.get("Format")
733+
734+
if isna(format_value):
735+
return {}
736+
737+
format_string = str(format_value).strip().lower()
738+
739+
check_allowed_values(
740+
self.dmr,
741+
entry_id=attribute_dict["Format"],
742+
value=format_string,
743+
relationship="format",
744+
)
745+
746+
return {"Format": format_string}
747+
694748
def parse_csv_model(
695749
self,
696750
path_to_data_model: str,
@@ -1708,6 +1762,37 @@ def get_node_column_type(
17081762
raise ValueError(msg)
17091763
return column_type
17101764

1765+
def get_node_format(
1766+
self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
1767+
) -> Optional[JSONSchemaFormat]:
1768+
"""Gets the format of the node
1769+
1770+
Args:
1771+
node_label: The label of the node to get the format from
1772+
node_display_name: The display name of the node to get the format from
1773+
1774+
Raises:
1775+
ValueError: If the value from the node is not allowed
1776+
1777+
Returns:
1778+
The format of the node if it has one, otherwise None
1779+
"""
1780+
node_label = self._get_node_label(node_label, node_display_name)
1781+
rel_node_label = self.dmr.get_relationship_value("format", "node_label")
1782+
format_value = self.graph.nodes[node_label][rel_node_label]
1783+
if format_value is None:
1784+
return format_value
1785+
format_string = str(format_value).lower()
1786+
try:
1787+
column_type = JSONSchemaFormat(format_string)
1788+
except ValueError as exc:
1789+
msg = (
1790+
f"Node: '{node_label}' had illegal format value: '{format_value}'. "
1791+
f"Allowed values are: [{[member.value for member in JSONSchemaFormat]}]"
1792+
)
1793+
raise ValueError(msg) from exc
1794+
return column_type
1795+
17111796
def _get_node_label(
17121797
self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
17131798
) -> str:
@@ -2826,6 +2911,16 @@ def define_data_model_relationships(self) -> dict:
28262911
"node_attr_dict": {"default": None},
28272912
"allowed_values": ALL_COLUMN_TYPE_VALUES,
28282913
},
2914+
"format": {
2915+
"jsonld_key": "sms:format",
2916+
"csv_header": "Format",
2917+
"node_label": "format",
2918+
"type": str,
2919+
"required_header": False,
2920+
"edge_rel": False,
2921+
"node_attr_dict": {"default": None},
2922+
"allowed_values": [member.value for member in JSONSchemaFormat],
2923+
},
28292924
}
28302925

28312926
return map_data_model_relationships
@@ -4290,6 +4385,7 @@ def _get_rules_by_names(names: list[str]) -> list[ValidationRule]:
42904385
def _get_validation_rule_based_fields(
42914386
validation_rules: list[str],
42924387
explicit_is_array: Optional[bool],
4388+
explicit_format: Optional[JSONSchemaFormat],
42934389
name: str,
42944390
column_type: Optional[ColumnType],
42954391
logger: Logger,
@@ -4334,7 +4430,7 @@ def _get_validation_rule_based_fields(
43344430
- js_pattern: If the type is string the JSON Schema pattern
43354431
"""
43364432
js_is_array = False
4337-
js_format = None
4433+
js_format = explicit_format
43384434
js_minimum = None
43394435
js_maximum = None
43404436
js_pattern = None
@@ -4398,10 +4494,51 @@ def _get_validation_rule_based_fields(
43984494
)
43994495
logger.warning(msg)
44004496

4401-
if ValidationRuleName.URL in validation_rule_names:
4402-
js_format = JSONSchemaFormat.URI
4403-
elif ValidationRuleName.DATE in validation_rule_names:
4404-
js_format = JSONSchemaFormat.DATE
4497+
# url and date rules are deprecated for adding format keyword
4498+
# TODO: remove the if/else block below
4499+
# https://sagebionetworks.jira.com/browse/SYNPY-1685
4500+
4501+
if explicit_format:
4502+
if (
4503+
ValidationRuleName.DATE in validation_rule_names
4504+
and explicit_format == JSONSchemaFormat.URI
4505+
):
4506+
msg = (
4507+
f"For property: {name}, the format is uri, "
4508+
"but the validation rule date is present. "
4509+
"The format will be set to uri."
4510+
)
4511+
logger.warning(msg)
4512+
elif (
4513+
ValidationRuleName.URL in validation_rule_names
4514+
and explicit_format == JSONSchemaFormat.DATE
4515+
):
4516+
msg = (
4517+
f"For property: {name}, the format is date, "
4518+
"but the validation rule url is present. "
4519+
"The format will be set to date."
4520+
)
4521+
logger.warning(msg)
4522+
4523+
else:
4524+
if ValidationRuleName.URL in validation_rule_names:
4525+
js_format = JSONSchemaFormat.URI
4526+
msg = (
4527+
f"A url validation rule is set for property: {name}, but the format is not set. "
4528+
"The format will be set to uri, but this behavior is deprecated and validation "
4529+
"rules will no longer be used in the future."
4530+
"Please explicitly set the format to uri in the data model."
4531+
)
4532+
logger.warning(msg)
4533+
elif ValidationRuleName.DATE in validation_rule_names:
4534+
js_format = JSONSchemaFormat.DATE
4535+
msg = (
4536+
f"A date validation rule is set for property: {name}, but the format is not set. "
4537+
"The format will be set to date, but this behavior is deprecated and validation "
4538+
"rules will no longer be used in the future."
4539+
"Please explicitly set the format to uri in the data model."
4540+
)
4541+
logger.warning(msg)
44054542

44064543
in_range_rule = get_rule_from_inputted_rules(
44074544
ValidationRuleName.IN_RANGE, validation_rules
@@ -4417,7 +4554,6 @@ def _get_validation_rule_based_fields(
44174554
if regex_rule:
44184555
js_pattern = get_regex_parameters_from_inputted_rule(regex_rule)
44194556

4420-
print(js_is_array)
44214557
return (
44224558
js_is_array,
44234559
js_format,
@@ -4496,7 +4632,6 @@ def __post_init__(self) -> None:
44964632
column_type = self.dmge.get_node_column_type(
44974633
node_display_name=self.display_name
44984634
)
4499-
45004635
# list validation rule is been deprecated for use in deciding type
45014636
# TODO: set self.is_array here instead of return from _get_validation_rule_based_fields
45024637
# https://sagebionetworks.jira.com/browse/SYNPY-1692
@@ -4509,6 +4644,22 @@ def __post_init__(self) -> None:
45094644
else:
45104645
self.type = None
45114646
explicit_is_array = None
4647+
4648+
# url and date rules are deprecated for adding format keyword
4649+
# TODO: set self.format here instead of passing it to get_validation_rule_based_fields
4650+
# https://sagebionetworks.jira.com/browse/SYNPY-1685
4651+
explicit_format = self.dmge.get_node_format(node_display_name=self.display_name)
4652+
if explicit_format:
4653+
if column_type not in (ListColumnType.STRING_LIST, AtomicColumnType.STRING):
4654+
msg = (
4655+
f"A format value (current value: {explicit_format.value}) "
4656+
f"is set for property: {self.name}, but columnType is not a string type "
4657+
f"(current value: {column_type.value}). "
4658+
"To use a format value the columnType must be set to one of: "
4659+
"[string, string_list] "
4660+
)
4661+
raise ValueError(msg)
4662+
45124663
(
45134664
self.is_array,
45144665
self.format,
@@ -4518,6 +4669,7 @@ def __post_init__(self) -> None:
45184669
) = _get_validation_rule_based_fields(
45194670
validation_rules=validation_rules,
45204671
explicit_is_array=explicit_is_array,
4672+
explicit_format=explicit_format,
45214673
name=self.name,
45224674
column_type=self.type,
45234675
logger=self.logger,
@@ -4896,7 +5048,7 @@ def _set_conditional_dependencies(
48965048

48975049

48985050
def _create_enum_array_property(
4899-
node: Node, use_valid_value_display_names: bool = True
5051+
node: TraversalNode, use_valid_value_display_names: bool = True
49005052
) -> Property:
49015053
"""
49025054
Creates a JSON Schema property array with enum items
@@ -4930,7 +5082,7 @@ def _create_enum_array_property(
49305082
return array_property
49315083

49325084

4933-
def _create_array_property(node: Node) -> Property:
5085+
def _create_array_property(node: TraversalNode) -> Property:
49345086
"""
49355087
Creates a JSON Schema property array
49365088
@@ -4962,7 +5114,7 @@ def _create_array_property(node: Node) -> Property:
49625114

49635115

49645116
def _create_enum_property(
4965-
node: Node, use_valid_value_display_names: bool = True
5117+
node: TraversalNode, use_valid_value_display_names: bool = True
49665118
) -> Property:
49675119
"""
49685120
Creates a JSON Schema property enum
@@ -4995,7 +5147,7 @@ def _create_enum_property(
49955147
return enum_property
49965148

49975149

4998-
def _create_simple_property(node: Node) -> Property:
5150+
def _create_simple_property(node: TraversalNode) -> Property:
49995151
"""
50005152
Creates a JSON Schema property
50015153
@@ -5031,7 +5183,7 @@ def _create_simple_property(node: Node) -> Property:
50315183
return prop
50325184

50335185

5034-
def _set_type_specific_keywords(schema: dict[str, Any], node: Node) -> None:
5186+
def _set_type_specific_keywords(schema: dict[str, Any], node: TraversalNode) -> None:
50355187
"""Sets JSON Schema keywords that are allowed if type has been set
50365188
50375189
Arguments:
@@ -5049,7 +5201,7 @@ def _set_type_specific_keywords(schema: dict[str, Any], node: Node) -> None:
50495201

50505202
def _set_property(
50515203
json_schema: JSONSchema,
5052-
node: Node,
5204+
node: TraversalNode,
50535205
use_property_display_names: bool = True,
50545206
use_valid_value_display_names: bool = True,
50555207
) -> None:

0 commit comments

Comments
 (0)