Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 93 additions & 2 deletions synapseclient/extensions/curator/schema_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,7 @@ def gather_csv_attributes_relationships(
# Check for presence of optional columns
model_includes_column_type = "columnType" in model_df.columns
model_includes_format = "Format" in model_df.columns
model_includes_pattern = "Pattern" in model_df.columns

# Build attribute/relationship dictionary
relationship_types = self.required_headers
Expand Down Expand Up @@ -697,6 +698,12 @@ def gather_csv_attributes_relationships(
attr_rel_dictionary[attribute_name]["Relationships"].update(
maximum_dict
)

if model_includes_pattern:
pattern_dict = self.parse_pattern(attr)
attr_rel_dictionary[attribute_name]["Relationships"].update(
pattern_dict
)
return attr_rel_dictionary

def parse_column_type(self, attr: dict) -> dict:
Expand Down Expand Up @@ -798,6 +805,26 @@ def parse_format(self, attribute_dict: dict) -> dict[str, str]:

return {"Format": format_string}

def parse_pattern(self, attribute_dict: dict) -> dict[str, str]:
"""Finds the pattern value if it exists and returns it as a dictionary.

Args:
attribute_dict: The attribute dictionary.
Returns:
A dictionary containing the pattern value if it exists
else an empty dict
"""
from pandas import isna

pattern_value = attribute_dict.get("Pattern")

if isna(pattern_value):
return {}

pattern_string = str(pattern_value).strip()

return {"Pattern": pattern_string}

def parse_csv_model(
self,
path_to_data_model: str,
Expand Down Expand Up @@ -1815,6 +1842,27 @@ def get_node_column_type(
raise ValueError(msg)
return column_type

def get_node_column_pattern(
self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
) -> Optional[str]:
"""Gets the regex pattern of the node

Args:
node_label: The label of the node to get the type from
node_display_name: The display name of the node to get the type from

Raises:
ValueError: If the value from the node is not allowed

Returns:
The column pattern of the node if it has one, otherwise None
"""
node_label = self._get_node_label(node_label, node_display_name)
rel_node_label = self.dmr.get_relationship_value("pattern", "node_label")
pattern = self.graph.nodes[node_label][rel_node_label]

return pattern

def get_node_format(
self, node_label: Optional[str] = None, node_display_name: Optional[str] = None
) -> Optional[JSONSchemaFormat]:
Expand Down Expand Up @@ -1942,6 +1990,9 @@ class PropertyTemplate:
magic_validationRules: list = field(
default_factory=list, metadata=config(field_name="sms:validationRules")
)
magic_pattern: list = field(
default_factory=list, metadata=config(field_name="sms:pattern")
)


@dataclass_json
Expand Down Expand Up @@ -2841,6 +2892,7 @@ def define_data_model_relationships(self) -> dict:
allowed_values: A list of values the entry must be one of
edge_dir: str, 'in'/'out' is the edge an in or out edge. Define for edge relationships
jsonld_dir: str, 'in'/out is the direction in or out in the JSONLD.
pattern: regex pattern that the entry must match
"""
map_data_model_relationships = {
"displayName": {
Expand Down Expand Up @@ -3016,6 +3068,15 @@ def define_data_model_relationships(self) -> dict:
"edge_rel": False,
"node_attr_dict": {"default": None},
},
"pattern": {
"jsonld_key": "sms:pattern",
"csv_header": "Pattern",
"node_label": "pattern",
"type": str,
"required_header": False,
"edge_rel": False,
"node_attr_dict": {"default": None},
},
}

return map_data_model_relationships
Expand Down Expand Up @@ -4741,6 +4802,9 @@ def __post_init__(self) -> None:
relationship_value="minimum", node_display_name=self.display_name
)

column_pattern = self.dmge.get_node_column_pattern(
node_display_name=self.display_name
)
# list validation rule is been deprecated for use in deciding type
# TODO: set self.is_array here instead of return from _get_validation_rule_based_fields
# https://sagebionetworks.jira.com/browse/SYNPY-1692
Expand Down Expand Up @@ -4771,7 +4835,7 @@ def __post_init__(self) -> None:
self.format,
implicit_minimum,
implicit_maximum,
self.pattern,
rule_pattern,
) = _get_validation_rule_based_fields(
validation_rules=validation_rules,
explicit_is_array=explicit_is_array,
Expand All @@ -4790,6 +4854,30 @@ def __post_init__(self) -> None:
explicit_maximum if explicit_maximum is not None else implicit_maximum
)

if column_pattern and column_type and column_type.value != "string":
raise ValueError(
"Column type must be set to 'string' to use column pattern specification for regex validation."
)

self.pattern = column_pattern if column_pattern else rule_pattern

if rule_pattern and not column_pattern:
msg = (
f"A regex validation rule is set for property: {self.name}, but the pattern is not set in the data model. "
f"The regex pattern will be set to {self.pattern}, but the regex rule is deprecated and validation "
"rules will no longer be used in the future."
"Please explicitly set the regex pattern in the 'Pattern' column in the data model."
)
self.logger.warning(msg)

if self.pattern:
try:
re.compile(self.pattern)
except re.error as e:
raise SyntaxError(
f"The regex pattern '{self.pattern}' for property '{self.name}' is invalid."
) from e

def _determine_type_and_array(
self, column_type: Optional[ColumnType]
) -> tuple[Optional[AtomicColumnType], Optional[bool]]:
Expand Down Expand Up @@ -5369,14 +5457,17 @@ def _set_type_specific_keywords(schema: dict[str, Any], node: TraversalNode) ->
schema: The schema to set keywords on
node (Node): The node the corresponds to the property which is being set in the JSON Schema
"""
for attr in ["minimum", "maximum", "pattern"]:
for attr in ["minimum", "maximum"]:
value = getattr(node, attr)
if value is not None:
schema[attr] = value

if node.format is not None:
schema["format"] = node.format.value

if hasattr(node, "pattern") and node.pattern is not None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why if hasattr(node, "pattern")?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in the loop starting on line 5280, assuming the node has the pattern attribute raises an exception when it is not present
@andrewelamb

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you elaborate @SageGJ ?
In any case, the node here is an instance of the TraversalNode dataclass, and one of its attributes is pattern, so you shouldn't ever have to check that the attribute exists via hasattr, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if no pattern is specified for the node then the TraversalNode instance of that node will not have a pattern attribute

schema["pattern"] = node.pattern


def _set_property(
json_schema: JSONSchema,
Expand Down
Loading
Loading