diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py index 2188c31d4e38..c3286e14b332 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py @@ -32,6 +32,30 @@ # Centralizing here avoids magic strings sprinkled through schema/content generation code. WRAPPER_KEY = "item" +# Keys that must remain at the top level (outside the wrapper) when we +# normalize flat JSONL rows into the canonical `item` structure. +_RESERVED_ROOT_KEYS: Set[str] = {"sample"} + + +def _normalize_row_for_item_wrapper(row: Dict[str, Any]) -> Dict[str, Any]: + """Ensure every row exposes an `item` object without losing reserved keys.""" + + wrapper = row.get(WRAPPER_KEY) + if isinstance(wrapper, dict): + return row + + normalized: Dict[str, Any] = {} + item_payload: Dict[str, Any] = {} + + for key, value in row.items(): + if key in _RESERVED_ROOT_KEYS: + normalized[key] = value + elif key != WRAPPER_KEY: + item_payload[key] = value + + normalized[WRAPPER_KEY] = item_payload + return normalized + class OAIEvalRunCreationInfo(TypedDict, total=True): """Configuration for an evaluator""" @@ -146,7 +170,6 @@ def _begin_single_aoai_evaluation( that maps the user-supplied evaluators to the names of the graders as generated by the OAI service. :rtype: Tuple[str, str, Dict[str, str]] """ - # Format data for eval group creation LOGGER.info(f"AOAI: Preparing evaluation for {len(graders)} grader(s): {list(graders.keys())}") grader_name_list = [] @@ -637,7 +660,6 @@ def to_schema(node: Dict[str, Any]) -> Dict[str, Any]: required = [] for name, child in children.items(): props[name] = to_schema(child) - required.append(name) return { "type": "object", "properties": props, @@ -785,25 +807,37 @@ def _get_data_source(input_data_df: pd.DataFrame, column_mapping: Dict[str, str] :rtype: Dict[str, Any] """ - def _convert_value_to_string(val: Any) -> str: - """Convert a value to string representation for AOAI evaluation.""" + def _convert_value(val: Any) -> Any: + """Convert to AOAI-friendly representation while preserving structure when useful.""" if val is None: return "" - elif isinstance(val, (str, int, float, bool)): + if isinstance(val, str): + return val + if isinstance(val, bool): + return val + # Align numerics with legacy text-only JSONL payloads by turning them into strings. + if isinstance(val, (int, float)): return str(val) - else: - try: # Attempt to JSON serialize lists/dicts - return json.dumps(val, ensure_ascii=False) - except (TypeError, ValueError): - # Fallback for unserializable objects - return str(val) + if isinstance(val, (dict, list)): + return val + return str(val) + + def _get_value_from_path(normalized_row: Dict[str, Any], path: str) -> Any: + cursor: Any = normalized_row + for segment in path.split("."): + if not isinstance(cursor, dict): + return None + cursor = cursor.get(segment) + if cursor is None: + return None + return cursor LOGGER.info( f"AOAI: Building data source from {len(input_data_df)} rows with {len(column_mapping)} column mappings..." ) # Gather path specs: list of tuples (original_mapping_value, relative_parts, dataframe_column_name) # relative_parts excludes the wrapper (so schema + content align). - path_specs: List[Tuple[str, List[str], str]] = [] + path_specs: List[Dict[str, Any]] = [] for name, formatted_entry in column_mapping.items(): if not ( @@ -842,30 +876,53 @@ def _convert_value_to_string(val: Any) -> str: if not relative_parts: continue - path_specs.append((formatted_entry, relative_parts, dataframe_col)) + path_specs.append( + { + "source_path": source_path, + "relative_parts": relative_parts, + "dataframe_col": dataframe_col, + "is_run_output": False, + } + ) elif pieces[0] == "run" and len(pieces) >= 3 and pieces[1] == "outputs": # Target / run outputs become __outputs. columns run_col = "__outputs." + ".".join(pieces[2:]) leaf_name = pieces[-1] - path_specs.append((formatted_entry, [leaf_name], run_col)) + path_specs.append( + { + "source_path": None, + "relative_parts": [leaf_name], + "dataframe_col": run_col, + "is_run_output": True, + } + ) LOGGER.info(f"AOAI: Processed {len(path_specs)} path specifications from column mappings.") content: List[Dict[str, Any]] = [] for _, row in input_data_df.iterrows(): + normalized_row = _normalize_row_for_item_wrapper(row.to_dict()) item_root: Dict[str, Any] = {} - # Track which dataframe columns have been processed via column_mapping - processed_cols: Set[str] = set() + # Track which top-level keys under the wrapper have been populated via mappings + processed_root_keys: Set[str] = set() + + for spec in path_specs: + rel_parts = spec["relative_parts"] + if not rel_parts: + continue + + if spec["is_run_output"]: + val = row.get(spec["dataframe_col"], None) + else: + source_path = cast(str, spec["source_path"]) + val = _get_value_from_path(normalized_row, source_path) + if val is None: + val = row.get(spec["dataframe_col"], None) - for _, rel_parts, df_col in path_specs: - # Safely fetch value - val = row.get(df_col, None) - # Convert value to string to match schema's "type": "string" leaves. - str_val = _convert_value_to_string(val) + norm_val = _convert_value(val) - # Insert into nested dict cursor = item_root for seg in rel_parts[:-1]: nxt = cursor.get(seg) @@ -874,19 +931,27 @@ def _convert_value_to_string(val: Any) -> str: cursor[seg] = nxt cursor = nxt leaf_key = rel_parts[-1] - cursor[leaf_key] = str_val + cursor[leaf_key] = norm_val - # Mark this dataframe column as processed - processed_cols.add(df_col) + processed_root_keys.add(rel_parts[0]) - # Add any unmapped dataframe columns directly to item_root - for col_name in input_data_df.columns: - if col_name not in processed_cols: - val = row.get(col_name, None) - str_val = _convert_value_to_string(val) - item_root[col_name] = str_val + # Pull through any wrapper entries that were never explicitly mapped + wrapper_view = normalized_row.get(WRAPPER_KEY, {}) + if isinstance(wrapper_view, dict): + for key, raw_val in wrapper_view.items(): + if key in processed_root_keys: + continue + if key in item_root: + continue + item_root[key] = _convert_value(raw_val) - content.append({WRAPPER_KEY: item_root}) + content_row: Dict[str, Any] = {} + for root_key in _RESERVED_ROOT_KEYS: + if root_key in normalized_row: + content_row[root_key] = _convert_value(normalized_row[root_key]) + + content_row[WRAPPER_KEY] = item_root + content.append(content_row) LOGGER.info(f"AOAI: Generated {len(content)} content items for data source.") return { @@ -926,7 +991,6 @@ def _begin_eval_run( LOGGER.info(f"AOAI: Creating eval run '{run_name}' for eval group {eval_group_id}...") data_source = _get_data_source(input_data_df, column_mapping) - if data_source_params is not None: data_source.update(data_source_params) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/flat_sample_output.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/flat_sample_output.jsonl new file mode 100644 index 000000000000..7ddf3d107bcd --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/flat_sample_output.jsonl @@ -0,0 +1 @@ +{"query":"how can i help someone be a good person","context":"you can teach them stuff","ground_truth":"teach stuff","response":"I can help you with this query. Give me more details","test":{"test_string":"baking cakes is fun!"},"sample.output_text":"someoutput"} \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/nested_item_keyword.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/nested_item_keyword.jsonl new file mode 100644 index 000000000000..b1f17a4ad807 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/nested_item_keyword.jsonl @@ -0,0 +1 @@ +{"item":{"query":"what is the weather today","context":"its sunny","ground_truth":"rainy","response":"It is sunny out","test":{"test_string":"baking cakes is a fun pass time when you are bored!"}}} \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py index c74991160e9d..df08cb575446 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py @@ -84,6 +84,18 @@ def wrapped_flat_test_data_file(): return _get_file("wrapped_flat_test_data.jsonl") +@pytest.fixture +def nested_item_keyword_data(): + """Fixture for data that already contains an 'item' wrapper column.""" + return pd.read_json(_get_file("nested_item_keyword.jsonl"), lines=True) + + +@pytest.fixture +def flat_sample_output_data(): + """Fixture for flat data that includes dotted sample metadata (e.g. sample.output_text).""" + return pd.read_json(_get_file("flat_sample_output.jsonl"), lines=True) + + @pytest.mark.unittest class TestBuildSchemaTreeFromPaths: """Test suite for the _build_schema_tree_from_paths helper function.""" @@ -98,7 +110,6 @@ def test_single_level_paths(self): assert "required" in schema assert set(schema["properties"].keys()) == {"query", "response", "ground_truth"} assert all(prop["type"] == "string" for prop in schema["properties"].values()) - assert set(schema["required"]) == {"query", "response", "ground_truth"} def test_nested_paths(self): """Test building schema with nested paths.""" @@ -395,6 +406,49 @@ def test_data_source_with_none_values(self, flat_test_data): # None should be converted to empty string assert content[1][WRAPPER_KEY]["response"] == "" + def test_data_source_with_item_column_and_nested_values(self, nested_item_keyword_data): + """Ensure rows that already have an 'item' column keep nested dicts intact.""" + + column_mapping = { + "query": "${data.item.query}", + "response": "${data.item.response}", + "test_string": "${data.item.test.test_string}", + } + + data_source = _get_data_source(nested_item_keyword_data, column_mapping) + content = data_source["source"]["content"] + + assert len(content) == len(nested_item_keyword_data) + first_row = content[0] + assert WRAPPER_KEY in first_row + + item_payload = first_row[WRAPPER_KEY] + assert item_payload["query"] == "what is the weather today" + assert item_payload["response"] == "It is sunny out" + assert item_payload["test"]["test_string"] == ("baking cakes is a fun pass time when you are bored!") + # Ensure we did not accidentally nest another 'item' key inside the wrapper + assert "item" not in item_payload + + def test_data_source_with_sample_output_metadata(self, flat_sample_output_data): + """Ensure flat rows that include dotted sample metadata remain accessible.""" + + column_mapping = { + "query": "${data.item.query}", + "response": "${data.item.response}", + "test_string": "${data.item.test.test_string}", + } + + data_source = _get_data_source(flat_sample_output_data, column_mapping) + content = data_source["source"]["content"] + + assert len(content) == len(flat_sample_output_data) + row = content[0][WRAPPER_KEY] + + assert row["query"] == "how can i help someone be a good person" + assert row["test"]["test_string"] == "baking cakes is fun!" + # sample.output_text should follow the row through normalization without being stringified + assert row["sample.output_text"] == "someoutput" + def test_data_source_with_numeric_values(self, flat_test_data): """Test data source generation converts numeric values to strings.""" flat_test_data["score"] = [95, 87, 92]