Skip to content

Commit 3014918

Browse files
mufaddal-rohawalamufiAmazonrsareddy0329
authored andcommitted
remove mlflow app from eval integs (#1951)
Co-authored-by: Mufaddal Rohawala <mufi@amazon.com> Co-authored-by: rsareddy0329 <rsareddy0329@gmail.com>
1 parent 119cb60 commit 3014918

File tree

3 files changed

+13
-13
lines changed

3 files changed

+13
-13
lines changed

tests/integ/sagemaker/modules/evaluate/test_benchmark_evaluator.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def test_benchmark_evaluation_full_flow(self):
125125
benchmark=Benchmark.GEN_QA,
126126
model=TEST_CONFIG["model_package_arn"],
127127
s3_output_path=TEST_CONFIG["s3_output_path"],
128-
mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
128+
# mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
129129
dataset=TEST_CONFIG["dataset_s3_uri"],
130130
model_package_group=TEST_CONFIG["model_package_group_arn"],
131131
base_eval_name="integ-test-gen-qa-eval",
@@ -245,7 +245,7 @@ def test_benchmark_evaluator_validation(self):
245245
benchmark="invalid_benchmark",
246246
model=TEST_CONFIG["model_package_arn"],
247247
s3_output_path=TEST_CONFIG["s3_output_path"],
248-
mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
248+
# mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
249249
dataset="s3://bucket/dataset.jsonl",
250250
)
251251

@@ -270,7 +270,7 @@ def test_benchmark_subtasks_validation(self):
270270
benchmark=Benchmark.MMLU,
271271
model=TEST_CONFIG["model_package_arn"],
272272
s3_output_path=TEST_CONFIG["s3_output_path"],
273-
mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
273+
# mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
274274
dataset="s3://bucket/dataset.jsonl",
275275
subtasks=["abstract_algebra", "anatomy"],
276276
model_package_group="arn:aws:sagemaker:us-west-2:123456789012:model-package-group/test",
@@ -283,7 +283,7 @@ def test_benchmark_subtasks_validation(self):
283283
benchmark=Benchmark.GEN_QA,
284284
model=TEST_CONFIG["model_package_arn"],
285285
s3_output_path=TEST_CONFIG["s3_output_path"],
286-
mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
286+
# mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
287287
dataset="s3://bucket/dataset.jsonl",
288288
subtasks=["invalid"],
289289
model_package_group="arn:aws:sagemaker:us-west-2:123456789012:model-package-group/test",
@@ -312,7 +312,7 @@ def test_benchmark_evaluation_base_model_only(self):
312312
benchmark=Benchmark.GEN_QA,
313313
model=BASE_MODEL_ONLY_CONFIG["base_model_id"],
314314
s3_output_path=BASE_MODEL_ONLY_CONFIG["s3_output_path"],
315-
mlflow_resource_arn=BASE_MODEL_ONLY_CONFIG["mlflow_tracking_server_arn"],
315+
# mlflow_resource_arn=BASE_MODEL_ONLY_CONFIG["mlflow_tracking_server_arn"],
316316
dataset=BASE_MODEL_ONLY_CONFIG["dataset_s3_uri"],
317317
base_eval_name="integ-test-base-model-only",
318318
# Note: model_package_group not needed for JumpStart models

tests/integ/sagemaker/modules/evaluate/test_custom_scorer_evaluator.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def test_custom_scorer_evaluation_full_flow(self):
9999
dataset=TEST_CONFIG["dataset_s3_uri"],
100100
model=TEST_CONFIG["model_package_arn"],
101101
s3_output_path=TEST_CONFIG["s3_output_path"],
102-
mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
102+
# mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
103103
evaluate_base_model=TEST_CONFIG["evaluate_base_model"],
104104
)
105105

@@ -216,7 +216,7 @@ def test_custom_scorer_evaluator_validation(self):
216216
evaluator=123, # Invalid type (not string, enum, or object)
217217
model=TEST_CONFIG["model_package_arn"],
218218
s3_output_path=TEST_CONFIG["s3_output_path"],
219-
mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
219+
# mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
220220
dataset=TEST_CONFIG["dataset_s3_uri"],
221221
)
222222

@@ -254,7 +254,7 @@ def test_custom_scorer_with_builtin_metric(self):
254254
dataset=TEST_CONFIG["dataset_s3_uri"],
255255
model=TEST_CONFIG["model_package_arn"],
256256
s3_output_path=TEST_CONFIG["s3_output_path"],
257-
mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
257+
# mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
258258
evaluate_base_model=False,
259259
)
260260

tests/integ/sagemaker/modules/evaluate/test_llm_as_judge_evaluator.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ def test_llm_as_judge_evaluation_full_flow(self):
112112
dataset=TEST_CONFIG["dataset_s3_uri"],
113113
builtin_metrics=TEST_CONFIG["builtin_metrics"],
114114
custom_metrics=TEST_CONFIG["custom_metrics_json"],
115-
mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
115+
# mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
116116
s3_output_path=TEST_CONFIG["s3_output_path"],
117117
evaluate_base_model=TEST_CONFIG["evaluate_base_model"],
118118
)
@@ -235,7 +235,7 @@ def test_llm_as_judge_builtin_metrics_prefix_handling(self):
235235
evaluator_model=TEST_CONFIG["evaluator_model"],
236236
dataset=TEST_CONFIG["dataset_s3_uri"],
237237
s3_output_path=TEST_CONFIG["s3_output_path"],
238-
mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
238+
# mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
239239
builtin_metrics=["Builtin.Correctness", "Builtin.Helpfulness"],
240240
)
241241
assert evaluator_with_prefix.builtin_metrics == ["Builtin.Correctness", "Builtin.Helpfulness"]
@@ -246,7 +246,7 @@ def test_llm_as_judge_builtin_metrics_prefix_handling(self):
246246
evaluator_model=TEST_CONFIG["evaluator_model"],
247247
dataset=TEST_CONFIG["dataset_s3_uri"],
248248
s3_output_path=TEST_CONFIG["s3_output_path"],
249-
mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
249+
# mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
250250
builtin_metrics=["Correctness", "Helpfulness"],
251251
)
252252
assert evaluator_without_prefix.builtin_metrics == ["Correctness", "Helpfulness"]
@@ -271,7 +271,7 @@ def test_llm_as_judge_builtin_metrics_only(self):
271271
evaluator_model=TEST_CONFIG["evaluator_model"],
272272
dataset=TEST_CONFIG["dataset_s3_uri"],
273273
builtin_metrics=["Completeness", "Faithfulness", "Helpfulness"],
274-
mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
274+
# mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
275275
s3_output_path=TEST_CONFIG["s3_output_path"],
276276
evaluate_base_model=False,
277277
)
@@ -319,7 +319,7 @@ def test_llm_as_judge_custom_metrics_only(self):
319319
evaluator_model=TEST_CONFIG["evaluator_model"],
320320
dataset=TEST_CONFIG["dataset_s3_uri"],
321321
custom_metrics=TEST_CONFIG["custom_metrics_json"],
322-
mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
322+
# mlflow_resource_arn=TEST_CONFIG["mlflow_tracking_server_arn"],
323323
s3_output_path=TEST_CONFIG["s3_output_path"],
324324
evaluate_base_model=False,
325325
)

0 commit comments

Comments
 (0)