Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion sagemaker-core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ description = "An python package for sagemaker core functionalities"
authors = [
{name = "AWS", email = "sagemaker-interests@amazon.com"}
]
readme = "README.rst"
readme = "README.rst"
dependencies = [
# Add your dependencies here (Include lower and upper bounds as applicable)
"boto3>=1.42.2,<2.0.0",
Expand All @@ -34,6 +34,9 @@ dependencies = [
"omegaconf>=2.1.0",
"torch>=1.9.0",
"scipy>=1.5.0",
# Remote function dependencies
"cloudpickle>=2.0.0",
"paramiko>=2.11.0",
]
requires-python = ">=3.9"
classifiers = [
Expand Down
7 changes: 4 additions & 3 deletions sagemaker-core/src/sagemaker/core/training/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,15 +257,16 @@ class InputData(BaseConfig):
Parameters:
channel_name (StrPipeVar):
The name of the input data source channel.
data_source (Union[str, S3DataSource, FileSystemDataSource, DatasetSource]):
data_source (Union[StrPipeVar, S3DataSource, FileSystemDataSource, DatasetSource]):
The data source for the channel. Can be an S3 URI string, local file path string,
S3DataSource object, or FileSystemDataSource object.
S3DataSource object, FileSystemDataSource object, DatasetSource object, or a
pipeline variable (Properties) from a previous step.
content_type (StrPipeVar):
The MIME type of the data.
"""

channel_name: StrPipeVar = None
data_source: Union[str, FileSystemDataSource, S3DataSource, DatasetSource] = None
data_source: Union[StrPipeVar, FileSystemDataSource, S3DataSource, DatasetSource] = None
content_type: StrPipeVar = None


Expand Down
4 changes: 4 additions & 0 deletions sagemaker-core/tests/integ/jumpstart/test_search_integ.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from sagemaker.core.resources import HubContent


@pytest.mark.serial
@pytest.mark.integ
def test_search_public_hub_models_default_args():
# Only query, uses default hub name and session
Expand All @@ -30,6 +31,7 @@ def test_search_public_hub_models_default_args():
assert len(results) > 0, "Expected at least one matching model from the public hub"


@pytest.mark.serial
@pytest.mark.integ
def test_search_public_hub_models_custom_session():
# Provide a custom SageMaker session
Expand All @@ -41,6 +43,7 @@ def test_search_public_hub_models_custom_session():
assert all(isinstance(m, HubContent) for m in results)


@pytest.mark.serial
@pytest.mark.integ
def test_search_public_hub_models_custom_hub_name():
# Using the default public hub but provided explicitly
Expand All @@ -51,6 +54,7 @@ def test_search_public_hub_models_custom_hub_name():
assert all(isinstance(m, HubContent) for m in results)


@pytest.mark.serial
@pytest.mark.integ
def test_search_public_hub_models_all_args():
# Provide both hub_name and session explicitly
Expand Down
17 changes: 16 additions & 1 deletion sagemaker-core/tests/unit/telemetry/test_telemetry_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,18 @@
PYTHON_VERSION,
)
from sagemaker.core.user_agent import SDK_VERSION, process_studio_metadata_file
from sagemaker.serve.utils.exceptions import ModelBuilderException, LocalModelOutOfMemoryException

# Try to import sagemaker-serve exceptions, skip tests if not available
try:
from sagemaker.serve.utils.exceptions import ModelBuilderException, LocalModelOutOfMemoryException
SAGEMAKER_SERVE_AVAILABLE = True
except ImportError:
SAGEMAKER_SERVE_AVAILABLE = False
# Create mock exceptions for type hints
class ModelBuilderException(Exception):
pass
class LocalModelOutOfMemoryException(Exception):
pass

MOCK_SESSION = Mock()
MOCK_EXCEPTION = LocalModelOutOfMemoryException("mock raise ex")
Expand Down Expand Up @@ -147,6 +158,10 @@ def test_telemetry_emitter_decorator_success(
1, [1, 2], MOCK_SESSION, None, None, expected_extra_str
)

@pytest.mark.skipif(
not SAGEMAKER_SERVE_AVAILABLE,
reason="Requires sagemaker-serve package"
)
@patch("sagemaker.core.telemetry.telemetry_logging._send_telemetry_request")
@patch("sagemaker.core.telemetry.telemetry_logging.resolve_value_from_config")
def test_telemetry_emitter_decorator_handle_exception_success(
Expand Down
1 change: 1 addition & 0 deletions sagemaker-core/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ markers =
release
image_uris_unit_test
timeout: mark a test as a timeout.
serial: marks tests that must run serially (not in parallel)

[testenv]
setenv =
Expand Down
2 changes: 1 addition & 1 deletion sagemaker-mlops/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
name = "sagemaker-mlops"
dynamic = ["version"]
description = "SageMaker MLOps package for workflow orchestration and model building"
readme = "README.md"
readme = "README.md"
requires-python = ">=3.9"
authors = [
{name = "Amazon Web Services"},
Expand Down
14 changes: 13 additions & 1 deletion sagemaker-mlops/tests/integ/test_pipeline_train_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,19 @@ def test_pipeline_with_train_and_registry(sagemaker_session, pipeline_session, r
assert execution_status == "Succeeded"
break
elif execution_status in ["Failed", "Stopped"]:
pytest.fail(f"Pipeline execution {execution_status}")
# Get detailed failure information
steps = sagemaker_session.sagemaker_client.list_pipeline_execution_steps(
PipelineExecutionArn=execution_desc["PipelineExecutionArn"]
)["PipelineExecutionSteps"]

failed_steps = []
for step in steps:
if step.get("StepStatus") == "Failed":
failure_reason = step.get("FailureReason", "Unknown reason")
failed_steps.append(f"{step['StepName']}: {failure_reason}")

failure_details = "\n".join(failed_steps) if failed_steps else "No detailed failure information available"
pytest.fail(f"Pipeline execution {execution_status}. Failed steps:\n{failure_details}")

time.sleep(60)
else:
Expand Down
3 changes: 1 addition & 2 deletions sagemaker-mlops/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,7 @@ allowlist_externals =
commands =
python -c "import os; os.system('install-custom-pkgs --install-boto-wheels')"
pip install 'apache-airflow==2.10.4' --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.10.4/constraints-3.9.txt"
pip install 'torch==2.3.1+cpu' -f 'https://download.pytorch.org/whl/torch_stable.html'
pip install 'torchvision==0.18.1+cpu' -f 'https://download.pytorch.org/whl/torch_stable.html'
pip install 'torch==2.8.0' 'torchvision==0.23.0'
pip install 'dill>=0.3.9'

pytest {posargs}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ def __init__(
):
self._thread = None
self._loop = None
self._stop_event = asyncio.Event()
self._shutdown_event = threading.Event()
self._router = APIRouter()
self._task = task
Expand Down
8 changes: 1 addition & 7 deletions sagemaker-serve/tests/integ/test_tei_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,6 @@
MODEL_NAME_PREFIX = "tei-test-model"
ENDPOINT_NAME_PREFIX = "tei-test-endpoint"

# Configuration from backup file
AWS_REGION = "us-east-2"


@pytest.mark.slow_test
def test_tei_build_deploy_invoke_cleanup():
Expand Down Expand Up @@ -81,8 +78,6 @@ def build_and_deploy():
hf_model_id = MODEL_ID

schema_builder = create_schema_builder()
boto_session = boto3.Session(region_name=AWS_REGION)
sagemaker_session = Session(boto_session=boto_session)
unique_id = str(uuid.uuid4())[:8]

compute = Compute(
Expand All @@ -94,7 +89,6 @@ def build_and_deploy():
model=hf_model_id, # Use HuggingFace model string
model_server=ModelServer.TEI,
schema_builder=schema_builder,
sagemaker_session=sagemaker_session,
compute=compute,
)

Expand All @@ -104,7 +98,7 @@ def build_and_deploy():

core_endpoint = model_builder.deploy(
endpoint_name=f"{ENDPOINT_NAME_PREFIX}-{unique_id}",
initial_instance_count=1
initial_instance_count=1,
)
logger.info(f"Endpoint Successfully Created: {core_endpoint.endpoint_name}")

Expand Down
8 changes: 1 addition & 7 deletions sagemaker-serve/tests/integ/test_tgi_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,6 @@
MODEL_NAME_PREFIX = "tgi-test-model"
ENDPOINT_NAME_PREFIX = "tgi-test-endpoint"

# Configuration from backup file
AWS_REGION = "us-east-2"


@pytest.mark.slow_test
def test_tgi_build_deploy_invoke_cleanup():
Expand Down Expand Up @@ -81,8 +78,6 @@ def build_and_deploy():
hf_model_id = MODEL_ID

schema_builder = create_schema_builder()
boto_session = boto3.Session(region_name=AWS_REGION)
sagemaker_session = Session(boto_session=boto_session)
unique_id = str(uuid.uuid4())[:8]

compute = Compute(
Expand All @@ -101,7 +96,6 @@ def build_and_deploy():
model=hf_model_id, # Use HuggingFace model string
model_server=ModelServer.TGI,
schema_builder=schema_builder,
sagemaker_session=sagemaker_session,
compute=compute,
env_vars=env_vars
)
Expand All @@ -112,7 +106,7 @@ def build_and_deploy():

core_endpoint = model_builder.deploy(
endpoint_name=f"{ENDPOINT_NAME_PREFIX}-{unique_id}",
initial_instance_count=1
initial_instance_count=1,
)
logger.info(f"Endpoint Successfully Created: {core_endpoint.endpoint_name}")

Expand Down
30 changes: 16 additions & 14 deletions sagemaker-serve/tests/unit/test_model_builder_servers.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,20 +414,22 @@ def test_all_supported_model_servers_have_routes(self):
"""Test that all supported model servers have corresponding build methods."""
from sagemaker.serve.model_builder_servers import _ModelBuilderServers

# Map of model servers to their expected build methods
server_method_map = {
ModelServer.TORCHSERVE: '_build_for_torchserve',
ModelServer.TRITON: '_build_for_triton',
ModelServer.TENSORFLOW_SERVING: '_build_for_tensorflow_serving',
ModelServer.DJL_SERVING: '_build_for_djl',
ModelServer.TEI: '_build_for_tei',
ModelServer.TGI: '_build_for_tgi',
ModelServer.MMS: '_build_for_transformers',
ModelServer.SMD: '_build_for_smd',
}

for model_server, method_name in server_method_map.items():
with self.subTest(model_server=model_server):
# Map of model servers to their expected build methods using string values
# to avoid enum serialization issues with pytest-xdist
server_method_map = [
(ModelServer.TORCHSERVE, '_build_for_torchserve'),
(ModelServer.TRITON, '_build_for_triton'),
(ModelServer.TENSORFLOW_SERVING, '_build_for_tensorflow_serving'),
(ModelServer.DJL_SERVING, '_build_for_djl'),
(ModelServer.TEI, '_build_for_tei'),
(ModelServer.TGI, '_build_for_tgi'),
(ModelServer.MMS, '_build_for_transformers'),
(ModelServer.SMD, '_build_for_smd'),
]

for model_server, method_name in server_method_map:
# Use enum.name instead of enum itself for subTest to avoid serialization
with self.subTest(model_server=model_server.name):
self.mock_builder.model_server = model_server

# Mock the specific build method
Expand Down
8 changes: 8 additions & 0 deletions sagemaker-serve/tests/unit/test_model_builder_utils_triton.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,14 @@ class TestExportPytorchToOnnx(unittest.TestCase):
@patch('torch.onnx.export')
def test_export_pytorch_to_onnx_success(self, mock_export):
"""Test successful PyTorch to ONNX export."""
try:
import ml_dtypes
# Skip test if ml_dtypes doesn't have required attribute
if not hasattr(ml_dtypes, 'float4_e2m1fn'):
self.skipTest("ml_dtypes version incompatible with current numpy/onnx")
except ImportError:
pass

utils = _ModelBuilderUtils()
mock_model = Mock()
mock_schema = Mock()
Expand Down
5 changes: 3 additions & 2 deletions sagemaker-serve/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,10 @@ allowlist_externals =
commands =
python -c "import os; os.system('install-custom-pkgs --install-boto-wheels')"
pip install 'apache-airflow==2.10.4' --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.10.4/constraints-3.9.txt"
pip install 'torch==2.3.1+cpu' -f 'https://download.pytorch.org/whl/torch_stable.html'
pip install 'torchvision==0.18.1+cpu' -f 'https://download.pytorch.org/whl/torch_stable.html'
pip install 'torch==2.8.0' 'torchvision==0.23.0'
pip install 'onnx>=1.16.0,<1.17.0' 'onnxruntime>=1.19.0,<1.20.0'
pip install 'dill>=0.3.9'
pip install 'tensorflow==2.16.2'

pytest {posargs}
deps =
Expand Down
6 changes: 5 additions & 1 deletion sagemaker-train/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ test = [
"pandas",
"scipy",
"omegaconf",
"graphene"
"graphene",
"IPython"
]

[tool.setuptools.packages.find]
Expand All @@ -71,6 +72,9 @@ version = { file = "VERSION"}
[tool.pytest.ini_options]
addopts = ["-vv"]
testpaths = ["tests"]
markers = [
"serial: marks tests that must run serially (not in parallel)",
]

[tool.black]
line-length = 100
Expand Down
1 change: 0 additions & 1 deletion sagemaker-train/src/sagemaker/ai_registry/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,6 @@ def _validate_dataset_file(cls, file_path: str) -> None:
max_size_mb = DATASET_MAX_FILE_SIZE_BYTES / (1024 * 1024)
raise ValueError(f"File size {file_size_mb:.2f} MB exceeds maximum allowed size of {max_size_mb:.0f} MB")

@classmethod
@classmethod
@_telemetry_emitter(feature=Feature.MODEL_CUSTOMIZATION, func_name="DataSet.get")
def get(cls, name: str, sagemaker_session=None) -> "DataSet":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -300,12 +300,18 @@ class BenchMarkEvaluator(BaseEvaluator):
"""

benchmark: _Benchmark
dataset: Union[str, Any] # Required field, must come before optional fields
subtasks: Optional[Union[str, List[str]]] = None
evaluate_base_model: bool = True
_hyperparameters: Optional[Any] = None

# Template-required fields
evaluate_base_model: bool = False

@validator('dataset', pre=True)
def _resolve_dataset(cls, v):
"""Resolve dataset to string (S3 URI or ARN) and validate format.

Uses BaseEvaluator's common validation logic to avoid code duplication.
"""
return BaseEvaluator._validate_and_resolve_dataset(v)

@validator('benchmark')
def _validate_benchmark_model_compatibility(cls, v, values):
Expand Down
1 change: 1 addition & 0 deletions sagemaker-train/tests/integ/ai_registry/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from sagemaker.ai_registry.air_constants import HubContentStatus


@pytest.mark.serial
class TestDataSetIntegration:
"""Integration tests for DataSet operations."""

Expand Down
1 change: 1 addition & 0 deletions sagemaker-train/tests/integ/ai_registry/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from sagemaker.ai_registry.air_constants import HubContentStatus, REWARD_FUNCTION, REWARD_PROMPT


@pytest.mark.serial
class TestEvaluatorIntegration:
"""Integration tests for Evaluator operations."""

Expand Down
2 changes: 1 addition & 1 deletion sagemaker-train/tests/unit/train/local/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def test_pad_groups_records_within_size(self):
def test_pad_splits_when_exceeding_size(self):
"""Test pad splits records when exceeding size."""
splitter = MagicMock()
splitter.split.return_value = ["a" * 1000, "b" * 1000, "c" * 1000]
splitter.split.return_value = ["a" * 500, "b" * 500, "c" * 500]

strategy = MultiRecordStrategy(splitter)
result = list(strategy.pad("file.txt", size=0.001)) # Very small size
Expand Down
1 change: 1 addition & 0 deletions sagemaker-train/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ markers =
release
image_uris_unit_test
timeout: mark a test as a timeout.
serial: marks tests that must run serially (not in parallel)

[testenv]
setenv =
Expand Down
Loading