From 4b11a35382f43b6f1bdc3f90b1cb1cbfc91f2956 Mon Sep 17 00:00:00 2001 From: Molly He Date: Tue, 3 Jun 2025 15:32:21 -0700 Subject: [PATCH 1/7] Add ignore_patterns in ModelTrainer to ignore specific files/folders --- src/sagemaker/modules/configs.py | 6 +++- src/sagemaker/modules/train/model_trainer.py | 37 ++++++++++++++++---- 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/src/sagemaker/modules/configs.py b/src/sagemaker/modules/configs.py index 3739c73c5d..d1ab81527c 100644 --- a/src/sagemaker/modules/configs.py +++ b/src/sagemaker/modules/configs.py @@ -21,7 +21,7 @@ from __future__ import absolute_import -from typing import Optional, Union +from typing import Optional, Union, List from pydantic import BaseModel, model_validator, ConfigDict import sagemaker_core.shapes as shapes @@ -96,12 +96,16 @@ class SourceCode(BaseConfig): command (Optional[str]): The command(s) to execute in the training job container. Example: "python my_script.py". If not specified, entry_script must be provided. + ignore_patterns: (Optional[List[str]]) : + The ignore patterns to ignore specific files/folders when uploading to S3. Example: + ['.env', '.git', 'data', '__pycache__']. """ source_dir: Optional[str] = None requirements: Optional[str] = None entry_script: Optional[str] = None command: Optional[str] = None + ignore_patterns: Optional[List[str]] = None class Compute(shapes.ResourceConfig): diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py index 2143da4e5c..d7299d55b8 100644 --- a/src/sagemaker/modules/train/model_trainer.py +++ b/src/sagemaker/modules/train/model_trainer.py @@ -119,7 +119,8 @@ class ModelTrainer(BaseModel): from sagemaker.modules.train import ModelTrainer from sagemaker.modules.configs import SourceCode, Compute, InputData - source_code = SourceCode(source_dir="source", entry_script="train.py") + ignore_patterns = ['.env', '.git', 'data', '__pycache__'] + source_code = SourceCode(source_dir="source", entry_script="train.py", ignore_patterns=ignore_patterns) training_image = "123456789012.dkr.ecr.us-west-2.amazonaws.com/my-training-image" model_trainer = ModelTrainer( training_image=training_image, @@ -654,6 +655,7 @@ def train( channel_name=SM_CODE, data_source=self.source_code.source_dir, key_prefix=input_data_key_prefix, + ignore_patterns=self.source_code.ignore_patterns, ) final_input_data_config.append(source_code_channel) @@ -755,7 +757,11 @@ def train( local_container.train(wait) def create_input_data_channel( - self, channel_name: str, data_source: DataSourceType, key_prefix: Optional[str] = None + self, + channel_name: str, + data_source: DataSourceType, + key_prefix: Optional[str] = None, + ignore_patterns: Optional[List[str]] = None, ) -> Channel: """Create an input data channel for the training job. @@ -771,6 +777,9 @@ def create_input_data_channel( If specified, local data will be uploaded to: ``s3://///`` + ignore_patterns: (Optional[List[str]]) : + The ignore patterns to ignore specific files/folders when uploading to S3. + Example: ['.env', '.git', 'data', '__pycache__']. """ channel = None if isinstance(data_source, str): @@ -810,11 +819,25 @@ def create_input_data_channel( ) if self.sagemaker_session.default_bucket_prefix: key_prefix = f"{self.sagemaker_session.default_bucket_prefix}/{key_prefix}" - s3_uri = self.sagemaker_session.upload_data( - path=data_source, - bucket=self.sagemaker_session.default_bucket(), - key_prefix=key_prefix, - ) + if ignore_patterns: + tmp_dir = TemporaryDirectory() + shutil.copytree( + data_source, + os.path.join(tmp_dir.name, os.path.basename(data_source)), + dirs_exist_ok=True, + ignore=shutil.ignore_patterns(*ignore_patterns) + ) + s3_uri = self.sagemaker_session.upload_data( + path=tmp_dir.name, + bucket=self.sagemaker_session.default_bucket(), + key_prefix=key_prefix, + ) + else: + s3_uri = self.sagemaker_session.upload_data( + path=data_source, + bucket=self.sagemaker_session.default_bucket(), + key_prefix=key_prefix, + ) channel = Channel( channel_name=channel_name, data_source=DataSource( From 8571a99c5dd91806813c80ee51a22572b6df02b3 Mon Sep 17 00:00:00 2001 From: Molly He Date: Tue, 3 Jun 2025 15:40:46 -0700 Subject: [PATCH 2/7] fix black format --- src/sagemaker/modules/configs.py | 2 +- src/sagemaker/modules/train/model_trainer.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/sagemaker/modules/configs.py b/src/sagemaker/modules/configs.py index d1ab81527c..e53f5b3dd9 100644 --- a/src/sagemaker/modules/configs.py +++ b/src/sagemaker/modules/configs.py @@ -97,7 +97,7 @@ class SourceCode(BaseConfig): The command(s) to execute in the training job container. Example: "python my_script.py". If not specified, entry_script must be provided. ignore_patterns: (Optional[List[str]]) : - The ignore patterns to ignore specific files/folders when uploading to S3. Example: + The ignore patterns to ignore specific files/folders when uploading to S3. Example: ['.env', '.git', 'data', '__pycache__']. """ diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py index d7299d55b8..c633cfe94b 100644 --- a/src/sagemaker/modules/train/model_trainer.py +++ b/src/sagemaker/modules/train/model_trainer.py @@ -757,9 +757,9 @@ def train( local_container.train(wait) def create_input_data_channel( - self, - channel_name: str, - data_source: DataSourceType, + self, + channel_name: str, + data_source: DataSourceType, key_prefix: Optional[str] = None, ignore_patterns: Optional[List[str]] = None, ) -> Channel: @@ -778,7 +778,7 @@ def create_input_data_channel( If specified, local data will be uploaded to: ``s3://///`` ignore_patterns: (Optional[List[str]]) : - The ignore patterns to ignore specific files/folders when uploading to S3. + The ignore patterns to ignore specific files/folders when uploading to S3. Example: ['.env', '.git', 'data', '__pycache__']. """ channel = None @@ -825,7 +825,7 @@ def create_input_data_channel( data_source, os.path.join(tmp_dir.name, os.path.basename(data_source)), dirs_exist_ok=True, - ignore=shutil.ignore_patterns(*ignore_patterns) + ignore=shutil.ignore_patterns(*ignore_patterns), ) s3_uri = self.sagemaker_session.upload_data( path=tmp_dir.name, From 071d6cb04b07e15889cd67a0e6a9268212591283 Mon Sep 17 00:00:00 2001 From: Molly He Date: Wed, 4 Jun 2025 15:09:06 -0700 Subject: [PATCH 3/7] add unit test --- .../sagemaker/modules/train/test_model_trainer.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/unit/sagemaker/modules/train/test_model_trainer.py b/tests/unit/sagemaker/modules/train/test_model_trainer.py index 5d4722b8aa..59bd015fdb 100644 --- a/tests/unit/sagemaker/modules/train/test_model_trainer.py +++ b/tests/unit/sagemaker/modules/train/test_model_trainer.py @@ -202,6 +202,17 @@ def model_trainer(): }, "should_throw": False, }, + { + "init_params": { + "training_image": DEFAULT_IMAGE, + "source_code": SourceCode( + source_dir=DEFAULT_SOURCE_DIR, + command="python custom_script.py", + ignore_patterns=["data"] + ), + }, + "should_throw": False, + }, ], ids=[ "no_params", @@ -213,6 +224,7 @@ def model_trainer(): "supported_source_code_local_tar_file", "supported_source_code_s3_dir", "supported_source_code_s3_tar_file", + "supported_source_code_ignore_patterns" ], ) def test_model_trainer_param_validation(test_case, modules_session): From 22a119d382bedb176be4f564407d227029d78762 Mon Sep 17 00:00:00 2001 From: Molly He Date: Thu, 5 Jun 2025 13:07:08 -0700 Subject: [PATCH 4/7] add default ignore_patterns, fix minor path issue when uploaded to s3 --- src/sagemaker/modules/configs.py | 6 +++--- src/sagemaker/modules/train/model_trainer.py | 19 +++++++++++++------ .../modules/train/test_model_trainer.py | 4 ++-- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/sagemaker/modules/configs.py b/src/sagemaker/modules/configs.py index e53f5b3dd9..129dfd85df 100644 --- a/src/sagemaker/modules/configs.py +++ b/src/sagemaker/modules/configs.py @@ -97,15 +97,15 @@ class SourceCode(BaseConfig): The command(s) to execute in the training job container. Example: "python my_script.py". If not specified, entry_script must be provided. ignore_patterns: (Optional[List[str]]) : - The ignore patterns to ignore specific files/folders when uploading to S3. Example: - ['.env', '.git', 'data', '__pycache__']. + The ignore patterns to ignore specific files/folders when uploading to S3. If not specified, + default to: ['.env', '.git', '__pycache__', '.DS_Store']. """ source_dir: Optional[str] = None requirements: Optional[str] = None entry_script: Optional[str] = None command: Optional[str] = None - ignore_patterns: Optional[List[str]] = None + ignore_patterns: Optional[List[str]] = [".env", ".git", "__pycache__", ".DS_Store"] class Compute(shapes.ResourceConfig): diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py index c633cfe94b..0ae9e81835 100644 --- a/src/sagemaker/modules/train/model_trainer.py +++ b/src/sagemaker/modules/train/model_trainer.py @@ -119,7 +119,7 @@ class ModelTrainer(BaseModel): from sagemaker.modules.train import ModelTrainer from sagemaker.modules.configs import SourceCode, Compute, InputData - ignore_patterns = ['.env', '.git', 'data', '__pycache__'] + ignore_patterns = ['.env', '.git', '__pycache__', '.DS_Store', 'data'] source_code = SourceCode(source_dir="source", entry_script="train.py", ignore_patterns=ignore_patterns) training_image = "123456789012.dkr.ecr.us-west-2.amazonaws.com/my-training-image" model_trainer = ModelTrainer( @@ -677,6 +677,7 @@ def train( channel_name=SM_DRIVERS, data_source=tmp_dir.name, key_prefix=input_data_key_prefix, + ignore_patterns=self.source_code.ignore_patterns, ) final_input_data_config.append(sm_drivers_channel) @@ -779,7 +780,7 @@ def create_input_data_channel( ``s3://///`` ignore_patterns: (Optional[List[str]]) : The ignore patterns to ignore specific files/folders when uploading to S3. - Example: ['.env', '.git', 'data', '__pycache__']. + If not specified, default to: ['.env', '.git', '__pycache__', '.DS_Store']. """ channel = None if isinstance(data_source, str): @@ -819,16 +820,19 @@ def create_input_data_channel( ) if self.sagemaker_session.default_bucket_prefix: key_prefix = f"{self.sagemaker_session.default_bucket_prefix}/{key_prefix}" - if ignore_patterns: + if ignore_patterns and _is_valid_path(data_source, path_type="Directory"): tmp_dir = TemporaryDirectory() + copied_path = os.path.join( + tmp_dir.name, os.path.basename(os.path.normpath(data_source)) + ) shutil.copytree( data_source, - os.path.join(tmp_dir.name, os.path.basename(data_source)), + copied_path, dirs_exist_ok=True, ignore=shutil.ignore_patterns(*ignore_patterns), ) s3_uri = self.sagemaker_session.upload_data( - path=tmp_dir.name, + path=copied_path, bucket=self.sagemaker_session.default_bucket(), key_prefix=key_prefix, ) @@ -884,7 +888,10 @@ def _get_input_data_config( channels.append(input_data) elif isinstance(input_data, InputData): channel = self.create_input_data_channel( - input_data.channel_name, input_data.data_source, key_prefix=key_prefix + input_data.channel_name, + input_data.data_source, + key_prefix=key_prefix, + ignore_patterns=self.source_code.ignore_patterns, ) channels.append(channel) else: diff --git a/tests/unit/sagemaker/modules/train/test_model_trainer.py b/tests/unit/sagemaker/modules/train/test_model_trainer.py index 59bd015fdb..cf38f26334 100644 --- a/tests/unit/sagemaker/modules/train/test_model_trainer.py +++ b/tests/unit/sagemaker/modules/train/test_model_trainer.py @@ -208,7 +208,7 @@ def model_trainer(): "source_code": SourceCode( source_dir=DEFAULT_SOURCE_DIR, command="python custom_script.py", - ignore_patterns=["data"] + ignore_patterns=["data"], ), }, "should_throw": False, @@ -224,7 +224,7 @@ def model_trainer(): "supported_source_code_local_tar_file", "supported_source_code_s3_dir", "supported_source_code_s3_tar_file", - "supported_source_code_ignore_patterns" + "supported_source_code_ignore_patterns", ], ) def test_model_trainer_param_validation(test_case, modules_session): From 8fe66ba0d92025a7f0316362fc42c1fc91a34fc3 Mon Sep 17 00:00:00 2001 From: Molly He Date: Thu, 5 Jun 2025 13:53:51 -0700 Subject: [PATCH 5/7] minor change to fix unit test failure --- src/sagemaker/modules/train/model_trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py index 0ae9e81835..337589d78d 100644 --- a/src/sagemaker/modules/train/model_trainer.py +++ b/src/sagemaker/modules/train/model_trainer.py @@ -891,7 +891,6 @@ def _get_input_data_config( input_data.channel_name, input_data.data_source, key_prefix=key_prefix, - ignore_patterns=self.source_code.ignore_patterns, ) channels.append(channel) else: From 62ab2a649e9d0c074a5870fa6fa01cc281ea5653 Mon Sep 17 00:00:00 2001 From: Molly He Date: Mon, 9 Jun 2025 11:41:28 -0700 Subject: [PATCH 6/7] add new variables in default ignore_patterns --- src/sagemaker/modules/configs.py | 11 +++++++++-- src/sagemaker/modules/train/model_trainer.py | 3 ++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/sagemaker/modules/configs.py b/src/sagemaker/modules/configs.py index 129dfd85df..1ada10dff3 100644 --- a/src/sagemaker/modules/configs.py +++ b/src/sagemaker/modules/configs.py @@ -98,14 +98,21 @@ class SourceCode(BaseConfig): If not specified, entry_script must be provided. ignore_patterns: (Optional[List[str]]) : The ignore patterns to ignore specific files/folders when uploading to S3. If not specified, - default to: ['.env', '.git', '__pycache__', '.DS_Store']. + default to: ['.env', '.git', '__pycache__', '.DS_Store', '.cache', '.ipynb_checkpoints']. """ source_dir: Optional[str] = None requirements: Optional[str] = None entry_script: Optional[str] = None command: Optional[str] = None - ignore_patterns: Optional[List[str]] = [".env", ".git", "__pycache__", ".DS_Store"] + ignore_patterns: Optional[List[str]] = [ + ".env", + ".git", + "__pycache__", + ".DS_Store", + ".cache", + ".ipynb_checkpoints", + ] class Compute(shapes.ResourceConfig): diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py index 337589d78d..1e4df7134e 100644 --- a/src/sagemaker/modules/train/model_trainer.py +++ b/src/sagemaker/modules/train/model_trainer.py @@ -780,7 +780,8 @@ def create_input_data_channel( ``s3://///`` ignore_patterns: (Optional[List[str]]) : The ignore patterns to ignore specific files/folders when uploading to S3. - If not specified, default to: ['.env', '.git', '__pycache__', '.DS_Store']. + If not specified, default to: ['.env', '.git', '__pycache__', '.DS_Store', + '.cache', '.ipynb_checkpoints']. """ channel = None if isinstance(data_source, str): From 6e11a67bd2cf1d1e5f05d52ab3eee09dbd8a3c87 Mon Sep 17 00:00:00 2001 From: Molly He Date: Mon, 9 Jun 2025 14:13:09 -0700 Subject: [PATCH 7/7] fix indentation error in docstring for readthedocs --- src/sagemaker/modules/train/model_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker/modules/train/model_trainer.py b/src/sagemaker/modules/train/model_trainer.py index 1e4df7134e..7d83766c9f 100644 --- a/src/sagemaker/modules/train/model_trainer.py +++ b/src/sagemaker/modules/train/model_trainer.py @@ -781,7 +781,7 @@ def create_input_data_channel( ignore_patterns: (Optional[List[str]]) : The ignore patterns to ignore specific files/folders when uploading to S3. If not specified, default to: ['.env', '.git', '__pycache__', '.DS_Store', - '.cache', '.ipynb_checkpoints']. + '.cache', '.ipynb_checkpoints']. """ channel = None if isinstance(data_source, str):