diff --git a/kopt/hyopt.py b/kopt/hyopt.py index da9cc6f..1fa4519 100644 --- a/kopt/hyopt.py +++ b/kopt/hyopt.py @@ -334,7 +334,7 @@ def add_n_epoch(df): # TODO - put to a separate module def _train_and_eval_single(train, valid, model, batch_size=32, epochs=300, use_weight=False, - callbacks=[], eval_best=False, add_eval_metrics={}, custom_objects=None): + callbacks=[], eval_best=False, add_eval_metrics={}, custom_objects=None,data_format='npy'): """Fit and evaluate a keras model eval_best: if True, load the checkpointed model for evaluation @@ -352,14 +352,30 @@ def _format_keras_history(history): # train the model logger.info("Fit...") history = History() - model.fit(train[0], train[1], - batch_size=batch_size, - validation_data=valid[:2], - epochs=epochs, - sample_weight=sample_weight, - verbose=2, - callbacks=[history] + callbacks) + # if we're using numpy arrays + if data_format == 'npy': + model.fit(train[0], train[1], + batch_size=batch_size, + validation_data=valid[:2], + epochs=epochs, + sample_weight=sample_weight, + verbose=2, + callbacks=[history] + callbacks) + # if we're using h5 files + elif data_format == 'hdf5': + model.fit(train[0], train[1], + batch_size=batch_size, + validation_data=valid[:2], + epochs=epochs, + sample_weight=sample_weight, + verbose=2, + callbacks=[history] + callbacks, + shuffle='batch') + # else, just exit cleanly + else: + logger.error('Data format is not supported. You can use numpy arrays (default), or hdf5 arrays.') + exit(-1) # get history hist = _format_keras_history(history) # load and eval the best model @@ -503,9 +519,17 @@ def __init__(self, db_name, exp_name, optim_metric = kwargs["loss_metric"] if "loss_metric_mode" in kwargs and optim_metric_mode == "min": optim_metric_mode = kwargs["loss_metric_mode"] - possible_kwargs = ["loss_metric", "loss_metric_mode"] + + # add in additional kwarg to handle reading directly from h5py + if 'data_format' in kwargs: + self.data_format = kwargs['data_format'] + else: + self.data_format = 'npy' + possible_kwargs = ["loss_metric", "loss_metric_mode", 'data_format'] add_arguments = set(kwargs.keys()).difference(possible_kwargs) + # add in ability to handle reading from hdf5 + if len(add_arguments) > 0: raise ValueError("Unknown argument(s) {0}. **kwargs accepts only arguments: {1}. ". format(add_arguments, possible_kwargs)) @@ -629,7 +653,8 @@ def __call__(self, param): callbacks=c_callbacks, eval_best=self.save_model == "best", add_eval_metrics=self.add_eval_metrics, - custom_objects=self.custom_objects) + custom_objects=self.custom_objects, + data_format=self.data_format) if self.save_model == "last": model.save(model_path) else: diff --git a/tests/data.py b/tests/data.py index 36eb4fb..94b9a47 100644 --- a/tests/data.py +++ b/tests/data.py @@ -1,6 +1,6 @@ from keras.preprocessing import sequence from keras.datasets import imdb - +import h5py def data(max_features=5000, maxlen=400): print('Loading data...') @@ -20,4 +20,17 @@ def data(max_features=5000, maxlen=400): x_test = sequence.pad_sequences(x_test, maxlen=maxlen) print('x_train shape:', x_train.shape) print('x_test shape:', x_test.shape) - return (x_train, y_train, [1, 2, 3, "dummy_data"]), (x_test, y_test) \ No newline at end of file + return (x_train, y_train, [1, 2, 3, "dummy_data"]), (x_test, y_test) + +def data_hdf5(): + ''' + This function returns training and testing data which is simply the data stored in keras.datasets.cifar10 + + Returns + ------- + Tuple of tuples (x_train, y_train), (x_test, y_test) + ''' + print('Loading data...') + # open and allow other to open and read in other processes + f = h5py.File('tests/data/data.h5', 'r', libver='latest', swmr=True) + return (f['x_train'], f['y_train']), (f['x_test'], f['y_test']) diff --git a/tests/data/data.h5 b/tests/data/data.h5 new file mode 100644 index 0000000..51167e7 Binary files /dev/null and b/tests/data/data.h5 differ diff --git a/tests/model.py b/tests/model.py index fe1d715..151e2be 100644 --- a/tests/model.py +++ b/tests/model.py @@ -4,7 +4,7 @@ from keras.models import Sequential from keras.layers import Dense, Dropout, Activation from keras.layers import Embedding -from keras.layers import Conv1D, GlobalMaxPooling1D +from keras.layers import Conv1D, Conv2D, GlobalMaxPooling1D from keras.datasets import imdb # set parameters: @@ -46,6 +46,32 @@ def build_model(train_data, max_features=5000, maxlen=400, optimizer='adam', metrics=['accuracy']) return model + +def build_model_hdf5(train_data, + n_convolutions=3, + batch_size=32, + n_filters=250, + kernel_size=3, + hidden_dims=250): + print('Build model...') + model = Sequential() + + # we start off with an efficient embedding layer which maps + # our vocab indices into embedding_dims dimensions + for i in range(n_convolutions): + model.add(Conv2D(n_filters, kernel_size, padding='same', activation='relu', strides=1)) + # We add a vanilla hidden layer: + model.add(Dense(hidden_dims)) + model.add(Activation('relu')) + + # We project onto a single unit output layer, and squash it with a sigmoid: + model.add(Dense(10)) + model.add(Activation('sigmoid')) + + model.compile(loss='binary_crossentropy', + optimizer='adam', + metrics=['accuracy']) + return model # model.fit(x_train, y_train, # batch_size=batch_size, # epochs=epochs, diff --git a/tests/test_hyopt.py b/tests/test_hyopt.py index 4db2353..5d77302 100644 --- a/tests/test_hyopt.py +++ b/tests/test_hyopt.py @@ -39,6 +39,37 @@ def test_argument_compileCN(): unknown_arg=3) +def test_compilefn_train_test_split_h5py(tmpdir): + ''' + Test out a kopt optimization when loading data from h5py. + ''' + db_name = "test" + exp_name = "test2" + fn = CompileFN(db_name, exp_name, + data_fn=data.data_hdf5, + model_fn=model.build_model_hdf5, + optim_metric="acc", + optim_metric_mode="max", + # eval + valid_split=.1, + stratified=False, + random_state=True, + save_dir="/tmp/", + data_format='hdf5') + hyper_params = { + "data": {}, + "model": {"n_filters": hp.choice("m_n_filters", (2, 5)), + "n_convolutions": hp.choice("m_n_convolutions", (1, 3)), + "kernel_size": hp.choice("m_kernel_size", (2, 5)), + "hidden_dims": 3, + }, + "fit": {"epochs": 1} + } + fn_test(fn, hyper_params, tmp_dir=str(tmpdir)) + trials = Trials() + best = fmin(fn, hyper_params, trials=trials, algo=tpe.suggest, max_evals=2) + assert isinstance(best, dict) + def test_compilefn_train_test_split(tmpdir): db_name = "test" exp_name = "test2"