Add section on hyperparameter tuning using Vertex AI's blackbox optimization service

main
Aurélien Geron 2022-04-19 09:23:30 +12:00
parent 96edbd5ef4
commit c345b60732
1 changed files with 279 additions and 19 deletions

View File

@ -2409,13 +2409,264 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Hyperparameter Tuning using Keras Tuner on Vertex AI"
"# Hyperparameter Tuning on Vertex AI"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Writing my_vertex_ai_trial.py\n"
]
}
],
"source": [
"%%writefile my_vertex_ai_trial.py\n",
"\n",
"import argparse\n",
"\n",
"parser = argparse.ArgumentParser()\n",
"parser.add_argument(\"--n_hidden\", type=int, default=2)\n",
"parser.add_argument(\"--n_neurons\", type=int, default=256)\n",
"parser.add_argument(\"--learning_rate\", type=float, default=1e-2)\n",
"parser.add_argument(\"--optimizer\", default=\"adam\")\n",
"args = parser.parse_args()\n",
"\n",
"import tensorflow as tf\n",
"\n",
"def build_model(args):\n",
" with tf.distribute.MirroredStrategy().scope():\n",
" model = tf.keras.Sequential()\n",
" model.add(tf.keras.layers.Flatten(input_shape=[28, 28], dtype=tf.uint8))\n",
" for _ in range(args.n_hidden):\n",
" model.add(tf.keras.layers.Dense(args.n_neurons, activation=\"relu\"))\n",
" model.add(tf.keras.layers.Dense(10, activation=\"softmax\"))\n",
" opt = tf.keras.optimizers.get(args.optimizer)\n",
" opt.learning_rate = args.learning_rate\n",
" model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=opt,\n",
" metrics=[\"accuracy\"])\n",
" return model\n",
"\n",
"# extra code loads and splits the dataset\n",
"mnist = tf.keras.datasets.mnist.load_data()\n",
"(X_train_full, y_train_full), (X_test, y_test) = mnist\n",
"X_valid, X_train = X_train_full[:5000], X_train_full[5000:]\n",
"y_valid, y_train = y_train_full[:5000], y_train_full[5000:]\n",
"\n",
"# extra code use the AIP_* environment variable and create the callbacks\n",
"import os\n",
"model_dir = os.getenv(\"AIP_MODEL_DIR\")\n",
"tensorboard_log_dir = os.getenv(\"AIP_TENSORBOARD_LOG_DIR\")\n",
"checkpoint_dir = os.getenv(\"AIP_CHECKPOINT_DIR\")\n",
"trial_id = os.getenv(\"CLOUD_ML_TRIAL_ID\")\n",
"tensorboard_cb = tf.keras.callbacks.TensorBoard(tensorboard_log_dir)\n",
"early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=5)\n",
"callbacks = [tensorboard_cb, early_stopping_cb]\n",
"\n",
"model = build_model(args)\n",
"history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid),\n",
" epochs=10, callbacks=callbacks)\n",
"model.save(model_dir) # extra code\n",
"\n",
"import hypertune\n",
"\n",
"hypertune = hypertune.HyperTune()\n",
"hypertune.report_hyperparameter_tuning_metric(\n",
" hyperparameter_metric_tag=\"accuracy\", # name of the reported metric\n",
" metric_value=max(history.history[\"val_accuracy\"]), # max accuracy value\n",
" global_step=model.optimizer.iterations.numpy(),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training script copied to:\n",
"gs://homl3-mybucket5/staging/aiplatform-2022-04-18-18:14:02.860-aiplatform_custom_trainer_script-0.1.tar.gz.\n"
]
}
],
"source": [
"trial_job = aiplatform.CustomJob.from_local_script(\n",
" display_name=\"my_search_trial_job\",\n",
" script_path=\"my_vertex_ai_trial.py\", # path to your training script\n",
" container_uri=\"gcr.io/cloud-aiplatform/training/tf-gpu.2-4:latest\",\n",
" staging_bucket=f\"gs://{bucket_name}/staging\",\n",
" accelerator_type=\"NVIDIA_TESLA_K80\",\n",
" accelerator_count=2, # in this example, each trial will have 2 GPUs\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Creating HyperparameterTuningJob\n",
"HyperparameterTuningJob created. Resource name: projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568\n",
"To use this HyperparameterTuningJob in another session:\n",
"hpt_job = aiplatform.HyperparameterTuningJob.get('projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568')\n",
"View HyperparameterTuningJob:\n",
"https://console.cloud.google.com/ai/platform/locations/us-central1/training/5825136187899117568?project=522977795627\n",
"HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n",
"JobState.JOB_STATE_RUNNING\n",
"HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n",
"JobState.JOB_STATE_RUNNING\n",
"HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n",
"JobState.JOB_STATE_RUNNING\n",
"HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n",
"JobState.JOB_STATE_RUNNING\n",
"HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n",
"JobState.JOB_STATE_RUNNING\n",
"HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n",
"JobState.JOB_STATE_RUNNING\n",
"HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n",
"JobState.JOB_STATE_RUNNING\n",
"HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n",
"JobState.JOB_STATE_RUNNING\n",
"HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n",
"JobState.JOB_STATE_SUCCEEDED\n",
"HyperparameterTuningJob run completed. Resource name: projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568\n"
]
}
],
"source": [
"from google.cloud.aiplatform import hyperparameter_tuning as hpt\n",
"\n",
"hp_job = aiplatform.HyperparameterTuningJob(\n",
" display_name=\"my_hp_search_job\",\n",
" custom_job=trial_job,\n",
" metric_spec={\"accuracy\": \"maximize\"},\n",
" parameter_spec={\n",
" \"learning_rate\": hpt.DoubleParameterSpec(min=1e-3, max=10, scale=\"log\"),\n",
" \"n_neurons\": hpt.IntegerParameterSpec(min=1, max=300, scale=\"linear\"),\n",
" \"n_hidden\": hpt.IntegerParameterSpec(min=1, max=10, scale=\"linear\"),\n",
" \"optimizer\": hpt.CategoricalParameterSpec([\"sgd\", \"adam\"]),\n",
" },\n",
" max_trial_count=100,\n",
" parallel_trial_count=20,\n",
")\n",
"hp_job.run()"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"def get_final_metric(trial, metric_id):\n",
" for metric in trial.final_measurement.metrics:\n",
" if metric.metric_id == metric_id:\n",
" return metric.value\n",
"\n",
"trials = hp_job.trials\n",
"trial_accuracies = [get_final_metric(trial, \"accuracy\") for trial in trials]\n",
"best_trial = trials[np.argmax(trial_accuracies)]"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.977400004863739"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"max(trial_accuracies)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'98'"
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"best_trial.id"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[parameter_id: \"learning_rate\"\n",
"value {\n",
" number_value: 0.001\n",
"}\n",
", parameter_id: \"n_hidden\"\n",
"value {\n",
" number_value: 8.0\n",
"}\n",
", parameter_id: \"n_neurons\"\n",
"value {\n",
" number_value: 216.0\n",
"}\n",
", parameter_id: \"optimizer\"\n",
"value {\n",
" string_value: \"adam\"\n",
"}\n",
"]"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"best_trial.parameters"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Extra Material Distributed Keras Tuner on Vertex AI"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [
{
"name": "stdout",
@ -2506,12 +2757,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Replace `gs://my_bucket` with your bucket's name:"
"Replace `/gcs/my_bucket/` with <code>/gcs/<i>{bucket_name}</i>/</code>:"
]
},
{
"cell_type": "code",
"execution_count": 88,
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
@ -2524,7 +2775,7 @@
},
{
"cell_type": "code",
"execution_count": 89,
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
@ -2540,7 +2791,7 @@
},
{
"cell_type": "code",
"execution_count": 90,
"execution_count": 97,
"metadata": {},
"outputs": [
{
@ -2623,7 +2874,7 @@
},
{
"cell_type": "code",
"execution_count": 91,
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
@ -2650,7 +2901,7 @@
},
{
"cell_type": "code",
"execution_count": 92,
"execution_count": 99,
"metadata": {},
"outputs": [
{
@ -2689,7 +2940,7 @@
},
{
"cell_type": "code",
"execution_count": 93,
"execution_count": 100,
"metadata": {},
"outputs": [
{
@ -2713,7 +2964,7 @@
},
{
"cell_type": "code",
"execution_count": 94,
"execution_count": 101,
"metadata": {},
"outputs": [
{
@ -2721,23 +2972,25 @@
"output_type": "stream",
"text": [
"Creating ImageDataset\n",
"Create ImageDataset backing LRO: projects/522977795627/locations/us-central1/datasets/7569473452214583296/operations/3268657372830105600\n",
"ImageDataset created. Resource name: projects/522977795627/locations/us-central1/datasets/7569473452214583296\n",
"Create ImageDataset backing LRO: projects/522977795627/locations/us-central1/datasets/7532459492777132032/operations/3812233931370004480\n",
"ImageDataset created. Resource name: projects/522977795627/locations/us-central1/datasets/7532459492777132032\n",
"To use this ImageDataset in another session:\n",
"ds = aiplatform.ImageDataset('projects/522977795627/locations/us-central1/datasets/7569473452214583296')\n",
"Importing ImageDataset data: projects/522977795627/locations/us-central1/datasets/7569473452214583296\n",
"Import ImageDataset data backing LRO: projects/522977795627/locations/us-central1/datasets/7569473452214583296/operations/5061090024523563008\n",
"ImageDataset data imported. Resource name: projects/522977795627/locations/us-central1/datasets/7569473452214583296\n"
"ds = aiplatform.ImageDataset('projects/522977795627/locations/us-central1/datasets/7532459492777132032')\n",
"Importing ImageDataset data: projects/522977795627/locations/us-central1/datasets/7532459492777132032\n",
"Import ImageDataset data backing LRO: projects/522977795627/locations/us-central1/datasets/7532459492777132032/operations/3010593197698056192\n",
"ImageDataset data imported. Resource name: projects/522977795627/locations/us-central1/datasets/7532459492777132032\n"
]
}
],
"source": [
"from aiplatform.schema.dataset.ioformat.image import single_label_classification\n",
"\n",
"mnist_dataset = aiplatform.ImageDataset.create(\n",
" display_name=\"mnist-dataset\",\n",
" gcs_source=[f\"gs://{bucket_name}/mnist/import.csv\"],\n",
" project=project_id,\n",
" import_schema_uri=aiplatform.schema.dataset.ioformat.image.single_label_classification,\n",
" sync=True\n",
" import_schema_uri=single_label_classification,\n",
" sync=True,\n",
")"
]
},
@ -2745,7 +2998,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"**TODO**: create an AutoML training job on this dataset."
"Create an AutoML training job on this dataset:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**TODO**"
]
},
{