From c345b607321de03d3b9e3b90ed078c4defa300c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Tue, 19 Apr 2022 09:23:30 +1200 Subject: [PATCH] Add section on hyperparameter tuning using Vertex AI's blackbox optimization service --- 19_training_and_deploying_at_scale.ipynb | 298 +++++++++++++++++++++-- 1 file changed, 279 insertions(+), 19 deletions(-) diff --git a/19_training_and_deploying_at_scale.ipynb b/19_training_and_deploying_at_scale.ipynb index 9697970..c22a86c 100644 --- a/19_training_and_deploying_at_scale.ipynb +++ b/19_training_and_deploying_at_scale.ipynb @@ -2409,13 +2409,264 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Hyperparameter Tuning using Keras Tuner on Vertex AI" + "# Hyperparameter Tuning on Vertex AI" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing my_vertex_ai_trial.py\n" + ] + } + ], + "source": [ + "%%writefile my_vertex_ai_trial.py\n", + "\n", + "import argparse\n", + "\n", + "parser = argparse.ArgumentParser()\n", + "parser.add_argument(\"--n_hidden\", type=int, default=2)\n", + "parser.add_argument(\"--n_neurons\", type=int, default=256)\n", + "parser.add_argument(\"--learning_rate\", type=float, default=1e-2)\n", + "parser.add_argument(\"--optimizer\", default=\"adam\")\n", + "args = parser.parse_args()\n", + "\n", + "import tensorflow as tf\n", + "\n", + "def build_model(args):\n", + " with tf.distribute.MirroredStrategy().scope():\n", + " model = tf.keras.Sequential()\n", + " model.add(tf.keras.layers.Flatten(input_shape=[28, 28], dtype=tf.uint8))\n", + " for _ in range(args.n_hidden):\n", + " model.add(tf.keras.layers.Dense(args.n_neurons, activation=\"relu\"))\n", + " model.add(tf.keras.layers.Dense(10, activation=\"softmax\"))\n", + " opt = tf.keras.optimizers.get(args.optimizer)\n", + " opt.learning_rate = args.learning_rate\n", + " model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=opt,\n", + " metrics=[\"accuracy\"])\n", + " return model\n", + "\n", + "# extra code – loads and splits the dataset\n", + "mnist = tf.keras.datasets.mnist.load_data()\n", + "(X_train_full, y_train_full), (X_test, y_test) = mnist\n", + "X_valid, X_train = X_train_full[:5000], X_train_full[5000:]\n", + "y_valid, y_train = y_train_full[:5000], y_train_full[5000:]\n", + "\n", + "# extra code – use the AIP_* environment variable and create the callbacks\n", + "import os\n", + "model_dir = os.getenv(\"AIP_MODEL_DIR\")\n", + "tensorboard_log_dir = os.getenv(\"AIP_TENSORBOARD_LOG_DIR\")\n", + "checkpoint_dir = os.getenv(\"AIP_CHECKPOINT_DIR\")\n", + "trial_id = os.getenv(\"CLOUD_ML_TRIAL_ID\")\n", + "tensorboard_cb = tf.keras.callbacks.TensorBoard(tensorboard_log_dir)\n", + "early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=5)\n", + "callbacks = [tensorboard_cb, early_stopping_cb]\n", + "\n", + "model = build_model(args)\n", + "history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid),\n", + " epochs=10, callbacks=callbacks)\n", + "model.save(model_dir) # extra code\n", + "\n", + "import hypertune\n", + "\n", + "hypertune = hypertune.HyperTune()\n", + "hypertune.report_hyperparameter_tuning_metric(\n", + " hyperparameter_metric_tag=\"accuracy\", # name of the reported metric\n", + " metric_value=max(history.history[\"val_accuracy\"]), # max accuracy value\n", + " global_step=model.optimizer.iterations.numpy(),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training script copied to:\n", + "gs://homl3-mybucket5/staging/aiplatform-2022-04-18-18:14:02.860-aiplatform_custom_trainer_script-0.1.tar.gz.\n" + ] + } + ], + "source": [ + "trial_job = aiplatform.CustomJob.from_local_script(\n", + " display_name=\"my_search_trial_job\",\n", + " script_path=\"my_vertex_ai_trial.py\", # path to your training script\n", + " container_uri=\"gcr.io/cloud-aiplatform/training/tf-gpu.2-4:latest\",\n", + " staging_bucket=f\"gs://{bucket_name}/staging\",\n", + " accelerator_type=\"NVIDIA_TESLA_K80\",\n", + " accelerator_count=2, # in this example, each trial will have 2 GPUs\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating HyperparameterTuningJob\n", + "HyperparameterTuningJob created. Resource name: projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568\n", + "To use this HyperparameterTuningJob in another session:\n", + "hpt_job = aiplatform.HyperparameterTuningJob.get('projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568')\n", + "View HyperparameterTuningJob:\n", + "https://console.cloud.google.com/ai/platform/locations/us-central1/training/5825136187899117568?project=522977795627\n", + "HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n", + "JobState.JOB_STATE_RUNNING\n", + "HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n", + "JobState.JOB_STATE_RUNNING\n", + "HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n", + "JobState.JOB_STATE_RUNNING\n", + "HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n", + "JobState.JOB_STATE_RUNNING\n", + "HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n", + "JobState.JOB_STATE_RUNNING\n", + "HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n", + "JobState.JOB_STATE_RUNNING\n", + "HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n", + "JobState.JOB_STATE_RUNNING\n", + "HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n", + "JobState.JOB_STATE_RUNNING\n", + "HyperparameterTuningJob projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568 current state:\n", + "JobState.JOB_STATE_SUCCEEDED\n", + "HyperparameterTuningJob run completed. Resource name: projects/522977795627/locations/us-central1/hyperparameterTuningJobs/5825136187899117568\n" + ] + } + ], + "source": [ + "from google.cloud.aiplatform import hyperparameter_tuning as hpt\n", + "\n", + "hp_job = aiplatform.HyperparameterTuningJob(\n", + " display_name=\"my_hp_search_job\",\n", + " custom_job=trial_job,\n", + " metric_spec={\"accuracy\": \"maximize\"},\n", + " parameter_spec={\n", + " \"learning_rate\": hpt.DoubleParameterSpec(min=1e-3, max=10, scale=\"log\"),\n", + " \"n_neurons\": hpt.IntegerParameterSpec(min=1, max=300, scale=\"linear\"),\n", + " \"n_hidden\": hpt.IntegerParameterSpec(min=1, max=10, scale=\"linear\"),\n", + " \"optimizer\": hpt.CategoricalParameterSpec([\"sgd\", \"adam\"]),\n", + " },\n", + " max_trial_count=100,\n", + " parallel_trial_count=20,\n", + ")\n", + "hp_job.run()" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "def get_final_metric(trial, metric_id):\n", + " for metric in trial.final_measurement.metrics:\n", + " if metric.metric_id == metric_id:\n", + " return metric.value\n", + "\n", + "trials = hp_job.trials\n", + "trial_accuracies = [get_final_metric(trial, \"accuracy\") for trial in trials]\n", + "best_trial = trials[np.argmax(trial_accuracies)]" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.977400004863739" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "max(trial_accuracies)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'98'" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "best_trial.id" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[parameter_id: \"learning_rate\"\n", + "value {\n", + " number_value: 0.001\n", + "}\n", + ", parameter_id: \"n_hidden\"\n", + "value {\n", + " number_value: 8.0\n", + "}\n", + ", parameter_id: \"n_neurons\"\n", + "value {\n", + " number_value: 216.0\n", + "}\n", + ", parameter_id: \"optimizer\"\n", + "value {\n", + " string_value: \"adam\"\n", + "}\n", + "]" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "best_trial.parameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Extra Material – Distributed Keras Tuner on Vertex AI" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -2506,12 +2757,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Replace `gs://my_bucket` with your bucket's name:" + "Replace `/gcs/my_bucket/` with /gcs/{bucket_name}/:" ] }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ @@ -2524,7 +2775,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ @@ -2540,7 +2791,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 97, "metadata": {}, "outputs": [ { @@ -2623,7 +2874,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 98, "metadata": {}, "outputs": [], "source": [ @@ -2650,7 +2901,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 99, "metadata": {}, "outputs": [ { @@ -2689,14 +2940,14 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 100, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Uploaded datasets/mnist\n" + "Uploaded datasets/mnist \n" ] } ], @@ -2713,7 +2964,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 101, "metadata": {}, "outputs": [ { @@ -2721,23 +2972,25 @@ "output_type": "stream", "text": [ "Creating ImageDataset\n", - "Create ImageDataset backing LRO: projects/522977795627/locations/us-central1/datasets/7569473452214583296/operations/3268657372830105600\n", - "ImageDataset created. Resource name: projects/522977795627/locations/us-central1/datasets/7569473452214583296\n", + "Create ImageDataset backing LRO: projects/522977795627/locations/us-central1/datasets/7532459492777132032/operations/3812233931370004480\n", + "ImageDataset created. Resource name: projects/522977795627/locations/us-central1/datasets/7532459492777132032\n", "To use this ImageDataset in another session:\n", - "ds = aiplatform.ImageDataset('projects/522977795627/locations/us-central1/datasets/7569473452214583296')\n", - "Importing ImageDataset data: projects/522977795627/locations/us-central1/datasets/7569473452214583296\n", - "Import ImageDataset data backing LRO: projects/522977795627/locations/us-central1/datasets/7569473452214583296/operations/5061090024523563008\n", - "ImageDataset data imported. Resource name: projects/522977795627/locations/us-central1/datasets/7569473452214583296\n" + "ds = aiplatform.ImageDataset('projects/522977795627/locations/us-central1/datasets/7532459492777132032')\n", + "Importing ImageDataset data: projects/522977795627/locations/us-central1/datasets/7532459492777132032\n", + "Import ImageDataset data backing LRO: projects/522977795627/locations/us-central1/datasets/7532459492777132032/operations/3010593197698056192\n", + "ImageDataset data imported. Resource name: projects/522977795627/locations/us-central1/datasets/7532459492777132032\n" ] } ], "source": [ + "from aiplatform.schema.dataset.ioformat.image import single_label_classification\n", + "\n", "mnist_dataset = aiplatform.ImageDataset.create(\n", " display_name=\"mnist-dataset\",\n", " gcs_source=[f\"gs://{bucket_name}/mnist/import.csv\"],\n", " project=project_id,\n", - " import_schema_uri=aiplatform.schema.dataset.ioformat.image.single_label_classification,\n", - " sync=True\n", + " import_schema_uri=single_label_classification,\n", + " sync=True,\n", ")" ] }, @@ -2745,7 +2998,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**TODO**: create an AutoML training job on this dataset." + "Create an AutoML training job on this dataset:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**TODO**" ] }, {