====== Filename: ./cloud_build/presubmit.sh ======
#!/bin/bash

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Run all tests
python2 -m unittest discover
====== Filename: ./cloud_build/cloudbuild.yaml ======
steps:
- name: 'gcr.io/cloud-builders/gcloud'
  id: 'presubmit'
  entrypoint: 'bash'
  args: ['cloud_build/presubmit.sh']
====== Filename: ./custom_image_utils/__init__.py ======
====== Filename: ./custom_image_utils/shell_script_generator.py.orig ======
# Copyright 2019,2020,2024 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Shell script based image creation workflow generator.
"""

from datetime import datetime


_template = """#!/usr/bin/env bash

# Script for creating Dataproc custom image.

set -euo pipefail

RED='\\e[0;31m'
GREEN='\\e[0;32m'
NC='\\e[0m'

base_obj_type="images"

function execute_with_retries() (
  set +x
  local -r cmd="$*"

  for ((i = 0; i < 3; i++)); do
    set -x
    time eval "$cmd" > "/tmp/{run_id}/install.log" 2>&1 && retval=$? || {{ retval=$? ; cat "/tmp/{run_id}/install.log" ; }}
    set +x
    if [[ $retval == 0 ]] ; then return 0 ; fi
    sleep 5
  done
  return 1
)

function exit_handler() {{
  echo 'Cleaning up before exiting.'

  if [[ -f /tmp/{run_id}/vm_created ]]; then ( set +e
    echo 'Deleting VM instance.'
    execute_with_retries \
      gcloud compute instances delete {image_name}-install --project={project_id} --zone={zone} -q
  ) elif [[ -f /tmp/{run_id}/disk_created ]]; then
    echo 'Deleting disk.'
    execute_with_retries \
      gcloud compute ${{base_obj_type}} delete {image_name}-install --project={project_id} --zone={zone} -q
  fi

  echo 'Uploading local logs to GCS bucket.'
  gsutil -m rsync -r {log_dir}/ {gcs_log_dir}/

  if [[ -f /tmp/{run_id}/image_created ]]; then
    echo -e "${{GREEN}}Workflow succeeded${{NC}}, check logs at {log_dir}/ or {gcs_log_dir}/"
    exit 0
  else
    echo -e "${{RED}}Workflow failed${{NC}}, check logs at {log_dir}/ or {gcs_log_dir}/"
    exit 1
  fi
}}

function test_element_in_array {{
  local test_element="$1" ; shift
  local -a test_array=("$@")

  for item in "${{test_array[@]}}"; do
    if [[ "${{item}}" == "${{test_element}}" ]]; then return 0 ; fi
  done
  return 1
}}

function print_modulus_md5sum {{
  local derfile="$1"
  openssl x509 -noout -modulus -in "${{derfile}}" | openssl md5 | awk '{{print $2}}'
}}

function print_img_dbs_modulus_md5sums() {{
  local long_img_name="$1"
  local img_name="$(echo ${{long_img_name}} | sed -e 's:^.*/::')"
  local json_tmpfile="/tmp/{run_id}/${{img_name}}.json"
  gcloud compute images describe ${{long_img_name}} --format json > "${{json_tmpfile}}"

  local -a db_certs=()
  mapfile -t db_certs < <( cat ${{json_tmpfile}} | jq -r 'try .shieldedInstanceInitialState.dbs[].content' )

  local -a modulus_md5sums=()
  for key in "${{!db_certs[@]}}" ; do
    local derfile="/tmp/{run_id}/${{img_name}}.${{key}}.der"
    echo "${{db_certs[${{key}}]}}" | \
      perl -M'MIME::Base64(decode_base64url)' -ne 'chomp; print( decode_base64url($_) )' \
      > "${{derfile}}"
    modulus_md5sums+=( $(print_modulus_md5sum "${{derfile}}") )
  done

  echo "${{modulus_md5sums[@]}}"
}}

function main() {{
  echo 'Uploading files to GCS bucket.'
  declare -a sources_k=({sources_map_k})
  declare -a sources_v=({sources_map_v})
  for i in "${{!sources_k[@]}}"; do
    gsutil cp "${{sources_v[i]}}" "{custom_sources_path}/${{sources_k[i]}}" > /dev/null 2>&1
  done

  local cert_args=""
  local num_src_certs="0"
  metadata_arg="{metadata_flag}"
  if [[ -n '{trusted_cert}' ]] && [[ -f '{trusted_cert}' ]]; then
    # build tls/ directory from variables defined near the header of
    # the examples/secure-boot/create-key-pair.sh file

    eval "$(bash examples/secure-boot/create-key-pair.sh)"
    metadata_arg="${{metadata_arg}},public_secret_name=${{public_secret_name}},private_secret_name=${{private_secret_name}},secret_project=${{secret_project}},secret_version=${{secret_version}}"

    # by default, a gcloud secret with the name of efi-db-pub-key-042 is
    # created in the current project to store the certificate installed
    # as the signature database file for this disk image

    # The MS UEFI CA is a reasonable base from which to build trust.  We
    # will trust code signed by this CA as well as code signed by
    # trusted_cert (tls/db.der)

    # The Microsoft Corporation UEFI CA 2011
    local -r MS_UEFI_CA="tls/MicCorUEFCA2011_2011-06-27.crt"
    test -f "${{MS_UEFI_CA}}" || \
      curl -L -o ${{MS_UEFI_CA}} 'https://go.microsoft.com/fwlink/p/?linkid=321194'

    local -a cert_list=()

    local -a default_cert_list
    default_cert_list=("{trusted_cert}" "${{MS_UEFI_CA}}")
    local -a src_img_modulus_md5sums=()

    mapfile -t src_img_modulus_md5sums < <(print_img_dbs_modulus_md5sums {dataproc_base_image})
    num_src_certs="${{#src_img_modulus_md5sums[@]}}"
    echo "debug - num_src_certs: [${{#src_img_modulus_md5sums[*]}}]"
    echo "value of src_img_modulus_md5sums: [${{src_img_modulus_md5sums}}]"
    if [[ -z "${{src_img_modulus_md5sums}}" ]]; then
      num_src_certs=0
      echo "no db certificates in source image"
      cert_list=( "${{default_cert_list[@]}}" )
    else
      echo "${{num_src_certs}} db certificates attached to source image"
      echo "db certs exist in source image"
      for cert in ${{default_cert_list[*]}}; do
        if test_element_in_array "$(print_modulus_md5sum ${{cert}})" ${{src_img_modulus_md5sums[@]}} ; then
          echo "cert ${{cert}} is already in source image's db list"
        else
          cert_list+=("${{cert}}")
        fi
      done
      # append source image's cert list
      local img_name="$(echo {dataproc_base_image} | sed -e 's:^.*/::')"
      if [[ ${{#cert_list[@]}} -ne 0 ]] && compgen -G "/tmp/{run_id}/${{img_name}}.*.der" > /dev/null ; then
        cert_list+=(/tmp/{run_id}/${{img_name}}.*.der)
      fi
    fi

    if [[ ${{#cert_list[@]}} -eq 0 ]]; then
      echo "all certificates already included in source image's db list"
    else
      cert_args="--signature-database-file=$(IFS=, ; echo "${{cert_list[*]}}") --guest-os-features=UEFI_COMPATIBLE"
    fi
  fi

  date

  if [[ -z "${{cert_args}}" && "${{num_src_certs}}" -ne "0" ]]; then
    echo 'Re-using base image'
    base_obj_type="reuse"
    instance_disk_args='--image-project={project_id} --image={dataproc_base_image} --boot-disk-size={disk_size}G --boot-disk-type=pd-ssd'

  elif [[ -n "${{cert_args}}" ]] ; then
    echo 'Creating image.'
    base_obj_type="images"
    instance_disk_args='--image-project={project_id} --image={image_name}-install --boot-disk-size={disk_size}G --boot-disk-type=pd-ssd'
    execute_with_retries \
      gcloud compute images create {image_name}-install \
      --project={project_id} \
      --source-image={dataproc_base_image} \
      ${{cert_args}} \
      {storage_location_flag} \
      --family={family}
    touch "/tmp/{run_id}/disk_created"
  else
    echo 'Creating disk.'
    base_obj_type="disks"
    instance_disk_args='--disk=auto-delete=yes,boot=yes,mode=rw,name={image_name}-install'
    execute_with_retries gcloud compute disks create {image_name}-install \
      --project={project_id} \
      --zone={zone} \
      --image={dataproc_base_image} \
      --type=pd-ssd \
      --size={disk_size}GB
    touch "/tmp/{run_id}/disk_created"
  fi

  date
  echo 'Creating VM instance to run customization script.'
  execute_with_retries gcloud compute instances create {image_name}-install \
      --project={project_id} \
      --zone={zone} \
      {network_flag} \
      {subnetwork_flag} \
      {no_external_ip_flag} \
      --machine-type={machine_type} \
      ${{instance_disk_args}} \
      {accelerator_flag} \
      {service_account_flag} \
      --scopes=cloud-platform \
      "${{metadata_arg}}" \
      --metadata-from-file startup-script=startup_script/run.sh

  touch /tmp/{run_id}/vm_created

  # clean up intermediate install image
  if [[ "${{base_obj_type}}" == "images" ]] ; then ( set +e
    # This sometimes returns an API error but deletes the image despite the failure
    gcloud compute images delete -q {image_name}-install --project={project_id}
  ) fi

  echo 'Waiting for customization script to finish and VM shutdown.'
  execute_with_retries gcloud compute instances tail-serial-port-output {image_name}-install \
      --project={project_id} \
      --zone={zone} \
      --port=1 2>&1 \
      | grep 'startup-script' \
      | sed -e 's/ {image_name}-install.*startup-script://g' \
      | dd status=none bs=1 of={log_dir}/startup-script.log \
      || true
  echo 'Checking customization script result.'
  date
  if grep -q 'BuildFailed:' {log_dir}/startup-script.log; then
    echo -e "${{RED}}Customization script failed.${{NC}}"
    echo "See {log_dir}/startup-script.log for details"
    exit 1
  elif grep -q 'BuildSucceeded:' {log_dir}/startup-script.log; then
    echo -e "${{GREEN}}Customization script succeeded.${{NC}}"
  else
    echo 'Unable to determine the customization script result.'
    exit 1
  fi

  date
  echo 'Creating custom image.'
  execute_with_retries gcloud compute images create {image_name} \
    --project={project_id} \
    --source-disk-zone={zone} \
    --source-disk={image_name}-install \
    {storage_location_flag} \
    --family={family}

  touch /tmp/{run_id}/image_created
}}

trap exit_handler EXIT
mkdir -p {log_dir}
main "$@" 2>&1 | tee {log_dir}/workflow.log
"""

class Generator:
  """Shell script based image creation workflow generator."""

  def _init_args(self, args):
    self.args = args
    if "run_id" not in self.args:
      self.args["run_id"] = "custom-image-{image_name}-{timestamp}".format(
          timestamp=datetime.now().strftime("%Y%m%d-%H%M%S"), **self.args)
    self.args["bucket_name"] = self.args["gcs_bucket"].replace("gs://", "")
    self.args["custom_sources_path"] = "gs://{bucket_name}/{run_id}/sources".format(**self.args)

    all_sources = {
        "run.sh": "startup_script/run.sh",
        "init_actions.sh": self.args["customization_script"]
    }
    all_sources.update(self.args["extra_sources"])

    sources_map_items = tuple(enumerate(all_sources.items()))
    self.args["sources_map_k"] = " ".join([
        "[{}]='{}'".format(i, kv[0].replace("'", "'\\''")) for i, kv in sources_map_items])
    self.args["sources_map_v"] = " ".join([
        "[{}]='{}'".format(i, kv[1].replace("'", "'\\''")) for i, kv in sources_map_items])

    self.args["log_dir"] = "/tmp/{run_id}/logs".format(**self.args)
    self.args["gcs_log_dir"] = "gs://{bucket_name}/{run_id}/logs".format(
      **self.args)
    if self.args["subnetwork"]:
      self.args["subnetwork_flag"] = "--subnet={subnetwork}".format(**self.args)
      self.args["network_flag"] = ""
    elif self.args["network"]:
      self.args["network_flag"] = "--network={network}".format(**self.args)
      self.args["subnetwork_flag"] = ""
    if self.args["service_account"]:
      self.args[
        "service_account_flag"] = "--service-account={service_account}".format(
        **self.args)
    self.args["no_external_ip_flag"] = "--no-address" if self.args[
      "no_external_ip"] else ""
    self.args[
      "accelerator_flag"] = "--accelerator={accelerator} --maintenance-policy terminate".format(
        **self.args) if self.args["accelerator"] else ""
    self.args[
      "storage_location_flag"] = "--storage-location={storage_location}".format(
        **self.args) if self.args["storage_location"] else ""
    metadata_flag_template = (
        "--metadata=shutdown-timer-in-sec={shutdown_timer_in_sec},"
        "custom-sources-path={custom_sources_path}")
    if self.args["metadata"]:
      metadata_flag_template += ",{metadata}"
    self.args["metadata_flag"] = metadata_flag_template.format(**self.args)

  def generate(self, args):
    self._init_args(args)
    return _template.format(**args)
====== Filename: ./custom_image_utils/shell_script_generator.py ======
# Copyright 2019,2020,2024 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Shell script based image creation workflow generator.
"""

from datetime import datetime


_template = """#!/usr/bin/env bash

# Script for creating Dataproc custom image.

set -euo pipefail

RED='\\e[0;31m'
GREEN='\\e[0;32m'
NC='\\e[0m'

base_obj_type="images"

function execute_with_retries() (
  set +x
  local -r cmd="$*"

  for ((i = 0; i < 3; i++)); do
    if eval "$cmd"; then return 0 ; fi
    sleep 12
  done
  return 1
)

function version_le(){{ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; }}
function version_lt(){{ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";}}

function prepare() {{
  # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be
  # used as a more performant replacement for `gsutil`
  gsutil_cmd="gcloud storage"
  rsync_cmd="${{gsutil_cmd}} rsync"
  gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {{print $2}}')"
  if version_lt "${{gcloud_sdk_version}}" "402.0.0" ; then
    gsutil_cmd="$(which gsutil) -o GSUtil:check_hashes=never"
    rsync_cmd="${{gsutil_cmd}} -m rsync"
  fi
}}

function exit_handler() {{
  echo 'Cleaning up before exiting.'

  if [[ -f /tmp/{run_id}/vm_created ]]; then ( set +e
    echo 'Deleting VM instance.'
    execute_with_retries \
      gcloud compute instances delete {image_name}-install --project={project_id} --zone={zone} -q
  ) elif [[ -f /tmp/{run_id}/disk_created ]]; then
    echo 'Deleting disk.'
    execute_with_retries gcloud compute ${{base_obj_type}} delete {image_name}-install --project={project_id} -q
  fi

  echo 'Uploading local logs to GCS bucket.'
  ${{rsync_cmd}} -r {log_dir}/ {gcs_log_dir}/

  if [[ -f /tmp/{run_id}/image_created ]]; then
    echo -e "${{GREEN}}Workflow succeeded${{NC}}, check logs at {log_dir}/ or {gcs_log_dir}/"
    exit 0
  else
    echo -e "${{RED}}Workflow failed${{NC}}, check logs at {log_dir}/ or {gcs_log_dir}/"
    exit 1
  fi
}}

function test_element_in_array {{
  local test_element="$1" ; shift
  local -a test_array=("$@")

  for item in "${{test_array[@]}}"; do
    if [[ "${{item}}" == "${{test_element}}" ]]; then return 0 ; fi
  done
  return 1
}}

function print_modulus_md5sum {{
  local derfile="$1"
  openssl x509 -noout -modulus -in "${{derfile}}" | openssl md5 | awk '{{print $2}}'
}}

function print_img_dbs_modulus_md5sums() {{
  local long_img_name="$1"
  local img_name="$(echo ${{long_img_name}} | sed -e 's:^.*/::')"
  local json_tmpfile="/tmp/{run_id}/${{img_name}}.json"
  gcloud compute images describe ${{long_img_name}} --format json > "${{json_tmpfile}}"

  local -a db_certs=()
  mapfile -t db_certs < <( cat ${{json_tmpfile}} | jq -r 'try .shieldedInstanceInitialState.dbs[].content' )

  local -a modulus_md5sums=()
  for key in "${{!db_certs[@]}}" ; do
    local derfile="/tmp/{run_id}/${{img_name}}.${{key}}.der"
    echo "${{db_certs[${{key}}]}}" | \
      perl -M'MIME::Base64(decode_base64url)' -ne 'chomp; print( decode_base64url($_) )' \
      > "${{derfile}}"
    modulus_md5sums+=( $(print_modulus_md5sum "${{derfile}}") )
  done

  echo "${{modulus_md5sums[@]}}"
}}

function main() {{
  echo 'Uploading files to GCS bucket.'
  declare -a sources_k=({sources_map_k})
  declare -a sources_v=({sources_map_v})
  for i in "${{!sources_k[@]}}"; do
    ${{gsutil_cmd}} cp "${{sources_v[i]}}" "{custom_sources_path}/${{sources_k[i]}}" > /dev/null 2>&1
  done

  local cert_args=""
  local num_src_certs="0"
  metadata_arg="{metadata_flag}"
  if [[ -n '{trusted_cert}' ]] && [[ -f '{trusted_cert}' ]]; then
    # build tls/ directory from variables defined near the header of
    # the examples/secure-boot/create-key-pair.sh file

    eval "$(bash examples/secure-boot/create-key-pair.sh)"
    metadata_arg="${{metadata_arg}},public_secret_name=${{public_secret_name}},private_secret_name=${{private_secret_name}},secret_project=${{secret_project}},secret_version=${{secret_version}}"

    # by default, a gcloud secret with the name of efi-db-pub-key-042 is
    # created in the current project to store the certificate installed
    # as the signature database file for this disk image

    # The MS UEFI CA is a reasonable base from which to build trust.  We
    # will trust code signed by this CA as well as code signed by
    # trusted_cert (tls/db.der)

    # The Microsoft Corporation UEFI CA 2011
    local -r MS_UEFI_CA="tls/MicCorUEFCA2011_2011-06-27.crt"
    test -f "${{MS_UEFI_CA}}" || \
      curl -L -o ${{MS_UEFI_CA}} 'https://go.microsoft.com/fwlink/p/?linkid=321194'

    local -a cert_list=()

    local -a default_cert_list
    default_cert_list=("{trusted_cert}" "${{MS_UEFI_CA}}")
    local -a src_img_modulus_md5sums=()

    mapfile -t src_img_modulus_md5sums < <(print_img_dbs_modulus_md5sums {dataproc_base_image})
    num_src_certs="${{#src_img_modulus_md5sums[@]}}"
    echo "debug - num_src_certs: [${{#src_img_modulus_md5sums[*]}}]"
    echo "value of src_img_modulus_md5sums: [${{src_img_modulus_md5sums}}]"
    if [[ -z "${{src_img_modulus_md5sums}}" ]]; then
      num_src_certs=0
      echo "no db certificates in source image"
      cert_list=( "${{default_cert_list[@]}}" )
    else
      echo "${{num_src_certs}} db certificates attached to source image"
      echo "db certs exist in source image"
      for cert in ${{default_cert_list[*]}}; do
        if test_element_in_array "$(print_modulus_md5sum ${{cert}})" ${{src_img_modulus_md5sums[@]}} ; then
          echo "cert ${{cert}} is already in source image's db list"
        else
          cert_list+=("${{cert}}")
        fi
      done
      # append source image's cert list
      local img_name="$(echo {dataproc_base_image} | sed -e 's:^.*/::')"
      if [[ ${{#cert_list[@]}} -ne 0 ]] && compgen -G "/tmp/{run_id}/${{img_name}}.*.der" > /dev/null ; then
        cert_list+=(/tmp/{run_id}/${{img_name}}.*.der)
      fi
    fi

    if [[ ${{#cert_list[@]}} -eq 0 ]]; then
      echo "all certificates already included in source image's db list"
    else
      cert_args="--signature-database-file=$(IFS=, ; echo "${{cert_list[*]}}") --guest-os-features=UEFI_COMPATIBLE"
    fi
  fi

  date

  if [[ -z "${{cert_args}}" && "${{num_src_certs}}" -ne "0" ]]; then
    echo 'Re-using base image'
    base_obj_type="reuse"
    instance_disk_args='--image-project={project_id} --image={dataproc_base_image} --boot-disk-size={disk_size}G --boot-disk-type=pd-ssd'

  elif [[ -n "${{cert_args}}" ]] ; then
    echo 'Creating image.'
    base_obj_type="images"
    instance_disk_args='--image-project={project_id} --image={image_name}-install --boot-disk-size={disk_size}G --boot-disk-type=pd-ssd'
    execute_with_retries \
      gcloud compute images create {image_name}-install \
      --project={project_id} \
      --source-image={dataproc_base_image} \
      ${{cert_args}} \
      {storage_location_flag} \
      --family={family}
    touch "/tmp/{run_id}/disk_created"
  else
    echo 'Creating disk.'
    base_obj_type="disks"
    instance_disk_args='--disk=auto-delete=yes,boot=yes,mode=rw,name={image_name}-install'
    execute_with_retries gcloud compute disks create {image_name}-install \
      --project={project_id} \
      --zone={zone} \
      --image={dataproc_base_image} \
      --type=pd-ssd \
      --size={disk_size}GB
    touch "/tmp/{run_id}/disk_created"
  fi

  date
  echo 'Creating VM instance to run customization script.'
  execute_with_retries gcloud compute instances create {image_name}-install \
      --project={project_id} \
      --zone={zone} \
      {network_flag} \
      {subnetwork_flag} \
      {no_external_ip_flag} \
      --machine-type={machine_type} \
      ${{instance_disk_args}} \
      {accelerator_flag} \
      {service_account_flag} \
      --scopes=cloud-platform \
      "${{metadata_arg}}" \
      --metadata-from-file startup-script=startup_script/run.sh

  touch /tmp/{run_id}/vm_created

  # clean up intermediate install image
  if [[ "${{base_obj_type}}" == "images" ]] ; then ( set +e
    # This sometimes returns an API error but deletes the image despite the failure
    gcloud compute images delete -q {image_name}-install --project={project_id}
  ) fi

  echo "Monitor startup logs in {log_dir}/startup-script.log"
  echo 'Waiting for customization script to finish and VM shutdown.'
  set -x
  # too many serial port output requests per minute occur if they all occur at once
  sleep $(( ( RANDOM % 60 ) + 20 ))

  gcloud compute instances describe --format json {image_name}-install --zone {zone} | tee {log_dir}/instance.json

  execute_with_retries gcloud compute instances tail-serial-port-output {image_name}-install \
      --project={project_id} \
      --zone={zone} \
      --port=1 2>&1 \
      | grep 'startup-script' | grep -v '^\\[' \
      | sed -e 's/ {image_name}-install.*startup-script://g' \
      | dd bs=1 status=none of={log_dir}/startup-script.log \
      || true
  echo 'Checking customization script result.'
  date
  if grep -q 'BuildSucceeded:' {log_dir}/startup-script.log; then
    echo -e "${{GREEN}}Customization script succeeded.${{NC}}"
  else
    echo -e "${{RED}}Customization script failed.${{NC}}"
    echo "See {log_dir}/startup-script.log for details"
    exit 1
  fi

  date
  echo 'Creating custom image.'
  execute_with_retries gcloud compute images create {image_name} \
    --project={project_id} \
    --source-disk-zone={zone} \
    --source-disk={image_name}-install \
    {storage_location_flag} \
    --family={family}

  touch /tmp/{run_id}/image_created
}}

prepare
trap exit_handler EXIT
mkdir -p {log_dir}
main "$@" 2>&1 | tee {log_dir}/workflow.log
"""

class Generator:
  """Shell script based image creation workflow generator."""

  def _init_args(self, args):
    self.args = args
    if "run_id" not in self.args:
      self.args["run_id"] = "custom-image-{image_name}-{timestamp}".format(
          timestamp=datetime.now().strftime("%Y%m%d-%H%M%S"), **self.args)
    self.args["bucket_name"] = self.args["gcs_bucket"].replace("gs://", "")
    self.args["custom_sources_path"] = "gs://{bucket_name}/{run_id}/sources".format(**self.args)

    all_sources = {
        "run.sh": "startup_script/run.sh",
        "init_actions.sh": self.args["customization_script"]
    }
    all_sources.update(self.args["extra_sources"])

    sources_map_items = tuple(enumerate(all_sources.items()))
    self.args["sources_map_k"] = " ".join([
        "[{}]='{}'".format(i, kv[0].replace("'", "'\\''")) for i, kv in sources_map_items])
    self.args["sources_map_v"] = " ".join([
        "[{}]='{}'".format(i, kv[1].replace("'", "'\\''")) for i, kv in sources_map_items])

    self.args["log_dir"] = "/tmp/{run_id}/logs".format(**self.args)
    self.args["gcs_log_dir"] = "gs://{bucket_name}/{run_id}/logs".format(
      **self.args)
    if self.args["subnetwork"]:
      self.args["subnetwork_flag"] = "--subnet={subnetwork}".format(**self.args)
      self.args["network_flag"] = ""
    elif self.args["network"]:
      self.args["network_flag"] = "--network={network}".format(**self.args)
      self.args["subnetwork_flag"] = ""
    if self.args["service_account"]:
      self.args[
        "service_account_flag"] = "--service-account={service_account}".format(
        **self.args)
    self.args["no_external_ip_flag"] = "--no-address" if self.args[
      "no_external_ip"] else ""
    self.args[
      "accelerator_flag"] = "--accelerator={accelerator} --maintenance-policy terminate".format(
        **self.args) if self.args["accelerator"] else ""
    self.args[
      "storage_location_flag"] = "--storage-location={storage_location}".format(
        **self.args) if self.args["storage_location"] else ""
    metadata_flag_template = (
        "--metadata=shutdown-timer-in-sec={shutdown_timer_in_sec},"
        "custom-sources-path={custom_sources_path}"
    )
    if self.args["zone"]:
      region = "-".join(self.args["zone"].split("-")[:-1])
      metadata_flag_template += ',dataproc-region="{}"'.format(region)
    if self.args["optional_components"]:
      optional_components = self.args["optional_components"].split(',')
      # convert to component names used inside image and join to set as metadata value
      optional_image_components = '.'.join(self._get_optional_to_image_components(optional_components))
      metadata_flag_template += ',optional-components="{}"'.format(optional_image_components)
    if self.args["dataproc_version"]:
      dataproc_version = self.args["dataproc_version"]
      metadata_flag_template += ',dataproc_dataproc_version="{}"'.format(dataproc_version)
    if self.args["metadata"]:
      metadata_flag_template += ",{metadata}"
    self.args["metadata_flag"] = metadata_flag_template.format(**self.args)

  def _get_optional_to_image_components(self, optional_components):
    """Get the equivalent component names in the image for user provided optional components."""
    # Add new component here, if component name inside image scripts is different.
    optional_to_image_component_map = {
      "DOCKER": "DOCKER-CE",
      "HIVE_WEBHCAT": "HIVE-WEBHCAT-SERVER",
      "SOLR": "SOLR-SERVER",
    }
    optional_image_components = []
    for component in optional_components:
      image_component = optional_to_image_component_map.get(component, component)
      optional_image_components.append(image_component)

    return optional_image_components

  def generate(self, args):
    self._init_args(args)
    return _template.format(**args)
====== Filename: ./custom_image_utils/args_inferer.py ======
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Infer arguments for Dataproc custom image build.
"""

import logging
import os
import re
import subprocess
import tempfile

_IMAGE_PATH = "projects/{}/global/images/{}"
_IMAGE_URI = re.compile(
    r"^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/([^/]+)$"
)
_IMAGE_FAMILY_PATH = "projects/{}/global/images/family/{}"
_IMAGE_FAMILY_URI = re.compile(
    r"^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/family/([^/]+)$"
)
logging.basicConfig()
_LOG = logging.getLogger(__name__)
_LOG.setLevel(logging.WARN)


def _get_project_id():
  """Get project id from gcloud config."""
  gcloud_command = ["gcloud", "config", "get-value", "project"]
  with tempfile.NamedTemporaryFile() as temp_file:
    pipe = subprocess.Popen(gcloud_command, stdout=temp_file)
    pipe.wait()
    if pipe.returncode != 0:
      raise RuntimeError("Cannot find gcloud project ID. "
                         "Please setup the project ID in gcloud SDK")
    # get project id
    temp_file.seek(0)
    stdout = temp_file.read()
    return stdout.decode('utf-8').strip()


def _extract_image_name_and_project(image_uri):
  """Get Dataproc image name and project."""
  m = _IMAGE_URI.match(image_uri)
  return m.group(3), m.group(4)  # project, image_name


def _extract_image_name_and_project_from_family_uri(image_uri):
  """Get Dataproc image family name and project."""
  m = _IMAGE_FAMILY_URI.match(image_uri)
  return m.group(3), m.group(4)  # project, image_name


def _get_dataproc_image_version(image_uri):
  """Get Dataproc image version from image URI."""
  project, image_name = _extract_image_name_and_project(image_uri)
  command = [
      "gcloud", "compute", "images", "describe", image_name, "--project",
      project, "--format=value(labels.goog-dataproc-version)"
  ]

  # get stdout from compute images list --filters
  with tempfile.NamedTemporaryFile() as temp_file:
    pipe = subprocess.Popen(command, stdout=temp_file)
    pipe.wait()
    if pipe.returncode != 0:
      raise RuntimeError(
          "Cannot find dataproc base image, please check and verify "
          "the base image URI.")

    temp_file.seek(0)  # go to start of the stdout
    stdout = temp_file.read()
    # parse the first ready image with the dataproc version attached in labels
    if stdout:
      parsed_line = stdout.decode('utf-8').strip()  # should be just one value
      return parsed_line

  raise RuntimeError("Cannot find dataproc base image: %s", image_uri)


def _get_dataproc_version_from_image_family(image_family_uri):
  """Get Dataproc image family version from family name."""
  project, image_family_name = _extract_image_name_and_project_from_family_uri(image_family_uri)
  command = [
      "gcloud", "compute", "images", "describe-from-family", image_family_name, "--project",
      project, "--format=value(labels.goog-dataproc-version)"
  ]

  # get stdout from compute images list --filters
  with tempfile.NamedTemporaryFile() as temp_file:
    pipe = subprocess.Popen(command, stdout=temp_file)
    pipe.wait()
    if pipe.returncode != 0:
      raise RuntimeError(
          "Cannot find dataproc base family image, please check and verify "
          "the family URI.")

    temp_file.seek(0)  # go to start of the stdout
    stdout = temp_file.read()
    # parse the first ready image with the dataproc version attached in labels
    if stdout:
      dataproc_version = stdout.decode('utf-8').strip()  # should be just one value
      return dataproc_version

  raise RuntimeError("Cannot find dataproc base image family: %s" %
                     image_family_uri)

def _extract_image_path(image_uri):
  """Get the partial image URI from the full image URI."""
  project, image_name = _extract_image_name_and_project(image_uri)
  return _IMAGE_PATH.format(project, image_name)

def _extract_image_family_path(image_family_uri):
  """Get the partial image family URI from the full image family URI."""
  project, image_name = _extract_image_name_and_project_from_family_uri(image_family_uri)
  return _IMAGE_FAMILY_PATH.format(project, image_name)

def _get_dataproc_image_path_by_version(version):
  """Get Dataproc base image name from version."""
  # version regex already checked in arg parser
  parsed_version = version.split(".")
  major_version = parsed_version[0]
  if len(parsed_version) == 2:
    # The input version must be of format 1.5-debian10 in which case we need to
    # expand it to 1-5-\d+-debian10 so we can do a regexp on the minor version
    minor_version = parsed_version[1].split("-")[0]
    parsed_version[1] = parsed_version[1].replace("-", "-\d+-")
    filter_arg = ("labels.goog-dataproc-version ~ ^{}-{} AND NOT name ~ -eap$"
                  " AND status = READY").format(parsed_version[0],
                                                parsed_version[1])
  else:
    major_version = parsed_version[0]
    minor_version = parsed_version[1]
    # Moreover, push the filter of READY status and name not containing 'eap' to
    # gcloud command so we don't have to iterate the list
    filter_arg = ("labels.goog-dataproc-version = {}-{}-{} AND NOT name ~ -eap$"
                  " AND status = READY").format(parsed_version[0],
                                                parsed_version[1],
                                                parsed_version[2])
  command = [
    "gcloud", "compute", "images", "list", "--project", "cloud-dataproc",
    "--filter", filter_arg, "--format",
    "csv[no-heading=true](name,labels.goog-dataproc-version)",
    "--sort-by=~creationTimestamp"
  ]

  _LOG.info("Executing command: {}".format(command))
  # get stdout from compute images list --filters
  with tempfile.NamedTemporaryFile() as temp_file:
    pipe = subprocess.Popen(command, stdout=temp_file)
    pipe.wait()
    if pipe.returncode != 0:
      raise RuntimeError(
        "Cannot find dataproc base image, please check and verify "
        "[--dataproc-version]")

    temp_file.seek(0)  # go to start of the stdout
    stdout = temp_file.read()
    # parse the first ready image with the dataproc version attached in labels
    if stdout:
      # in case there are multiple images
      parsed_lines = stdout.decode('utf-8').strip().split('\n')
      expected_prefix = "dataproc-{}-{}".format(major_version, minor_version)
      _LOG.info("Filtering images : %s", expected_prefix)
      image_versions=[]
      all_images_for_version = {}
      for line in parsed_lines:
        parsed_image = line.split(",")
        if len(parsed_image) == 2:
          parsed_image_name = parsed_image[0]
          if not parsed_image_name.startswith(expected_prefix):
            _LOG.info("Skipping non-release image %s", parsed_image_name)
            # Not a regular dataproc release image. Maybe a custom image with same label.
            continue
          parsed_image_version = parsed_image[1]
          if parsed_image_version not in all_images_for_version:
            all_images_for_version[parsed_image_version] = [_IMAGE_PATH.format("cloud-dataproc", parsed_image_name)]
            image_versions.append(parsed_image_version)
          else:
            all_images_for_version[parsed_image_version].append(_IMAGE_PATH.format("cloud-dataproc", parsed_image_name))

      _LOG.info("All Images : %s", all_images_for_version)
      _LOG.info("All Image-Versions : %s", image_versions)

      latest_available_version = image_versions[0]
      if (len(all_images_for_version[latest_available_version]) > 1):
        raise RuntimeError(
          "Found more than one images for latest dataproc-version={}. Images: {}".format(
            latest_available_version,
            str(all_images_for_version[latest_available_version])))

      _LOG.info("Choosing image %s with version %s", all_images_for_version[image_versions[0]][0], image_versions[0])
      return all_images_for_version[image_versions[0]][0], image_versions[0]

  raise RuntimeError(
    "Cannot find dataproc base image with dataproc-version=%s." % version)


def _infer_project_id(args):
  if not args.project_id:
    args.project_id = _get_project_id()


def _infer_base_image(args):
  # get dataproc base image from dataproc version
  _LOG.info("Getting Dataproc base image name...")
  if args.base_image_uri:
    args.dataproc_base_image = _extract_image_path(args.base_image_uri)
    args.dataproc_version = _get_dataproc_image_version(args.base_image_uri)
  elif args.dataproc_version:
    args.dataproc_base_image, args.dataproc_version = _get_dataproc_image_path_by_version(
        args.dataproc_version)
  elif args.base_image_family:
    args.dataproc_base_image = _extract_image_family_path(args.base_image_family)
    args.dataproc_version = _get_dataproc_version_from_image_family(args.base_image_family)
  else:
    raise RuntimeError(
        "Neither --dataproc-version nor --base-image-uri nor --source-image-family-uri is specified.")
  _LOG.info("Returned Dataproc base image: %s", args.dataproc_base_image)
  _LOG.info("Returned Dataproc version   : %s", args.dataproc_version)


def _infer_oauth(args):
  if args.oauth:
    args.oauth = "\n    \"OAuthPath\": \"{}\",".format(
        os.path.abspath(args.oauth))
  else:
    args.oauth = ""


def _infer_network(args):
  # When the user wants to create a VM in a shared VPC,
  # only the subnetwork argument has to be provided whereas
  # the network one has to be left empty.
  if not args.network and not args.subnetwork:
    args.network = 'global/networks/default'
  # The --network flag requires format global/networks/<network>,
  # which does not work for gcloud, here we convert it to
  # projects/<project>/global/networks/<network>.
  if args.network.startswith('global/networks/'):
    args.network = 'projects/{}/{}'.format(args.project_id, args.network)


def infer_args(args):
  _infer_project_id(args)
  _infer_base_image(args)
  _infer_oauth(args)
  _infer_network(args)
  args.shutdown_timer_in_sec = args.shutdown_instance_timer_sec
====== Filename: ./custom_image_utils/image_labeller.py ======
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Add label to Dataproc custom images.
"""

import logging
import subprocess

logging.basicConfig()
_LOG = logging.getLogger(__name__)
_LOG.setLevel(logging.WARN)


def _set_custom_image_label(image_name, version, project_id):
  """Sets Dataproc version label in the custom image."""

  # Convert `1.5.0-RC1-debian9` version to `1-5-0-rc1-debian9` label
  version_label = version.replace('.', '-').lower()
  label_flag = "--labels=goog-dataproc-version={}".format(version_label)
  command = [
      "gcloud", "compute", "images", "add-labels", image_name, "--project",
      project_id, label_flag
  ]
  _LOG.info("Running: {}".format(" ".join(command)))

  # get stdout from compute images list --filters
  pipe = subprocess.Popen(command)
  pipe.wait()
  if pipe.returncode != 0:
    raise RuntimeError("Cannot set dataproc version to image label.")


def add_label(args):
  """Sets Dataproc version label in the custom image."""

  if not args.dry_run:
    _LOG.info("Setting label on custom image...")
    _set_custom_image_label(args.image_name, args.dataproc_version,
                            args.project_id)
    _LOG.info("Successfully set label on custom image...")
  else:
    _LOG.info("Skip setting label on custom image (dry run).")
====== Filename: ./custom_image_utils/args_parser.py.rej ======
--- custom_image_utils/args_parser.py
+++ custom_image_utils/args_parser.py
@@ -238,6 +246,14 @@ def parse_args(args):
       default="tls/db.der",
       help="""(Optional) Inserts the specified DER-format certificate into
       the custom image's EFI boot sector for use with secure boot.""")
+  parser.add_argument(
+      "--optional-components",
+      type=_validate_components,
+      required=False,
+      help="""Optional Components to be installed with the image. 
+      Can be a comma-separated list of components, e.g., TRINO,ZEPPELIN.
+      (Only supported for Dataproc Images 2.3 and above)"""
+  )
 
 
   return parser.parse_args(args)
====== Filename: ./custom_image_utils/shell_script_executor.py ======
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Shell script executor.
"""

import os
import subprocess
import sys
import tempfile


def run(shell_script):
  """Runs a Shell script."""

  # Write the script to a temp file.
  temp_file = tempfile.NamedTemporaryFile(delete=False)
  try:
    temp_file.write(shell_script.encode("utf-8"))
    temp_file.flush()
    temp_file.close()  # close this file but do not delete

    # Run the shell script from the temp file, then wait for it to complete.
    pipe = subprocess.Popen(
        ['bash', temp_file.name],
        stdout=sys.stdout,
        stderr=sys.stderr
    )
    #for line in iter(pipe.stdout.readline, b''):
    #  if not line:
    #    print(line)
    #pipe.stdout.close()
    pipe.wait()
    if pipe.returncode != 0:
      raise RuntimeError("Error building custom image.")
  finally:
    try:
      os.remove(temp_file.name)
    except OSError:
      pass
====== Filename: ./custom_image_utils/args_parser.py ======
# Copyright 2019,2020,2021,2024 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This is a utility module which defines and parses the command-line arguments
for the generate_custom_image.py script.
"""

import argparse
import json
import re

from custom_image_utils import constants


# Old style images: 1.2.3
# New style images: 1.2.3-deb8, 1.2.3-debian9, 1.2.3-RC10-debian9
_VERSION_REGEX = re.compile(r"^\d+\.\d+\.\d+(-RC\d+)?(-[a-z]+\d+)?$")
_FULL_IMAGE_URI = re.compile(r"^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/([^/]+)$")
_FULL_IMAGE_FAMILY_URI = re.compile(r"^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/family/([^/]+)$")
_LATEST_FROM_MINOR_VERSION = re.compile(r"^(\d+)\.(\d+)-((?:debian|ubuntu|rocky)\d+)$")
_VALID_OPTIONAL_COMPONENTS = ["HIVE_WEBHCAT", "ZEPPELIN", "TRINO", "RANGER", "SOLR", "FLINK", "DOCKER", "HUDI", "ICEBERG", "PIG"]

def _version_regex_type(s):
  """Check if version string matches regex."""
  if not _VERSION_REGEX.match(s) and not _LATEST_FROM_MINOR_VERSION.match(s):
    raise argparse.ArgumentTypeError("Invalid version: {}.".format(s))
  return s

def _full_image_uri_regex_type(s):
  """Check if the partial image uri string matches regex."""
  if not _FULL_IMAGE_URI.match(s):
    raise argparse.ArgumentTypeError("Invalid image URI: {}.".format(s))
  return s

def _full_image_family_uri_regex_type(s):
  """Check if the partial image family uri string matches regex."""
  if not _FULL_IMAGE_FAMILY_URI.match(s):
    raise argparse.ArgumentTypeError("Invalid image family URI: {}.".format(s))
  return s

def _validate_components(optional_components):
    components = optional_components.split(',')
    for component in components:
        if component not in _VALID_OPTIONAL_COMPONENTS:
            raise argparse.ArgumentTypeError("Invalid optional component selected.")
    return optional_components

def parse_args(args):
  """Parses command-line arguments."""
  parser = argparse.ArgumentParser()
  required_args = parser.add_argument_group("required named arguments")
  required_args.add_argument(
      "--image-name",
      type=str,
      required=True,
      help="""The image name for the Dataproc custom image.""")
  image_args = required_args.add_mutually_exclusive_group()
  image_args.add_argument(
      "--dataproc-version",
      type=_version_regex_type,
      help=constants.version_help_text)
  image_args.add_argument(
      "--base-image-uri",
      type=_full_image_uri_regex_type,
      help="""The full image URI for the base Dataproc image. The
      customiziation script will be executed on top of this image instead of
      an out-of-the-box Dataproc image. This image must be a valid Dataproc
      image.
      """)
  image_args.add_argument(
      "--base-image-family",
      type=_full_image_family_uri_regex_type,
      help="""The source image family URI. The latest non-depracated image associated with the family will be used.
      """)
  required_args.add_argument(
      "--customization-script",
      type=str,
      required=True,
      help="""User's script to install custom packages.""")
  required_args.add_argument(
      "--metadata",
      type=str,
      required=False,
      help="""VM metadata which can be read by the customization script
      with `/usr/share/google/get_metadata_value attributes/<key>` at runtime.
      The value of this flag takes the form of `key1=value1,key2=value2,...`.
      If the value includes special characters (e.g., `=`, `,` or spaces) which
      needs to be escaped, consider encoding the value, then decode it back in
      the customization script. See more information about VM metadata
      on https://cloud.google.com/sdk/gcloud/reference/compute/instances/create.
      """)
  required_args.add_argument(
      "--zone",
      type=str,
      required=True,
      help="""GCE zone used to build the custom image.""")
  required_args.add_argument(
      "--gcs-bucket",
      type=str,
      required=True,
      help="""GCS bucket used to store files and logs when
      building custom image.""")
  parser.add_argument(
      "--family",
      type=str,
      required=False,
      default='dataproc-custom-image',
      help="""(Optional) The family of the image.""")
  parser.add_argument(
      "--project-id",
      type=str,
      required=False,
      help="""The project Id of the project where the custom image will be
      created and saved. The default value will be set to the project id
      specified by `gcloud config get-value project`.""")
  parser.add_argument(
      "--oauth",
      type=str,
      required=False,
      help="""A local path to JSON credentials for your GCE project.
      The default oauth is the application-default credentials from gcloud.""")
  parser.add_argument(
      "--machine-type",
      type=str,
      required=False,
      default="n1-standard-1",
      help="""(Optional) Machine type used to build custom image.
      Default machine type is n1-standard-1.""")
  parser.add_argument(
      "--no-smoke-test",
      action="store_true",
      help="""(Optional) Disables smoke test to verify if the custom image
      can create a functional Dataproc cluster.""")
  parser.add_argument(
      "--network",
      type=str,
      required=False,
      default="",
      help="""(Optional) Network interface used to launch the VM instance that
      builds the custom image. Default network is 'global/networks/default'
      when no network and subnetwork arguments are provided.
      If the default network does not exist in your project, please specify
      a valid network interface.""")
  parser.add_argument(
      "--subnetwork",
      type=str,
      required=False,
      default="",
      help="""(Optional) The subnetwork that is used to launch the VM instance
      that builds the custom image. A full subnetwork URL is required.
      Default subnetwork is None. For shared VPC only provide this parameter and
      do not use the --network argument.""")
  parser.add_argument(
      "--no-external-ip",
      action="store_true",
      help="""(Optional) Disables external IP for the image build VM. The VM
      will not be able to access the internet, but if Private Google
      Access is enabled for the subnetwork, it can still access Google services
      (e.g., GCS) through internal IP of the VPC.""")
  parser.add_argument(
      "--service-account",
      type=str,
      required=False,
      default="default",
      help=
      """(Optional) The service account that is used to launch the VM instance
      that builds the custom image. If not specified, the default service
      account under the GCE project will be used. The scope of this service
      account is defaulted to /auth/cloud-platform.""")
  parser.add_argument(
      "--extra-sources",
      type=json.loads,
      required=False,
      default={},
      help=
      """(Optional) Additional files/directories uploaded along with
      customization script. This argument is evaluated to a json dictionary.
      For example:
      '--extra-sources "{\\"notes.txt\\": \\"/path/to/notes.txt\\"}"'
      """)
  parser.add_argument(
      "--disk-size",
      type=int,
      required=False,
      default=30,
      help=
      """(Optional) The size in GB of the disk attached to the VM instance
      that builds the custom image. If not specified, the default value of
      15 GB will be used.""")
  parser.add_argument(
      "--accelerator",
      type=str,
      required=False,
      default=None,
      help=
      """(Optional) The accelerators (e.g. GPUs) attached to the VM instance
      that builds the custom image. If not specified, no accelerators are
      attached.""")
  parser.add_argument(
      "--storage-location",
      type=str,
      required=False,
      default=None,
      help=
      """(Optional) The storage location (e.g. US, us-central1) of the custom
      GCE image. If not specified, the default GCE image storage location is
      used.""")
  parser.add_argument(
      "--shutdown-instance-timer-sec",
      type=int,
      required=False,
      default=300,
      help=
      """(Optional) The time to wait in seconds before shutting down the VM
      instance. This value may need to be increased if your init script
      generates a lot of output on stdout. If not specified, the default value
      of 300 seconds will be used.""")
  parser.add_argument(
      "--dry-run",
      action="store_true",
      help="""(Optional) Only generates script without creating image.""")
  parser.add_argument(
      "--trusted-cert",
      type=str,
      required=False,
      default="tls/db.der",
      help="""(Optional) Inserts the specified DER-format certificate into
      the custom image's EFI boot sector for use with secure boot.""")
  parser.add_argument(
      "--optional-components",
      type=_validate_components,
      required=False,
      help="""Optional Components to be installed with the image.
      Can be a comma-separated list of components, e.g., TRINO,ZEPPELIN.
      (Only supported for Dataproc Images 2.3 and above)"""
  )


  return parser.parse_args(args)
====== Filename: ./custom_image_utils/shell_image_creator.py ======
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Shell script based custom image creator.
"""

import logging

from custom_image_utils import shell_script_executor
from custom_image_utils import shell_script_generator

logging.basicConfig()
_LOG = logging.getLogger(__name__)
_LOG.setLevel(logging.WARN)


def create(args):
  """Creates a custom image with generated Shell script."""

  # Generate Shell script.
  _LOG.info("Generating Shell script...")
  script = shell_script_generator.Generator().generate(vars(args))
  _LOG.info("#" * 60)
  _LOG.info(script)
  _LOG.info("#" * 60)
  _LOG.info("Successfully generated Shell script...")

  # Run the script to build custom image.
  if not args.dry_run:
    _LOG.info("Creating custom image...")
    shell_script_executor.run(script)
    _LOG.info("Successfully created custom image...")
  else:
    _LOG.info("Skip creating custom image (dry run).")
====== Filename: ./custom_image_utils/args_parser.py.orig ======
# Copyright 2019,2020,2021,2024 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This is a utility module which defines and parses the command-line arguments
for the generate_custom_image.py script.
"""

import argparse
import json
import re

from custom_image_utils import constants


# Old style images: 1.2.3
# New style images: 1.2.3-deb8, 1.2.3-debian9, 1.2.3-RC10-debian9
_VERSION_REGEX = re.compile(r"^\d+\.\d+\.\d+(-RC\d+)?(-[a-z]+\d+)?$")
_FULL_IMAGE_URI = re.compile(r"^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/([^/]+)$")
_FULL_IMAGE_FAMILY_URI = re.compile(r"^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/family/([^/]+)$")
_LATEST_FROM_MINOR_VERSION = re.compile(r"^(\d+)\.(\d+)-((?:debian|ubuntu|rocky)\d+)$")

def _version_regex_type(s):
  """Check if version string matches regex."""
  if not _VERSION_REGEX.match(s) and not _LATEST_FROM_MINOR_VERSION.match(s):
    raise argparse.ArgumentTypeError("Invalid version: {}.".format(s))
  return s

def _full_image_uri_regex_type(s):
  """Check if the partial image uri string matches regex."""
  if not _FULL_IMAGE_URI.match(s):
    raise argparse.ArgumentTypeError("Invalid image URI: {}.".format(s))
  return s

def _full_image_family_uri_regex_type(s):
  """Check if the partial image family uri string matches regex."""
  if not _FULL_IMAGE_FAMILY_URI.match(s):
    raise argparse.ArgumentTypeError("Invalid image family URI: {}.".format(s))
  return s  

def parse_args(args):
  """Parses command-line arguments."""
  parser = argparse.ArgumentParser()
  required_args = parser.add_argument_group("required named arguments")
  required_args.add_argument(
      "--image-name",
      type=str,
      required=True,
      help="""The image name for the Dataproc custom image.""")
  image_args = required_args.add_mutually_exclusive_group()
  image_args.add_argument(
      "--dataproc-version",
      type=_version_regex_type,
      help=constants.version_help_text)
  image_args.add_argument(
      "--base-image-uri",
      type=_full_image_uri_regex_type,
      help="""The full image URI for the base Dataproc image. The
      customiziation script will be executed on top of this image instead of
      an out-of-the-box Dataproc image. This image must be a valid Dataproc
      image.
      """)
  image_args.add_argument(
      "--base-image-family",
      type=_full_image_family_uri_regex_type,
      help="""The source image family URI. The latest non-depracated image associated with the family will be used.
      """)      
  required_args.add_argument(
      "--customization-script",
      type=str,
      required=True,
      help="""User's script to install custom packages.""")
  required_args.add_argument(
      "--metadata",
      type=str,
      required=False,
      help="""VM metadata which can be read by the customization script
      with `/usr/share/google/get_metadata_value attributes/<key>` at runtime.
      The value of this flag takes the form of `key1=value1,key2=value2,...`.
      If the value includes special characters (e.g., `=`, `,` or spaces) which
      needs to be escaped, consider encoding the value, then decode it back in
      the customization script. See more information about VM metadata
      on https://cloud.google.com/sdk/gcloud/reference/compute/instances/create.
      """)
  required_args.add_argument(
      "--zone",
      type=str,
      required=True,
      help="""GCE zone used to build the custom image.""")
  required_args.add_argument(
      "--gcs-bucket",
      type=str,
      required=True,
      help="""GCS bucket used to store files and logs when
      building custom image.""")
  parser.add_argument(
      "--family",
      type=str,
      required=False,
      default='dataproc-custom-image',
      help="""(Optional) The family of the image.""")
  parser.add_argument(
      "--project-id",
      type=str,
      required=False,
      help="""The project Id of the project where the custom image will be
      created and saved. The default value will be set to the project id
      specified by `gcloud config get-value project`.""")
  parser.add_argument(
      "--oauth",
      type=str,
      required=False,
      help="""A local path to JSON credentials for your GCE project.
      The default oauth is the application-default credentials from gcloud.""")
  parser.add_argument(
      "--machine-type",
      type=str,
      required=False,
      default="n1-standard-1",
      help="""(Optional) Machine type used to build custom image.
      Default machine type is n1-standard-1.""")
  parser.add_argument(
      "--no-smoke-test",
      action="store_true",
      help="""(Optional) Disables smoke test to verify if the custom image
      can create a functional Dataproc cluster.""")
  parser.add_argument(
      "--network",
      type=str,
      required=False,
      default="",
      help="""(Optional) Network interface used to launch the VM instance that
      builds the custom image. Default network is 'global/networks/default'
      when no network and subnetwork arguments are provided.
      If the default network does not exist in your project, please specify
      a valid network interface.""")
  parser.add_argument(
      "--subnetwork",
      type=str,
      required=False,
      default="",
      help="""(Optional) The subnetwork that is used to launch the VM instance
      that builds the custom image. A full subnetwork URL is required.
      Default subnetwork is None. For shared VPC only provide this parameter and
      do not use the --network argument.""")
  parser.add_argument(
      "--no-external-ip",
      action="store_true",
      help="""(Optional) Disables external IP for the image build VM. The VM
      will not be able to access the internet, but if Private Google
      Access is enabled for the subnetwork, it can still access Google services
      (e.g., GCS) through internal IP of the VPC.""")
  parser.add_argument(
      "--service-account",
      type=str,
      required=False,
      default="default",
      help=
      """(Optional) The service account that is used to launch the VM instance
      that builds the custom image. If not specified, the default service
      account under the GCE project will be used. The scope of this service
      account is defaulted to /auth/cloud-platform.""")
  parser.add_argument(
      "--extra-sources",
      type=json.loads,
      required=False,
      default={},
      help=
      """(Optional) Additional files/directories uploaded along with
      customization script. This argument is evaluated to a json dictionary.
      For example:
      '--extra-sources "{\\"notes.txt\\": \\"/path/to/notes.txt\\"}"'
      """)
  parser.add_argument(
      "--disk-size",
      type=int,
      required=False,
      default=30,
      help=
      """(Optional) The size in GB of the disk attached to the VM instance
      that builds the custom image. If not specified, the default value of
      15 GB will be used.""")
  parser.add_argument(
      "--accelerator",
      type=str,
      required=False,
      default=None,
      help=
      """(Optional) The accelerators (e.g. GPUs) attached to the VM instance
      that builds the custom image. If not specified, no accelerators are
      attached.""")
  parser.add_argument(
      "--storage-location",
      type=str,
      required=False,
      default=None,
      help=
      """(Optional) The storage location (e.g. US, us-central1) of the custom
      GCE image. If not specified, the default GCE image storage location is
      used.""")
  parser.add_argument(
      "--shutdown-instance-timer-sec",
      type=int,
      required=False,
      default=300,
      help=
      """(Optional) The time to wait in seconds before shutting down the VM
      instance. This value may need to be increased if your init script
      generates a lot of output on stdout. If not specified, the default value
      of 300 seconds will be used.""")
  parser.add_argument(
      "--dry-run",
      action="store_true",
      help="""(Optional) Only generates script without creating image.""")
  parser.add_argument(
      "--trusted-cert",
      type=str,
      required=False,
      default="tls/db.der",
      help="""(Optional) Pass an empty string to this argument to
      disable support for shielded-secure-boot.""")

  return parser.parse_args(args)
====== Filename: ./custom_image_utils/smoke_test_runner.py ======
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Run smoke test for Dataproc custom images.
"""

import datetime
import logging
import subprocess
import uuid

logging.basicConfig()
_LOG = logging.getLogger(__name__)
_LOG.setLevel(logging.WARN)

def _create_workflow_template(workflow_name, image_name, project_id, zone, region,
                              network, subnet, no_external_ip):
  """Create a Dataproc workflow template for testing."""
  create_command = [
      "gcloud", "dataproc", "workflow-templates", "create",
      workflow_name, "--project", project_id, "--region", region
  ]
  set_cluster_command = [
      "gcloud", "dataproc", "workflow-templates",
      "set-managed-cluster", workflow_name, "--project", project_id, "--image",
      image_name, "--zone", zone, "--region", region
  ]
  if network and not subnet:
    set_cluster_command.extend(["--network", network])
  else:
    set_cluster_command.extend(["--subnet", subnet])
  if no_external_ip:
    set_cluster_command.extend(["--no-address"])
  add_job_command = [
      "gcloud", "dataproc", "workflow-templates", "add-job", "spark",
      "--workflow-template", workflow_name, "--project", project_id, "--region", region,
      "--step-id", "001", "--class", "org.apache.spark.examples.SparkPi",
      "--jars", "file:///usr/lib/spark/examples/jars/spark-examples.jar", "--",
      "1000"
  ]
  pipe = subprocess.Popen(create_command)
  pipe.wait()
  if pipe.returncode != 0:
    raise RuntimeError("Error creating Dataproc workflow template '%s'.",
                       workflow_name)

  pipe = subprocess.Popen(set_cluster_command)
  pipe.wait()
  if pipe.returncode != 0:
    raise RuntimeError(
        "Error setting cluster for Dataproc workflow template '%s'.",
        workflow_name)

  pipe = subprocess.Popen(add_job_command)
  pipe.wait()
  if pipe.returncode != 0:
    raise RuntimeError("Error adding job to Dataproc workflow template '%s'.",
                       workflow_name)


def _instantiate_workflow_template(workflow_name, project_id, region):
  """Run a Dataproc workflow template to test the newly built custom image."""
  command = [
      "gcloud", "dataproc", "workflow-templates", "instantiate",
      workflow_name, "--project", project_id, "--region", region
  ]
  pipe = subprocess.Popen(command)
  pipe.wait()
  if pipe.returncode != 0:
    raise RuntimeError("Unable to instantiate workflow template.")


def _delete_workflow_template(workflow_name, project_id, region):
  """Delete a Dataproc workflow template."""
  command = [
      "gcloud", "dataproc", "workflow-templates", "delete",
      workflow_name, "-q", "--project", project_id, "--region", region
  ]
  pipe = subprocess.Popen(command)
  pipe.wait()
  if pipe.returncode != 0:
    raise RuntimeError("Error deleting workfloe template %s.", workflow_name)


def _verify_custom_image(image_name, project_id, zone, network, subnetwork, no_external_ip):
  """Verifies if custom image works with Dataproc."""
  region = zone[:-2]
  date = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
  # Note: workflow_name can collide if the script runs more than 10000
  # times/second.
  workflow_name = "verify-image-{}-{}".format(date, uuid.uuid4().hex[-8:])
  try:
    _LOG.info("Creating Dataproc workflow-template %s with image %s...",
              workflow_name, image_name)
    _create_workflow_template(workflow_name, image_name, project_id, zone, region,
                              network, subnetwork, no_external_ip)
    _LOG.info(
        "Successfully created Dataproc workflow-template %s with image %s...",
        workflow_name, image_name)
    _LOG.info("Smoke testing Dataproc workflow-template %s...")
    _instantiate_workflow_template(workflow_name, project_id, region)
    _LOG.info("Successfully smoke tested Dataproc workflow-template %s...",
              workflow_name)
  except RuntimeError as e:
    err_msg = "Verification of custom image {} failed: {}".format(
        image_name, e)
    _LOG.error(err_msg)
    raise RuntimeError(err_msg)
  finally:
    try:
      _LOG.info("Deleting Dataproc workflow-template %s...", workflow_name)
      _delete_workflow_template(workflow_name, project_id, region)
      _LOG.info("Successfully deleted Dataproc workflow-template %s...",
                workflow_name)
    except RuntimeError:
      pass


def run(args):
  """Runs smoke test."""

  if not args.dry_run:
    if not args.no_smoke_test:
      _LOG.info("Verifying the custom image...")
      _verify_custom_image(args.image_name, args.project_id, args.zone,
                           args.network, args.subnetwork, args.no_external_ip)
      _LOG.info("Successfully verified the custom image...")
  else:
    _LOG.info("Skip running smoke test (dry run).")
====== Filename: ./custom_image_utils/shell_script_generator.py.rej ======
--- custom_image_utils/shell_script_generator.py
+++ custom_image_utils/shell_script_generator.py
@@ -111,11 +111,13 @@ function main() {{
 
   local cert_args=""
   local num_src_certs="0"
+  metadata_arg="{metadata_flag}"
   if [[ -n '{trusted_cert}' ]] && [[ -f '{trusted_cert}' ]]; then
     # build tls/ directory from variables defined near the header of
     # the examples/secure-boot/create-key-pair.sh file
 
     eval "$(bash examples/secure-boot/create-key-pair.sh)"
+    metadata_arg="${{metadata_arg}},public_secret_name=${{public_secret_name}},private_secret_name=${{private_secret_name}},secret_project=${{secret_project}},secret_version=${{secret_version}}"
 
     # by default, a gcloud secret with the name of efi-db-pub-key-042 is
     # created in the current project to store the certificate installed
@@ -209,7 +211,7 @@ function main() {{
       {accelerator_flag} \
       {service_account_flag} \
       --scopes=cloud-platform \
-      {metadata_flag} \
+      ${{metadata_arg}} \
       --metadata-from-file startup-script=startup_script/run.sh )
 
   touch /tmp/{run_id}/vm_created
====== Filename: ./custom_image_utils/expiration_notifier.py ======
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Notify expiration for Dataproc custom images.
"""

import datetime
import logging
import subprocess
import tempfile

logging.basicConfig()
_LOG = logging.getLogger(__name__)
_LOG.setLevel(logging.WARN)

_expiration_notification_text = """\

#####################################################################
  WARNING: DATAPROC CUSTOM IMAGE '{}'
           WILL EXPIRE ON {}.
#####################################################################

"""


def _parse_date_time(timestamp_string):
  """Parses a timestamp string (RFC3339) to datetime format."""

  return datetime.datetime.strptime(timestamp_string[:-6],
                                    "%Y-%m-%dT%H:%M:%S.%f")


def _get_image_creation_timestamp(image_name, project_id):
  """Gets the creation timestamp of the custom image."""

  # version regex already checked in arg parser
  command = [
      "gcloud", "compute", "images", "describe", image_name, "--project",
      project_id, "--format=csv[no-heading=true](creationTimestamp)"
  ]

  with tempfile.NamedTemporaryFile() as temp_file:
    pipe = subprocess.Popen(command, stdout=temp_file)
    pipe.wait()
    if pipe.returncode != 0:
      raise RuntimeError("Cannot get custom image creation timestamp.")

    # get creation timestamp
    temp_file.seek(0)
    stdout = temp_file.read()
    return stdout.decode('utf-8').strip()


def notify(args):
  """Notifies when the image will expire."""

  if not args.dry_run:
    _LOG.info("Successfully built Dataproc custom image: %s", args.image_name)
    creation_date = _parse_date_time(
        _get_image_creation_timestamp(args.image_name, args.project_id))
    expiration_date = creation_date + datetime.timedelta(days=365)
    _LOG.info(
        _expiration_notification_text.format(args.image_name,
                                             str(expiration_date)))
  else:
    _LOG.info("Dry run succeeded.")
====== Filename: ./custom_image_utils/constants.py ======
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Constant variables for building custom image."""

version_help_text = """\
    The dataproc image version to be used for building the custom Dataproc
    image. The image version is in the format of:

        version_major.version_minor.version_patch

    Example:
        1.2.13

    Please refer to https://cloud.google.com/dataproc/docs/concepts/versioning/overview
    for more information on image versions.
    """
====== Filename: ./custom_image_utils/__pycache__/shell_image_creator.cpython-311.pyc ======
�
    �дg�  �                   �   � d Z ddlZddlmZ ddlmZ  ej        �   �           ej        e�  �        Ze�	                    ej
        �  �         d� ZdS )z*
Shell script based custom image creator.
�    N)�shell_script_executor)�shell_script_generatorc                 �F  � t           �                    d�  �         t          j        �   �         �                    t          | �  �        �  �        }t           �                    d�  �         t           �                    |�  �         t           �                    d�  �         t           �                    d�  �         | j        sJt           �                    d�  �         t          j        |�  �         t           �                    d�  �         dS t           �                    d�  �         dS )z3Creates a custom image with generated Shell script.zGenerating Shell script...z<############################################################z&Successfully generated Shell script...zCreating custom image...z$Successfully created custom image...z%Skip creating custom image (dry run).N)	�_LOG�infor   �	Generator�generate�vars�dry_runr   �run)�args�scripts     �w/usr/local/google/home/cjac/src/github/LLC-Technologies-Collier/custom-images/custom_image_utils/shell_image_creator.py�creater      s�   � � �)�)�(�)�)�)�!�+�-�-�6�6�t�D�z�z�B�B�&��)�)�H�����)�)�F�����)�)�H�����)�)�4�5�5�5� 
�� 7��I�I�(�)�)�)���f�%�%�%��I�I�4�5�5�5�5�5��I�I�5�6�6�6�6�6�    )�__doc__�logging�custom_image_utilsr   r   �basicConfig�	getLogger�__name__r   �setLevel�WARNr   � r   r   �<module>r      s�   ��� � ���� 4� 4� 4� 4� 4� 4� 5� 5� 5� 5� 5� 5� �� � � � ��w���"�"�� ���g�l� � � �7� 7� 7� 7� 7r   ====== Filename: ./custom_image_utils/__pycache__/image_labeller.cpython-311.pyc ======
�
    �дg=  �                   �   � d Z ddlZddlZ ej        �   �           ej        e�  �        Ze�                    ej        �  �         d� Z	d� Z
dS )z%Add label to Dataproc custom images.
�    Nc                 �  � |�                     dd�  �        �                    �   �         }d�                    |�  �        }dddd| d||g}t          �                    d	�                    d
�                    |�  �        �  �        �  �         t          j        |�  �        }|�                    �   �          |j	        dk    rt          d�  �        �dS )�0Sets Dataproc version label in the custom image.�.�-z!--labels=goog-dataproc-version={}�gcloud�compute�imagesz
add-labelsz	--projectzRunning: {}� r   z+Cannot set dataproc version to image label.N)�replace�lower�format�_LOG�info�join�
subprocess�Popen�wait�
returncode�RuntimeError)�
image_name�version�
project_id�version_label�
label_flag�command�pipes          �r/usr/local/google/home/cjac/src/github/LLC-Technologies-Collier/custom-images/custom_image_utils/image_labeller.py�_set_custom_image_labelr      s�   � � �/�/�#�s�+�+�1�1�3�3�-�2�9�9�-�H�H�*��	�8�\�:�{��*��'� �)�)�M� � ����'�!2�!2�3�3�4�4�4� 
�	�'�	"�	"�$��)�)�+�+�+�	�_����
�D�
E�
E�E� ��    c                 ��   � | j         sVt          �                    d�  �         t          | j        | j        | j        �  �         t          �                    d�  �         dS t          �                    d�  �         dS )r   z Setting label on custom image...z)Successfully set label on custom image...z-Skip setting label on custom image (dry run).N)�dry_runr   r   r   r   �dataproc_versionr   )�argss    r   �	add_labelr$   ,   ss   � � 
�� ?��I�I�0�1�1�1��D�O�T�-B� �O�-� -� -��I�I�9�:�:�:�:�:��I�I�=�>�>�>�>�>r   )�__doc__�loggingr   �basicConfig�	getLogger�__name__r   �setLevel�WARNr   r$   � r   r   �<module>r-      s�   ��� � ���� � � � � �� � � � ��w���"�"�� ���g�l� � � �F� F� F�&	?� 	?� 	?� 	?� 	?r   ====== Filename: ./custom_image_utils/__pycache__/constants.cpython-311.pyc ======
�
    ӷpf  �                   �   � d Z dZdS )z-Constant variables for building custom image.aV      The dataproc image version to be used for building the custom Dataproc
    image. The image version is in the format of:

        version_major.version_minor.version_patch

    Example:
        1.2.13

    Please refer to https://cloud.google.com/dataproc/docs/concepts/versioning/overview
    for more information on image versions.
    N)�__doc__�version_help_text� �    �Y/usr/local/google/home/cjac/src/github/cjac/custom-images/custom_image_utils/constants.py�<module>r      s   �� 4� 3�� � � r   ====== Filename: ./custom_image_utils/__pycache__/smoke_test_runner.cpython-311.pyc ======
�
    �дg�  �                   �   � d Z ddlZddlZddlZddlZ ej        �   �           ej        e�  �        Ze�	                    ej
        �  �         d� Zd� Zd� Zd� Zd� ZdS )z+Run smoke test for Dataproc custom images.
�    Nc                 �  � dddd| d|d|g	}dddd| d|d|d	|d|g}	|r|s|	�                     d
|g�  �         n|	�                     d|g�  �         |r|	�                     dg�  �         dddddd| d|d|ddddddddg}
t          j        |�  �        }|�                    �   �          |j        dk    rt          d| �  �        �t          j        |	�  �        }|�                    �   �          |j        dk    rt          d| �  �        �t          j        |
�  �        }|�                    �   �          |j        dk    rt          d| �  �        �dS )z0Create a Dataproc workflow template for testing.�gcloud�dataproc�workflow-templates�create�	--project�--regionzset-managed-clusterz--imagez--zonez	--networkz--subnetz--no-addresszadd-job�sparkz--workflow-templatez	--step-id�001z--classz!org.apache.spark.examples.SparkPiz--jarsz6file:///usr/lib/spark/examples/jars/spark-examples.jarz--�1000r   z/Error creating Dataproc workflow template '%s'.z:Error setting cluster for Dataproc workflow template '%s'.z4Error adding job to Dataproc workflow template '%s'.N)�extend�
subprocess�Popen�wait�
returncode�RuntimeError)�workflow_name�
image_name�
project_id�zone�region�network�subnet�no_external_ip�create_command�set_cluster_command�add_job_command�pipes               �u/usr/local/google/home/cjac/src/github/LLC-Technologies-Collier/custom-images/custom_image_utils/smoke_test_runner.py�_create_workflow_templater       s�  � � �
�0�(��[�*�j�&��.�
 �
�0��]�K��Y��(�D�*�f���
 � 5�V� 5�����W�5�6�6�6�6����
�F�3�4�4�4�� 1�����/�0�0�0��
�0�)�W��]�K��Z�QW��5�)�%H��H�$���/� 
�	�.�	)�	)�$��)�)�+�+�+�	�_����
�H�$�&� &� &� 
�	�-�	.�	.�$��)�)�+�+�+�	�_����
�D��� � � 
�	�/�	*�	*�$��)�)�+�+�+�	�_����
�M�$�&� &� &� ��    c           	      �   � dddd| d|d|g	}t          j        |�  �        }|�                    �   �          |j        dk    rt	          d�  �        �d	S )
zFRun a Dataproc workflow template to test the newly built custom image.r   r   r   �instantiater   r	   r   z(Unable to instantiate workflow template.N�r   r   r   r   r   �r   r   r   �commandr   s        r   �_instantiate_workflow_templater'   G   sc   � � �
�0�-��[�*�j�&��'� 
�	�'�	"�	"�$��)�)�+�+�+�	�_����
�A�
B�
B�B� �r!   c           
      �   � dddd| dd|d|g
}t          j        |�  �        }|�                    �   �          |j        dk    rt	          d	| �  �        �d
S )z$Delete a Dataproc workflow template.r   r   r   �deletez-qr   r	   r   z$Error deleting workfloe template %s.Nr$   r%   s        r   �_delete_workflow_templater*   S   sg   � � �
�0�(��T�;�
�J���'� 
�	�'�	"�	"�$��)�)�+�+�+�	�_����
�=�}�
M�
M�M� �r!   c           
      �   � |dd�         }t           j         �                    �   �         �                    d�  �        }d�                    |t	          j        �   �         j        dd�         �  �        }	 t          �                    d|| �  �         t          || ||||||�  �         t          �                    d|| �  �         t          �                    d�  �         t          |||�  �         t          �                    d	|�  �         nQ# t          $ rD}	d
�                    | |	�  �        }
t          �                    |
�  �         t          |
�  �        �d}	~	ww xY w	 	 t          �                    d|�  �         t          |||�  �         t          �                    d|�  �         dS # t          $ r Y dS w xY w# 	 t          �                    d|�  �         t          |||�  �         t          �                    d|�  �         w # t          $ r Y w w xY wxY w)z-Verifies if custom image works with Dataproc.N�����z%Y%m%d%H%M%Szverify-image-{}-{}i����z7Creating Dataproc workflow-template %s with image %s...zCSuccessfully created Dataproc workflow-template %s with image %s...z.Smoke testing Dataproc workflow-template %s...z:Successfully smoke tested Dataproc workflow-template %s...z*Verification of custom image {} failed: {}z)Deleting Dataproc workflow-template %s...z5Successfully deleted Dataproc workflow-template %s...)�datetime�now�strftime�format�uuid�uuid4�hex�_LOG�infor    r'   r   �errorr*   )r   r   r   r   �
subnetworkr   r   �dater   �e�err_msgs              r   �_verify_custom_imager;   _   sL  � �����9�&�	�	�	�	�	 �	 �	)�	)�.�	9�	9�$� '�-�-�d�D�J�L�L�4D�R�S�S�4I�J�J�-���I�I�G��Z�)� )� )��m�Z��T�6�%�z�>�C� C� C��I�I�M��z�#� #� #� 	�I�I�>�?�?�?�"�=�*�f�E�E�E��I�I�J��� � � ��	�  �  �  �:�A�A��A�� �G��J�J�w����
�w�
�
������	 ������
�i�i�;�]�K�K�K���z�6�B�B�B�
�i�i�G��� � � � ��� � � �
�d�d�������
�i�i�;�]�K�K�K���z�6�B�B�B�
�i�i�G��� � � ��� � � �
�d�������sc   �1BD �F3 �
E�?E�E�F3 �AF" �"
F0�/F0�3H�5AG=�<H�=
H
�H�	H
�
Hc                 �*  � | j         sq| j        sht          �                    d�  �         t	          | j        | j        | j        | j        | j	        | j
        �  �         t          �                    d�  �         dS dS t          �                    d�  �         dS )zRuns smoke test.zVerifying the custom image...z)Successfully verified the custom image...z"Skip running smoke test (dry run).N)�dry_run�no_smoke_testr4   r5   r;   r   r   r   r   r7   r   )�argss    r   �runr@   �   s�   � � 
�� 4��� =�
�i�i�/�0�0�0��4�?�D�O�T�Y��<���$�:M�O� O� O�
�i�i�;�<�<�<�<�<�	=� =� 	�I�I�2�3�3�3�3�3r!   )�__doc__r-   �loggingr   r1   �basicConfig�	getLogger�__name__r4   �setLevel�WARNr    r'   r*   r;   r@   � r!   r   �<module>rI      s�   ��� � ���� ���� � � � � ���� �� � � � ��w���"�"�� ���g�l� � � �*&� *&� *&�Z	C� 	C� 	C�	N� 	N� 	N�� � �D
4� 
4� 
4� 
4� 
4r!   ====== Filename: ./custom_image_utils/__pycache__/expiration_notifier.cpython-311.pyc ======
�
    �дg�	  �                   �   � d Z ddlZddlZddlZddlZ ej        �   �           ej        e�  �        Ze�	                    ej
        �  �         dZd� Zd� Zd� ZdS )z/
Notify expiration for Dataproc custom images.
�    Nz�
#####################################################################
  WARNING: DATAPROC CUSTOM IMAGE '{}'
           WILL EXPIRE ON {}.
#####################################################################

c                 �R   � t           j         �                    | dd�         d�  �        S )z7Parses a timestamp string (RFC3339) to datetime format.Ni����z%Y-%m-%dT%H:%M:%S.%f)�datetime�strptime)�timestamp_strings    �w/usr/local/google/home/cjac/src/github/LLC-Technologies-Collier/custom-images/custom_image_utils/expiration_notifier.py�_parse_date_timer   %   s.   � � 
�	�	#�	#�$4�S�b�S�$9�$:�
<� 
<� <�    c                 �  � dddd| d|dg}t          j        �   �         5 }t          j        ||��  �        }|�                    �   �          |j        dk    rt          d	�  �        �|�                    d�  �         |�                    �   �         }|�	                    d
�  �        �
                    �   �         cddd�  �         S # 1 swxY w Y   dS )z0Gets the creation timestamp of the custom image.�gcloud�compute�images�describez	--projectz0--format=csv[no-heading=true](creationTimestamp))�stdoutr   z+Cannot get custom image creation timestamp.zutf-8N)�tempfile�NamedTemporaryFile�
subprocess�Popen�wait�
returncode�RuntimeError�seek�read�decode�strip)�
image_name�
project_id�command�	temp_file�piper   s         r   �_get_image_creation_timestampr    ,   s  � �
 �	�8�Z��[��D��'�
 �"�$�$� 	*�	���G�I�6�6�6�D��I�I�K�K�K���!����F�G�G�G� �N�N�1�����^�^���F��=�=��!�!�'�'�)�)�	*� 	*� 	*� 	*� 	*� 	*� 	*� 	*� 	*� 	*� 	*� 	*���� 	*� 	*� 	*� 	*� 	*� 	*s   �BB?�?C�Cc                 �  � | j         s�t          �                    d| j        �  �         t	          t          | j        | j        �  �        �  �        }|t          j        d��  �        z   }t          �                    t          �
                    | j        t          |�  �        �  �        �  �         dS t          �                    d�  �         dS )z$Notifies when the image will expire.z,Successfully built Dataproc custom image: %sim  )�dayszDry run succeeded.N)�dry_run�_LOG�infor   r   r    r   r   �	timedelta�_expiration_notification_text�format�str)�args�creation_date�expiration_dates      r   �notifyr-   A   s�   � � 
�� 	$��I�I�<�d�o�N�N�N�$�%�d�o�t��G�G�I� I�M�#�h�&8�c�&B�&B�&B�B�O��I�I�%�,�,�T�_�-0��-A�-A�	C� 	C�D� D� D� D� D� 	�I�I�"�#�#�#�#�#r	   )�__doc__r   �loggingr   r   �basicConfig�	getLogger�__name__r$   �setLevel�WARNr'   r   r    r-   � r	   r   �<module>r6      s�   ��� � ���� ���� � � � � ���� �� � � � ��w���"�"�� ���g�l� � � �!� �<� <� <�*� *� *�*$� $� $� $� $r	   ====== Filename: ./custom_image_utils/__pycache__/args_inferer.cpython-311.pyc ======
�
    �дgn)  �                   �:  � d Z ddlZddlZddlZddlZddlZdZ ej        d�  �        ZdZ	 ej        d�  �        Z
 ej        �   �           ej        e�  �        Ze�                    ej        �  �         d� Zd� Zd	� Zd
� Zd� Zd� Zd� Zd� Zd� Zd� Zd� Zd� Zd� ZdS )z2
Infer arguments for Dataproc custom image build.
�    Nzprojects/{}/global/images/{}zX^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/([^/]+)$z#projects/{}/global/images/family/{}z_^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/family/([^/]+)$c                  �  � g d�} t          j        �   �         5 }t          j        | |��  �        }|�                    �   �          |j        dk    rt          d�  �        �|�                    d�  �         |�                    �   �         }|�	                    d�  �        �
                    �   �         cddd�  �         S # 1 swxY w Y   dS )z"Get project id from gcloud config.)�gcloud�configz	get-value�project��stdoutr   zHCannot find gcloud project ID. Please setup the project ID in gcloud SDK�utf-8N)�tempfile�NamedTemporaryFile�
subprocess�Popen�wait�
returncode�RuntimeError�seek�read�decode�strip)�gcloud_command�	temp_file�piper   s       �p/usr/local/google/home/cjac/src/github/LLC-Technologies-Collier/custom-images/custom_image_utils/args_inferer.py�_get_project_idr   %   s  � �?�?�?�.��"�$�$� 	*�	���N�9�=�=�=�D��I�I�K�K�K���!���� E� F� F� F� �N�N�1�����^�^���F��=�=��!�!�'�'�)�)�	*� 	*� 	*� 	*� 	*� 	*� 	*� 	*� 	*� 	*� 	*� 	*���� 	*� 	*� 	*� 	*� 	*� 	*s   �BB9�9B=� B=c                 �   � t           �                    | �  �        }|�                    d�  �        |�                    d�  �        fS )z$Get Dataproc image name and project.�   �   )�
_IMAGE_URI�match�group��	image_uri�ms     r   �_extract_image_name_and_projectr#   4   s4   � ����y�!�!�!�	
������Q�W�W�Q�Z�Z�	��    c                 �   � t           �                    | �  �        }|�                    d�  �        |�                    d�  �        fS )z+Get Dataproc image family name and project.r   r   )�_IMAGE_FAMILY_URIr   r   r    s     r   �/_extract_image_name_and_project_from_family_urir'   :   s4   � ����i�(�(�!�	
������Q�W�W�Q�Z�Z�	�r$   c                 ��  � t          | �  �        \  }}dddd|d|dg}t          j        �   �         5 }t          j        ||��  �        }|�                    �   �          |j        dk    rt          d	�  �        �|�                    d�  �         |�	                    �   �         }|r5|�
                    d
�  �        �                    �   �         }|cddd�  �         S 	 ddd�  �         n# 1 swxY w Y   t          d| �  �        �)z*Get Dataproc image version from image URI.r   �compute�images�describe�	--project�,--format=value(labels.goog-dataproc-version)r   r   zLCannot find dataproc base image, please check and verify the base image URI.r	   Nz#Cannot find dataproc base image: %s)r#   r
   r   r   r   r   r   r   r   r   r   r   )r!   r   �
image_name�commandr   r   r   �parsed_lines           r   �_get_dataproc_image_versionr1   @   sb  � �7�	�B�B��'�:��	�8�Z��[��=��'� �"�$�$� �	���G�I�6�6�6�D��I�I�K�K�K���!���� �!� !� !� �N�N�1�����^�^���F�� ��M�M�'�*�*�0�0�2�2�k��� � � � � � � ��� � � � � � � � � � ���� � � � � 	�:�I�F�F�F�   �BC"�"C&�)C&c                 ��  � t          | �  �        \  }}dddd|d|dg}t          j        �   �         5 }t          j        ||��  �        }|�                    �   �          |j        dk    rt          d	�  �        �|�                    d�  �         |�	                    �   �         }|r5|�
                    d
�  �        �                    �   �         }|cddd�  �         S 	 ddd�  �         n# 1 swxY w Y   t          d| z  �  �        �)z3Get Dataproc image family version from family name.r   r)   r*   zdescribe-from-familyr,   r-   r   r   zOCannot find dataproc base family image, please check and verify the family URI.r	   Nz*Cannot find dataproc base image family: %s)r'   r
   r   r   r   r   r   r   r   r   r   r   )�image_family_urir   �image_family_namer/   r   r   r   �dataproc_versions           r   �'_get_dataproc_version_from_image_familyr7   [   st  � �N�O_�`�`��'���	�8�%;�=N�P[��=��'� �"�$�$� �	���G�I�6�6�6�D��I�I�K�K�K���!������ � � �N�N�1�����^�^���F�� ����w�/�/�5�5�7�7���� � � � � � � ��� � � � � � � � � � ���� � � � � 	�A�%�&� 	'� 	'� 'r2   c                 �\   � t          | �  �        \  }}t          �                    ||�  �        S )z2Get the partial image URI from the full image URI.)r#   �_IMAGE_PATH�format)r!   r   r.   s      r   �_extract_image_pathr;   v   s*   � �7�	�B�B��'�:�	�	�	�G�Z�	0�	0�0r$   c                 �\   � t          | �  �        \  }}t          �                    ||�  �        S )z@Get the partial image family URI from the full image family URI.)r'   �_IMAGE_FAMILY_PATHr:   )r4   r   r.   s      r   �_extract_image_family_pathr>   {   s+   � �G�HX�Y�Y��'�:�	�	"�	"�7�J�	7�	7�7r$   c                 ��  � | �                     d�  �        }|d         }t          |�  �        dk    rc|d         �                     d�  �        d         }|d         �                    dd�  �        |d<   d�                    |d         |d         �  �        }n9|d         }|d         }d�                    |d         |d         |d         �  �        }d	d
ddddd|dddg}t          �                    d�                    |�  �        �  �         t          j        �   �         5 }t          j	        ||��  �        }|�
                    �   �          |j        dk    rt          d�  �        �|�                    d�  �         |�                    �   �         }|�r3|�                    d�  �        �                    �   �         �                     d�  �        }	d�                    ||�  �        }
t          �                    d|
�  �         g }i }|	D ]�}|�                     d�  �        }t          |�  �        dk    r�|d         }|�                    |
�  �        st          �                    d|�  �         �c|d         }||vr5t$          �                    d|�  �        g||<   |�                    |�  �         ��||         �                    t$          �                    d|�  �        �  �         ��t          �                    d|�  �         t          �                    d|�  �         |d         }t          ||         �  �        dk    r6t          d�                    |t)          ||         �  �        �  �        �  �        �t          �                    d||d                  d         |d         �  �         ||d                  d         |d         fcd d d �  �         S 	 d d d �  �         n# 1 swxY w Y   t          d!| z  �  �        �)"z*Get Dataproc base image name from version.�.r   �   �   �-z-\d+-zMlabels.goog-dataproc-version ~ ^{}-{} AND NOT name ~ -eap$ AND status = READYzOlabels.goog-dataproc-version = {}-{}-{} AND NOT name ~ -eap$ AND status = READYr   r)   r*   �listr,   zcloud-dataprocz--filterz--formatz7csv[no-heading=true](name,labels.goog-dataproc-version)z--sort-by=~creationTimestampzExecuting command: {}r   zMCannot find dataproc base image, please check and verify [--dataproc-version]r	   �
zdataproc-{}-{}zFiltering images : %s�,zSkipping non-release image %szAll Images : %szAll Image-Versions : %szEFound more than one images for latest dataproc-version={}. Images: {}z!Choosing image %s with version %sNz9Cannot find dataproc base image with dataproc-version=%s.)�split�len�replacer:   �_LOG�infor
   r   r   r   r   r   r   r   r   r   r   �
startswithr9   �append�str)�version�parsed_version�major_version�minor_version�
filter_argr/   r   r   r   �parsed_lines�expected_prefix�image_versions�all_images_for_version�line�parsed_image�parsed_image_name�parsed_image_version�latest_available_versions                     r   �#_get_dataproc_image_path_by_versionr]   �   s�  � � �=�=��%�%�.� ��#�-�����A��� #�1�%�+�+�C�0�0��3�M�&�q�)�1�1�#�w�?�?�N�1��(�)/���q�0A�0>�q�0A�*C� *C� �J� #�1�%�M�"�1�%�M�(�)/���q�0A�0>�q�0A�0>�q�0A�*C� *C� �
 �i��6�;�8H��
�J�=�"�	�'� �)�)�#�*�*�7�3�3�4�4�4��"�$�$� ,M�	���G�I�6�6�6�D��I�I�K�K�K���!����	� �  �  � �N�N�1�����^�^���F�� !M��]�]�7�+�+�1�1�3�3�9�9�$�?�?�l�(�/�/��}�M�M�o�
�i�i�'��9�9�9��n�!��� y� y�$��z�z�#�����|����!�!�*�1�o�
�"�-�-�o�>�>� ��I�I�5�7H�I�I�I��!-�a��
�!�)?�?�?�<G�<N�<N�O_�ar�<s�<s�;t�"�#7�8��!�!�"6�7�7�7�7�"�#7�8�?�?��@R�@R�Sc�ev�@w�@w�x�x�x��
�i�i�!�#9�:�:�:�
�i�i�)�>�:�:�:�!/��!2��
�$�%=�>�
?�
?�!�
C�
C��
Q�
X�
X�$��&�'?�@�A�A�C� C�D� D� 	D�
 �i�i�3�5K�N�[\�L]�5^�_`�5a�cq�rs�ct�u�u�u�#�N�1�$5�6�q�9�>�!�;L�L�Y,M� ,M� ,M� ,M� ,M� ,M� ,M� ,M�!M�,M� ,M� ,M� ,M� ,M� ,M� ,M� ,M� ,M� ,M� ,M���� ,M� ,M� ,M� ,M�\ 	�?�'�I�	K� 	K� Ks   �JO�O�Oc                 �>   � | j         st          �   �         | _         d S d S �N)�
project_idr   ��argss    r   �_infer_project_idrc   �   s'   � �	�� (�%�'�'�D�O�O�O�(� (r$   c                 �  � t           �                    d�  �         | j        r3t          | j        �  �        | _        t          | j        �  �        | _        nr| j        r"t          | j        �  �        \  | _        | _        nI| j        r3t          | j        �  �        | _        t          | j        �  �        | _        nt          d�  �        �t           �                    d| j        �  �         t           �                    d| j        �  �         d S )Nz#Getting Dataproc base image name...z[Neither --dataproc-version nor --base-image-uri nor --source-image-family-uri is specified.z Returned Dataproc base image: %sz Returned Dataproc version   : %s)rJ   rK   �base_image_urir;   �dataproc_base_imager1   r6   r]   �base_image_familyr>   r7   r   ra   s    r   �_infer_base_imagerh   �   s�   � ��)�)�1�2�2�2�	�� g�2�4�3F�G�G�D��7��8K�L�L�D����� g�6Y���7� 7�3�D��d�3�3��� g�9�$�:P�Q�Q�D��C�D�DZ�[�[�D���
�e�g� g� g��)�)�.��0H�I�I�I��)�)�.��0E�F�F�F�F�Fr$   c                 �   � | j         r>d�                    t          j        �                    | j         �  �        �  �        | _         d S d| _         d S )Nz
    "OAuthPath": "{}",� )�oauthr:   �os�path�abspathra   s    r   �_infer_oauthro   �   sG   � �	�Z� �/�6�6�
�����
�#�#�%� %�D�J�J�J� �D�J�J�Jr$   c                 �   � | j         s| j        sd| _         | j         �                    d�  �        r'd�                    | j        | j         �  �        | _         d S d S )Nzglobal/networks/defaultzglobal/networks/zprojects/{}/{})�network�
subnetworkrL   r:   r`   ra   s    r   �_infer_networkrs   �   se   � � 
�� -�d�o� -�,�D�L� 
�\���/�0�0� J�#�*�*�4�?�D�L�I�I�D�L�L�L�J� Jr$   c                 �   � t          | �  �         t          | �  �         t          | �  �         t          | �  �         | j        | _        d S r_   )rc   rh   ro   rs   �shutdown_instance_timer_sec�shutdown_timer_in_secra   s    r   �
infer_argsrw   �   sK   � ��D�����D�����t���������#�?�$���r$   )�__doc__�loggingrl   �rer   r
   r9   �compiler   r=   r&   �basicConfig�	getLogger�__name__rJ   �setLevel�WARNr   r#   r'   r1   r7   r;   r>   r]   rc   rh   ro   rs   rw   � r$   r   �<module>r�      s�  ��� � ���� 	�	�	�	� 	�	�	�	� � � � � ����,���R�Z�_�� �
� ;� ��B�J�f�� � � �� � � � ��w���"�"�� ���g�l� � � �*� *� *� �  �  � �  �  �G� G� G�6'� '� '�61� 1� 1�
8� 8� 8�
NK� NK� NK�b(� (� (�
G� G� G�&� � �
J� 
J� 
J�@� @� @� @� @r$   ====== Filename: ./custom_image_utils/__pycache__/shell_script_generator.cpython-311.pyc ======
�
    ��gX3  �                   �4   � d Z ddlmZ dZ G d� d�  �        ZdS )z7
Shell script based image creation workflow generator.
�    )�datetimea%  #!/usr/bin/env bash

# Script for creating Dataproc custom image.

set -euo pipefail

RED='\e[0;31m'
GREEN='\e[0;32m'
NC='\e[0m'

base_obj_type="images"

function execute_with_retries() (
  set +x
  local -r cmd="$*"

  for ((i = 0; i < 3; i++)); do
    time eval "$cmd" > "/tmp/{run_id}/install.log" 2>&1 && retval=$? || {{ retval=$? ; cat "/tmp/{run_id}/install.log" ; }}
    if [[ $retval == 0 ]] ; then return 0 ; fi
    sleep 5
  done
  return 1
)

function gsutil() {{ ${{gsutil_cmd}} "$*" ; }}

function version_ge() ( set +x ;  [ "$1" = "$(echo -e "$1
$2" | sort -V | tail -n1)" ] ; )
function version_gt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; )
function version_le() ( set +x ;  [ "$1" = "$(echo -e "$1
$2" | sort -V | head -n1)" ] ; )
function version_lt() ( set +x ;  [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; )

function prepare() {{
  # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be
  # used as a more performant replacement for `gsutil`
  gsutil_cmd="gcloud storage"
  gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {{print $2}}')"
  if version_lt "${{gcloud_sdk_version}}" "402.0.0" ; then
    gsutil_cmd="$(which gsutil) -o GSUtil:check_hashes=never"
  fi
}}

function exit_handler() {{
  echo 'Cleaning up before exiting.'

  if [[ -f /tmp/{run_id}/vm_created ]]; then ( set +e
    echo 'Deleting VM instance.'
    execute_with_retries       gcloud compute instances delete {image_name}-install --project={project_id} --zone={zone} -q
  ) elif [[ -f /tmp/{run_id}/disk_created ]]; then
    echo 'Deleting disk.'
    execute_with_retries       gcloud compute ${{base_obj_type}} delete {image_name}-install --project={project_id} --zone={zone} -q
  fi

  echo 'Uploading local logs to GCS bucket.'
  gsutil -m rsync -r {log_dir}/ {gcs_log_dir}/

  if [[ -f /tmp/{run_id}/image_created ]]; then
    echo -e "${{GREEN}}Workflow succeeded${{NC}}, check logs at {log_dir}/ or {gcs_log_dir}/"
    exit 0
  else
    echo -e "${{RED}}Workflow failed${{NC}}, check logs at {log_dir}/ or {gcs_log_dir}/"
    exit 1
  fi
}}

function test_element_in_array {{
  local test_element="$1" ; shift
  local -a test_array=("$@")

  for item in "${{test_array[@]}}"; do
    if [[ "${{item}}" == "${{test_element}}" ]]; then return 0 ; fi
  done
  return 1
}}

function print_modulus_md5sum {{
  local derfile="$1"
  openssl x509 -noout -modulus -in "${{derfile}}" | openssl md5 | awk '{{print $2}}'
}}

function print_img_dbs_modulus_md5sums() {{
  local long_img_name="$1"
  local img_name="$(echo ${{long_img_name}} | sed -e 's:^.*/::')"
  local json_tmpfile="/tmp/{run_id}/${{img_name}}.json"
  gcloud compute images describe ${{long_img_name}} --format json > "${{json_tmpfile}}"

  local -a db_certs=()
  mapfile -t db_certs < <( cat ${{json_tmpfile}} | jq -r 'try .shieldedInstanceInitialState.dbs[].content' )

  local -a modulus_md5sums=()
  for key in "${{!db_certs[@]}}" ; do
    local derfile="/tmp/{run_id}/${{img_name}}.${{key}}.der"
    echo "${{db_certs[${{key}}]}}" |       perl -M'MIME::Base64(decode_base64url)' -ne 'chomp; print( decode_base64url($_) )'       > "${{derfile}}"
    modulus_md5sums+=( $(print_modulus_md5sum "${{derfile}}") )
  done

  echo "${{modulus_md5sums[@]}}"
}}

function main() {{
  echo 'Uploading files to GCS bucket.'
  declare -a sources_k=({sources_map_k})
  declare -a sources_v=({sources_map_v})
  for i in "${{!sources_k[@]}}"; do
    gsutil cp "${{sources_v[i]}}" "{custom_sources_path}/${{sources_k[i]}}" > /dev/null 2>&1
  done

  local cert_args=""
  local num_src_certs="0"
  metadata_arg="{metadata_flag}"
  if [[ -n '{trusted_cert}' ]] && [[ -f '{trusted_cert}' ]]; then
    # build tls/ directory from variables defined near the header of
    # the examples/secure-boot/create-key-pair.sh file

    eval "$(bash examples/secure-boot/create-key-pair.sh)"
    metadata_arg="${{metadata_arg}},public_secret_name=${{public_secret_name}},private_secret_name=${{private_secret_name}},secret_project=${{secret_project}},secret_version=${{secret_version}}"

    # by default, a gcloud secret with the name of efi-db-pub-key-042 is
    # created in the current project to store the certificate installed
    # as the signature database file for this disk image

    # The MS UEFI CA is a reasonable base from which to build trust.  We
    # will trust code signed by this CA as well as code signed by
    # trusted_cert (tls/db.der)

    # The Microsoft Corporation UEFI CA 2011
    local -r MS_UEFI_CA="tls/MicCorUEFCA2011_2011-06-27.crt"
    test -f "${{MS_UEFI_CA}}" ||       curl -L -o ${{MS_UEFI_CA}} 'https://go.microsoft.com/fwlink/p/?linkid=321194'

    local -a cert_list=()

    local -a default_cert_list
    default_cert_list=("{trusted_cert}" "${{MS_UEFI_CA}}")
    local -a src_img_modulus_md5sums=()

    mapfile -t src_img_modulus_md5sums < <(print_img_dbs_modulus_md5sums {dataproc_base_image})
    num_src_certs="${{#src_img_modulus_md5sums[@]}}"
    echo "debug - num_src_certs: [${{#src_img_modulus_md5sums[*]}}]"
    echo "value of src_img_modulus_md5sums: [${{src_img_modulus_md5sums}}]"
    if [[ -z "${{src_img_modulus_md5sums}}" ]]; then
      num_src_certs=0
      echo "no db certificates in source image"
      cert_list=( "${{default_cert_list[@]}}" )
    else
      echo "${{num_src_certs}} db certificates attached to source image"
      echo "db certs exist in source image"
      for cert in ${{default_cert_list[*]}}; do
        if test_element_in_array "$(print_modulus_md5sum ${{cert}})" ${{src_img_modulus_md5sums[@]}} ; then
          echo "cert ${{cert}} is already in source image's db list"
        else
          cert_list+=("${{cert}}")
        fi
      done
      # append source image's cert list
      local img_name="$(echo {dataproc_base_image} | sed -e 's:^.*/::')"
      if [[ ${{#cert_list[@]}} -ne 0 ]] && compgen -G "/tmp/{run_id}/${{img_name}}.*.der" > /dev/null ; then
        cert_list+=(/tmp/{run_id}/${{img_name}}.*.der)
      fi
    fi

    if [[ ${{#cert_list[@]}} -eq 0 ]]; then
      echo "all certificates already included in source image's db list"
    else
      cert_args="--signature-database-file=$(IFS=, ; echo "${{cert_list[*]}}") --guest-os-features=UEFI_COMPATIBLE"
    fi
  fi

  date

  if [[ -z "${{cert_args}}" && "${{num_src_certs}}" -ne "0" ]]; then
    echo 'Re-using base image'
    base_obj_type="reuse"
    instance_disk_args='--image-project={project_id} --image={dataproc_base_image} --boot-disk-size={disk_size}G --boot-disk-type=pd-ssd'

  elif [[ -n "${{cert_args}}" ]] ; then
    echo 'Creating image.'
    base_obj_type="images"
    instance_disk_args='--image-project={project_id} --image={image_name}-install --boot-disk-size={disk_size}G --boot-disk-type=pd-ssd'
    execute_with_retries       gcloud compute images create {image_name}-install       --project={project_id}       --source-image={dataproc_base_image}       ${{cert_args}}       {storage_location_flag}       --family={family}
    touch "/tmp/{run_id}/disk_created"
  else
    echo 'Creating disk.'
    base_obj_type="disks"
    instance_disk_args='--disk=auto-delete=yes,boot=yes,mode=rw,name={image_name}-install'
    execute_with_retries gcloud compute disks create {image_name}-install       --project={project_id}       --zone={zone}       --image={dataproc_base_image}       --type=pd-ssd       --size={disk_size}GB
    touch "/tmp/{run_id}/disk_created"
  fi

  date
  echo 'Creating VM instance to run customization script.'
  execute_with_retries gcloud compute instances create {image_name}-install       --project={project_id}       --zone={zone}       {network_flag}       {subnetwork_flag}       {no_external_ip_flag}       --machine-type={machine_type}       ${{instance_disk_args}}       {accelerator_flag}       {service_account_flag}       --scopes=cloud-platform       "${{metadata_arg}}"       --metadata-from-file startup-script=startup_script/run.sh

  touch /tmp/{run_id}/vm_created

  # clean up intermediate install image
  if [[ "${{base_obj_type}}" == "images" ]] ; then ( set +e
    # This sometimes returns an API error but deletes the image despite the failure
    gcloud compute images delete -q {image_name}-install --project={project_id}
  ) fi

  echo 'Waiting for customization script to finish and VM shutdown.'
  execute_with_retries gcloud compute instances tail-serial-port-output {image_name}-install       --project={project_id}       --zone={zone}       --port=1 2>&1       | grep 'startup-script'       | sed -e 's/ {image_name}-install.*startup-script://g'       | dd status=none bs=1 of={log_dir}/startup-script.log       || true
  echo 'Checking customization script result.'
  date
  if grep -q 'BuildFailed:' {log_dir}/startup-script.log; then
    echo -e "${{RED}}Customization script failed.${{NC}}"
    echo "See {log_dir}/startup-script.log for details"
    exit 1
  elif grep -q 'BuildSucceeded:' {log_dir}/startup-script.log; then
    echo -e "${{GREEN}}Customization script succeeded.${{NC}}"
  else
    echo 'Unable to determine the customization script result.'
    exit 1
  fi

  date
  echo 'Creating custom image.'
  execute_with_retries gcloud compute images create {image_name}     --project={project_id}     --source-disk-zone={zone}     --source-disk={image_name}-install     {storage_location_flag}     --family={family}

  touch /tmp/{run_id}/image_created
}}

prepare
trap exit_handler EXIT
mkdir -p {log_dir}
main "$@" 2>&1 | tee {log_dir}/workflow.log
c                   �   � e Zd ZdZd� Zd� ZdS )�	Generatorz5Shell script based image creation workflow generator.c                 �"  � || _         d| j         vr@ dj        d4dt          j        �   �         �                    d�  �        i| j         ��| j         d<   | j         d         �                    dd�  �        | j         d<    d	j        d4i | j         ��| j         d
<   d| j         d         d�}|�                    | j         d         �  �         t          t          |�	                    �   �         �  �        �  �        }d�
                    d� |D �   �         �  �        | j         d<   d�
                    d� |D �   �         �  �        | j         d<    dj        d4i | j         ��| j         d<    dj        d4i | j         ��| j         d<   | j         d         r% dj        d4i | j         ��| j         d<   d| j         d<   n1| j         d         r$ dj        d4i | j         ��| j         d<   d| j         d<   | j         d         r dj        d4i | j         ��| j         d <   | j         d!         rd"nd| j         d#<   | j         d$         r d%j        d4i | j         ��nd| j         d&<   | j         d'         r d(j        d4i | j         ��nd| j         d)<   d*}| j         d+         r9| j         d+         �                    d,d-�  �        }|d.�                    |�  �        z  }| j         d/         r%| j         d/         }|d0�                    |�  �        z  }| j         d1         r|d2z  } |j        d4i | j         ��| j         d3<   d S )5N�run_idz%custom-image-{image_name}-{timestamp}�	timestampz%Y%m%d-%H%M%S�
gcs_bucketzgs://� �bucket_namez#gs://{bucket_name}/{run_id}/sources�custom_sources_pathzstartup_script/run.sh�customization_script)zrun.shzinit_actions.sh�extra_sources� c           	      �t   � g | ]5\  }}d �                     ||d         �                    dd�  �        �  �        ��6S )�	[{}]='{}'r   �'�'\''��format�replace��.0�i�kvs      �z/usr/local/google/home/cjac/src/github/LLC-Technologies-Collier/custom-images/custom_image_utils/shell_script_generator.py�
<listcomp>z(Generator._init_args.<locals>.<listcomp>2  �Q   � � +[� +[� +[�?D�q�"����1�b��e�m�m�C��9�9�:�:�+[� +[� +[�    �sources_map_kc           	      �t   � g | ]5\  }}d �                     ||d         �                    dd�  �        �  �        ��6S )r   �   r   r   r   r   s      r   r   z(Generator._init_args.<locals>.<listcomp>4  r   r   �sources_map_vz/tmp/{run_id}/logs�log_dirz gs://{bucket_name}/{run_id}/logs�gcs_log_dir�
subnetworkz--subnet={subnetwork}�subnetwork_flag�network_flag�networkz--network={network}�service_accountz#--service-account={service_account}�service_account_flag�no_external_ipz--no-address�no_external_ip_flag�acceleratorz:--accelerator={accelerator} --maintenance-policy terminate�accelerator_flag�storage_locationz%--storage-location={storage_location}�storage_location_flagzb--metadata=shutdown-timer-in-sec={shutdown_timer_in_sec},custom-sources-path={custom_sources_path}�optional_components�,�.z,optional-components="{}"�dataproc_versionz,dataproc-version="{}"�metadataz,{metadata}�metadata_flag� )�argsr   r   �now�strftimer   �update�tuple�	enumerate�items�join)�selfr8   �all_sources�sources_map_items�metadata_flag_templater1   r4   s          r   �
_init_argszGenerator._init_args#  s  � ��D�I��t�y� � �J�C�J� K� K��L�N�N�+�+�O�<�<�K�@D�	�K� K�d�i���#�y��6�>�>�w��K�K�D�I�m��'S�'L�'S�'`�'`�VZ�V_�'`�'`�D�I�#�$� *��9�%;�<�� �K� ���t�y��1�2�2�2��i��(9�(9�(;�(;�<�<�=�=��!$��� +[� +[�HY�+[� +[� +[� "\� "\�D�I�o��!$��� +[� +[�HY�+[� +[� +[� "\� "\�D�I�o�� 7�/�6�C�C���C�C�D�I�i��H�A�H�  �  ��	� �  �D�I�m���y��� (�%C�%<�%C�%P�%P�d�i�%P�%P�d�i�!�"�"$�d�i����	��9�	� (�">�"7�">�"K�"K���"K�"K�d�i���%'�d�i�!�"��y�"�#� �"N�"G�"N� #� #�
�)�#� #� �i�� � :>���:� ( �~�~�� 	�I�#�$� �	�-�0�9�_�X�_� � �
�)�� � �68� 	�I���
 �	�"4�5�">�!O�!H�!O� "� "�
�)�"� "� "�;=� 	�I���	4� � �y�&�'� X� �I�&;�<�D�D�S�#�N�N��� ;� B� B�CV� W� W�W���y�#�$� R���#5�6��� 8� ?� ?�@P� Q� Q�Q���y��� .���-��!>�!7�!>�!K�!K���!K�!K�D�I�o���r   c                 �N   � | �                     |�  �         t          j        di |��S )Nr7   )rD   �	_templater   )r@   r8   s     r   �generatezGenerator.generateZ  s+   � ��O�O�D������#�#�d�#�#�#r   N)�__name__�
__module__�__qualname__�__doc__rD   rG   r7   r   r   r   r      s<   � � � � � �=�=�5L� 5L� 5L�n$� $� $� $� $r   r   N)rK   r   rF   r   r7   r   r   �<module>rL      s^   ��� � � � � � � �I�	�V<$� <$� <$� <$� <$� <$� <$� <$� <$� <$r   ====== Filename: ./custom_image_utils/__pycache__/args_parser.cpython-311.pyc ======
�
    #�g	&  �                   ��   � d Z ddlZddlZddlZddlmZ  ej        d�  �        Z ej        d�  �        Z ej        d�  �        Z	 ej        d�  �        Z
g d�Zd	� Zd
� Zd� Zd� Zd� ZdS )zw
This is a utility module which defines and parses the command-line arguments
for the generate_custom_image.py script.
�    N)�	constantsz%^\d+\.\d+\.\d+(-RC\d+)?(-[a-z]+\d+)?$zX^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/([^/]+)$z_^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/family/([^/]+)$z+^(\d+)\.(\d+)-((?:debian|ubuntu|rocky)\d+)$)
�HIVE_WEBHCAT�ZEPPELIN�TRINO�RANGER�SOLR�FLINK�DOCKER�HUDI�ICEBERG�PIGc                 �   � t           �                    | �  �        sAt          �                    | �  �        s't          j        d�                    | �  �        �  �        �| S )z&Check if version string matches regex.zInvalid version: {}.)�_VERSION_REGEX�match�_LATEST_FROM_MINOR_VERSION�argparse�ArgumentTypeError�format��ss    �o/usr/local/google/home/cjac/src/github/LLC-Technologies-Collier/custom-images/custom_image_utils/args_parser.py�_version_regex_typer   #   sU   � �	�	�	�a�	 �	 � G�)C�)I�)I�!�)L�)L� G�
�
$�%;�%B�%B�1�%E�%E�
F�
F�F�	
�(�    c                 �   � t           �                    | �  �        s't          j        d�                    | �  �        �  �        �| S )z4Check if the partial image uri string matches regex.zInvalid image URI: {}.)�_FULL_IMAGE_URIr   r   r   r   r   s    r   �_full_image_uri_regex_typer   )   s>   � �	�	�	�q�	!�	!� I�
�
$�%=�%D�%D�Q�%G�%G�
H�
H�H�	
�(r   c                 �   � t           �                    | �  �        s't          j        d�                    | �  �        �  �        �| S )z;Check if the partial image family uri string matches regex.zInvalid image family URI: {}.)�_FULL_IMAGE_FAMILY_URIr   r   r   r   r   s    r   �!_full_image_family_uri_regex_typer   /   s>   � �	�	%�	%�a�	(�	(� P�
�
$�%D�%K�%K�A�%N�%N�
O�
O�O�	
�(r   c                 �t   � | �                     d�  �        }|D ]}|t          vrt          j        d�  �        �� | S )N�,z$Invalid optional component selected.)�split�_VALID_OPTIONAL_COMPONENTSr   r   )�optional_components�
components�	components      r   �_validate_componentsr'   5   sO   � �$�*�*�3�/�/�J�� U� U�	��6�6�6��,�-S�T�T�T� 7��r   c                 �  � t          j        �   �         }|�                    d�  �        }|�                    dt          dd��  �         |�                    �   �         }|�                    dt          t          j        ��  �         |�                    dt          d	��  �         |�                    d
t          d��  �         |�                    dt          dd��  �         |�                    dt          dd��  �         |�                    dt          dd��  �         |�                    dt          dd��  �         |�                    dt          ddd��  �         |�                    dt          dd��  �         |�                    dt          dd��  �         |�                    dt          ddd��  �         |�                    d d!d"�#�  �         |�                    d$t          dd%d&��  �         |�                    d't          dd%d(��  �         |�                    d)d!d*�#�  �         |�                    d+t          dd,d-��  �         |�                    d.t          j        di d/��  �         |�                    d0t          dd1d2��  �         |�                    d3t          dd4d5��  �         |�                    d6t          dd4d7��  �         |�                    d8t          dd9d:��  �         |�                    d;d!d<�#�  �         |�                    d=t          dd>d?��  �         |�                    d@t          ddA��  �         |�                    | �  �        S )BzParses command-line arguments.zrequired named argumentsz--image-nameTz-The image name for the Dataproc custom image.)�type�required�helpz--dataproc-version)r)   r+   z--base-image-uriz�The full image URI for the base Dataproc image. The
      customiziation script will be executed on top of this image instead of
      an out-of-the-box Dataproc image. This image must be a valid Dataproc
      image.
      z--base-image-familyzlThe source image family URI. The latest non-depracated image associated with the family will be used.
      z--customization-scriptz)User's script to install custom packages.z
--metadataFa  VM metadata which can be read by the customization script
      with `/usr/share/google/get_metadata_value attributes/<key>` at runtime.
      The value of this flag takes the form of `key1=value1,key2=value2,...`.
      If the value includes special characters (e.g., `=`, `,` or spaces) which
      needs to be escaped, consider encoding the value, then decode it back in
      the customization script. See more information about VM metadata
      on https://cloud.google.com/sdk/gcloud/reference/compute/instances/create.
      z--zonez(GCE zone used to build the custom image.z--gcs-bucketzIGCS bucket used to store files and logs when
      building custom image.z--familyzdataproc-custom-imagez#(Optional) The family of the image.)r)   r*   �defaultr+   z--project-idz�The project Id of the project where the custom image will be
      created and saved. The default value will be set to the project id
      specified by `gcloud config get-value project`.z--oauthz�A local path to JSON credentials for your GCE project.
      The default oauth is the application-default credentials from gcloud.z--machine-typezn1-standard-1z`(Optional) Machine type used to build custom image.
      Default machine type is n1-standard-1.z--no-smoke-test�
store_truezl(Optional) Disables smoke test to verify if the custom image
      can create a functional Dataproc cluster.)�actionr+   z	--network� a6  (Optional) Network interface used to launch the VM instance that
      builds the custom image. Default network is 'global/networks/default'
      when no network and subnetwork arguments are provided.
      If the default network does not exist in your project, please specify
      a valid network interface.z--subnetworka  (Optional) The subnetwork that is used to launch the VM instance
      that builds the custom image. A full subnetwork URL is required.
      Default subnetwork is None. For shared VPC only provide this parameter and
      do not use the --network argument.z--no-external-ipa  (Optional) Disables external IP for the image build VM. The VM
      will not be able to access the internet, but if Private Google
      Access is enabled for the subnetwork, it can still access Google services
      (e.g., GCS) through internal IP of the VPC.z--service-accountr,   a  (Optional) The service account that is used to launch the VM instance
      that builds the custom image. If not specified, the default service
      account under the GCE project will be used. The scope of this service
      account is defaulted to /auth/cloud-platform.z--extra-sourcesz�(Optional) Additional files/directories uploaded along with
      customization script. This argument is evaluated to a json dictionary.
      For example:
      '--extra-sources "{\"notes.txt\": \"/path/to/notes.txt\"}"'
      z--disk-size�   z�(Optional) The size in GB of the disk attached to the VM instance
      that builds the custom image. If not specified, the default value of
      15 GB will be used.z--acceleratorNz�(Optional) The accelerators (e.g. GPUs) attached to the VM instance
      that builds the custom image. If not specified, no accelerators are
      attached.z--storage-locationz�(Optional) The storage location (e.g. US, us-central1) of the custom
      GCE image. If not specified, the default GCE image storage location is
      used.z--shutdown-instance-timer-seci,  z�(Optional) The time to wait in seconds before shutting down the VM
      instance. This value may need to be increased if your init script
      generates a lot of output on stdout. If not specified, the default value
      of 300 seconds will be used.z	--dry-runz8(Optional) Only generates script without creating image.z--trusted-certz
tls/db.derz(Optional) Inserts the specified DER-format certificate into
      the custom image's EFI boot sector for use with secure boot.z--optional-componentsz�Optional Components to be installed with the image.
      Can be a comma-separated list of components, e.g., TRINO,ZEPPELIN.
      (Only supported for Dataproc Images 2.3 and above))r   �ArgumentParser�add_argument_group�add_argument�str�add_mutually_exclusive_groupr   r   �version_help_textr   r   �json�loads�intr'   �
parse_args)�args�parser�required_args�
image_argss       r   r:   r:   <   s�  � ��"�$�$�&��+�+�,F�G�G�-�������>�	 � @� @� @�
 �9�9�;�;�*�������&� � (� (� (� ����%�
� � � � � ����,�
� � � � �
 ������:�	 � <� <� <�
 ������
�	 � � � � ������9�	 � ;� ;� ;�
 ������ �	 � !� !� !� 	������%�4� � 6� 6� 6� 	������9�	 � :� :� :� 	������O�	 � P� P� P� 	�������0� � 1� 1� 1� 	�����3� � 4� 4� 4�
 	�������$� � 	%� 	%� 	%� 	�������,� � -� -� -� 	�����5� � 6� 6� 6� 	�������7� � 	8� 	8� 	8� 	�����:���
� � 
� 
� 
� 	�������� � � � � 	�������� � � � � 	�������� � � � � 	���%����&� � 	'� 	'� 	'� 	�����I� � K� K� K� 	�������F� � G� G� G� 	������<�	 � � � � 
�	�	�4�	 �	 � r   )�__doc__r   r7   �re�custom_image_utilsr   �compiler   r   r   r   r#   r   r   r   r'   r:   � r   r   �<module>rD      s  ��� �
 ���� ���� 	�	�	�	� (� (� (� (� (� (�
 ���D�E�E���"�*�x�y�y��#���  %G�  H�  H� �'�R�Z�(V�W�W� � B�  B�  B� �� � �� � �� � �� � �!� !� !� !� !r   ====== Filename: ./custom_image_utils/__pycache__/__init__.cpython-311.pyc ======
�
    ӷpf    �                   �   � d S )N� r   �    �X/usr/local/google/home/cjac/src/github/cjac/custom-images/custom_image_utils/__init__.py�<module>r      s   �� � r   ====== Filename: ./custom_image_utils/__pycache__/shell_script_executor.cpython-311.pyc ======
�
    �@g�  �                   �0   � d Z ddlZddlZddlZddlZd� ZdS )z
Shell script executor.
�    Nc                 �H  � t          j        d��  �        }	 |�                    | �                    d�  �        �  �         |�                    �   �          |�                    �   �          t          j        d|j        gt          j
        t          j        ��  �        }|�                    �   �          |j        dk    rt          d�  �        �	 	 t          j        |j        �  �         dS # t"          $ r Y dS w xY w# 	 t          j        |j        �  �         w # t"          $ r Y w w xY wxY w)	zRuns a Shell script.F)�deletezutf-8�bash)�stdout�stderrr   zError building custom image.N)�tempfile�NamedTemporaryFile�write�encode�flush�close�
subprocess�Popen�name�sysr   r   �wait�
returncode�RuntimeError�os�remove�OSError)�shell_script�	temp_file�pipes      �y/usr/local/google/home/cjac/src/github/LLC-Technologies-Collier/custom-images/custom_image_utils/shell_script_executor.py�runr      s6  � � �)��7�7�7�)���O�O�L�'�'��0�0�1�1�1��O�O�����O�O���� ��	��� ��z��z�� � �D� 	�I�I�K�K�K���!����7�8�8�8� ���i�	��������� � � �
�d�d��������i�	�������� � � �
�d�������sB   �B0C5 �	C$ �$
C2�1C2�5D!�7D�D!�
D�D!�D�D!)�__doc__r   r   r   r   r   � �    r   �<module>r       sQ   ��� � 
�	�	�	� � � � � 
�
�
�
� ����� � � � r   ====== Filename: ./env.json.bz2 ======
BZh91AY&SYm�B� b߀ P��?��0����0�ֵ���<�b��j   h4!
m�)�MF� ѐ��M B#Q�M&�=M4�OPz��cP�1���"d$O�Uv>V�J���������RǪ#	 A�2�;��\[o:V��y�a^W,�2�e	e9�ȋ�)jd�) ;�������M�{�](eJ�G��ܣD��wn�m%t�����&�&
ā.Sv��5�]��K���U��ܚ6Z�Y1�>�B�1J�a'Y	��"�U��HRq���>��j=�vU�h/��>��"`�����zf�I���GdF�s�� ����hA��b�	�U.2�(�i┥��8~k�o)��tY�/*�wb�zdv?uV�ҾJH�IH�}vL���q�qd4aH��<^��� �`x�b�p
�U+*��$P�4�a��q?��.�p� ۤ��====== Filename: ./README.md ======
# Build Dataproc custom images

This page describes how to generate a custom Dataproc image.

## Important notes

To help ensure that clusters receive the latest service updates and bug fixes,
the creation of clusters with a custom image is limited to **365 days** from the
image creation date, but existing custom-image clusters can run indefinitely.
Automation to continuously build a custom image may be necessary if you wish to
create clusters with a custom image for a period greater than 365 days.

Creating clusters with expired custom images is possible by following these
[instructions](https://cloud.google.com/dataproc/docs/guides/dataproc-images#how_to_create_a_cluster_with_an_expired_custom_image),
but Cloud Dataproc cannot guarantee support of issues that arise with these
clusters.

## Requirements

1.  Python
2.  gcloud
3.  Bash 3.0.
4.  A GCE project with billing, Google Cloud Dataproc API, Google Compute Engine
    API, Google Secret Manager API, and Google Cloud Storage APIs enabled.
5.  Use `gcloud config set project <your-project>` to specify which project to
    use to create and save your custom image.

## Generate custom image

To generate a custom image, you can run the following command:

```shell
python generate_custom_image.py \
    --image-name '<new_custom_image_name>' \
    --dataproc-version '<dataproc_version>' \
    --customization-script '<custom_script_to_install_custom_packages>' \
    --zone '<zone_to_create_instance_to_build_custom_image>' \
    --gcs-bucket '<gcs_bucket_to_write_logs>'
```

### Arguments

*   **--image-name**: The name for custom image.
*   **--dataproc-version**: The Dataproc version for this custom image
    to build on. Examples: `2.2.32-debian12`, `2.2.31-debian12`,
    `2.2.31-ubuntu22`. If the sub-minor version is unspecified, the
    latest available one will be used.  Examples: `2.2-rocky9`,
    `2.2-debian12`. For a complete list of Dataproc image versions,
    please review the output of `gcloud compute images list --project
    cloud-dataproc`. To understand Dataproc versioning, please refer
    to
    [documentation](https://cloud.google.com/dataproc/docs/concepts/versioning/overview).
    **This argument is mutually exclusive with `--base-image-uri` and
    `--source-image-family`**.
*   **--base-image-uri**: The full image URI for the base Dataproc image. The
    customization script will be executed on top of this image instead of an
    out-of-the-box Dataproc image. This image must be a valid Dataproc image.
    **This argument is mutually exclusive with `--dataproc-version` and
    `--source-image-family`**.
*   **--base-image-family**: The image family that the boot disk will be
    initialized with. The latest non-deprecated image from the family will be
    used. An example base image family URI is
    `projects/PROJECT_NAME/global/images/family/<FAMILY_NAME>`. To get the list
    of image families (and the associated image), run `gcloud compute images
    list [--project <PROJECT_NAME>]`. **This argument is mutually exclusive with
    `--dataproc-version` and `--base-image-uri`**.
*   **--customization-script**: The script used to install custom packages on
    the image.
*   **--zone**: The GCE zone for running your GCE instance.
*   **--gcs-bucket**: A GCS bucket to store the logs of building custom image.

#### Optional Arguments

*   **--family**: The family of the source image. This will cause the latest
    non-deprecated image in the family to be used as the source image.
*   **--project-id**: The project Id of the project where the custom image is
    created and saved. The default project Id is the current project id
    specified in `gcloud config get-value project`.
*   **--oauth**: The OAuth credential file used to call Google Cloud APIs. The
    default OAuth is the application-default credentials from gcloud.
*   **--machine-type**: The machine type used to build custom image. The default
    is `n1-standard-1`.
*   **--no-smoke-test**: This parameter is used to disable smoke testing the
    newly built custom image. The smoke test is used to verify if the newly
    built custom image can create a functional Dataproc cluster. Disabling this
    step will speed up the custom image build process; however, it is not
    advised. Note: The smoke test will create a Dataproc cluster with the newly
    built image, runs a short job and deletes the cluster in the end.
*   **--network**: This parameter specifies the GCE network to be used to launch
    the GCE VM instance which builds the custom Dataproc image. The default
    network is 'global/networks/default'. If the default network does not exist
    in your project, please specify a valid network interface. For more
    information on network interfaces, please refer to
    [GCE VPC documentation](https://cloud.google.com/vpc/docs/vpc).
*   **--subnetwork**: This parameter specifies the subnetwork that is used to
    launch the VM instance that builds the custom Dataprocimage. A full
    subnetwork URL is required. The default subnetwork is None. For more
    information, please refer to
    [GCE VPC documentation](https://cloud.google.com/vpc/docs/vpc).
*   **--no-external-ip**: This parameter is used to disables external IP for the
    image build VM. The VM will not be able to access the internet, but if
    [Private Google Access](https://cloud.google.com/vpc/docs/configure-private-google-access)
    is enabled for the subnetwork, it can still access Google services (e.g.,
    GCS) through internal IP of the VPC.
*   **--service-account**: The service account that is used to launch the VM
    instance that builds the custom Dataproc image. The scope of this service
    account is defaulted to "/auth/cloud-platform", which authorizes VM instance
    the access to all cloud platform services that is granted by IAM roles.
    Note: IAM role must allow the VM instance to access GCS bucket in order to
    access scripts and write logs.
*   **--extra-sources**: Additional files/directories uploaded along with
    customization script. This argument is evaluated to a json dictionary.
*   **--disk-size**: The size in GB of the disk attached to the VM instance used
    to build custom image. The default is `30` GB.
*   **--accelerator**: The accelerators (e.g. GPUs) attached to the VM instance
    used to build custom image. This flag supports the same
    [values](https://cloud.google.com/sdk/gcloud/reference/compute/instances/create#--accelerator)
    as `gcloud compute instances create --accelerator` flag. By default no
    accelerators are attached.
*   **--base-image-uri**: The partial image URI for the base Dataproc image. The
    customization script will be executed on top of this image instead of an
    out-of-the-box Dataproc image. This image must be a valid Dataproc image.
    The format of the partial image URI is the following:
    `projects/<project_id>/global/images/<image_name>`.
*   **--storage-location**: The storage location (e.g. US, us-central1) of the
    custom GCE image. This flag supports the same
    [values](https://cloud.google.com/sdk/gcloud/reference/compute/images/create#--storage-location)
    as `gcloud compute images create --storage-location` flag. If not specified,
    the default GCE image storage location is used.
*   **--shutdown-instance-timer-sec**: The time to wait in seconds before
    shutting down the VM instance. This value may need to be increased if your
    init script generates a lot of output on stdout. If not specified, the
    default value of 300 seconds will be used.
*   **--dry-run**: Dry run mode which only validates input and generates
    workflow script without creating image. Disabled by default.
*   **--trusted-cert**: a certificate in DER format to be inserted
    into the custom image's EFI boot sector.  Can be generated by
    reading examples/secure-boot/README.md.  This argument is mutually
    exclusive with base-image-family
*   **--metadata**: VM metadata which can be read by the customization script
    with `/usr/share/google/get_metadata_value attributes/<key>` at runtime. The
    value of this flag takes the form of `key1=value1,key2=value2,...`. If the
    value includes special characters (e.g., `=`, `,` or spaces) which needs to
    be escaped, consider encoding the value, then decode it back in the
    customization script. See more information about VM metadata on
    https://cloud.google.com/sdk/gcloud/reference/compute/instances/create.
*   **--optional-components**: List of optional components for 2.3+ DPGCE Images. This will install the 
    optional components in the image. For eg. - SOLR,RANGER,TRINO,DOCKER,FLINK,HIVE_WEBHCAT,ZEPPELIN,HUDI,ICEBERG,PIG
    is the list of valid optional components list.

#### Overriding cluster properties with a custom image

You can use custom images to overwrite any
[cluster properties](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/cluster-properties)
set during cluster creation. If a user creates a cluster with your custom image
but sets cluster properties different from those you set with your custom image,
your custom image cluster property settings will take precedence.

To set cluster properties with your custom image:

In your custom image
[customization script](https://cloud.google.com/dataproc/docs/guides/dataproc-images#running_the_code),
create a `dataproc.custom.properties` file in `/etc/google-dataproc`, then set
cluster property values in the file.

*   Sample `dataproc.custom.properties` file contents:

    ```shell
    dataproc.conscrypt.provider.enable=true
    dataproc.logging.stackdriver.enable=false
    ```

*   Sample customization script file-creation snippet to override two cluster
    properties:

    ```shell
    cat <<EOF >/etc/google-dataproc/dataproc.custom.properties
    dataproc.conscrypt.provider.enable=true
    dataproc.logging.stackdriver.enable=false EOF
    ```

### Examples

#### Create a custom image

Create a custom image with name `custom-image-1-5-9` with Dataproc version
`1.5.9-debian10`:

```shell
python generate_custom_image.py \
    --image-name custom-image-1-5-9 \
    --dataproc-version 1.5.9-debian10 \
    --customization-script ~/custom-script.sh \
    --metadata 'key1=value1,key2=value2' \
    --zone us-central1-f \
    --gcs-bucket gs://my-test-bucket
```

#### Create a custom image without running smoke test

```shell
python generate_custom_image.py \
    --image-name custom-image-1-5-9 \
    --dataproc-version 1.5.9-debian10 \
    --customization-script ~/custom-script.sh \
    --zone us-central1-f \
    --gcs-bucket gs://my-test-bucket \
    --no-smoke-test
```
====== Filename: ./generate_custom_image.py ======
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Generate custom Dataproc image.

This python script is used to generate a custom Dataproc image for the user.

With the required arguments such as custom install packages script and
Dataproc version, this script will run the following steps in order:
  1. Get user's gcloud project ID.
  2. Get Dataproc's base image name with Dataproc version.
  3. Run Shell script to create a custom Dataproc image.
    1. Create a disk with Dataproc's base image.
    2. Create an GCE instance with the disk.
    3. Run custom install packages script to install custom packages.
    4. Shutdown instance.
    5. Create custom Dataproc image from the disk.
  4. Set the custom image label (required for launching custom Dataproc image).
  5. Run a Dataproc workflow to smoke test the custom image.

Once this script is completed, the custom Dataproc image should be ready to use.

"""

import logging
import os
import subprocess
import sys

from custom_image_utils import args_inferer
from custom_image_utils import args_parser
from custom_image_utils import expiration_notifier
from custom_image_utils import image_labeller
from custom_image_utils import shell_image_creator
from custom_image_utils import smoke_test_runner

logging.basicConfig()
_LOG = logging.getLogger(__name__)
_LOG.setLevel(logging.WARN)


def parse_args(raw_args):
  """Parses and infers command line arguments."""

  args = args_parser.parse_args(raw_args)
  _LOG.info("Parsed args: {}".format(args))
  args_inferer.infer_args(args)
  _LOG.info("Inferred args: {}".format(args))
  return args


def perform_sanity_checks(args):
  _LOG.info("Performing sanity checks...")

  # Customization script
  if not os.path.isfile(args.customization_script):
    raise Exception("Invalid path to customization script: '{}' is not a file.".format(
        args.customization_script))

  # Check the image doesn't already exist.
  command = "gcloud compute images describe {} --project={}".format(
      args.image_name, args.project_id)
  with open(os.devnull, 'w') as devnull:
    pipe = subprocess.Popen(
        [command], stdout=devnull, stderr=devnull, shell=True)
    pipe.wait()
    if pipe.returncode == 0:
      raise RuntimeError("Image {} already exists.".format(args.image_name))

  _LOG.info("Passed sanity checks...")


def main():
  """Generates custom image."""

  args = parse_args(sys.argv[1:])
  perform_sanity_checks(args)
  shell_image_creator.create(args)
  image_labeller.add_label(args)
  smoke_test_runner.run(args)
  expiration_notifier.notify(args)


if __name__ == "__main__":
  main()
====== Filename: ./env.json.zst ======
(�/�d�� �U#k��m�����Z��1�T���Љ��2R+  P �K K K ��g�V�)|��JoK@�}_�c#��-:B����/���<NE$��h+Y�� ��Q�r�MS�E 2]����B�DQ&Ӷ�[�Zw.M�`�&@Y����Vӝ�DO2�D*��-�����RV;��4��H�Y���,@�I`��Ls�b�<؝d���1l�tc�r[ItSW��a�Ysm7�U�\�����}��򥳋���@ʣD�����j�m	j&A�������[,e�O|g�|���:�nz#vn�/~�>��Б d_WW��|7=�V�XV�.|PW �-��r�h@�+) P±�z6����* h>�N>(����Y�%	Ř3�9	ka�𢧆�I3��@��Έ7�7�9\#߈�NrBA�6N|S������ ;T�>����====== Filename: ./scripts/customize_conda.sh ======
#!/usr/bin/env bash

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -euxo pipefail

# This customization-script can be used to customize the conda environment.
# It expects the following metadata:
#
#   conda-component: (Required) Must be either ANACONDA or MINICONDA3. Please
#   make sure the base image supports the component passed here, else the
#   script will fail. Anaconda is not supported on 2.0 images. For information
#   on Anaconda vs Miniconda, refer to Miniconda's latest documentation
#   https://docs.conda.io/en/latest/miniconda.html
#
#   conda-env-config-uri: (Optional) Must be a GCS URI to the yaml config
#   file.
#
#   conda-packages: (Optional) A list of conda packages with versions to be
#   installed in the base environment. Must be of the format
#   <pkg1>:<version1>#<pkg2>:<version2>...
#
#   pip-packages: (Optional) A list of pip packages with versions to be
#   installed in the base environment. Must be of the format
#   <pkg1>:<version1>#<pkg2>:<version2>...
#
# conda-env-config-uri is mutually exclusive with conda-packages and
# pip-packages. If both are provided, the script will fail.
# If environment config file does not contain name of the environment, the name
# "custom" will be used by default.
#
#
# Examples
#
# The following example extracts config file from your environment, copies it to
# your GCS bucket and uses it to create a cluster.
#
# For gcloud SDK < 402, use `gsutil` instead of `gcloud storage`
#
#   conda env export --name=<env-name> > environment.yaml
#   gcloud storage cp environment.yaml gs://<bucket-directory-path>/environment.yaml
#   python generate_custom_image.py \
#    --image-name <image-name> \
#    --dataproc-version "1.5.34-debian10" \
#    --customization-script scripts/customize_conda.sh \
#    --zone <zone> \
#    --gcs-bucket gs://<bucket-directory-path> \
#    --metadata 'conda-component=MINICONDA3,dataproc:conda.env.config.uri=gs://<file-path>/environment.yaml'
#
#
# The following example installs the specified conda and pip packages into the
# base environment.
# python generate_custom_image.py \
#    --image-name <image-name> \
#    --dataproc-version "1.5.34-debian10" \
#    --customization-script scripts/customize_conda.sh \
#    --zone <zone> \
#    --gcs-bucket gs://<bucket-path> \
#    --metadata 'conda-component=MINICONDA3,conda-packages=pytorch:1.4.0#visions:0.7.1,pip-packages=tokenizers:0.10.1#numpy:1.19.2'


function customize_conda() {
  local conda_component
  local conda_env_config_uri
  local conda_packages
  local pip_packages
  local conda_bin_dir
  conda_component=$(/usr/share/google/get_metadata_value attributes/conda-component || true)
  conda_env_config_uri=$(/usr/share/google/get_metadata_value attributes/conda-env-config-uri || true)
  conda_packages=$(/usr/share/google/get_metadata_value attributes/conda-packages || true)
  pip_packages=$(/usr/share/google/get_metadata_value attributes/pip-packages || true)

  validate_conda_component "${conda_component}"

  if [[ -n "${conda_env_config_uri}" && (( -n "${conda_packages}" || -n "${pip_packages}" )) ]]; then
    echo "conda-env-config-uri is mutually exclusive with conda-packages and pip-packages."
    exit 1
  fi

  if [[ "${conda_component}" == 'ANACONDA' ]]; then
    conda_bin_dir="/opt/conda/anaconda/bin"
  elif [[ "${conda_component}" == 'MINICONDA3' ]]; then
    conda_bin_dir="/opt/conda/miniconda3/bin"
  fi
  if [[ -n "${conda_env_config_uri}" ]]; then
    customize_with_config_file "${conda_bin_dir}" "${conda_env_config_uri}"
  else
    customize_with_package_list "${conda_bin_dir}" "${conda_packages}" "${pip_packages}"
  fi
}

function validate_conda_component() {
  local -r conda_component=$1

  if [[ -z "${conda_component}" ]]; then
    echo "Expected metadata conda-component not found"
    exit 1
  fi

  if [[ "${conda_component}" != 'ANACONDA' && "${conda_component}" != 'MINICONDA3' ]]; then
    echo "Metadata conda-component should either be ANACONDA or MINICONDA3"
    exit 1
  fi
}

function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; }
function version_lt(){ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";}

# With the 402.0.0 release of gcloud sdk, `gcloud storage` can be
# used as a more performant replacement for `gsutil`
gsutil_cmd="gcloud storage"
gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')"
if version_lt "${gcloud_sdk_version}" "402.0.0" ; then
  gsutil_cmd="$(which gsutil) -o GSUtil:check_hashes=never"
fi

function customize_with_config_file() {
  local -r conda_bin_dir=$1
  local -r conda_env_config_uri=$2
  local temp_config_file
  temp_config_file=$(mktemp /tmp/conda_env_XXX.yaml)
  ${gsutil_cmd} cp "${conda_env_config_uri}" "${temp_config_file}"
  conda_env_name="$(grep 'name: ' "${temp_config_file}" | awk '{print $2}')"
  if [[ -z "${conda_env_name}" ]]; then
    conda_env_name="custom"
  fi
  create_and_activate_environment "${conda_bin_dir}" "${conda_env_name}" "${temp_config_file}"
}

function create_and_activate_environment() {
  local -r conda_bin_dir=$1
  local -r conda_env_name=$2
  local -r conda_env_config=$3
  "${conda_bin_dir}/conda" env create --quiet --name="${conda_env_name}" --file="${conda_env_config}"
  source "${conda_bin_dir}/activate" "${conda_env_name}"

  # Set property conda.env, which can be used during activate of the conda
  # component to activate the right environment.
  local -r conda_properties_path=/etc/google-dataproc/conda.properties
  echo "conda.env=$conda_env_name" >> "${conda_properties_path}"
}

function customize_with_package_list() {
  local -r conda_bin_dir=$1
  local conda_packages=$2
  local pip_packages=$3
  if [[ -n "${conda_packages}" ]]; then
      local -a packages
      conda_packages=$(echo "${conda_packages}" | sed -r 's/:/==/g')
      IFS='#' read -r -a packages <<< "${conda_packages}"
      validate_package_formats "${packages[@]}"

      # Conda will upgrade dependencies only if required, and fail if conflict
      # resolution with existing packages is not possible.
      "${conda_bin_dir}/conda" install "${packages[@]}" --yes
    fi
    if [[ -n "${pip_packages}" ]]; then
      local -a packages
      pip_packages=$(echo "${pip_packages}" | sed -r 's/:/==/g')
      IFS='#' read -r -a packages <<< "${pip_packages}"
      validate_package_formats "${packages[@]}"

      # Pip will upgrade dependencies only if required. Pip does not check for
      # conflicts and may result in inconsistent environment.
      "${conda_bin_dir}/pip" install -U --upgrade-strategy only-if-needed "${packages[@]}"
    fi
}

function validate_package_formats() {
  local -r packages=("$@")
  local -r regex='.+==[0-9]+[\\.[0-9]+]*'
  for package in "${packages[@]}"; do
    if ! [[ "${package}" =~ $regex ]]; then
      echo "Invalid package format ${package}"
      exit 1
    fi
  done
}

customize_conda
====== Filename: ./tests/__init__.py ======
====== Filename: ./tests/test_create_custom_image.sh ======
#!/usr/bin/env bash

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -euxo pipefail

readonly CURRENT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)
readonly REPO_DIR=$(realpath "${CURRENT_DIR}/..")

readonly TEST_SUFFIX=$(tr -dc 'a-z0-9' </dev/urandom | head -c 6)
readonly TEST_BUCKET="gs://dataproc-custom-images-presubmit"
readonly TEST_ZONE="us-central1-a"

test_debian_with_image_version() {
  local image_name="test-image-deb9-${TEST_SUFFIX}"
  echo "Creating custom Debian image: ${image_name}"

  python2 "${REPO_DIR}/generate_custom_image.py" \
    --image-name "${image_name}" \
    --dataproc-version 1.4.15-debian9 \
    --customization-script "${REPO_DIR}/examples/customization_script.sh" \
    --metadata "key1=value1,key2=value2" \
    --zone "${TEST_ZONE}" \
    --gcs-bucket "${TEST_BUCKET}" \
    --shutdown-instance-timer-sec 10
}

test_ubuntu_with_image_uri() {
  local image_name="test-image-ubu18-${TEST_SUFFIX}"
  echo "Creating custom Ubuntu image: ${image_name}"

  python2 "${REPO_DIR}/generate_custom_image.py" \
    --image-name "${image_name}" \
    --base-image-uri projects/cloud-dataproc/global/images/dataproc-1-4-ubu18-20190606-000000-rc01 \
    --customization-script "${REPO_DIR}/examples/customization_script.sh" \
    --metadata "key1=value1,key2=value2" \
    --zone "${TEST_ZONE}" \
    --gcs-bucket "${TEST_BUCKET}" \
    --shutdown-instance-timer-sec 10
}

test_extra_sources() {
  local image_name="test-image-extra-src-${TEST_SUFFIX}"
  echo "Creating custom image: ${image_name}"

  python2 "${REPO_DIR}/generate_custom_image.py" \
    --image-name "${image_name}" \
    --dataproc-version 1.4.15-ubuntu18 \
    --customization-script "${REPO_DIR}/tests/data/customization_script_with_extra_sources.sh" \
    --metadata "key1=value1,key2=value2" \
    --extra-sources "{\"extra/source.txt\": \"${REPO_DIR}/tests/data/extra_source.txt\"}" \
    --zone "${TEST_ZONE}" \
    --gcs-bucket "${TEST_BUCKET}" \
    --shutdown-instance-timer-sec 10
}

test_debian_with_image_version
test_ubuntu_with_image_uri
test_extra_sources

echo "All custom image tests succedded"
====== Filename: ./tests/test_args_parser.py ======
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
import exceptions
from custom_image_utils import args_parser


class TestArgsParser(unittest.TestCase):

  def test_missing_required_args(self):
    """Verifies it fails if missing required args."""
    with self.assertRaises(SystemExit) as e:
      args_parser.parse_args([])

  def test_minimal_required_args(self):
    """Verifies it succeeds if all required args are present."""
    customization_script = '/tmp/my-script.sh'
    gcs_bucket = 'gs://my-bucket'
    image_name = 'my-image'
    zone = 'us-west1-a'

    args = args_parser.parse_args([
        '--image-name', image_name,
        '--customization-script', customization_script,
        '--zone', zone,
        '--gcs-bucket', gcs_bucket])

    expected_result = self._make_expected_result(
        accelerator=None,
        base_image_family="None",
        base_image_uri="None",
        customization_script="'{}'".format(customization_script),
        dataproc_version="None",
        disk_size="20",
        dry_run=False,
        extra_sources="{}",
        family="'dataproc-custom-image'",
        gcs_bucket="'{}'".format(gcs_bucket),
        image_name="'{}'".format(image_name),
        machine_type="'n1-standard-1'",
        network="'{}'".format(''),
        no_external_ip="False",
        no_smoke_test="False",
        oauth="None",
        project_id="None",
        service_account="'default'",
        shutdown_instance_timer_sec="300",
        storage_location=None,
        subnetwork="''",
        zone="'{}'".format(zone),
        metadata=None
    )
    self.assertEqual(str(args), expected_result)

  def test_optional_args(self):
    """Verifies it succeeds with optional arguments specified."""
    accelerator = 'type=nvidia-tesla-v100,count=2'
    customization_script = '/tmp/my-script.sh'
    dataproc_version = '1.4.5-debian9'
    disk_size = 40
    dry_run = True
    family = 'debian9'
    gcs_bucket = 'gs://my-bucket'
    image_name = 'my-image'
    machine_type = 'n1-standard-4'
    network = 'my-network'
    no_external_ip = True
    no_smoke_test = True
    oauth = 'xyz'
    project_id = 'my-project'
    service_account = "my-service-account"
    shutdown_instance_timer_sec = 567
    storage_location = 'us-east1'
    subnetwork = 'my-subnetwork'
    zone = 'us-west1-a'
    metadata = 'key1=value1,key2=value2'

    args = args_parser.parse_args([
        '--accelerator', str(accelerator),
        '--customization-script', customization_script,
        '--dataproc-version', dataproc_version,
        '--disk-size', str(disk_size),
        '--dry-run',
        '--family', family,
        '--gcs-bucket', gcs_bucket,
        '--image-name', image_name,
        '--machine-type', machine_type,
        '--network', network,
        '--no-external-ip',
        '--no-smoke-test',
        '--oauth', oauth,
        '--project-id', project_id,
        '--service-account', service_account,
        '--shutdown-instance-timer-sec', str(shutdown_instance_timer_sec),
        '--storage-location', str(storage_location),
        '--subnetwork', subnetwork,
        '--zone', zone,
        '--metadata', metadata,
    ])

    expected_result = self._make_expected_result(
        accelerator="'{}'".format(accelerator),
        base_image_family="None",        
        base_image_uri="None",
        customization_script="'{}'".format(customization_script),
        dataproc_version="'{}'".format(dataproc_version),
        disk_size="{}".format(disk_size),
        dry_run="{}".format(dry_run),
        extra_sources="{}",
        family="'{}'".format(family),
        gcs_bucket="'{}'".format(gcs_bucket),
        image_name="'{}'".format(image_name),
        machine_type="'{}'".format(machine_type),
        metadata="'{}'".format(metadata),
        network="'{}'".format(network),
        no_external_ip="{}".format(no_external_ip),
        no_smoke_test="{}".format(no_smoke_test),
        oauth="'{}'".format(oauth),
        project_id="'{}'".format(project_id),
        service_account="'{}'".format(service_account),
        shutdown_instance_timer_sec="{}".format(shutdown_instance_timer_sec),
        storage_location="'{}'".format(storage_location),
        subnetwork="'{}'".format(subnetwork),
        zone="'{}'".format(zone),
    )
    self.assertEqual(str(args), expected_result)

  def test_inferred_subminor_versions(self):
    """Verifies it succeeds if inferred/unspecified subminor version is correctly formatted."""
    customization_script = '/tmp/my-script.sh'
    gcs_bucket = 'gs://my-bucket'
    image_name = 'my-image'
    zone = 'us-west1-a'

    def _args_parsed(dataproc_version):
      return args_parser.parse_args([
          '--image-name', image_name,
          '--dataproc-version', dataproc_version,
          '--customization-script', customization_script,
          '--zone', zone,
          '--gcs-bucket', gcs_bucket])

    def _expected_result(dataproc_version):
       return self._make_expected_result(
          accelerator=None,
          base_image_family="None",
          base_image_uri="None",
          customization_script="'{}'".format(customization_script),
          dataproc_version="'{}'".format(dataproc_version),
          disk_size="20",
          dry_run=False,
          extra_sources="{}",
          family="'dataproc-custom-image'",
          gcs_bucket="'{}'".format(gcs_bucket),
          image_name="'{}'".format(image_name),
          machine_type="'n1-standard-1'",
          network="'{}'".format(''),
          no_external_ip="False",
          no_smoke_test="False",
          oauth="None",
          project_id="None",
          service_account="'default'",
          shutdown_instance_timer_sec="300",
          storage_location=None,
          subnetwork="''",
          zone="'{}'".format(zone),
          metadata=None
    )

    def _args_exception(dataproc_version):
      # Checks that inputs produce an exception
      try:
        _args_parsed(dataproc_version)
      except SystemExit as e:
        self.assertEqual(e.__class__, exceptions.SystemExit)
      else:
        raise ValueError("Exception not raised")

    self.assertEqual(str(_args_parsed('1.5-debian10')), _expected_result('1.5-debian10'))
    self.assertEqual(str(_args_parsed('1.3-ubuntu18')), _expected_result('1.3-ubuntu18'))
    self.assertEqual(str(_args_parsed('1.3-centos8')), _expected_result('1.3-centos8'))

    invalid_dataproc_versions = ['*.*.*-debian10', '1.**.*-debian10', '1.*.8*-debian10', '11.*.*-debian', 
      '1.*-debian10', '1.5.*-debian10', '1.5.-debian10', '1.*.*-debian10']
    try:
      for version in invalid_dataproc_versions:
        _args_exception(version)
    except ValueError as e:
      raise e

  def _make_expected_result(
      self,
      accelerator,
      base_image_family,      
      base_image_uri,
      customization_script,
      dataproc_version,
      disk_size,
      dry_run,
      extra_sources,
      family,
      gcs_bucket,
      image_name,
      machine_type,
      metadata,
      network,
      no_external_ip,
      no_smoke_test,
      oauth,
      project_id,
      service_account,
      shutdown_instance_timer_sec,
      storage_location,
      subnetwork,
      zone):
    expected_result_template = (
        "Namespace("
        "accelerator={}, "
        "base_image_family={}, "        
        "base_image_uri={}, "
        "customization_script={}, "
        "dataproc_version={}, "
        "disk_size={}, "
        "dry_run={}, "
        "extra_sources={}, "
        "family={}, "
        "gcs_bucket={}, "
        "image_name={}, "
        "machine_type={}, "
        "metadata={}, "
        "network={}, "
        "no_external_ip={}, "
        "no_smoke_test={}, "
        "oauth={}, "
        "project_id={}, "
        "service_account={}, "
        "shutdown_instance_timer_sec={}, "
        "storage_location={}, "
        "subnetwork={}, "
        "zone={})")
    return expected_result_template.format(
        accelerator,
        base_image_family,        
        base_image_uri,
        customization_script,
        dataproc_version,
        disk_size,
        dry_run,
        extra_sources,
        family,
        gcs_bucket,
        image_name,
        machine_type,
        metadata,
        network,
        no_external_ip,
        no_smoke_test,
        oauth,
        project_id,
        service_account,
        shutdown_instance_timer_sec,
        storage_location,
        subnetwork,
        zone)

if __name__ == '__main__':
    unittest.main()
====== Filename: ./tests/test_customize_conda_script.sh ======
#!/usr/bin/env bash

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -euxo pipefail

readonly CURRENT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)
readonly REPO_DIR=$(realpath "${CURRENT_DIR}/..")

readonly TEST_SUFFIX=$(tr -dc 'a-z0-9' </dev/urandom | head -c 6)
readonly TEST_BUCKET="gs://dataproc-custom-images-presubmit"
readonly TEST_ZONE="us-central1-a"

test_script_with_environment_config_metadata() {
  local image_name="test-image-custom-conda-env-${TEST_SUFFIX}"
  echo "Creating custom image: ${image_name}"

  python2 "${REPO_DIR}/generate_custom_image.py" \
    --image-name "${image_name}" \
    --dataproc-version 1.5.34-debian10 \
    --customization-script "${REPO_DIR}/scripts/customize_conda.sh" \
    --metadata "conda-component=MINICONDA3,conda-env-config-uri=gs://dataproc-integration-test/conda-integration-test/test-env-15.yaml" \
    --zone "${TEST_ZONE}" \
    --gcs-bucket "${TEST_BUCKET}" \
    --shutdown-instance-timer-sec 10
}

test_script_with_packages_metadata() {
  local image_name="test-image-custom-conda-packages-${TEST_SUFFIX}"
  echo "Creating custom image: ${image_name}"

  python2 "${REPO_DIR}/generate_custom_image.py" \
    --image-name "${image_name}" \
    --dataproc-version 1.5.34-debian10 \
    --customization-script "${REPO_DIR}/scripts/customize_conda.sh" \
    --metadata "conda-component=MINICONDA3,conda-packages=pytorch:1.4.0_visions:0.7.1,pip-packages=tokenizers:0.10.1_numpy:1.19.2" \
    --zone "${TEST_ZONE}" \
    --gcs-bucket "${TEST_BUCKET}" \
    --shutdown-instance-timer-sec 10
}

test_script_with_environment_config_metadata
test_script_with_packages_metadata

echo "All customize conda script tests succeeded"
====== Filename: ./tests/test_shell_script_generator.py ======
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

from custom_image_utils import shell_script_generator

_expected_script = """
#!/usr/bin/env bash

# Script for creating Dataproc custom image.

set -euxo pipefail

RED='\\e[0;31m'
GREEN='\\e[0;32m'
NC='\\e[0m'

function exit_handler() {
  echo 'Cleaning up before exiting.'

  if [[ -f /tmp/custom-image-my-image-20190611-160823/vm_created ]]; then
    echo 'Deleting VM instance.'
    gcloud compute instances delete my-image-install         --project=my-project --zone=us-west1-a -q
  elif [[ -f /tmp/custom-image-my-image-20190611-160823/disk_created ]]; then
    echo 'Deleting disk.'
    gcloud compute disks delete my-image-install --project=my-project --zone=us-west1-a -q
  fi

  echo 'Uploading local logs to GCS bucket.'
  gsutil -m rsync -r /tmp/custom-image-my-image-20190611-160823/logs/ gs://my-bucket/custom-image-my-image-20190611-160823/logs/

  if [[ -f /tmp/custom-image-my-image-20190611-160823/image_created ]]; then
    echo -e "${GREEN}Workflow succeeded, check logs at /tmp/custom-image-my-image-20190611-160823/logs/ or gs://my-bucket/custom-image-my-image-20190611-160823/logs/${NC}"
    exit 0
  else
    echo -e "${RED}Workflow failed, check logs at /tmp/custom-image-my-image-20190611-160823/logs/ or gs://my-bucket/custom-image-my-image-20190611-160823/logs/${NC}"
    exit 1
  fi
}

function main() {
  echo 'Uploading files to GCS bucket.'
  declare -a sources_k=([0]='run.sh' [1]='init_actions.sh' [2]='ext'\\''ra_src.txt')
  declare -a sources_v=([0]='startup_script/run.sh' [1]='/tmp/my-script.sh' [2]='/path/to/extra.txt')
  for i in "${!sources_k[@]}"; do
    gsutil cp "${sources_v[i]}" "gs://my-bucket/custom-image-my-image-20190611-160823/sources/${sources_k[i]}"
  done

  echo 'Creating disk.'
  if [[ 'projects/my-dataproc-project/global/images/family/debian-10' = '' ||  'projects/my-dataproc-project/global/images/family/debian-10' = 'None' ]]; then
     IMAGE_SOURCE="--image=projects/cloud-dataproc/global/images/dataproc-1-4-deb9-20190510-000000-rc01"
  else
     IMAGE_SOURCE="--image-family=projects/my-dataproc-project/global/images/family/debian-10"
  fi
  
  gcloud compute disks create my-image-install       --project=my-project       --zone=us-west1-a       ${IMAGE_SOURCE}       --type=pd-ssd       --size=40GB

  touch "/tmp/custom-image-my-image-20190611-160823/disk_created"

  echo 'Creating VM instance to run customization script.'
  gcloud compute instances create my-image-install       --project=my-project       --zone=us-west1-a              --subnet=my-subnet       --no-address       --machine-type=n1-standard-2       --disk=auto-delete=yes,boot=yes,mode=rw,name=my-image-install       --accelerator=type=nvidia-tesla-v100,count=2 --maintenance-policy terminate       --service-account=my-service-account       --scopes=cloud-platform       --metadata=shutdown-timer-in-sec=500,custom-sources-path=gs://my-bucket/custom-image-my-image-20190611-160823/sources,key1=value1,key2=value2       --metadata-from-file startup-script=startup_script/run.sh
  touch /tmp/custom-image-my-image-20190611-160823/vm_created

  echo 'Waiting for customization script to finish and VM shutdown.'
  gcloud compute instances tail-serial-port-output my-image-install       --project=my-project       --zone=us-west1-a       --port=1 2>&1       | grep 'startup-script'       | tee /tmp/custom-image-my-image-20190611-160823/logs/startup-script.log       || true

  echo 'Checking customization script result.'
  if grep 'BuildFailed:' /tmp/custom-image-my-image-20190611-160823/logs/startup-script.log; then
    echo -e "${RED}Customization script failed.${NC}"
    exit 1
  elif grep 'BuildSucceeded:' /tmp/custom-image-my-image-20190611-160823/logs/startup-script.log; then
    echo -e "${GREEN}Customization script succeeded.${NC}"
  else
    echo 'Unable to determine the customization script result.'
    exit 1
  fi

  echo 'Creating custom image.'
  gcloud compute images create my-image       --project=my-project       --source-disk-zone=us-west1-a       --source-disk=my-image-install       --storage-location=us-east1       --family=debian9
  touch /tmp/custom-image-my-image-20190611-160823/image_created
}

trap exit_handler EXIT
mkdir -p /tmp/custom-image-my-image-20190611-160823/logs
main "$@" 2>&1 | tee /tmp/custom-image-my-image-20190611-160823/logs/workflow.log
"""


class TestShellScriptGenerator(unittest.TestCase):
  def test_generate_shell_script(self):
    args = {
        'run_id': 'custom-image-my-image-20190611-160823',
        'family': 'debian9',
        'image_name': 'my-image',
        'customization_script': '/tmp/my-script.sh',
        'metadata': 'key1=value1,key2=value2',
        'extra_sources': {"ext'ra_src.txt": "/path/to/extra.txt"},
        'machine_type': 'n1-standard-2',
        'disk_size': 40,
        'accelerator': 'type=nvidia-tesla-v100,count=2',
        'gcs_bucket': 'gs://my-bucket',
        'network': 'my-network',
        'subnetwork': 'my-subnet',
        'no_external_ip': True,
        'zone': 'us-west1-a',
        'dataproc_base_image':
          'projects/cloud-dataproc/global/images/dataproc-1-4-deb9-20190510-000000-rc01',
        'service_account': 'my-service-account',
        'oauth': '',
        'project_id': 'my-project',
        'storage_location': 'us-east1',
        'shutdown_timer_in_sec': 500,
        'base_image_family': 'projects/my-dataproc-project/global/images/family/debian-10'
    }

    script = shell_script_generator.Generator().generate(args)

    self.assertEqual(script, _expected_script)


if __name__ == '__main__':
  unittest.main()
====== Filename: ./tests/data/customization_script_with_extra_sources.sh ======
#!/usr/bin/env bash

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


cat extra/source.txt
====== Filename: ./tests/data/extra_source.txt ======
# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


Example extra source file
====== Filename: ./tests/test_infer_subminor_version.sh ======
#!/usr/bin/env bash

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -euxo pipefail

readonly CURRENT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)
readonly REPO_DIR=$(realpath "${CURRENT_DIR}/..")

readonly TEST_SUFFIX=$(tr -dc 'a-z0-9' </dev/urandom | head -c 6)
readonly TEST_BUCKET="gs://dataproc-custom-images-presubmit"
readonly TEST_ZONE="us-central1-a"

test_inferred_subminor_version_debian() {
  local image_name="test-image-infer-subminor-${TEST_SUFFIX}"
  echo "Creating custom debian image with inferred subminor version: ${image_name}"
  # Expected image - 1.5.35-debian10 - dataproc-1-5-deb10-20210413-000000-rc01, as of 2021-06-30.
  python2 "${REPO_DIR}/generate_custom_image.py" \
    --image-name "${image_name}" \
    --dataproc-version '1.5-debian10' \
    --customization-script "${REPO_DIR}/examples/customization_script.sh" \
    --zone "${TEST_ZONE}" \
    --gcs-bucket "${TEST_BUCKET}" \
    --shutdown-instance-timer-sec 10 \
    --dry-run
}

test_inferred_subminor_version_ubuntu() {
  local image_name="test-image-wildcard-${TEST_SUFFIX}"
  echo "Creating custom ubuntu image with inferred subminor version: ${image_name}"

  python2 "${REPO_DIR}/generate_custom_image.py" \
    --image-name "${image_name}" \
    --dataproc-version '1.5-ubuntu18' \
    --customization-script "${REPO_DIR}/examples/customization_script.sh" \
    --zone "${TEST_ZONE}" \
    --gcs-bucket "${TEST_BUCKET}" \
    --shutdown-instance-timer-sec 10 \
    --dry-run
}

test_inferred_subminor_version_centos() {
  local image_name="test-image-wildcard-${TEST_SUFFIX}"
  echo "Creating custom centos image with inferred subminor version: ${image_name}"

  python2 "${REPO_DIR}/generate_custom_image.py" \
    --image-name "${image_name}" \
    --dataproc-version '1.5-centos8' \
    --customization-script "${REPO_DIR}/examples/customization_script.sh" \
    --zone "${TEST_ZONE}" \
    --gcs-bucket "${TEST_BUCKET}" \
    --shutdown-instance-timer-sec 10 \
    --dry-run
}

test_inferred_subminor_version_debian
test_inferred_subminor_version_ubuntu
test_inferred_subminor_version_centos

echo "All custom image tests with unspecified subminor dataproc versions succeeded"====== Filename: ./LICENSE ======
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
====== Filename: ./Makefile ======
.PHONY: clean
.PHONY: tests

default: clean unit_tests

clean:
	rm -f custom_image_utils/*.pyc tests/*.pyc

unit_tests:
	python2 -m unittest discover

integration_tests:
	bash tests/test_create_custom_image.sh
====== Filename: ./examples/patch-log4j.sh ======
#!/usr/bin/env bash

set -euxo pipefail

# This script applies patches to log4j jars of version [2.0.0, 2.16.0)
# for CVE-2021-44228 in Dataproc custom images.

function main() {
  echo "Searching for log4j jars of version [2.0.0, 2.16.0)..."
  local -a jars
  mapfile -t jars < <(find / -regextype egrep -regex ".*/log4j-core-2\.([0-9]|1[0-5])(\.[0-9]+)?\.jar$" || true)
  echo "Found ${#jars[@]} jars"
  for jar in "${jars[@]}"; do
   echo "Patching ${jar}"
   zip -q -d "${jar}" org/apache/logging/log4j/core/lookup/JndiLookup.class \
     || { echo "Failed patching ${jar}"; exit 1; }
   echo "Done with patching ${jar}"
  done

  echo "All done"
}

main "$@"
====== Filename: ./examples/customization_script.sh ======
#!/usr/bin/env bash

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -euxo pipefail

METADATA1=$(/usr/share/google/get_metadata_value attributes/key1)
echo "Metadata key1=${METADATA1}"
METADATA2=$(/usr/share/google/get_metadata_value attributes/key2)
echo "Metadata key1=${METADATA2}"

if [[ ${METADATA1} != "value1" || ${METADATA2} != "value2" ]]; then
  echo "Unexpected metadata values"
  exit 1
fi

echo "Installing custom packages..."
apt-get -y update
apt-get install python-dev python-pip -y
pip install numpy
echo "Successfully installed custom packages."
====== Filename: ./examples/secure-boot/test.screenrc ======

# screen -L -t monitor 0 /bin/bash

screen -L -t 2.2-debian12 1 echo yes
screen -L -t 2.1-debian11 2 echo yes
screen -L -t 2.0-debian10 3 echo yes

screen -L -t 2.2-ubuntu22 4 echo yes
screen -L -t 2.1-ubuntu20 5 echo yes
screen -L -t 2.0-ubuntu18 6 echo yes

screen -L -t 2.2-rocky9   7 echo yes
screen -L -t 2.1-rocky8   8 echo yes
screen -L -t 2.0-rocky8   9 /bin/bash

====== Filename: ./examples/secure-boot/build-current-images.sh.cjac ======
#!/bin/bash

# Copyright 2024 Google LLC and contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This script creates a custom image pre-loaded with
#
# GPU drivers + cuda + rapids + cuDNN + nccl + tensorflow + pytorch + ipykernel + numba

# To run the script, the following will bootstrap
#
# git clone git@github.com:GoogleCloudDataproc/custom-images
# cd custom-images
# git checkout 2025.02
# cp examples/secure-boot/env.json.sample env.json
# vi env.json
# docker build -f Dockerfile -t custom-image-builder:latest .
# time docker run -it custom-images-builder:latest bash examples/secure-boot/build-current-images.sh


set -ex

function execute_with_retries() (
  set +x
  local -r cmd="$*"
  local install_log="${tmpdir}/install.log"

  for ((i = 0; i < 3; i++)); do
    set -x
    eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
    set +x
    if [[ $retval == 0 ]] ; then return 0 ; fi
    sleep 5
  done
  return 1
)

function configure_service_account() {
  # Create service account
  if gcloud iam service-accounts list --filter email="${GSA}" 2>&1 | grep -q 'Listed 0 items.' ; then
    # Create service account for this purpose
    echo "creating pre-init customization service account ${GSA}"
    gcloud iam service-accounts create "${SA_NAME}" \
      --description="Service account for pre-init customization" \
      --display-name="${SA_NAME}"
  fi

  if [[ -d tls ]] ; then mv tls "tls-$(date +%s)" ; fi
  eval "$(bash examples/secure-boot/create-key-pair.sh)"

  execute_with_retries gcloud projects add-iam-policy-binding "${PROJECT_ID}" \
    --member="serviceAccount:${GSA}" \
    --role="roles/dataproc.worker" \
    --condition=None

  # Grant the service account access to buckets in this project
  # TODO: this is over-broad and should be limited only to the buckets
  # used by these clusters

  gsutil iam ch "serviceAccount:${GSA}:roles/storage.objectViewer" "gs://${BUCKET}"

  # KMS_KEY_URI =~ m:projects/.../locations/.../keyRings/.../cryptoKeys/...:
  (
    eval "$(echo "${KMS_KEY_URI}" | perl -e '$l=<STDIN>; $l =~ m:([^/]+)/([^/]+)/([^/]+)/([^/]+)/([^/]+)/([^/]+)/([^/]+)/([^/]+):; print(join($/, ("$1=$2", "$3=$4", "$5=$6", "$7=$8")), $/)')"
    gcloud kms keys add-iam-policy-binding "${cryptoKeys}" \
      --location "${locations}" \
      --keyring "${keyRings}" \
      --member "serviceAccount:${GSA}" \
      --role "roles/cloudkms.cryptoKeyEncrypterDecrypter"
  )

  gcloud projects add-iam-policy-binding "${PROJECT_ID}" \
    --member="serviceAccount:${GSA}" \
    --role=roles/cloudkms.cryptoKeyDecrypter \


  for storage_object_role in 'User' 'Creator' 'Viewer' ; do
    execute_with_retries gcloud projects add-iam-policy-binding "${PROJECT_ID}" \
      --member="serviceAccount:${GSA}" \
      --role="roles/storage.object${storage_object_role}" \
      --condition=None
  done

  for secret in "${public_secret_name}" "${private_secret_name}" ; do
    for sm_role in 'viewer' 'secretAccessor' ; do
      # Grant the service account permission to list the secret
      execute_with_retries gcloud secrets -q add-iam-policy-binding "${secret}" \
        --member="serviceAccount:${GSA}" \
        --role="roles/secretmanager.${sm_role}" \
        --condition=None
    done
  done

  execute_with_retries gcloud projects add-iam-policy-binding "${PROJECT_ID}" \
    --member="serviceAccount:${GSA}" \
    --role=roles/compute.instanceAdmin.v1 \
    --condition=None

  execute_with_retries gcloud iam service-accounts add-iam-policy-binding "${GSA}" \
    --member="serviceAccount:${GSA}" \
    --role=roles/iam.serviceAccountUser \
    --condition=None
}

function revoke_bindings() {
  execute_with_retries gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \
    --member="serviceAccount:${GSA}" \
    --role="roles/dataproc.worker"

  # Revoke the service account's access to buckets in this project
  for storage_object_role in 'User' 'Creator' 'Viewer' ; do
    execute_with_retries gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \
      --member="serviceAccount:${GSA}" \
      --role="roles/storage.object${storage_object_role}"
  done

  for secret in "${public_secret_name}" "${private_secret_name}" ; do
    # Revoke the service account's permission to list and access the secret
    for sm_role in 'viewer' 'secretAccessor' ; do
      execute_with_retries gcloud secrets -q remove-iam-policy-binding "${secret}" \
        --member="serviceAccount:${GSA}" \
        --role="roles/secretmanager.${sm_role}" \
        --condition=None
    done
  done


  execute_with_retries gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \
    --member="serviceAccount:${GSA}" \
    --role=roles/compute.instanceAdmin.v1

  execute_with_retries gcloud iam service-accounts remove-iam-policy-binding "${GSA}" \
    --member="serviceAccount:${GSA}" \
    --role=roles/iam.serviceAccountUser
}

export PROJECT_ID="$(jq    -r .PROJECT_ID    env.json)"
export PURPOSE="$(jq       -r .PURPOSE       env.json)"
export BUCKET="$(jq        -r .BUCKET        env.json)"
export KMS_KEY_URI="$(jq   -r .KMS_KEY_URI   env.json)"

SA_NAME="sa-${PURPOSE}"
if [[ "${PROJECT_ID}" =~ ":" ]] ; then
  GSA="${SA_NAME}@${PROJECT_ID#*:}.${PROJECT_ID%:*}.iam.gserviceaccount.com"
else
   GSA="${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com"
fi

gcloud config set project "${PROJECT_ID}"

gcloud auth login

configure_service_account

# screen session name
session_name="build-current-images"

#readonly timestamp="$(date +%F-%H-%M)"
#readonly timestamp="2025-02-15-03-29"
readonly timestamp="2025-03-20-19-43"
export timestamp

export tmpdir=/tmp/${timestamp};
mkdir -p ${tmpdir}
export ZONE="$(jq -r .ZONE env.json)"
gcloud compute instances list --zones "${ZONE}" --format json > ${tmpdir}/instances.json
gcloud compute images    list                   --format json > ${tmpdir}/images.json

# Run generation scripts simultaneously for each dataproc image version
screen -L -US "${session_name}" -c examples/secure-boot/pre-init.screenrc

function find_disk_usage() {
  #  grep maximum-disk-used /tmp/custom-image-*/logs/startup-script.log
  grep -H 'Customization script' /tmp/custom-image-*/logs/workflow.log
  for workflow_log in $(grep -Hl "Customization script" /tmp/custom-image-*/logs/workflow.log) ; do
    startup_log=$(echo "${workflow_log}" | sed -e 's/workflow.log/startup-script.log/')
    grep -v '^\['  "${startup_log}" \
      | grep -A7 'Filesystem.*Avail' \
      | perl examples/secure-boot/genline.pl "${workflow_log}"
  done
}

revoke_bindings
====== Filename: ./examples/secure-boot/dask.sh ======
#!/bin/bash

# Copyright 2020,2021,2023,2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This initialization action script will install Dask and other relevant
# libraries on a Dataproc cluster. This is supported for either "yarn" or
# "standalone" runtimes Please see dask.org and yarn.dask.org for more
# information.

set -euxo pipefail

function os_id()       { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; }
function is_ubuntu()   { [[ "$(os_id)" == 'ubuntu' ]] ; }
function is_ubuntu18() { is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; }
function is_debian()   { [[ "$(os_id)" == 'debian' ]] ; }
function is_debuntu()  { is_debian || is_ubuntu ; }

function print_metadata_value() {
  local readonly tmpfile=$(mktemp)
  http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \
    -s -o ${tmpfile} 2>/dev/null)
  local readonly return_code=$?
  # If the command completed successfully, print the metadata value to stdout.
  if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then
    cat ${tmpfile}
  fi
  rm -f ${tmpfile}
  return ${return_code}
}

function print_metadata_value_if_exists() {
  local return_code=1
  local readonly url=$1
  print_metadata_value ${url}
  return_code=$?
  return ${return_code}
}

function get_metadata_value() {
  set +x
  local readonly varname=$1
  local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1
  # Print the instance metadata value.
  print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname}
  return_code=$?
  # If the instance doesn't have the value, try the project.
  if [[ ${return_code} != 0 ]]; then
    print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname}
    return_code=$?
  fi
  set -x
  return ${return_code}
}

function get_metadata_attribute() (
  set +x
  local -r attribute_name="$1"
  local -r default_value="${2:-}"
  get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
)

function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; }
function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; }

function execute_with_retries() {
  local -r cmd="$*"
  for i in {0..9} ; do
    if eval "$cmd"; then
      return 0 ; fi
    sleep 5
  done
  echo "Cmd '${cmd}' failed."
  return 1
}

function configure_dask_yarn() {
  readonly DASK_YARN_CONFIG_DIR=/etc/dask/
  readonly DASK_YARN_CONFIG_FILE=${DASK_YARN_CONFIG_DIR}/config.yaml
  # Minimal custom configuration is required for this
  # setup. Please see https://yarn.dask.org/en/latest/quickstart.html#usage
  # for information on tuning Dask-Yarn environments.
  mkdir -p "${DASK_YARN_CONFIG_DIR}"

  cat <<EOF >"${DASK_YARN_CONFIG_FILE}"
# Config file for Dask Yarn.
#
# These values are joined on top of the default config, found at
# https://yarn.dask.org/en/latest/configuration.html#default-configuration

yarn:
  environment: python://${DASK_CONDA_ENV}/bin/python

  worker:
    count: 2
EOF
}

function install_systemd_dask_worker() {
  echo "Installing systemd Dask Worker service..."
  local -r dask_worker_local_dir="/tmp/${DASK_WORKER_SERVICE}"

  mkdir -p "${dask_worker_local_dir}"

  local DASK_WORKER_LAUNCHER="/usr/local/bin/${DASK_WORKER_SERVICE}-launcher.sh"

  cat <<EOF >"${DASK_WORKER_LAUNCHER}"
#!/bin/bash
LOGFILE="/var/log/${DASK_WORKER_SERVICE}.log"
echo "dask worker starting, logging to \${LOGFILE}"
${DASK_CONDA_ENV}/bin/dask worker "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>&1
EOF

  chmod 750 "${DASK_WORKER_LAUNCHER}"

  local -r dask_service_file="/usr/lib/systemd/system/${DASK_WORKER_SERVICE}.service"
  cat <<EOF >"${dask_service_file}"
[Unit]
Description=Dask Worker Service
[Service]
Type=simple
Restart=on-failure
ExecStart=/bin/bash -c 'exec ${DASK_WORKER_LAUNCHER}'
[Install]
WantedBy=multi-user.target
EOF
  chmod a+r "${dask_service_file}"

  systemctl daemon-reload

  # Enable the service
  if [[ "${ROLE}" != "Master" ]]; then
    enable_worker_service="1"
  else
    local RUN_WORKER_ON_MASTER="$(get_metadata_attribute dask-worker-on-master 'true')"
    # Enable service on single-node cluster (no workers)
    local worker_count="$(get_metadata_attribute dataproc-worker-count)"
    if [[ "${worker_count}" == "0" || "${RUN_WORKER_ON_MASTER}" == "true" ]]; then
      enable_worker_service="1"
    fi
  fi

  if [[ "${enable_worker_service}" == "1" ]]; then
    systemctl enable "${DASK_WORKER_SERVICE}"
    systemctl restart "${DASK_WORKER_SERVICE}"
  fi
}

function install_systemd_dask_scheduler() {
  # only run scheduler on primary master
  if [[ "$(hostname -s)" != "${MASTER}" ]]; then return ; fi
  echo "Installing systemd Dask Scheduler service..."
  local -r dask_scheduler_local_dir="/tmp/${DASK_SCHEDULER_SERVICE}"

  mkdir -p "${dask_scheduler_local_dir}"

  local DASK_SCHEDULER_LAUNCHER="/usr/local/bin/${DASK_SCHEDULER_SERVICE}-launcher.sh"

  cat <<EOF >"${DASK_SCHEDULER_LAUNCHER}"
#!/bin/bash
LOGFILE="/var/log/${DASK_SCHEDULER_SERVICE}.log"
echo "dask scheduler starting, logging to \${LOGFILE}"
${DASK_CONDA_ENV}/bin/dask scheduler >> "\${LOGFILE}" 2>&1
EOF

  chmod 750 "${DASK_SCHEDULER_LAUNCHER}"

  local -r dask_service_file="/usr/lib/systemd/system/${DASK_SCHEDULER_SERVICE}.service"
  cat <<EOF >"${dask_service_file}"
[Unit]
Description=Dask Scheduler Service
[Service]
Type=simple
Restart=on-failure
ExecStart=/bin/bash -c 'exec ${DASK_SCHEDULER_LAUNCHER}'
[Install]
WantedBy=multi-user.target
EOF
  chmod a+r "${dask_service_file}"

  systemctl daemon-reload

  # Enable the service
  systemctl enable "${DASK_SCHEDULER_SERVICE}"
}

function install_systemd_dask_service() {
  install_systemd_dask_scheduler
  install_systemd_dask_worker
}

function restart_knox() {
  systemctl stop knox
  rm -rf "${KNOX_HOME}/data/deployments/*"
  systemctl start knox
}

function configure_knox_for_dask() {
  if [[ ! -d "${KNOX_HOME}" ]]; then
    echo "Skip configuring Knox rules for Dask"
    return 0
  fi

  local DASK_UI_PORT=8787
  if [[ -f /etc/knox/conf/topologies/default.xml ]]; then
    sed -i \
      "/<\/topology>/i <service><role>DASK<\/role><url>http://localhost:${DASK_UI_PORT}<\/url><\/service> <service><role>DASKWS<\/role><url>ws:\/\/${MASTER}:${DASK_UI_PORT}<\/url><\/service>" \
      /etc/knox/conf/topologies/default.xml
  fi

  mkdir -p "${KNOX_DASK_DIR}"

  cat >"${KNOX_DASK_DIR}/service.xml" <<'EOF'
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>

<service role="DASK" name="dask" version="0.1.0">
  <policies>
    <policy role="webappsec"/>
    <policy role="authentication" name="Anonymous"/>
    <policy role="rewrite"/>
    <policy role="authorization"/>
  </policies>

  <routes>
    <!-- Javascript paths -->
    <route path="/dask/**/*.js">
      <rewrite apply="DASK/dask/inbound/js/dask" to="request.url"/>
      <rewrite apply="DASK/dask/outbound/js" to="response.body"/>
    </route>
    <route path="/dask/**/*.js?**">
      <rewrite apply="DASK/dask/inbound/js/dask" to="request.url"/>
      <rewrite apply="DASK/dask/outbound/js" to="response.body"/>
    </route>

    <!-- CSS paths -->
    <route path="/dask/**/*.css">
      <rewrite apply="DASK/dask/inbound/css/dask" to="request.url"/>
    </route>

    <!-- General path routing -->
    <route path="/dask">
      <rewrite apply="DASK/dask/inbound/root" to="request.url"/>
      <rewrite apply="DASK/dask/outbound/headers" to="response.headers"/>
    </route>
    <route path="/dask/**">
      <rewrite apply="DASK/dask/inbound/root/path" to="request.url"/>
      <rewrite apply="DASK/dask/outbound/headers" to="response.headers"/>
      <rewrite apply="DASK/dask/outbound/logs" to="response.body"/>
    </route>
    <route path="/dask/**?**">
      <rewrite apply="DASK/dask/inbound/root/query" to="request.url"/>
      <rewrite apply="DASK/dask/outbound/headers" to="response.headers"/>
      <rewrite apply="DASK/dask/outbound/logs" to="response.body"/>
    </route>
  </routes>
  <dispatch classname="org.apache.knox.gateway.dispatch.PassAllHeadersNoChunkedPostDispatch"/>
</service>
EOF

  cat >"${KNOX_DASK_DIR}/rewrite.xml" <<'EOF'
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>

<rules>
  <rule dir="IN" name="DASK/dask/inbound/js/dask" pattern="http://*:*/**/dask/{**}?{**}">
    <rewrite template="{$serviceUrl[DASK]}/{**}?{**}"/>
  </rule>
  <rule dir="IN" name="DASK/dask/inbound/root" pattern="http://*:*/**/dask">
    <rewrite template="{$serviceUrl[DASK]}"/>
  </rule>
  <rule dir="IN" name="DASK/dask/inbound/root/path" pattern="http://*:*/**/dask/{**}">
    <rewrite template="{$serviceUrl[DASK]}/{**}"/>
  </rule>
  <rule dir="IN" name="DASK/dask/inbound/root/query" pattern="http://*:*/**/dask/{**}?{**}">
    <rewrite template="{$serviceUrl[DASK]}/{**}?{**}"/>
  </rule>
  <rule dir="IN" name="DASK/dask/inbound/css/dask" pattern="http://*:*/**/dask/{**}?{**}">
    <rewrite template="{$serviceUrl[DASK]}/{**}?{**}"/>
  </rule>
  <!-- without the /gateway/default prefix -->
  <rule dir="IN" name="DASK/dask/inbound/root/noprefix" pattern="http://*:*/dask">
    <rewrite template="{$serviceUrl[DASK]}"/>
  </rule>

  <rule dir="OUT" name="DASK/dask/outbound/logs" pattern="/logs">
    <rewrite template="{$frontend[path]}/dask/info/logs"/>
  </rule>

  <!-- Rewrite redirect responses Location header -->
  <filter name="DASK/dask/outbound/headers">
    <content type="application/x-http-headers">
      <apply path="Location" rule="DASK/dask/outbound/headers/location"/>
    </content>
  </filter>

  <rule dir="OUT" name="DASK/dask/outbound/headers/location" flow="OR">
    <match pattern="*://*:*/">
      <rewrite template="{$frontend[path]}/dask/"/>
    </match>
    <match pattern="*://*:*/{**}">
      <rewrite template="{$frontend[path]}/dask/{**}"/>
    </match>
    <match pattern="*://*:*/{**}?{**}">
      <rewrite template="{$frontend[path]}/dask/{**}?{**}"/>
    </match>
    <match pattern="/{**}">
      <rewrite template="{$frontend[path]}/dask/{**}"/>
    </match>
    <match pattern="/{**}?{**}">
      <rewrite template="{$frontend[path]}/dask/{**}?{**}"/>
    </match>
  </rule>
</rules>
EOF

  mkdir -p "${KNOX_DASKWS_DIR}"

  cat >"${KNOX_DASKWS_DIR}/service.xml" <<'EOF'
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>

<service role="DASKWS" name="daskws" version="0.1.0">
  <policies>
    <policy role="webappsec"/>
    <policy role="authentication" name="Anonymous"/>
    <policy role="rewrite"/>
    <policy role="authorization"/>
  </policies>

  <routes>

    <route path="/dask/**/ws">
      <rewrite apply="DASKWS/daskws/inbound/ws" to="request.url"/>
    </route>

  </routes>
  <dispatch classname="org.apache.knox.gateway.dispatch.PassAllHeadersNoChunkedPostDispatch"/>
</service>
EOF

  cat >"${KNOX_DASKWS_DIR}/rewrite.xml" <<'EOF'
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>

<rules>
  <rule dir="IN" name="DASKWS/daskws/inbound/ws" pattern="ws://*:*/**/dask/{**}/ws">
    <rewrite template="{$serviceUrl[DASKWS]}/{**}/ws"/>
  </rule>
</rules>
EOF

  chown -R knox:knox "${KNOX_DASK_DIR}" "${KNOX_DASKWS_DIR}"

  # Do not restart knox during pre-init script run
  if [[ -n "${ROLE}" ]]; then
    restart_knox
  fi
}

function configure_fluentd_for_dask() {
  if [[ "$(hostname -s)" == "${MASTER}" ]]; then
    cat >/etc/google-fluentd/config.d/dataproc-dask.conf <<EOF
# Fluentd config for Dask logs

# Dask scheduler
<source>
  @type tail
  path /var/log/dask-scheduler.log
  pos_file /var/tmp/fluentd.dataproc.dask.scheduler.pos
  read_from_head true
  tag google.dataproc.dask-scheduler
  <parse>
    @type none
  </parse>
</source>

<filter google.dataproc.dask-scheduler>
  @type record_transformer
  <record>
    filename dask-scheduler.log
  </record>
</filter>
EOF
  fi

  if [[ "${enable_worker_service}" == "1" ]]; then
    cat >>/etc/google-fluentd/config.d/dataproc-dask.conf <<EOF
# Dask worker
<source>
  @type tail
  path /var/log/dask-worker.log
  pos_file /var/tmp/fluentd.dataproc.dask.worker.pos
  read_from_head true
  tag google.dataproc.dask-worker
  <parse>
    @type none
  </parse>
</source>

<filter google.dataproc.dask-worker>
  @type record_transformer
  <record>
    filename dask-worker.log
  </record>
</filter>
EOF
  fi

  systemctl restart google-fluentd
}

function install_dask() {
  if is_cuda12 ; then
    local python_spec="python>=3.11"
    local cuda_spec="cuda-version>=12,<13"
    local dask_spec="dask>=2024.5"
  elif is_cuda11 ; then
    local python_spec="python>=3.9"
    local cuda_spec="cuda-version>=11,<12.0a0"
    local dask_spec="dask"
  fi

  CONDA_PACKAGES=()
  if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then
    # Pin `distributed` and `dask` package versions to old release
    # because `dask-yarn` 0.9 uses skein in a way which
    # is not compatible with `distributed` package 2022.2 and newer:
    # https://github.com/dask/dask-yarn/issues/155

    dask_spec="dask<2022.2"
    python_spec="python>=3.7,<3.8.0a0"
    if is_ubuntu18 ; then
      # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic
      CONDA_PACKAGES+=("fiona<1.8.22")
    fi
    CONDA_PACKAGES+=('dask-yarn=0.9' "distributed<2022.2")
  fi

  CONDA_PACKAGES+=(
    "${cuda_spec}"
    "${dask_spec}"
    "dask-bigquery"
    "dask-ml"
    "dask-sql"
  )

  # Install dask
  local is_installed="0"
  mamba="/opt/conda/miniconda3/bin/mamba"
  conda="/opt/conda/miniconda3/bin/conda"

  ( set +e
  for installer in "${mamba}" "${conda}" ; do
    test -d "${DASK_CONDA_ENV}" || \
      time "${installer}" "create" -m -n "dask" -y --no-channel-priority \
      -c 'conda-forge' -c 'nvidia'  \
      ${CONDA_PACKAGES[*]} \
      "${python_spec}" > /dev/null 2>&1
    local retval=$?
    sync
    if [[ "$retval" == "0" ]] ; then
      is_installed="1"
      break
    fi
    "${conda}" config --set channel_priority flexible
  done
  if [[ "${is_installed}" == "0" ]]; then
    echo "failed to install dask"
    return 1
  fi
  )
}

function main() {
  # Install Dask
  install_dask

  # In "standalone" mode, Dask relies on a systemd unit to launch.
  # In "yarn" mode, it relies a config.yaml file.
  if [[ "${DASK_RUNTIME}" == "yarn" ]]; then
    # Create Dask YARN config file
    configure_dask_yarn
  elif [[ "${DASK_RUNTIME}" == "standalone" ]]; then
    # Create Dask service
    install_systemd_dask_service

    if [[ "$(hostname -s)" == "${MASTER}" ]]; then
      systemctl start "${DASK_SCHEDULER_SERVICE}"
      systemctl status "${DASK_SCHEDULER_SERVICE}"
    fi

    echo "Starting Dask 'standalone' cluster..."
    if [[ "${enable_worker_service}" == "1" ]]; then
      systemctl start "${DASK_WORKER_SERVICE}"
      systemctl status "${DASK_WORKER_SERVICE}"
    fi

    configure_knox_for_dask

    local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')"
    if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then
      configure_fluentd_for_dask
    fi
  else
    echo "Unsupported Dask Runtime: ${DASK_RUNTIME}"
    exit 1
  fi

  echo "Dask for ${DASK_RUNTIME} successfully initialized."
}

function exit_handler() (
  set +e
  echo "Exit handler invoked"

  # Free conda cache
  /opt/conda/miniconda3/bin/conda clean -a > /dev/null 2>&1

  # Clear pip cache
  pip cache purge || echo "unable to purge pip cache"

  # remove the tmpfs conda pkgs_dirs
  if [[ -d /mnt/shm ]] ; then /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm ; fi

  # Clean up shared memory mounts
  for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do
    if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then
      rm -rf ${shmdir}/*
      umount -f ${shmdir}
    fi
  done

  # Clean up OS package cache ; re-hold systemd package
  if is_debuntu ; then
    apt-get -y -qq clean
    apt-get -y -qq autoremove
  else
    dnf clean all
  fi

  # print disk usage statistics
  if is_debuntu ; then
    # Rocky doesn't have sort -h and fails when the argument is passed
    du --max-depth 3 -hx / | sort -h | tail -10
  fi

  # Process disk usage logs from installation period
  rm -f /tmp/keep-running-df
  sleep 6s
  # compute maximum size of disk during installation
  # Log file contains logs like the following (minus the preceeding #):
#Filesystem      Size  Used Avail Use% Mounted on
#/dev/vda2       6.8G  2.5G  4.0G  39% /
  df --si
  perl -e '$max=( sort
                 map { (split)[2] =~ /^(\d+)/ }
                grep { m:^/: } <STDIN> )[-1];
print( "maximum-disk-used: $max", $/ );' < /tmp/disk-usage.log

  echo "exit_handler has completed"

  # zero free disk space
  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
    dd if=/dev/zero of=/zero ; sync ; rm -f /zero
  fi

  return 0
)

trap exit_handler EXIT

function prepare_to_install() {
  readonly DEFAULT_CUDA_VERSION="12.4"
  CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION})
  readonly CUDA_VERSION

  readonly ROLE=$(get_metadata_attribute dataproc-role)
  readonly MASTER=$(get_metadata_attribute dataproc-master)

  # Dask config
  DASK_RUNTIME="$(get_metadata_attribute dask-runtime || echo 'standalone')"
  readonly DASK_RUNTIME
  readonly DASK_SERVICE=dask-cluster
  readonly DASK_WORKER_SERVICE=dask-worker
  readonly DASK_SCHEDULER_SERVICE=dask-scheduler
  readonly DASK_CONDA_ENV="/opt/conda/miniconda3/envs/dask"

  # Knox config
  readonly KNOX_HOME=/usr/lib/knox
  readonly KNOX_DASK_DIR="${KNOX_HOME}/data/services/dask/0.1.0"
  readonly KNOX_DASKWS_DIR="${KNOX_HOME}/data/services/daskws/0.1.0"
  enable_worker_service="0"

  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
  # Write to a ramdisk instead of churning the persistent disk
  if [[ ${free_mem} -ge 5250000 ]]; then
    mkdir -p /mnt/shm
    mount -t tmpfs tmpfs /mnt/shm

    # Download conda packages to tmpfs
    /opt/conda/miniconda3/bin/conda config --add pkgs_dirs /mnt/shm
    mount -t tmpfs tmpfs /mnt/shm

    # Download pip packages to tmpfs
    pip config set global.cache-dir /mnt/shm || echo "unable to set global.cache-dir"

    # Download OS packages to tmpfs
    if is_debuntu ; then
      mount -t tmpfs tmpfs /var/cache/apt/archives
    else
      mount -t tmpfs tmpfs /var/cache/dnf
    fi
  fi

  # Monitor disk usage in a screen session
  if is_debuntu ; then
      apt-get install -y -qq screen
  elif is_rocky ; then
      dnf -y -q install screen
  fi
  rm -f /tmp/disk-usage.log
  touch /tmp/keep-running-df
  screen -d -m -US keep-running-df \
    bash -c 'while [[ -f /tmp/keep-running-df ]] ; do df --si / | tee -a /tmp/disk-usage.log ; sleep 5s ; done'
}

prepare_to_install

main
====== Filename: ./examples/secure-boot/README.md ======
## Secure Boot

Secure Boot is a security technology implemented in UEFI firmware that
verifies the integrity of the boot process of a computer system. It
ensures that only trusted software, such as the operating system,
firmware, and drivers, are loaded during startup. This helps prevent
malicious software from gaining control of the system before security
measures can be implemented.

Secure Boot achieves this by verifying the digital signature of
drivers and other software against a recognized root of trust. The EFI
DB variable stores the cryptographic keys and certificates used for
this verification process.

How Secure Boot impacts VPC SC:

Enhanced Security Perimeter: By verifying the integrity of the boot
process, Secure Boot strengthens the foundation of the security
perimeter created by VPC SC. This reduces the risk of unauthorized
access or data exfiltration due to compromised host systems.
Improved Trust in Service Perimeter Resources: VPC SC relies on the
trust that the resources within a service perimeter are secure. Secure
Boot helps to establish and maintain this trust by ensuring that these
resources are protected from malicious boot-time attacks.
Compliance and Regulatory Requirements: Many security compliance
standards, such as PCI DSS and HIPAA, require specific measures to
protect sensitive data. Secure Boot can be a valuable component of
meeting these requirements by providing additional assurance of system
integrity.

Reduced Attack Surface: By preventing unauthorized software from
loading during startup, Secure Boot reduces the potential attack
surface for malicious actors. This can help to mitigate the risk of
successful cyberattacks.

In summary, Secure Boot provides a crucial layer of protection for VPC
SC by ensuring that the underlying infrastructure is secure and
trusted. This helps to strengthen the overall security posture of
Google Cloud Platform environments and protect sensitive data.

## Examples

To create a custom image with a self-signed, trusted certificate
inserted into the boot sector, and then run a script to install cuda
on a Dataproc image, the commands from cuda.sh can be run from the
root of the custom-images git repository or from a docker container.

First, write an env.json to the directory from which you will run the
customization script.  There is a sample which you can copy and edit
in the file examples/secure-boot/env.json.sample.

```bash
cp examples/secure-boot/env.json.sample env.json
vi env.json
docker build -t dataproc-cuda-pre-init:latest .
docker run -it dataproc-cuda-pre-init:latest /bin/bash examples/secure-boot/cuda.sh
```

To do the same, but for all dataproc variants including supported
versions and image families, the same env.json steps as above should
be executed, and then the examples/secure-boot/build-current-images.sh
script can be run in docker:

```bash
cp examples/secure-boot/env.json.sample env.json
vi env.json
docker build -t dataproc-dask-rapids-pre-init:latest .
docker run -it dataproc-dask-rapids-pre-init:latest /bin/bash examples/secure-boot/build-current-images.sh
```
====== Filename: ./examples/secure-boot/genline.pl ======
#!/usr/bin/perl -w
use strict;
use POSIX qw(ceil);

# /tmp/custom-image-cuda-pre-init-2-0-debian10-2024-11-14-20-00-20241114-200043/logs/workflow.log
# /tmp/custom-image-dataproc-2-0-deb10-20250422-193049-secure-boot-20250422-193247
my $fn = $ARGV[0];
my( @matches ) =
  ( $fn =~
    m{custom-image-dataproc-
       (
	 \d+-\d+-(?:deb|roc|ubu)\d+
       )-
       (\d{8}-\d{6})-(.+)-(\d{8}-\d{6})
    }x
  );
#print "matches: @matches\n";
my($short_dp_ver, $timestamp, $purpose, $another_timestamp)=@matches;
$short_dp_ver =~ s/-/./;

my $dp_version = $short_dp_ver;
$dp_version =~ s/deb/debian/;
$dp_version =~ s/roc/rocky/;
$dp_version =~ s/ubu/ubuntu/;

my @raw_lines = <STDIN>;
my( $l ) = grep { m: /dev/.*/\s*$: } @raw_lines;

exit 0 unless $l;

my( $stats ) = ( $l =~ m:\s*/dev/\S+\s+(.*?)\s*$: );
$stats =~ s:(\d{4,}):sprintf(q{%7s}, sprintf(q{%.2fG},($1/1024)/1024)):eg;

my $max_regex = qr/ maximum-disk-used:\s+(\d+)/;
my($max)   = map { /$max_regex/ ; $1 } grep { /$max_regex/ } @raw_lines;
my($gbmax) = ceil((($max / 1024) / 1024) * 1.15);
$gbmax     = 30 if $gbmax < 30;
my $i_dp_version = sprintf(q{%-15s}, qq{"$dp_version"});
print( qq{  $i_dp_version) disk_size_gb="$gbmax" ;; # $stats # $timestamp-$purpose}, $/ );
====== Filename: ./examples/secure-boot/install_gpu_driver.sh ======
#!/bin/bash
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#
# This script installs NVIDIA GPU drivers and collects GPU utilization metrics.

set -xeuo pipefail

function os_id()       { grep '^ID='               /etc/os-release | cut -d= -f2 | xargs ; }
function os_version()  { grep '^VERSION_ID='       /etc/os-release | cut -d= -f2 | xargs ; }
function os_codename() { grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; }

function version_ge(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|tail -n1)" ]]; }
function version_gt(){ [[ "$1" = "$2" ]]&& return 1 || version_ge "$1" "$2";}
function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; }
function version_lt(){ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";}

readonly -A supported_os=(
  ['debian']="10 11 12"
  ['rocky']="8 9"
  ['ubuntu']="18.04 20.04 22.04"
)

# dynamically define OS version test utility functions
if [[ "$(os_id)" == "rocky" ]];
  then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
  else _os_version="$(os_version)"
fi
for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
  eval "function is_${os_id_val}() { [[ \"$(os_id)\" == '${os_id_val}' ]] ; }"

  for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
    eval "function is_${os_id_val}${osver%%.*}() { is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; }"
    eval "function ge_${os_id_val}${osver%%.*}() { is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; }"
    eval "function le_${os_id_val}${osver%%.*}() { is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; }"
  done
done

function is_debuntu()  {  is_debian || is_ubuntu ; }

function os_vercat()   {
  if   is_ubuntu ; then os_version | sed -e 's/[^0-9]//g'
  elif is_rocky  ; then os_version | sed -e 's/[^0-9].*$//g'
                   else os_version ; fi ; }

function repair_old_backports {
  if ! is_debuntu ; then return ; fi
  # This script uses 'apt-get update' and is therefore potentially dependent on
  # backports repositories which have been archived.  In order to mitigate this
  # problem, we will use archive.debian.org for the oldoldstable repo

  # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157
  debdists="https://deb.debian.org/debian/dists"
  oldoldstable=$(curl ${curl_retry_args} "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}');
  oldstable=$(   curl ${curl_retry_args} "${debdists}/oldstable/Release"    | awk '/^Codename/ {print $2}');
  stable=$(      curl ${curl_retry_args} "${debdists}/stable/Release"       | awk '/^Codename/ {print $2}');

  matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) )

  for filename in "${matched_files[@]}"; do
    # Fetch from archive.debian.org for ${oldoldstable}-backports
    perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports }
                  {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}"
  done
}

function print_metadata_value() {
  local readonly tmpfile=$(mktemp)
  http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \
    -s -o ${tmpfile} 2>/dev/null)
  local readonly return_code=$?
  # If the command completed successfully, print the metadata value to stdout.
  if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then
    cat ${tmpfile}
  fi
  rm -f ${tmpfile}
  return ${return_code}
}

function print_metadata_value_if_exists() {
  local return_code=1
  local readonly url=$1
  print_metadata_value ${url}
  return_code=$?
  return ${return_code}
}

# replicates /usr/share/google/get_metadata_value
function get_metadata_value() {
  local readonly varname=$1
  local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1
  # Print the instance metadata value.
  print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname}
  return_code=$?
  # If the instance doesn't have the value, try the project.
  if [[ ${return_code} != 0 ]]; then
    print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname}
    return_code=$?
  fi
  return ${return_code}
}

function get_metadata_attribute() {
  local -r attribute_name="$1"
  local -r default_value="${2:-}"
  set +e
  get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
  set -e
}

OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
readonly OS_NAME

# node role
ROLE="$(get_metadata_attribute dataproc-role)"
readonly ROLE

# CUDA version and Driver version
# https://docs.nvidia.com/deploy/cuda-compatibility/
# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
# https://developer.nvidia.com/cuda-downloads

# Minimum supported version for open kernel driver is 515.43.04
# https://github.com/NVIDIA/open-gpu-kernel-modules/tags
readonly -A DRIVER_FOR_CUDA=(
    ["10.0"]="410.48" ["10.1"]="418.87.00" ["10.2"]="440.33.01"
    ["11.1"]="455.45.01" ["11.2"]="460.91.03" ["11.3"]="465.31"
    ["11.4"]="470.256.02" ["11.5"]="495.46" ["11.6"]="510.108.03"
    ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05"
    ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.29.06"
    ["12.4"]="550.135" ["12.5"]="550.142" ["12.6"]="550.142"
)
readonly -A DRIVER_SUBVER=(
    ["410"]="410.104" ["415"]="415.27" ["418"]="418.113"
    ["430"]="430.64" ["435"]="435.21" ["440"]="440.100"
    ["450"]="450.119.03" ["455"]="455.45.01" ["460"]="460.91.03"
    ["465"]="465.31" ["470"]="470.256.02" ["495"]="495.46"
    ["510"]="510.108.03" ["515"]="515.48.07" ["520"]="525.147.05"
    ["525"]="525.147.05" ["535"]="535.216.01" ["545"]="545.29.06"
    ["550"]="550.142" ["555"]="555.58.02" ["560"]="560.35.03"
    ["565"]="565.77"
)
# https://developer.nvidia.com/cudnn-downloads
readonly -A CUDNN_FOR_CUDA=(
    ["10.0"]="7.4.1" ["10.1"]="7.6.4" ["10.2"]="7.6.5"
    ["11.0"]="8.0.4" ["11.1"]="8.0.5" ["11.2"]="8.1.1"
    ["11.3"]="8.2.1" ["11.4"]="8.2.4.15" ["11.5"]="8.3.1.22"
    ["11.6"]="8.4.0.27" ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17"
    ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.2"]="8.9.5"
    ["12.3"]="9.0.0.306" ["12.4"]="9.1.0.70" ["12.5"]="9.2.1.18"
    ["12.6"]="9.6.0.74"
)
# https://developer.nvidia.com/nccl/nccl-download
readonly -A NCCL_FOR_CUDA=(
    ["10.0"]="2.3.7" ["10.1"]= ["11.0"]="2.7.8" ["11.1"]="2.8.3"
    ["11.2"]="2.8.4" ["11.3"]="2.9.9" ["11.4"]="2.11.4"
    ["11.5"]="2.11.4" ["11.6"]="2.12.10" ["11.7"]="2.12.12"
    ["11.8"]="2.21.5" ["12.0"]="2.16.5" ["12.1"]="2.18.3"
    ["12.2"]="2.19.3" ["12.3"]="2.19.4" ["12.4"]="2.23.4"
    ["12.5"]="2.22.3" ["12.6"]="2.23.4"
)
readonly -A CUDA_SUBVER=(
    ["10.0"]="10.0.130" ["10.1"]="10.1.234" ["10.2"]="10.2.89"
    ["11.0"]="11.0.3" ["11.1"]="11.1.1" ["11.2"]="11.2.2"
    ["11.3"]="11.3.1" ["11.4"]="11.4.4" ["11.5"]="11.5.2"
    ["11.6"]="11.6.2" ["11.7"]="11.7.1" ["11.8"]="11.8.0"
    ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2"
    ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1"
    ["12.6"]="12.6.3"
)

function set_cuda_version() {
  case "${DATAPROC_IMAGE_VERSION}" in
    "1.5" ) DEFAULT_CUDA_VERSION="11.6.2" ;;
    "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18)
    "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;;
    "2.2" ) DEFAULT_CUDA_VERSION="12.6.3" ;;
    "2.3" ) DEFAULT_CUDA_VERSION="12.6.3" ;;
    *   )
      echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}"
      exit 1
      ;;
  esac
  local cuda_url
  cuda_url=$(get_metadata_attribute 'cuda-url' '')
  if [[ -n "${cuda_url}" ]] ; then
    # if cuda-url metadata variable has been passed, extract default version from url
    local CUDA_URL_VERSION
    CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')"
    if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then
      DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION}"
    fi
  fi
  readonly DEFAULT_CUDA_VERSION

  CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
  if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then
    CUDA_FULL_VERSION="${CUDA_VERSION}"
    CUDA_VERSION="${CUDA_VERSION%.*}"
  fi
  readonly CUDA_VERSION
  if ( ! test -v CUDA_FULL_VERSION ) ; then
    CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
  fi
  readonly CUDA_FULL_VERSION
}

function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; }
function le_cuda12() { version_le "${CUDA_VERSION%%.*}" "12" ; }
function ge_cuda12() { version_ge "${CUDA_VERSION%%.*}" "12" ; }

function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; }
function le_cuda11() { version_le "${CUDA_VERSION%%.*}" "11" ; }
function ge_cuda11() { version_ge "${CUDA_VERSION%%.*}" "11" ; }

function set_driver_version() {
  local gpu_driver_url
  gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '')

  local cuda_url
  cuda_url=$(get_metadata_attribute 'cuda-url' '')

  local nv_xf86_x64_base="https://us.download.nvidia.com/XFree86/Linux-x86_64"

  local DEFAULT_DRIVER
  # Take default from gpu-driver-url metadata value
  if [[ -n "${gpu_driver_url}" ]] ; then
    DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')"
    if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi
  # Take default from cuda-url metadata value as a backup
  elif [[ -n "${cuda_url}" ]] ; then
    local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')"
    if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then
      major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}"
      driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]}
      if curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q 'HTTP.*200' ; then
        # use the version indicated by the cuda url as the default if it exists
        DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}"
      elif curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q 'HTTP.*200' ; then
        # use the maximum sub-version available for the major version indicated in cuda url as the default
        DEFAULT_DRIVER="${driver_max_maj_version}"
      fi
    fi
  fi

  if ( ! test -v DEFAULT_DRIVER ) ; then
    # If a default driver version has not been extracted, use the default for this version of CUDA
    DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]}
  fi

  DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")

  readonly DRIVER_VERSION
  readonly DRIVER="${DRIVER_VERSION%%.*}"

  export DRIVER_VERSION DRIVER

  gpu_driver_url="${nv_xf86_x64_base}/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
  if ! curl ${curl_retry_args} --head "${gpu_driver_url}" | grep -E -q 'HTTP.*200' ; then
    echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
    exit 1
  fi
}

function set_cudnn_version() {
  readonly MIN_ROCKY8_CUDNN8_VERSION="8.0.5.39"
  readonly DEFAULT_CUDNN8_VERSION="8.3.1.22"
  readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"

  # Parameters for NVIDIA-provided cuDNN library
  readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
  CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
  # The minimum cuDNN version supported by rocky is ${MIN_ROCKY8_CUDNN8_VERSION}
  if ( is_rocky  && version_lt "${CUDNN_VERSION}" "${MIN_ROCKY8_CUDNN8_VERSION}" ) ; then
    CUDNN_VERSION="${MIN_ROCKY8_CUDNN8_VERSION}"
  elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then
    # cuDNN v8 is not distribution for ubuntu20+, debian12
    CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
  elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then
    # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
    CUDNN_VERSION="8.8.0.121"
  fi
  readonly CUDNN_VERSION
}

function is_cudnn8() { [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; }
function is_cudnn9() { [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; }

# Short name for urls
if is_ubuntu22  ; then
    # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at
    # https://developer.download.nvidia.com/compute/machine-learning/repos/
    # use packages from previous release until such time as nvidia
    # release ubuntu2204 builds

    nccl_shortname="ubuntu2004"
    shortname="$(os_id)$(os_vercat)"
elif ge_rocky9 ; then
    # use packages from previous release until such time as nvidia
    # release rhel9 builds

    nccl_shortname="rhel8"
    shortname="rhel9"
elif is_rocky ; then
    shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
    nccl_shortname="${shortname}"
else
    shortname="$(os_id)$(os_vercat)"
    nccl_shortname="${shortname}"
fi

function set_nv_urls() {
  # Parameters for NVIDIA-provided package repositories
  readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
  readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"

  # Parameter for NVIDIA-provided Rocky Linux GPU driver
  readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"
}

function set_cuda_runfile_url() {
  local MAX_DRIVER_VERSION
  local MAX_CUDA_VERSION

  MIN_OPEN_DRIVER_VER="515.43.04"
  local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}"
  local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER

  if is_cuda12 ; then
    if is_debian12 ; then
      MIN_DRIVER_VERSION="545.23.06"
      MIN_CUDA_VERSION="12.3.0"
    elif is_debian10 ; then
      MAX_DRIVER_VERSION="555.42.02"
      MAX_CUDA_VERSION="12.5.0"
    elif is_ubuntu18 ; then
      MAX_DRIVER_VERSION="530.30.02"
      MAX_CUDA_VERSION="12.1.1"
    fi
  elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
    if le_debian10 ; then
      # cuda 11 is not supported for <= debian10
      MAX_CUDA_VERSION="0"
      MAX_DRIVER_VERSION="0"
    fi
  else
    echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
  fi

  if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
    echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
  elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then
    echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
  fi
  if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then
    echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
  elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then
    echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
  fi

  # driver version named in cuda runfile filename
  # (these may not be actual driver versions - see https://us.download.nvidia.com/XFree86/Linux-x86_64/)
  readonly -A drv_for_cuda=(
      ["10.0.130"]="410.48"
      ["10.1.234"]="418.87.00"
      ["10.2.89"]="440.33.01"
      ["11.0.3"]="450.51.06"
      ["11.1.1"]="455.32.00"
      ["11.2.2"]="460.32.03"
      ["11.3.1"]="465.19.01"
      ["11.4.4"]="470.82.01"
      ["11.5.2"]="495.29.05"
      ["11.6.2"]="510.47.03"
      ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01"
      ["11.8.0"]="520.61.05"
      ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12"
      ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02"
      ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05"
      ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
      ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://us.download.nvidia.com/XFree86/Linux-x86_64/
      ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not
      ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ["12.6.3"]="560.35.05"
  )

  # Verify that the file with the indicated combination exists
  local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]}
  CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run"
  local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}"
  local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}"

  NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")

  if ! curl ${curl_retry_args} --head "${NVIDIA_CUDA_URL}" | grep -E -q 'HTTP.*200' ; then
    echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
    if [[ "${DEFAULT_NVIDIA_CUDA_URL}" != "${NVIDIA_CUDA_URL}" ]]; then
      echo "consider [${DEFAULT_NVIDIA_CUDA_URL}] instead"
    fi
    exit 1
  fi

  readonly NVIDIA_CUDA_URL

  CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
  readonly CUDA_RUNFILE

  if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then
    echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12"
  elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then
    echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18.  Requested version: ${CUDA_VERSION}"
  elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then
    echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}"
  elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then
    echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}"
  fi
}

function set_cudnn_tarball_url() {
CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
  # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format
  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz"
  if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then
    # When cuDNN version is greater than or equal to 8.4.1.50 use this format
    CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz"
  fi
  # Use legacy url format with one of the tarball name formats depending on version as above
  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}"
fi
if ( version_ge "${CUDA_VERSION}" "12.0" ); then
  # Use modern url format When cuda version is greater than or equal to 12.0
  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz"
  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}"
fi
readonly CUDNN_TARBALL
readonly CUDNN_TARBALL_URL
}

# Whether to install NVIDIA-provided or OS-provided GPU driver
GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
readonly GPU_DRIVER_PROVIDER

# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'true')
readonly INSTALL_GPU_AGENT

# Dataproc configurations
readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
readonly HIVE_CONF_DIR='/etc/hive/conf'
readonly SPARK_CONF_DIR='/etc/spark/conf'

NVIDIA_SMI_PATH='/usr/bin'
MIG_MAJOR_CAPS=0
IS_MIG_ENABLED=0

IS_CUSTOM_IMAGE_BUILD="false" # Default

function execute_with_retries() (
  local -r cmd="$*"

  if [[ "$cmd" =~ "^apt-get install" ]] ; then
    apt-get -y clean
    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
  fi
  for ((i = 0; i < 3; i++)); do
    time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
    if [[ $retval == 0 ]] ; then return 0 ; fi
    sleep 5
  done
  return 1
)

function install_cuda_keyring_pkg() {
  is_complete cuda-keyring-installed && return
  local kr_ver=1.1
  curl ${curl_retry_args} \
    "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \
    -o "${tmpdir}/cuda-keyring.deb"
  dpkg -i "${tmpdir}/cuda-keyring.deb"
  rm -f "${tmpdir}/cuda-keyring.deb"
  mark_complete cuda-keyring-installed
}

function uninstall_cuda_keyring_pkg() {
  apt-get purge -yq cuda-keyring
  mark_incomplete cuda-keyring-installed
}

function install_local_cuda_repo() {
  is_complete install-local-cuda-repo && return

  pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
  CUDA_LOCAL_REPO_PKG_NAME="${pkgname}"
  readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb"
  readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}"
  readonly DIST_KEYRING_DIR="/var/${pkgname}"

  curl ${curl_retry_args} \
    "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}"

  dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}"
  rm "${tmpdir}/${LOCAL_INSTALLER_DEB}"
  cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/

  if is_ubuntu ; then
    curl ${curl_retry_args} \
      "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \
      -o /etc/apt/preferences.d/cuda-repository-pin-600
  fi

  mark_complete install-local-cuda-repo
}
function uninstall_local_cuda_repo(){
  apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
  mark_incomplete install-local-cuda-repo
}

function install_local_cudnn_repo() {
  is_complete install-local-cudnn-repo && return
  pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
  CUDNN_PKG_NAME="${pkgname}"
  local_deb_fn="${pkgname}_1.0-1_amd64.deb"
  local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}"

  # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz
  curl ${curl_retry_args} \
    "${local_deb_url}" -o "${tmpdir}/local-installer.deb"

  dpkg -i "${tmpdir}/local-installer.deb"

  rm -f "${tmpdir}/local-installer.deb"

  cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings

  mark_complete install-local-cudnn-repo
}

function uninstall_local_cudnn_repo() {
  apt-get purge -yq "${CUDNN_PKG_NAME}"
  mark_incomplete install-local-cudnn-repo
}

function install_local_cudnn8_repo() {
  is_complete install-local-cudnn8-repo && return

  if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
  elif is_debian ; then cudnn8_shortname="debian11"
  else return 0 ; fi
  if   is_cuda12 ; then CUDNN8_CUDA_VER=12.0
  elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8
  else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi
  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}"

  pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}"
  CUDNN8_PKG_NAME="${pkgname}"

  deb_fn="${pkgname}_1.0-1_amd64.deb"
  local_deb_fn="${tmpdir}/${deb_fn}"
  local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"

  # cache the cudnn package
  cache_fetched_package "${local_deb_url}" \
                        "${pkg_bucket}/nvidia/cudnn/${CUDNN8_CUDA_VER}/${deb_fn}" \
                        "${local_deb_fn}"

  local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')"
  # If we are using a ram disk, mount another where we will unpack the cudnn local installer
  if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then
    mkdir -p "${cudnn_path}"
    mount -t tmpfs tmpfs "${cudnn_path}"
  fi

  dpkg -i "${local_deb_fn}"

  rm -f "${local_deb_fn}"

  cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings
  mark_complete install-local-cudnn8-repo
}

function uninstall_local_cudnn8_repo() {
  apt-get purge -yq "${CUDNN8_PKG_NAME}"
  mark_incomplete install-local-cudnn8-repo
}

function install_nvidia_nccl() {
  readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
  readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})

  is_complete nccl && return

  if is_cuda11 && is_debian12 ; then
    echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}"
    return
  fi

  local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"

  mkdir -p "${workdir}"
  pushd "${workdir}"

  test -d "${workdir}/nccl" || {
    local tarball_fn="v${NCCL_VERSION}-1.tar.gz"
    curl ${curl_retry_args} \
      "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \
      | tar xz
    mv "nccl-${NCCL_VERSION}-1" nccl
  }

  local build_path
  if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else
                       build_path="nccl/build/pkg/rpm/x86_64" ; fi

  test -d "${workdir}/nccl/build" || {
    local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz"
    local local_tarball="${workdir}/${build_tarball}"
    local gcs_tarball="${pkg_bucket}/nvidia/nccl/${_shortname}/${build_tarball}"

    if [[ "$(hostname -s)" =~ ^test-gpu && "$(nproc)" < 32 ]] ; then
      # when running with fewer than 32 cores, yield to in-progress build
      sleep $(( ( RANDOM % 11 ) + 10 ))
      local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')"
      if [[ "$?" == "0" ]] ; then
        local build_start_time build_start_epoch timeout_epoch
        build_start_time="$(echo ${output} | awk -F': +' '{print $2}')"
        build_start_epoch="$(date -u -d "${build_start_time}" +%s)"
        timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes
        while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do
          local now_epoch="$(date -u +%s)"
          if (( now_epoch > timeout_epoch )) ; then
            # detect unexpected build failure after 45m
            ${gsutil_cmd} rm "${gcs_tarball}.building"
            break
          fi
          sleep 5m
        done
      fi
    fi

    if ${gsutil_stat_cmd} "${gcs_tarball}" ; then
      # cache hit - unpack from cache
      echo "cache hit"
      ${gsutil_cmd} cat "${gcs_tarball}" | tar xvz
    else
      # build and cache
      touch "${local_tarball}.building"
      ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building"
      building_file="${gcs_tarball}.building"
      pushd nccl
      # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install
      install_build_dependencies

      # https://github.com/NVIDIA/nccl/blob/master/README.md
      # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
      # Fermi:     SM_20,             compute_30
      # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
      # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
      # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62

      # The following architectures are suppored by open kernel driver
      # Volta:     SM_70,SM_72,       compute_70,compute_72
      # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87

      # The following architectures are supported by CUDA v11.8+
      # Ada:       SM_89,             compute_89
      # Hopper:    SM_90,SM_90a       compute_90,compute_90a
      # Blackwell: SM_100,            compute_100
                      NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
      NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86"
      if version_gt "${CUDA_VERSION}" "11.6" ; then
        NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi
      if version_ge "${CUDA_VERSION}" "11.8" ; then
        NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi
      if version_ge "${CUDA_VERSION}" "12.0" ; then
        NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi

      if is_debuntu ; then
        # These packages are required to build .deb packages from source
        execute_with_retries \
          apt-get install -y -qq build-essential devscripts debhelper fakeroot
        export NVCC_GENCODE
        execute_with_retries make -j$(nproc) pkg.debian.build
      elif is_rocky ; then
        # These packages are required to build .rpm packages from source
        execute_with_retries \
          dnf -y -q install rpm-build rpmdevtools
        export NVCC_GENCODE
        execute_with_retries make -j$(nproc) pkg.redhat.build
      fi
      tar czvf "${local_tarball}" "../${build_path}"
      make clean
      popd
      tar xzvf "${local_tarball}"
      ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}"
      if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi
      building_file=""
      rm "${local_tarball}"
    fi
  }

  if is_debuntu ; then
    dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb"
  elif is_rocky ; then
    rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm"
  fi

  popd
  mark_complete nccl
}

function is_src_nvidia() { [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; }
function is_src_os()     { [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; }

function install_nvidia_cudnn() {
  is_complete cudnn && return
  if le_debian10 ; then return ; fi
  local major_version
  major_version="${CUDNN_VERSION%%.*}"
  local cudnn_pkg_version
  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}"

  if is_rocky ; then
    if is_cudnn8 ; then
      execute_with_retries dnf -y -q install \
        "libcudnn${major_version}" \
        "libcudnn${major_version}-devel"
      sync
    elif is_cudnn9 ; then
      execute_with_retries dnf -y -q install \
        "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \
        "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}"
      sync
    else
      echo "Unsupported cudnn version: '${major_version}'"
    fi
  elif is_debuntu; then
    if ge_debian12 && is_src_os ; then
      apt-get -y install nvidia-cudnn
    else
      if is_cudnn8 ; then
        add_repo_cuda

        apt-get update -qq
        # Ignore version requested and use the latest version in the package index
        cudnn_pkg_version="$(apt-cache show libcudnn8 | awk "/^Ver.*cuda${CUDA_VERSION%%.*}.*/ {print \$2}" | sort -V | tail -1)"

        execute_with_retries \
          apt-get -y install --no-install-recommends \
            "libcudnn8=${cudnn_pkg_version}" \
            "libcudnn8-dev=${cudnn_pkg_version}"

        sync
      elif is_cudnn9 ; then
        install_cuda_keyring_pkg

        apt-get update -qq

        execute_with_retries \
          apt-get -y install --no-install-recommends \
          "libcudnn9-cuda-${CUDA_VERSION%%.*}" \
          "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \
          "libcudnn9-static-cuda-${CUDA_VERSION%%.*}"

        sync
      else
        echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
      fi
    fi
  else
    echo "Unsupported OS: '${OS_NAME}'"
    exit 1
  fi

  ldconfig

  echo "NVIDIA cuDNN successfully installed for ${OS_NAME}."
  mark_complete cudnn
}

function install_pytorch() {
  is_complete pytorch && return

  local env
  env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce')

  local conda_root_path
  if version_lt "${DATAPROC_IMAGE_VERSION}" "2.3" ; then
    conda_root_path="/opt/conda/miniconda3"
  else
    conda_root_path="/opt/conda"
  fi
  [[ -d ${conda_root_path} ]] || return
  local envpath="${conda_root_path}/envs/${env}"
  if [[ "${env}" == "base" ]]; then
    echo "WARNING: installing to base environment known to cause solve issues" ; envpath="${conda_root_path}" ; fi
  # Set numa node to 0 for all GPUs
  for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done

  local build_tarball="pytorch_${env}_${_shortname}_cuda${CUDA_VERSION}.tar.gz"
  local local_tarball="${workdir}/${build_tarball}"
  local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}"

  if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
    # when running with fewer than 32 cores, yield to in-progress build
    sleep $(( ( RANDOM % 11 ) + 10 ))
    local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')"
    if [[ "$?" == "0" ]] ; then
      local build_start_time build_start_epoch timeout_epoch
      build_start_time="$(echo ${output} | awk -F': +' '{print $2}')"
      build_start_epoch="$(date -u -d "${build_start_time}" +%s)"
      timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes
      while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do
        local now_epoch="$(date -u +%s)"
        if (( now_epoch > timeout_epoch )) ; then
          # detect unexpected build failure after 45m
          ${gsutil_cmd} rm "${gcs_tarball}.building"
          break
        fi
        sleep 5m
      done
    fi
  fi

  if ${gsutil_stat_cmd} "${gcs_tarball}" ; then
    # cache hit - unpack from cache
    echo "cache hit"
    mkdir -p "${envpath}"
    ${gsutil_cmd} cat "${gcs_tarball}" | tar -C "${envpath}" -xz
  else
    touch "${local_tarball}.building"
    ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building"
    building_file="${gcs_tarball}.building"
    local verb=create
    if test -d "${envpath}" ; then verb=install ; fi
    cudart_spec="cuda-cudart"
    if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi

    # Install pytorch and company to this environment
    "${conda_root_path}/bin/mamba" "${verb}" -n "${env}" \
      -c conda-forge -c nvidia -c rapidsai \
      numba pytorch tensorflow[and-cuda] rapids pyspark \
      "cuda-version<=${CUDA_VERSION}" "${cudart_spec}"

    # Install jupyter kernel in this environment
    "${envpath}/bin/python3" -m pip install ipykernel

    # package environment and cache in GCS
    pushd "${envpath}"
    tar czf "${local_tarball}" .
    popd
    ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}"
    if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi
    building_file=""
  fi

  # register the environment as a selectable kernel
  "${envpath}/bin/python3" -m ipykernel install --name "${env}" --display-name "Python (${env})"

  mark_complete pytorch
}

function configure_dkms_certs() {
  if test -v PSN && [[ -z "${PSN}" ]]; then
      echo "No signing secret provided.  skipping";
      return 0
  fi

  mkdir -p "${CA_TMPDIR}"

  # If the private key exists, verify it
  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
    echo "Private key material exists"

    local expected_modulus_md5sum
    expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
    if [[ -n "${expected_modulus_md5sum}" ]]; then
      modulus_md5sum="${expected_modulus_md5sum}"

      # Verify that cert md5sum matches expected md5sum
      if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
        echo "unmatched rsa key"
      fi

      # Verify that key md5sum matches expected md5sum
      if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
        echo "unmatched x509 cert"
      fi
    else
      modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
    fi
    ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"

    return
  fi

  # Retrieve cloud secrets keys
  local sig_priv_secret_name
  sig_priv_secret_name="${PSN}"
  local sig_pub_secret_name
  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
  local sig_secret_project
  sig_secret_project="$(get_metadata_attribute secret_project)"
  local sig_secret_version
  sig_secret_version="$(get_metadata_attribute secret_version)"

  # If metadata values are not set, do not write mok keys
  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi

  # Write private material to volatile storage
  gcloud secrets versions access "${sig_secret_version}" \
         --project="${sig_secret_project}" \
         --secret="${sig_priv_secret_name}" \
      | dd status=none of="${CA_TMPDIR}/db.rsa"

  # Write public material to volatile storage
  gcloud secrets versions access "${sig_secret_version}" \
         --project="${sig_secret_project}" \
         --secret="${sig_pub_secret_name}" \
      | base64 --decode \
      | dd status=none of="${CA_TMPDIR}/db.der"

  local mok_directory="$(dirname "${mok_key}")"
  mkdir -p "${mok_directory}"

  # symlink private key and copy public cert from volatile storage to DKMS directory
  ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
  cp  -f "${CA_TMPDIR}/db.der" "${mok_der}"

  modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')"
}

function clear_dkms_key {
  if [[ -z "${PSN}" ]]; then
      echo "No signing secret provided.  skipping" >&2
      return 0
  fi
  rm -rf "${CA_TMPDIR}" "${mok_key}"
}

function add_contrib_component() {
  if ! is_debuntu ; then return ; fi
  if ge_debian12 ; then
      # Include in sources file components on which nvidia-kernel-open-dkms depends
      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
      local components="main contrib"

      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
  elif is_debian ; then
      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
  fi
}

function add_nonfree_components() {
  if is_src_nvidia ; then return; fi
  if ge_debian12 ; then
      # Include in sources file components on which nvidia-open-kernel-dkms depends
      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
      local components="main contrib non-free non-free-firmware"

      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
  elif is_debian ; then
      sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list
  fi
}

#
# Install package signing key and add corresponding repository
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
function add_repo_nvidia_container_toolkit() {
  local nvctk_root="https://nvidia.github.io/libnvidia-container"
  local signing_key_url="${nvctk_root}/gpgkey"
  local repo_data

  # Since there are more than one keys to go into this keychain, we can't call os_add_repo, which only works with one
  if is_debuntu ; then
    # "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}"
    local -r repo_name="nvidia-container-toolkit"
    local -r kr_path="/usr/share/keyrings/${repo_name}.gpg"
    execute_with_retries gpg --keyserver keyserver.ubuntu.com \
      --no-default-keyring --keyring "${kr_path}" \
      --recv-keys "0xae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80" "0xeb693b3035cd5710e231e123a4b469963bf863cc" "0xc95b321b61e88c1809c4f759ddcae044f796ecb0"
    local -r repo_data="${nvctk_root}/stable/deb/\$(ARCH) /"
    local -r repo_path="/etc/apt/sources.list.d/${repo_name}.list"
    echo "deb     [signed-by=${kr_path}] ${repo_data}" >  "${repo_path}"
    echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}"
    execute_with_retries apt-get update
  else
    repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo"
    os_add_repo nvidia-container-toolkit \
                "${signing_key_url}" \
                "${repo_data}" \
                "no"
  fi
}

function add_repo_cuda() {
  if is_debuntu ; then
    if version_le "${CUDA_VERSION}" 11.6 ; then
      local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg
      local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list"
      echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \
      | sudo tee "${sources_list_path}"

      gpg --keyserver keyserver.ubuntu.com \
        --no-default-keyring --keyring "${kr_path}" \
        --recv-keys "0xae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80" "0xeb693b3035cd5710e231e123a4b469963bf863cc"
    else
      install_cuda_keyring_pkg # 11.7+, 12.0+
    fi
  elif is_rocky ; then
    execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
  fi
}

function build_driver_from_github() {
  # non-GPL driver will have been built on rocky8, or when driver
  # version is prior to open driver min, or GPU architecture is prior
  # to Turing
  if ( is_rocky8 \
    || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" \
    || [[ "$((16#${pci_device_id}))" < "$((16#1E00))" ]] ) ; then return 0 ; fi
  pushd "${workdir}"
  test -d "${workdir}/open-gpu-kernel-modules" || {
    tarball_fn="${DRIVER_VERSION}.tar.gz"
    execute_with_retries curl ${curl_retry_args} \
      "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \
      \| tar xz
    mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules
  }

  local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
  test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
    local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
    local local_tarball="${workdir}/${build_tarball}"
    local build_dir
    if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
      then build_dir="${modulus_md5sum}"
      else build_dir="unsigned" ; fi

    local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"

    if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
      # when running with fewer than 32 cores, yield to in-progress build
      sleep $(( ( RANDOM % 11 ) + 10 ))
      local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')"
      if [[ "$?" == "0" ]] ; then
        local build_start_time build_start_epoch timeout_epoch
        build_start_time="$(echo ${output} | awk -F': +' '{print $2}')"
        build_start_epoch="$(date -u -d "${build_start_time}" +%s)"
        timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes
        while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do
          local now_epoch="$(date -u +%s)"
          if (( now_epoch > timeout_epoch )) ; then
            # detect unexpected build failure after 45m
            ${gsutil_cmd} rm "${gcs_tarball}.building" || echo "might have been deleted by a peer"
            break
          fi
          sleep 5m
        done
      fi
    fi

    if ${gsutil_stat_cmd} "${gcs_tarball}" 2>&1 ; then
      echo "cache hit"
    else
      # build the kernel modules
      touch "${local_tarball}.building"
      ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building"
      building_file="${gcs_tarball}.building"
      pushd open-gpu-kernel-modules
      install_build_dependencies
      if ( is_cuda11 && is_ubuntu22 ) ; then
        echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}"
        exit 1
      fi
      execute_with_retries make -j$(nproc) modules \
        >  kernel-open/build.log \
        2> kernel-open/build_error.log
      # Sign kernel modules
      if [[ -n "${PSN}" ]]; then
        configure_dkms_certs
        for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do
          "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
          "${mok_key}" \
          "${mok_der}" \
          "${module}"
        done
        clear_dkms_key
      fi
      make modules_install \
        >>  kernel-open/build.log \
        2>> kernel-open/build_error.log
      # Collect build logs and installed binaries
      tar czvf "${local_tarball}" \
        "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \
        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
      ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}"
      if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi
      building_file=""
      rm "${local_tarball}"
      make clean
      popd
    fi
    ${gsutil_cmd} cat "${gcs_tarball}" | tar -C / -xzv
    depmod -a
  }

  popd
}

function build_driver_from_packages() {
  if is_debuntu ; then
    if [[ -n "$(apt-cache search -n "nvidia-driver-${DRIVER}-server-open")" ]] ; then
      local pkglist=("nvidia-driver-${DRIVER}-server-open") ; else
      local pkglist=("nvidia-driver-${DRIVER}-open") ; fi
    if is_debian ; then
      pkglist=(
        "firmware-nvidia-gsp=${DRIVER_VERSION}-1"
        "nvidia-smi=${DRIVER_VERSION}-1"
        "nvidia-alternative=${DRIVER_VERSION}-1"
        "nvidia-kernel-open-dkms=${DRIVER_VERSION}-1"
        "nvidia-kernel-support=${DRIVER_VERSION}-1"
        "nvidia-modprobe=${DRIVER_VERSION}-1"
        "libnvidia-ml1=${DRIVER_VERSION}-1"
      )
    fi
    add_contrib_component
    apt-get update -qq
    execute_with_retries apt-get install -y -qq --no-install-recommends dkms
    configure_dkms_certs
    execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}"
    sync

  elif is_rocky ; then
    configure_dkms_certs
    if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then
      echo "nvidia-driver:${DRIVER}-dkms installed successfully"
    else
      execute_with_retries dnf -y -q module install 'nvidia-driver:latest'
    fi
    sync
  fi
  clear_dkms_key
}

function install_nvidia_userspace_runfile() {
  # Parameters for NVIDIA-provided Debian GPU driver
  readonly DEFAULT_USERSPACE_URL="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"

  readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")

  USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')"
  readonly USERSPACE_FILENAME

  # This .run file contains NV's OpenGL implementation as well as
  # nvidia optimized implementations of the gtk+ 2,3 stack(s) not
  # including glib (https://docs.gtk.org/glib/), and what appears to
  # be a copy of the source from the kernel-open directory of for
  # example DRIVER_VERSION=560.35.03
  #
  # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz
  #
  # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run
  # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it.
  is_complete userspace && return
  local local_fn="${tmpdir}/userspace.run"

  cache_fetched_package "${USERSPACE_URL}" \
                        "${pkg_bucket}/nvidia/${USERSPACE_FILENAME}" \
                        "${local_fn}"

  local runfile_args
  runfile_args=""
  local cache_hit="0"
  local local_tarball

  # Build nonfree driver on rocky8, or when driver version is prior to
  # open driver min, or when GPU architecture is prior to Turing
  if ( is_rocky8 \
    || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" \
    || [[ "$((16#${pci_device_id}))" < "$((16#1E00))" ]] )
  then
    local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
    test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
      local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}_nonfree.tar.gz"
      local_tarball="${workdir}/${build_tarball}"
      local build_dir
      if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
        then build_dir="${modulus_md5sum}"
        else build_dir="unsigned" ; fi

      local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"

      if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then
        # when running with fewer than 32 cores, yield to in-progress build
        sleep $(( ( RANDOM % 11 ) + 10 ))
        local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')"
        if [[ "$?" == "0" ]] ; then
          local build_start_time build_start_epoch timeout_epoch
          build_start_time="$(echo ${output} | awk -F': +' '{print $2}')"
          build_start_epoch="$(date -u -d "${build_start_time}" +%s)"
          timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes
          while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do
            local now_epoch="$(date -u +%s)"
            if (( now_epoch > timeout_epoch )) ; then
              # detect unexpected build failure after 45m
              ${gsutil_cmd} rm "${gcs_tarball}.building"
              break
            fi
            sleep 5m
          done
        fi
      fi

      if ${gsutil_stat_cmd} "${gcs_tarball}" ; then
        cache_hit="1"
        if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then
          runfile_args="${runfile_args} --no-kernel-modules"
        fi
        echo "cache hit"
      else
        # build the kernel modules
        touch "${local_tarball}.building"
        ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building"
        building_file="${gcs_tarball}.building"
        install_build_dependencies
        configure_dkms_certs
        local signing_options
        signing_options=""
        if [[ -n "${PSN}" ]]; then
          signing_options="--module-signing-hash sha256 \
          --module-signing-x509-hash sha256 \
          --module-signing-secret-key \"${mok_key}\" \
          --module-signing-public-key \"${mok_der}\" \
          --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
          "
        fi
        runfile_args="${signing_options}"
        if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then
          runfile_args="${runfile_args} --no-dkms"
        fi
      fi
    }
  elif version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then
    runfile_args="--no-kernel-modules"
  fi

  execute_with_retries bash "${local_fn}" -e -q \
    ${runfile_args} \
    --ui=none \
    --install-libglvnd \
    --tmpdir="${tmpdir}"

  # On rocky8, or when driver version is prior to open driver min, or when GPU architecture is prior to Turing
  if ( is_rocky8 \
    || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" \
    || [[ "$((16#${pci_device_id}))" < "$((16#1E00))" ]] ) ; then
    if [[ "${cache_hit}" == "1" ]] ; then
      ${gsutil_cmd} cat "${gcs_tarball}" | tar -C / -xzv
      depmod -a
    else
      clear_dkms_key
      tar czvf "${local_tarball}" \
        /var/log/nvidia-installer.log \
        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
      ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}"

      if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi
      building_file=""
    fi
  fi

  rm -f "${local_fn}"
  mark_complete userspace
  sync
}

function install_cuda_runfile() {
  is_complete cuda && return

  local local_fn="${tmpdir}/cuda.run"

  cache_fetched_package "${NVIDIA_CUDA_URL}" \
                        "${pkg_bucket}/nvidia/${CUDA_RUNFILE}" \
                        "${local_fn}"

  execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
  rm -f "${local_fn}"
  mark_complete cuda
  sync
}

function install_cuda_toolkit() {
  local cudatk_package=cuda-toolkit
  if ge_debian12 && is_src_os ; then
    cudatk_package="${cudatk_package}=${CUDA_FULL_VERSION}-1"
  elif [[ -n "${CUDA_VERSION}" ]]; then
    cudatk_package="${cudatk_package}-${CUDA_VERSION//./-}"
  fi
  cuda_package="cuda=${CUDA_FULL_VERSION}-1"
  readonly cudatk_package
  if is_debuntu ; then
#    if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi
    execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package}
  elif is_rocky ; then
    # rocky9: cuda-11-[7,8], cuda-12-[1..6]
    execute_with_retries dnf -y -q install "${cudatk_package}"
  fi
  sync
}

function load_kernel_module() {
  # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
  for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
    ( set +e
      rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
    )
  done

  depmod -a
  modprobe nvidia
  for suffix in uvm modeset drm; do
    modprobe "nvidia-${suffix}"
  done
  # TODO: if peermem is available, also modprobe nvidia-peermem
}

function install_cuda(){
  is_complete cuda-repo && return
  if [[ "${gpu_count}" == "0" ]] ; then return ; fi

  if ( ge_debian12 && is_src_os ) ; then
    echo "installed with the driver on ${_shortname}"
    return 0
  fi

  # The OS package distributions are unreliable
  install_cuda_runfile

  # Includes CUDA packages
  add_repo_cuda

  mark_complete cuda-repo
}

function install_nvidia_container_toolkit() {
  is_complete install-nvctk && return

  local container_runtime_default
    if command -v docker     ; then container_runtime_default='docker'
  elif command -v containerd ; then container_runtime_default='containerd'
  elif command -v crio       ; then container_runtime_default='crio'
                               else container_runtime_default='' ; fi
  CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}")

  if test -z "${CONTAINER_RUNTIME}" ; then return ; fi

  add_repo_nvidia_container_toolkit
  if is_debuntu ; then
    execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else
    execute_with_retries dnf     install -y -q nvidia-container-toolkit ; fi
  nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}"
  systemctl restart "${CONTAINER_RUNTIME}"

  mark_complete install-nvctk
}

# Install NVIDIA GPU driver provided by NVIDIA
function install_nvidia_gpu_driver() {
  is_complete gpu-driver && return
  if [[ "${gpu_count}" == "0" ]] ; then return ; fi

  if ( ge_debian12 && is_src_os ) ; then
    add_nonfree_components
    apt-get update -qq
    apt-get -yq install \
        dkms \
        nvidia-open-kernel-dkms \
        nvidia-open-kernel-support \
        nvidia-smi \
        libglvnd0 \
        libcuda1
    echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully"
    return 0
  fi

  # OS driver packages do not produce reliable driver ; use runfile
  install_nvidia_userspace_runfile

  build_driver_from_github

  echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
  mark_complete gpu-driver
}

function install_ops_agent(){
  is_complete ops-agent && return

  mkdir -p /opt/google
  cd /opt/google
  # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation
  curl ${curl_retry_args} -O https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
  local expected="038d98644e4c4a7969d26da790946720d278c8d49bb82b677f550c2a2b858411  add-google-cloud-ops-agent-repo.sh"

  execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install

  mark_complete ops-agent
}

# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
function install_gpu_agent() {
  # Stackdriver GPU agent parameters
#  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics'
  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics'
  if ( ! command -v pip && is_debuntu ) ; then
    execute_with_retries "apt-get install -y -qq python3-pip"
  fi
  local install_dir=/opt/gpu-utilization-agent
  mkdir -p "${install_dir}"
  curl ${curl_retry_args} \
    "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt"
  curl ${curl_retry_args} \
    "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \
    | sed -e 's/-u --format=/--format=/' \
    | dd status=none of="${install_dir}/report_gpu_metrics.py"
  local venv="${install_dir}/venv"
  python_interpreter="/opt/conda/miniconda3/bin/python3"
  [[ -f "${python_interpreter}" ]] || python_interpreter="$(command -v python3)"
  if version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" && is_debuntu ; then
    execute_with_retries "apt-get install -y -qq python3-venv"
  fi
  "${python_interpreter}" -m venv "${venv}"
(
  source "${venv}/bin/activate"
  python3 -m pip install --upgrade pip
  execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt"
)
  sync

  # Generate GPU service.
  cat <<EOF >/lib/systemd/system/gpu-utilization-agent.service
[Unit]
Description=GPU Utilization Metric Agent

[Service]
Type=simple
PIDFile=/run/gpu_agent.pid
ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"'
User=root
Group=root
WorkingDirectory=/
Restart=always

[Install]
WantedBy=multi-user.target
EOF
  # Reload systemd manager configuration
  systemctl daemon-reload
  # Enable gpu-utilization-agent service
  systemctl --no-reload --now enable gpu-utilization-agent.service
}

function set_hadoop_property() {
  local -r config_file=$1
  local -r property=$2
  local -r value=$3
  "${bdcfg}" set_property \
    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
    --name "${property}" --value "${value}" \
    --clobber
}

function configure_yarn_resources() {
  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then
    # TODO: when running this script to customize an image, this file
    # needs to be written *after* bdutil completes

    return 0
  fi # pre-init scripts
  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
  fi
  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'

  set_hadoop_property 'capacity-scheduler.xml' \
    'yarn.scheduler.capacity.resource-calculator' \
    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'

  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
}

# This configuration should be applied only if GPU is attached to the node
function configure_yarn_nodemanager() {
  set_hadoop_property 'yarn-site.xml' \
    'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
  set_hadoop_property 'yarn-site.xml' \
    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
  set_hadoop_property 'yarn-site.xml' \
    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' "${NVIDIA_SMI_PATH}"
  set_hadoop_property 'yarn-site.xml' \
    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
  set_hadoop_property 'yarn-site.xml' \
    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
  set_hadoop_property 'yarn-site.xml' \
    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
  set_hadoop_property 'yarn-site.xml' \
    'yarn.nodemanager.container-executor.class' 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
  set_hadoop_property 'yarn-site.xml' \
    'yarn.nodemanager.linux-container-executor.group' 'yarn'

  # Fix local dirs access permissions
  local yarn_local_dirs=()

  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')

  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
  fi
}

function configure_gpu_exclusive_mode() {
  # only run this function when spark < 3.0
  if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi
  # include exclusive mode on GPU
  nvsmi -c EXCLUSIVE_PROCESS
}

function fetch_mig_scripts() {
  mkdir -p /usr/local/yarn-mig-scripts
  sudo chmod 755 /usr/local/yarn-mig-scripts
  execute_with_retries wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi
  execute_with_retries wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh
  sudo chmod 755 /usr/local/yarn-mig-scripts/*
}

function configure_gpu_script() {
  # Download GPU discovery script
  local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu'
  mkdir -p ${spark_gpu_script_dir}
  # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still
  # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of:
  # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh
  local -r gpus_resources_script="${spark_gpu_script_dir}/getGpusResources.sh"
  cat > "${gpus_resources_script}" <<'EOF'
#!/usr/bin/env bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]}

set -e
resources_json="/dev/shm/nvidia/gpusResources.json"
if test -f "${resources_json}" ; then cat "${resources_json}" ; exit 0 ; fi

mkdir -p "$(dirname ${resources_json})"

ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')

echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} | tee "${resources_json}"
EOF

  chmod a+rx "${gpus_resources_script}"

  if version_lt "${SPARK_VERSION}" "3.0" ; then return ; fi

  local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
  local spark_defaults_dir="$(dirname "${spark_defaults_conf}")"
  if ! grep spark.executor.resource.gpu.discoveryScript "${spark_defaults_conf}" ; then
    echo "spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}" >> "${spark_defaults_conf}"
  fi
  local executor_cores
  executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
  [[ "${executor_cores}" == "0" ]] && executor_cores=1
  local executor_memory
  executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
  local task_cpus=2
  [[ "${task_cpus}" -gt "${executor_cores}" ]] && task_cpus="${executor_cores}"
  local gpu_amount
#  gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"
  gpu_amount="$(perl -e "print 1 / ${executor_cores}")"

  # the gpu.amount properties are not appropriate for the version of
  # spark shipped with 1.5 images using the capacity scheduler.  TODO:
  # In order to get spark rapids GPU accelerated SQL working on 1.5
  # images, we must configure the Fair scheduler
  version_ge "${DATAPROC_IMAGE_VERSION}" "2.0" || return

  if ! grep -q "BEGIN : RAPIDS properties" "${spark_defaults_conf}"; then
    cat >>"${spark_defaults_conf}" <<EOF
###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
# Rapids Accelerator for Spark can utilize AQE, but when the plan is not finalized,
# query explain output won't show GPU operator, if the user has doubts
# they can uncomment the line before seeing the GPU plan explain;
# having AQE enabled gives user the best performance.
#spark.sql.autoBroadcastJoinThreshold=10m
#spark.sql.files.maxPartitionBytes=512m
spark.executor.resource.gpu.amount=1
#spark.executor.cores=${executor_cores}
#spark.executor.memory=${executor_memory_gb}G
#spark.dynamicAllocation.enabled=false
# please update this config according to your application
#spark.task.resource.gpu.amount=${gpu_amount}
#spark.task.cpus=2
#spark.yarn.unmanagedAM.enabled=false
#spark.plugins=com.nvidia.spark.SQLPlugin
###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
EOF
  fi
}

function configure_gpu_isolation() {
  if [[ ! -d "${HADOOP_CONF_DIR}" ]]; then
     echo "Hadoop conf dir ${HADOOP_CONF_DIR} not found. Skipping GPU isolation config."
     return
  fi
  # enable GPU isolation
  sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg"
  if [[ $IS_MIG_ENABLED -ne 0 ]]; then
    # configure the container-executor.cfg to have major caps
    printf '\n[gpu]\nmodule.enabled=true\ngpu.major-device-number=%s\n\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' $MIG_MAJOR_CAPS >> "${HADOOP_CONF_DIR}/container-executor.cfg"
    printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
    printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
  else
    printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg"
  fi

  # Configure a systemd unit to ensure that permissions are set on restart
  cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service<<EOF
[Unit]
Description=Set permissions to allow YARN to access device directories

[Service]
ExecStart=/bin/bash -c "chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct; chmod a+rwx -R /sys/fs/cgroup/devices"

[Install]
WantedBy=multi-user.target
EOF

  systemctl enable dataproc-cgroup-device-permissions
  systemctl start dataproc-cgroup-device-permissions
}

function nvsmi() {
  local nvsmi="/usr/bin/nvidia-smi"
  if   [[ "${nvsmi_works}" == "1" ]] ; then echo -n ''
  elif [[ ! -f "${nvsmi}" ]]         ; then echo "nvidia-smi not installed" >&2 ; return 0
  elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0
  else nvsmi_works="1" ; fi

  if test -v 1 && [[ "$1" == "-L" ]] ; then
    local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt"
    if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}"
    else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi
    return 0
  fi

  "${nvsmi}" $*
}

function install_build_dependencies() {
  is_complete build-dependencies && return

  if is_debuntu ; then
    if is_ubuntu22 && is_cuda12 ; then
      # On ubuntu22, the default compiler does not build some kernel module versions
      # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
      execute_with_retries apt-get install -y -qq gcc-12
      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
      update-alternatives --set gcc /usr/bin/gcc-12
    elif is_ubuntu22 && version_lt "${CUDA_VERSION}" "11.7" ; then
      # On cuda less than 11.7, the kernel driver does not build on ubuntu22
      # https://forums.developer.nvidia.com/t/latest-nvidia-driver-470-63-01-installation-fails-with-latest-linux-kernel-5-16-5-100/202972
      echo "N.B.: Older CUDA 11 known bad on ${_shortname}"
    fi

  elif is_rocky ; then
    execute_with_retries dnf -y -q install gcc

    local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
    set +e
    eval "${dnf_cmd}" > "${install_log}" 2>&1
    local retval="$?"
    set -e

    if [[ "${retval}" == "0" ]] ; then return ; fi

    local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')"
    local vault="https://download.rockylinux.org/vault/rocky/${os_ver}"
    if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then
      # this kernel-devel may have been migrated to the vault
      dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \
        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \
        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \
        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \
        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \
        "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm"
       )"
    fi

    set +e
    eval "${dnf_cmd}" > "${install_log}" 2>&1
    local retval="$?"
    set -e

    if [[ "${retval}" == "0" ]] ; then return ; fi

    if grep -q 'Status code: 404 for https' "${install_log}" ; then
      local stg_url="https://download.rockylinux.org/stg/rocky/${os_ver}/devel/x86_64/os/Packages/k/"
      dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \
        "${stg_url}/kernel-${uname_r}.rpm" \
        "${stg_url}/kernel-core-${uname_r}.rpm" \
        "${stg_url}/kernel-modules-${uname_r}.rpm" \
        "${stg_url}/kernel-modules-core-${uname_r}.rpm" \
        "${stg_url}/kernel-devel-${uname_r}.rpm"
       )"
    fi

    execute_with_retries "${dnf_cmd}"
  fi
  mark_complete build-dependencies
}

function is_complete() {
  phase="$1"
  test -f "${workdir}/complete/${phase}"
}

function mark_complete() {
  phase="$1"
  touch "${workdir}/complete/${phase}"
}

function mark_incomplete() {
  phase="$1"
  rm -f "${workdir}/complete/${phase}"
}

function install_dependencies() {
  is_complete install-dependencies && return 0

  pkg_list="pciutils screen"
  if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
  elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
  mark_complete install-dependencies
}

function prepare_gpu_env(){
  #set_support_matrix

  # if set, this variable includes a gcs path to a build-in-progress indicator
  building_file=""

  set_cuda_version
  set_driver_version

  set +e
  # NV vendor ID is 10DE
  pci_vendor_id="10DE"
  gpu_count="$(grep -i PCI_ID=${pci_vendor_id} /sys/bus/pci/devices/*/uevent | wc -l)"
  set -e

  if [[ "${gpu_count}" > "0" ]] ; then
    # N.B.: https://pci-ids.ucw.cz/v2.2/pci.ids.xz
    pci_device_id="$(grep -h -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | head -1 | awk -F: '{print $2}')"
    pci_device_id_int="$((16#${pci_device_id}))"
    case "${pci_device_id}" in
      "15F8" ) gpu_type="nvidia-tesla-p100"      ;;
      "1BB3" ) gpu_type="nvidia-tesla-p4"        ;;
      "1DB1" ) gpu_type="nvidia-tesla-v100"      ;;
      "1EB8" ) gpu_type="nvidia-tesla-t4"        ;;
      "20B2" ) gpu_type="nvidia-tesla-a100-80gb" ;;
      "20B5" ) gpu_type="nvidia-tesla-a100-80gb" ;;
      "20F3" ) gpu_type="nvidia-tesla-a100-80gb" ;;
      "20F5" ) gpu_type="nvidia-tesla-a100-80gb" ;;
      "20"*  ) gpu_type="nvidia-tesla-a100"      ;;
      "23"*  ) gpu_type="nvidia-h100"            ;; # NB: install does not begin with legacy image 2.0.68-debian10/cuda11.1
      "27B8" ) gpu_type="nvidia-l4"              ;; # NB: install does not complete with legacy image 2.0.68-debian10/cuda11.1
      *      ) gpu_type="unrecognized"
    esac

    ACCELERATOR="type=${gpu_type},count=${gpu_count}"
  fi

  nvsmi_works="0"

  if   is_cuda11 ; then gcc_ver="11"
  elif is_cuda12 ; then gcc_ver="12" ; fi

  if ! test -v DEFAULT_RAPIDS_RUNTIME ; then
    readonly DEFAULT_RAPIDS_RUNTIME='SPARK'
  fi

  # Set variables from metadata
  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')
  INCLUDE_GPUS="$(get_metadata_attribute include-gpus "")"
  INCLUDE_PYTORCH="$(get_metadata_attribute 'include-pytorch' 'no')"
  readonly RAPIDS_RUNTIME INCLUDE_GPUS INCLUDE_PYTORCH

  # determine whether we have nvidia-smi installed and working
  nvsmi

  set_nv_urls
  set_cuda_runfile_url
  set_cudnn_version
  set_cudnn_tarball_url
}

# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
# Users should run apt-mark unhold before they wish to upgrade these packages
function hold_nvidia_packages() {
  if ! is_debuntu ; then return ; fi

  apt-mark hold nvidia-*    > /dev/null 2>&1
  apt-mark hold libnvidia-* > /dev/null 2>&1
  if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
    apt-mark hold xserver-xorg-video-nvidia*
  fi
}

function check_secure_boot() {
  local SECURE_BOOT="disabled"
  if command -v mokutil ; then
      SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')
  fi

  PSN="$(get_metadata_attribute private_secret_name)"
  readonly PSN

  if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then
    echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster."
    exit 1
  elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
    echo "Error: Secure boot is enabled, but no signing material provided."
    echo "Please either disable secure boot or provide signing material as per"
    echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
    return 1
  fi

  CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
  readonly CA_TMPDIR

  if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv
                      mok_der=/var/lib/shim-signed/mok/MOK.der
                 else mok_key=/var/lib/dkms/mok.key
                      mok_der=/var/lib/dkms/mok.pub ; fi
  return 0
}

# Function to group Hadoop/Spark config steps (called in init-action mode or deferred)
function run_hadoop_spark_config() {
  # Ensure necessary variables are available or re-evaluated
  # prepare_gpu_env needs CUDA/Driver versions, call it first if needed
  if [[ ! -v CUDA_VERSION || ! -v DRIVER_VERSION ]]; then prepare_gpu_env; fi
  # Re-read ROLE
  ROLE="$(get_metadata_attribute dataproc-role)";
  # Re-read SPARK_VERSION if not set or default
  if [[ ! -v SPARK_VERSION || "${SPARK_VERSION}" == "0.0" ]]; then
      SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1 || echo "0.0")"
  fi
  # Re-check GPU count
  set +e
  gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)"
  set -e
  # Re-check MIG status
  IS_MIG_ENABLED=0
  NVIDIA_SMI_PATH='/usr/bin' # Reset default path
  MIG_MAJOR_CAPS=0
  if [[ "${gpu_count}" -gt "0" ]] && nvsmi >/dev/null 2>&1; then # Check if nvsmi works before querying
      migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader || echo '[N/A]')"
      if [[ "${migquery_result}" != "[N/A]" && "${migquery_result}" != "" ]]; then
          NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"
          if [[ "${NUM_MIG_GPUS}" -eq "1" ]] && (echo "${migquery_result}" | grep -q Enabled); then
            IS_MIG_ENABLED=1
            NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' # Set MIG path
            MIG_MAJOR_CAPS=$(grep nvidia-caps /proc/devices | cut -d ' ' -f 1 || echo 0)
            if [[ ! -d "/usr/local/yarn-mig-scripts" ]]; then fetch_mig_scripts || echo "WARN: Failed to fetch MIG scripts." >&2; fi
          fi
      fi
  fi

  # Ensure config directories exist
  if [[ ! -d "${HADOOP_CONF_DIR}" || ! -d "${SPARK_CONF_DIR}" ]]; then
     echo "ERROR: Config directories (${HADOOP_CONF_DIR}, ${SPARK_CONF_DIR}) not found. Cannot apply configuration."
     return 1 # Use return instead of exit in a function
  fi

  # Run config applicable to all nodes
  configure_yarn_resources

  # Run node-specific config
  if [[ "${gpu_count}" -gt 0 ]]; then
    configure_yarn_nodemanager
    install_spark_rapids # Installs JARs
    configure_gpu_script
    configure_gpu_isolation
    configure_gpu_exclusive_mode # Call this here, it checks Spark version internally
  elif [[ "${ROLE}" == "Master" ]]; then
    # Master node without GPU still needs some config
    configure_yarn_nodemanager
    install_spark_rapids # Still need JARs on Master
    configure_gpu_script
  else
    # Worker node without GPU, skip node-specific YARN/Spark config.
    :
  fi

  # Restart services after config
  for svc in resourcemanager nodemanager; do
    if (systemctl is-active --quiet hadoop-yarn-${svc}.service); then
      systemctl stop  hadoop-yarn-${svc}.service || echo "WARN: Failed to stop ${svc}"
      systemctl start hadoop-yarn-${svc}.service || echo "WARN: Failed to start ${svc}"
    fi
  done
  return 0 # Explicitly return success
}

# This function now ONLY generates the script and service file.
# It does NOT enable the service here.
function create_deferred_config_files() {
  local -r service_name="dataproc-gpu-config"
  local -r service_file="/etc/systemd/system/${service_name}.service"
  # This is the script that will contain the config logic
  local -r config_script_path="/usr/local/sbin/apply-dataproc-gpu-config.sh"

  # Use 'declare -f' to extract function definitions needed by the config logic
  # and write them, along with the config logic itself, into the new script.
  cat <<EOF > "${config_script_path}"
#!/bin/bash
# Deferred configuration script generated by install_gpu_driver.sh
set -xeuo pipefail

# --- Minimal necessary functions and variables ---
# Define constants
readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
readonly SPARK_CONF_DIR='/etc/spark/conf'
readonly bdcfg="/usr/local/bin/bdconfig"
readonly workdir=/opt/install-dpgce # Needed for cache_fetched_package

# --- Define Necessary Global Arrays ---
# These need to be explicitly defined here as they are not functions.
$(declare -p DRIVER_FOR_CUDA)
$(declare -p DRIVER_SUBVER)
$(declare -p CUDNN_FOR_CUDA)
$(declare -p NCCL_FOR_CUDA)
$(declare -p CUDA_SUBVER)
# drv_for_cuda is defined within set_cuda_runfile_url, which is included below

# Define minimal metadata functions
$(declare -f print_metadata_value)
$(declare -f print_metadata_value_if_exists)
$(declare -f get_metadata_value)
$(declare -f get_metadata_attribute)

# Define nvsmi wrapper
$(declare -f nvsmi)
nvsmi_works="0" # Initialize variable used by nvsmi

# Define version comparison
$(declare -f version_ge)
$(declare -f version_gt)
$(declare -f version_le)
$(declare -f version_lt)

# Define OS check functions
$(declare -f os_id)
$(declare -f os_version)
$(declare -f os_codename) # Added os_codename as it's used by clean_up_sources_lists indirectly via os_add_repo
$(declare -f is_debian)
$(declare -f is_ubuntu)
$(declare -f is_rocky)
$(declare -f is_debuntu)
$(declare -f is_debian10)
$(declare -f is_debian11)
$(declare -f is_debian12)
$(declare -f is_rocky8)
$(declare -f is_rocky9)
$(declare -f is_ubuntu18)
$(declare -f is_ubuntu20)
$(declare -f is_ubuntu22)
$(declare -f ge_debian12)
$(declare -f le_debian10)
$(declare -f le_debian11)
$(declare -f ge_ubuntu20)
$(declare -f le_ubuntu18)
$(declare -f ge_rocky9)
$(declare -f os_vercat) # Added os_vercat as it's used by set_nv_urls/set_cuda_runfile_url
# Define _shortname (needed by install_spark_rapids -> cache_fetched_package and others)
readonly _shortname="\$(os_id)\$(os_version|perl -pe 's/(\\d+).*/\$1/')"
# Define shortname and nccl_shortname (needed by set_nv_urls)
if is_ubuntu22  ; then
    nccl_shortname="ubuntu2004"
    shortname="\$(os_id)\$(os_vercat)"
elif ge_rocky9 ; then
    nccl_shortname="rhel8"
    shortname="rhel9"
elif is_rocky ; then
    shortname="\$(os_id | sed -e 's/rocky/rhel/')\$(os_vercat)"
    nccl_shortname="\${shortname}"
else
    shortname="\$(os_id)\$(os_vercat)"
    nccl_shortname="\${shortname}"
fi
readonly shortname nccl_shortname

# Define prepare_gpu_env and its dependencies
$(declare -f prepare_gpu_env)
$(declare -f set_cuda_version)
$(declare -f set_driver_version)
$(declare -f set_nv_urls)
$(declare -f set_cuda_runfile_url)
$(declare -f set_cudnn_version)
$(declare -f set_cudnn_tarball_url)
$(declare -f is_cuda11)
$(declare -f is_cuda12)
$(declare -f le_cuda11)
$(declare -f le_cuda12)
$(declare -f ge_cuda11)
$(declare -f ge_cuda12)
$(declare -f is_cudnn8)
$(declare -f is_cudnn9)

# Define DATAPROC_IMAGE_VERSION (re-evaluate)
SPARK_VERSION="\$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1 || echo "0.0")"
if   version_lt "\${SPARK_VERSION}" "2.5" ; then DATAPROC_IMAGE_VERSION="1.5"
elif version_lt "\${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
elif version_lt "\${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
elif version_lt "\${SPARK_VERSION}" "3.6" ; then
  if [[ -f /etc/environment ]] ; then
    eval "\$(grep '^DATAPROC_IMAGE_VERSION' /etc/environment)" || DATAPROC_IMAGE_VERSION="2.2"
  else
    DATAPROC_IMAGE_VERSION="2.2"
  fi
else DATAPROC_IMAGE_VERSION="2.3" ; fi # Default to latest known version
readonly DATAPROC_IMAGE_VERSION

# Define set_hadoop_property
$(declare -f set_hadoop_property)

# --- Include definitions of functions called by the config logic ---
$(declare -f configure_yarn_resources)
$(declare -f configure_yarn_nodemanager)
$(declare -f install_spark_rapids)
$(declare -f configure_gpu_script)
$(declare -f configure_gpu_isolation)
$(declare -f configure_gpu_exclusive_mode)
$(declare -f fetch_mig_scripts)
$(declare -f cache_fetched_package)
$(declare -f execute_with_retries)

# --- Define gsutil/gcloud commands and curl args ---
gsutil_cmd="gcloud storage"
gsutil_stat_cmd="gcloud storage objects describe"
gcloud_sdk_version="\$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print \$2}' || echo '0.0.0')"
if version_lt "\${gcloud_sdk_version}" "402.0.0" ; then
  gsutil_cmd="gsutil -o GSUtil:check_hashes=never"
  gsutil_stat_cmd="gsutil stat"
fi
curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30"
# Define pkg_bucket (needed by cache_fetched_package)
temp_bucket="\$(get_metadata_attribute dataproc-temp-bucket)"
readonly temp_bucket
readonly pkg_bucket="gs://\${temp_bucket}/dpgce-packages"
readonly install_log="/tmp/deferred-config-install.log" # Log file for execute_with_retries

# --- Include the main config function ---
$(declare -f run_hadoop_spark_config)

# --- Execute the config logic ---
if run_hadoop_spark_config; then
  # Configuration successful, disable the service
  systemctl disable ${service_name}.service
  rm -f "${config_script_path}" "${service_file}"
  systemctl daemon-reload
else
  echo "ERROR: Deferred configuration script (${config_script_path}) failed." >&2
  # Keep the service enabled to allow for manual inspection/retry
  exit 1
fi

exit 0
EOF

  chmod +x "${config_script_path}"

  cat <<EOF > "${service_file}"
[Unit]
Description=Apply Dataproc GPU configuration on first boot
# Ensure it runs after Dataproc agent and YARN services are likely up
After=google-dataproc-agent.service network-online.target hadoop-yarn-resourcemanager.service hadoop-yarn-nodemanager.service
Wants=network-online.target google-dataproc-agent.service

[Service]
Type=oneshot
ExecStart=${config_script_path} # Execute the generated config script
RemainAfterExit=no # Service is done after exec
StandardOutput=journal+console
StandardError=journal+console

[Install]
WantedBy=multi-user.target
EOF

  chmod 644 "${service_file}"
  # Service is enabled later only if IS_CUSTOM_IMAGE_BUILD is true
}

function main() {
  # Perform installations (these are generally safe during image build)
  if (lspci | grep -q NVIDIA); then
    # Check MIG status early, primarily for driver installation logic
    migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader || echo '[N/A]')" # Use || for safety
    if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi
    NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)"

    if [[ "${NUM_MIG_GPUS}" -gt 0 ]] ; then
      if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then
        if (echo "${migquery_result}" | grep Enabled); then
          IS_MIG_ENABLED=1
          # Fetch MIG scripts early if needed by driver install/check
          if [[ ! -d "/usr/local/yarn-mig-scripts" ]]; then fetch_mig_scripts || echo "WARN: Failed to fetch MIG scripts." >&2; fi
        fi
      fi
    fi

    # Install core components if MIG is not already enabled (MIG setup implies drivers exist)
    if [[ $IS_MIG_ENABLED -eq 0 ]]; then
      install_nvidia_gpu_driver
      install_nvidia_container_toolkit
      install_cuda
      load_kernel_module # Load modules after driver install

      if [[ -n ${CUDNN_VERSION} ]]; then
        install_nvidia_nccl
        install_nvidia_cudnn
      fi
      case "${INCLUDE_PYTORCH^^}" in
        "1" | "YES" | "TRUE" ) install_pytorch ;;
      esac
      #Install GPU metrics collection in Stackdriver if needed
      if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then
        #install_ops_agent
        install_gpu_agent
        echo 'GPU metrics agent successfully deployed.'
      else
        echo 'GPU metrics agent will not be installed.'
      fi

      # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
      for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
        rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
      done

      if test -n "$(nvsmi -L)" ; then
        # cache the result of the gpu query
        ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')
        echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt"
        chmod a+r "/var/run/nvidia-gpu-index.txt"
      fi
      MIG_GPU_LIST="$(nvsmi -L | grep -E '(MIG|[PVAH]100)' || echo -n "")"
      NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")"
      if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then
        # enable MIG on every GPU
        for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' '{print $2}') ; do
          if version_le "${CUDA_VERSION}" "11.6" ; then
            nvsmi -i "${GPU_ID}" --multi-instance-gpu=1
          else
            nvsmi -i "${GPU_ID}" --multi-instance-gpu 1

          fi
        done

        NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/'
        MIG_MAJOR_CAPS="$(grep nvidia-caps /proc/devices | cut -d ' ' -f 1)"
        fetch_mig_scripts
      else
        configure_gpu_exclusive_mode
      fi
    fi

    configure_yarn_nodemanager
    install_spark_rapids
    configure_gpu_script
    configure_gpu_isolation
  elif [[ "${ROLE}" == "Master" ]]; then
    # Master node without GPU detected.
    :
  else
    # Worker node without GPU detected.
    :
  fi # End GPU detection

  # --- Generate Config Script and Service File ---
  # This happens in both modes now
  create_deferred_config_files

  # --- Apply or Defer Configuration ---
  if [[ "${IS_CUSTOM_IMAGE_BUILD}" == "true" ]]; then
    # Enable the systemd service for first boot
    systemctl enable "dataproc-gpu-config.service"
  else
    # Running as a standard init action: execute the generated script immediately
    local -r config_script_path="/usr/local/sbin/apply-dataproc-gpu-config.sh"
    if [[ -x "${config_script_path}" ]]; then
        bash -x "${config_script_path}"
    else
        echo "ERROR: Generated config script ${config_script_path} not found or not executable."
        exit 1
    fi
    # The config script handles its own cleanup and service disabling on success
  fi
  # --- End Apply or Defer ---
}

function cache_fetched_package() {
  local src_url="$1"
  local gcs_fn="$2"
  local local_fn="$3"

  if ${gsutil_stat_cmd} "${gcs_fn}" 2>&1 ; then
    execute_with_retries ${gsutil_cmd} cp "${gcs_fn}" "${local_fn}"
  else
    time ( curl ${curl_retry_args} "${src_url}" -o "${local_fn}" && \
           execute_with_retries ${gsutil_cmd} cp "${local_fn}" "${gcs_fn}" ; )
  fi
}

function clean_up_sources_lists() {
  if ! is_debuntu; then return; fi
  #
  # bigtop (primary)
  #
  local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list"

  if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then
    region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')"

    local regional_bigtop_repo_uri
    regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} |
      sed -E "s#/dataproc-bigtop-repo(-dev)?/#/goog-dataproc-bigtop-repo\\1-${region}/#" |
      grep -E "deb .*goog-dataproc-bigtop-repo(-dev)?-${region}.* dataproc contrib" |
      cut -d ' ' -f 2 |
      head -1)

    if [[ "${regional_bigtop_repo_uri}" == */ ]]; then
      local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key"
    else
      local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key"
    fi

    local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
    rm -f "${bigtop_kr_path}"
    curl ${curl_retry_args} \
      "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"

    sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
    sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
  fi

  #
  # adoptium
  #
  # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu
  local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
  local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
  rm -f "${adoptium_kr_path}"
  local -r old_adoptium_list="/etc/apt/sources.list.d/adoptopenjdk.list"
  if test -f "${old_adoptium_list}" ; then
    rm -f "${old_adoptium_list}"
  fi
  for keyid in "0x3b04d753c9050d9a5d343f39843c48a565f8f04b" "0x35baa0b33e9eb396f59ca838c0ba5ce6dc6315a3" ; do
    curl ${curl_retry_args} "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" \
    | gpg --import --no-default-keyring --keyring "${adoptium_kr_path}"
  done
  echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
   > /etc/apt/sources.list.d/adoptium.list

  #
  # docker
  #
  local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg"
  local docker_repo_file="/etc/apt/sources.list.d/docker.list"
  local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"

  rm -f "${docker_kr_path}"
  curl ${curl_retry_args} "${docker_key_url}" \
    | gpg --import --no-default-keyring --keyring "${docker_kr_path}"
  echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
    > ${docker_repo_file}

  #
  # google cloud + logging/monitoring
  #
  local gcloud_kr_path="/usr/share/keyrings/cloud.google.gpg"
  if ls /etc/apt/sources.list.d/google-clou*.list ; then
    rm -f "${gcloud_kr_path}"
    curl ${curl_retry_args} https://packages.cloud.google.com/apt/doc/apt-key.gpg \
      | gpg --import --no-default-keyring --keyring "${gcloud_kr_path}"
    for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
      list_file="/etc/apt/sources.list.d/${list}.list"
      if [[ -f "${list_file}" ]]; then
        sed -i -e "s:deb https:deb [signed-by=${gcloud_kr_path}] https:g" "${list_file}"
      fi
    done
  fi

  #
  # cran-r
  #
  if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
    local cranr_kr_path="/usr/share/keyrings/cran-r.gpg"
    rm -f "${cranr_kr_path}"
    for keyid in "0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" "0xe298a3a825c0d65dfd57cbb651716619e084dab9" ; do
      curl ${curl_retry_args} "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" \
      | gpg --import --no-default-keyring --keyring "${cranr_kr_path}"
    done
    sed -i -e "s:deb http:deb [signed-by=${cranr_kr_path}] http:g" /etc/apt/sources.list.d/cran-r.list
  fi

  #
  # mysql
  #
  if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
    rm -f /usr/share/keyrings/mysql.gpg
    curl ${curl_retry_args} 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
      gpg --dearmor -o /usr/share/keyrings/mysql.gpg
    sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
  fi

  if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi

}

function exit_handler() {
  # Purge private key material until next grant
  clear_dkms_key

  # clean up incomplete build indicators
  if test -n "${building_file}" ; then
    if ${gsutil_stat_cmd} "${building_file}" ; then ${gsutil_cmd} rm "${building_file}" || true ; fi
  fi

  set +e # Allow cleanup commands to fail without exiting script
  echo "Exit handler invoked"

  # Clear pip cache
  # TODO: make this conditional on which OSs have pip without cache purge
  pip cache purge || echo "unable to purge pip cache"


  # If system memory was sufficient to mount memory-backed filesystems
  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
    # remove the tmpfs pip cache-dir
    pip config unset global.cache-dir || echo "unable to unset global pip cache"

    # Clean up shared memory mounts
    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do
      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
        umount -f ${shmdir}
      fi
    done

    # restart services stopped during preparation stage
    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
  fi

  if is_debuntu ; then
    # Clean up OS package cache
    apt-get -y -qq clean
    apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove
    # re-hold systemd package
    if ge_debian12 ; then
    apt-mark hold systemd libsystemd0 ; fi
    hold_nvidia_packages
  else
    dnf clean all
  fi

  # print disk usage statistics for large components
  if is_ubuntu ; then
    du -hs \
      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
      /usr/lib \
      /opt/nvidia/* \
      /opt/conda/miniconda3 2>/dev/null | sort -h
  elif is_debian ; then
    du -x -hs \
      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \
      /var/lib/{docker,mysql,} \
      /opt/nvidia/* \
      /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
      /usr/bin \
      /usr \
      /var \
      / 2>/dev/null | sort -h
  else # Rocky
    du -hs \
      /var/lib/docker \
      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \
      /usr/lib64/google-cloud-sdk \
      /opt/nvidia/* \
      /opt/conda/miniconda3 2>/dev/null | sort -h
  fi

  # Process disk usage logs from installation period
  rm -f /run/keep-running-df
  sync
  sleep 5.01s
  # compute maximum size of disk during installation
  # Log file contains logs like the following (minus the preceeding #):
#Filesystem     1K-blocks    Used Available Use% Mounted on
#/dev/vda2        7096908 2611344   4182932  39% /
  df / | tee -a "/run/disk-usage.log"

  perl -e '($first, @samples) = grep { m:^/: } <STDIN>;
           unshift(@samples,$first); $final=$samples[-1];
           ($starting)=(split(/\s+/,$first))[2] =~ /^(\d+)/;
             ($ending)=(split(/\s+/,$final))[2] =~ /^(\d+)/;
           @siz=( sort { $a <= $b }
                   map { (split)[2] =~ /^(\d+)/ } @samples );
$max=$siz[0]; $min=$siz[-1]; $inc=$max-$starting;
print( "     samples-taken: ", scalar @siz, $/,
       "starting-disk-used: $starting", $/,
       "  ending-disk-used: $ending", $/,
       " maximum-disk-used: $max", $/,
       " minimum-disk-used: $min", $/,
       "      increased-by: $inc", $/ )' < "/run/disk-usage.log"

  echo "exit_handler has completed"

  # zero free disk space (only if creating image)
  if [[ "${IS_CUSTOM_IMAGE_BUILD}" == "true" ]]; then
    dd if=/dev/zero of=/zero status=progress || true
    sync
    sleep 3s
    rm -f /zero || true
  fi

  return 0
}

function set_proxy(){
  METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')"

  if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi

  export http_proxy="${METADATA_HTTP_PROXY}"
  export https_proxy="${METADATA_HTTP_PROXY}"
  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
  no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254"
  local no_proxy_svc
  for no_proxy_svc in compute  secretmanager dns    servicedirectory     logging  \
                      bigquery composer      pubsub bigquerydatatransfer dataflow \
                      storage  datafusion    ; do
    no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com"
  done

  export NO_PROXY="${no_proxy}"
}

function mount_ramdisk(){
  local free_mem
  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
  if [[ ${free_mem} -lt 20500000 ]]; then return 0 ; fi

  # Write to a ramdisk instead of churning the persistent disk
  tmpdir="/mnt/shm"
  mkdir -p "${tmpdir}/pkgs_dirs"
  mount -t tmpfs tmpfs "${tmpdir}"

  # Download conda packages to tmpfs
  if [[ -f /opt/conda/miniconda3/bin/conda ]] ; then
    /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"
  fi

  # Clear pip cache
  # TODO: make this conditional on which OSs have pip without cache purge
  pip cache purge || echo "unable to purge pip cache"

  # Download pip packages to tmpfs
  pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"

  # Download OS packages to tmpfs
  if is_debuntu ; then
    mount -t tmpfs tmpfs /var/cache/apt/archives
  else
    mount -t tmpfs tmpfs /var/cache/dnf
  fi
}

function harden_sshd_config() {
  # disable sha1 and md5 use in kex and kex-gss features
  declare -A feature_map=(["kex"]="kexalgorithms")
  if ( is_rocky || version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ) ; then
    feature_map["kex-gss"]="gssapikexalgorithms"
  fi
  for ftr in "${!feature_map[@]}" ; do
    local feature=${feature_map[$ftr]}
    local sshd_config_line
    sshd_config_line="${feature} $(
      (sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g';
       ssh -Q "${ftr}" ) \
      | sort -u | grep -v -ie sha1 -e md5 | paste -sd "," -)"

    grep -iv "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new
    echo "$sshd_config_line" >> /tmp/sshd_config_new
    # TODO: test whether sshd will reload with this change before mv
    mv -f /tmp/sshd_config_new /etc/ssh/sshd_config
  done
  local svc=ssh
  if is_rocky ; then svc="sshd" ; fi
  systemctl reload "${svc}"
}

function prepare_to_install(){
  readonly uname_r=$(uname -r)
  # Verify OS compatability and Secure boot state
  check_os
  check_secure_boot

  # --- Detect Image Build Context ---
  # Use 'initialization-actions' as the default name for clarity
  INVOCATION_TYPE="$(get_metadata_attribute invocation-type "initialization-actions")"
  if [[ "${INVOCATION_TYPE}" == "custom-images" ]]; then
    IS_CUSTOM_IMAGE_BUILD="true"
    # echo "Detected custom image build context (invocation-type=custom-images). Configuration will be deferred." # Keep silent
  else
    IS_CUSTOM_IMAGE_BUILD="false" # Ensure it's explicitly false otherwise
    # echo "Running in initialization action mode (invocation-type=${INVOCATION_TYPE})." # Keep silent
  fi

  # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be
  # used as a more performant replacement for `gsutil`
  gsutil_cmd="gcloud storage"
  gsutil_stat_cmd="gcloud storage objects describe"
  gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')"
  if version_lt "${gcloud_sdk_version}" "402.0.0" ; then
    gsutil_cmd="gsutil -o GSUtil:check_hashes=never"
    gsutil_stat_cmd="gsutil stat"
  fi
  curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30"

  # Setup temporary directories (potentially on RAM disk)
  tmpdir=/tmp/ # Default
  mount_ramdisk # Updates tmpdir if successful
  install_log="${tmpdir}/install.log" # Set install log path based on final tmpdir

  # Prepare GPU environment variables (versions, URLs, counts)
  prepare_gpu_env

  workdir=/opt/install-dpgce
  # Set GCS bucket for caching
  temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
  readonly temp_bucket
  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
  readonly bdcfg="/usr/local/bin/bdconfig"
  export DEBIAN_FRONTEND=noninteractive

  mkdir -p "${workdir}/complete"
  trap exit_handler EXIT
  set_proxy

  is_complete prepare.common && return

  harden_sshd_config

  if is_debuntu ; then
    repair_old_backports
    clean_up_sources_lists
    apt-get update -qq --allow-releaseinfo-change
    apt-get -y clean
    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
    if ge_debian12 ; then
    apt-mark unhold systemd libsystemd0 ; fi
    if is_ubuntu ; then
      # Wait for gcloud to be available on Ubuntu
      while ! command -v gcloud ; do sleep 5s ; done
    fi
  else # Rocky
    dnf clean all
  fi

  # zero free disk space (only if creating image)
  if [[ "${IS_CUSTOM_IMAGE_BUILD}" == "true" ]]; then ( set +e
    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
  ) fi

  install_dependencies

  # Monitor disk usage in a screen session
  df / > "/run/disk-usage.log"
  touch "/run/keep-running-df"
  screen -d -m -LUS keep-running-df \
    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"

  mark_complete prepare.common
}

function check_os() {
  if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then
      echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version."
      exit 1
  elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22  ) ; then
      echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version."
      exit 1
  elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then
      echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
      exit 1
  fi

  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
  readonly SPARK_VERSION
  if version_lt "${SPARK_VERSION}" "2.4" || \
     version_ge "${SPARK_VERSION}" "4.0" ; then
    echo "Error: Your Spark version (${SPARK_VERSION}) is not supported. Please use a supported version."
    exit 1
  fi

  # Detect dataproc image version
  if (! test -v DATAPROC_IMAGE_VERSION || [[ -z "${DATAPROC_IMAGE_VERSION}" ]]) ; then
    if test -v DATAPROC_VERSION ; then
      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
    else
      # When building custom-images, neither of the above variables
      # are defined and we need to make a reasonable guess
      if   version_lt "${SPARK_VERSION}" "2.5" ; then DATAPROC_IMAGE_VERSION="1.5"
      elif version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
      elif version_lt "${SPARK_VERSION}" "3.6" ; then
        if [[ -f /etc/environment ]] ; then
          eval "$(grep '^DATAPROC_IMAGE_VERSION' /etc/environment)" || DATAPROC_IMAGE_VERSION="2.2"
        else
          DATAPROC_IMAGE_VERSION="2.2"
        fi
      else DATAPROC_IMAGE_VERSION="2.3" ; fi # Default to latest known version
    fi
  fi
}

#
# Generate repo file under /etc/apt/sources.list.d/
#
function apt_add_repo() {
  local -r repo_name="$1"
  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
  local -r include_src="${4:-yes}"
  local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
  local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}"

  echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}"
  if [[ "${include_src}" == "yes" ]] ; then
    echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}"
  fi

  apt-get update -qq
}

#
# Generate repo file under /etc/yum.repos.d/
#
function dnf_add_repo() {
  local -r repo_name="$1"
  local -r repo_url="$3" # "http(s)://host/path/filename.repo"
  local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}"
  local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}"

  curl ${curl_retry_args} "${repo_url}" \
    | dd of="${repo_path}" status=progress
}

#
# Keyrings default to
# /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or
# /etc/pki/rpm-gpg/${repo_name}.gpg    (rocky/RHEL)
#
function os_add_repo() {
  local -r repo_name="$1"
  local -r signing_key_url="$2"
  local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN"
  local kr_path
  if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}"
                  else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi

  mkdir -p "$(dirname "${kr_path}")"

  curl ${curl_retry_args} "${signing_key_url}" \
    | gpg --import --no-default-keyring --keyring "${kr_path}"

  if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}"
                  else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi
}


readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"

function install_spark_rapids() {
  if [[ "${RAPIDS_RUNTIME}" != "SPARK" ]]; then return ; fi

  # Update SPARK RAPIDS config
  local DEFAULT_SPARK_RAPIDS_VERSION
  DEFAULT_SPARK_RAPIDS_VERSION="24.08.1"
  if version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" ; then
    DEFAULT_SPARK_RAPIDS_VERSION="25.02.1"
  fi
  local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3

  # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu
  local -r scala_ver="2.12"

  if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then
    DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3
  fi

  readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION})
  readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION})

  local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids'
  local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia'
  local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc'

  local jar_basename
  local spark_jars_dir="/usr/lib/spark/jars"
  mkdir -p "${spark_jars_dir}"

  jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
  cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
                        "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
                        "${spark_jars_dir}/${jar_basename}"

  jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar"
  cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
                        "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \
                        "${spark_jars_dir}/${jar_basename}"

  jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar"
  cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
                        "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \
                        "${spark_jars_dir}/${jar_basename}"
}

# --- Script Entry Point ---
prepare_to_install # Run preparation steps first
main               # Call main logic
====== Filename: ./examples/secure-boot/cjac.sh ======
#!/bin/bash

set -e
set -x

PROJECT_ID=cjac-2021-00
CLUSTER_NAME="cluster-1718310842"
my_bucket="kerberos-bucket-000"
custom_image_zone="us-west4-a"
disk_size_gb="50" # greater than or equal to 30

SA_NAME="sa-${CLUSTER_NAME}"
GSA="${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com"

gcloud config set project "${PROJECT_ID}"

gcloud auth login

if [[ -d tls ]] ; then mv tls "tls-$(date +%s)" ; fi
eval "$(bash examples/secure-boot/create-key-pair.sh)"

metadata="public_secret_name=${public_secret_name}"
metadata="${metadata},private_secret_name=${private_secret_name}"
metadata="${metadata},secret_project=${secret_project}"
metadata="${metadata},secret_version=${secret_version}"

# Grant the service account access to list secrets for the project
gcloud projects add-iam-policy-binding "${PROJECT_ID}" \
  --member="serviceAccount:${GSA}" \
  --role="roles/secretmanager.viewer"

# grant service account permission to access the private secret
gcloud secrets add-iam-policy-binding "${private_secret_name}" \
    --member="serviceAccount:${GSA}" \
    --role="roles/secretmanager.secretAccessor"

# grant service account permission to access the public secret
gcloud secrets add-iam-policy-binding "${public_secret_name}" \
    --member="serviceAccount:${GSA}" \
    --role="roles/secretmanager.secretAccessor"

dataproc_version=2.2-debian12
echo "#!/bin/bash\necho no op" | dd of=empty.sh
#customization_script=empty.sh
customization_script="examples/secure-boot/install_gpu_driver.sh"
image_name="cuda-12-4-${dataproc_version/\./-}-$(date +%F-%H-%M)"

python generate_custom_image.py \
    --accelerator "type=nvidia-tesla-t4" \
    --image-name "${image_name}" \
    --dataproc-version "${dataproc_version}" \
    --trusted-cert "tls/db.der" \
    --customization-script "${customization_script}" \
    --service-account "${GSA}" \
    --metadata "${metadata}" \
    --zone "${custom_image_zone}" \
    --disk-size "${disk_size_gb}" \
    --no-smoke-test \
    --gcs-bucket "${my_bucket}" \
    --shutdown-instance-timer-sec=30

set +x
====== Filename: ./examples/secure-boot/install-nvidia-driver-debian12.sh ======
#!/bin/bash
set -xeu

mkdir -p /opt/install-nvidia-driver
cd $_

nv_driver_ver="550.54.14"
nv_cuda_ver="12.4.0"

# read secret name, project, version
sig_pub_secret_name="$(/usr/share/google/get_metadata_value attributes/public_secret_name)"
sig_priv_secret_name="$(/usr/share/google/get_metadata_value attributes/private_secret_name)"
sig_secret_project="$(/usr/share/google/get_metadata_value attributes/secret_project)"
sig_secret_version="$(/usr/share/google/get_metadata_value attributes/secret_version)"

readonly expected_modulus_md5sum="bd40cf5905c7bba4225d330136fdbfd3"

ca_tmpdir="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
mkdir -p "${ca_tmpdir}"

# The Microsoft Corporation UEFI CA 2011
ms_uefi_ca="${ca_tmpdir}/MicCorUEFCA2011_2011-06-27.crt"
if [[ ! -f "${ms_uefi_ca}" ]]; then
  curl -L -o "${ms_uefi_ca}" "https://go.microsoft.com/fwlink/p/?linkid=321194"
fi

# Write private material to volatile storage
gcloud secrets versions access "${sig_secret_version}" \
       --project="${sig_secret_project}" \
       --secret="${sig_priv_secret_name}" \
    | dd of="${ca_tmpdir}/db.rsa"

readonly cacert_der="${ca_tmpdir}/db.der"
gcloud secrets versions access "${sig_secret_version}" \
       --project="${sig_secret_project}" \
       --secret="${sig_pub_secret_name}" \
    | base64 --decode \
    | dd of="${cacert_der}"

mokutil --sb-state

# configure the nvidia-container-toolkit package source
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
  | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg

curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
  | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
  | tee /etc/apt/sources.list.d/nvidia-container-toolkit.list

# enable non-free and non-free-firmware components, update cache
DEBIAN_SOURCES="/etc/apt/sources.list.d/debian.sources"
COMPONENTS="main contrib non-free non-free-firmware"
sed -i -e "s/Components: .*$/Components: ${COMPONENTS}/" ${DEBIAN_SOURCES}
apt-get -qq update

# install DKMS
apt-get --no-install-recommends -qq -y install dkms

# Prepare DKMS to use the certificates retrieved from cloud secrets
ln -sf "${ca_tmpdir}/db.rsa" /var/lib/dkms/mok.key
cp "${ca_tmpdir}/db.der" /var/lib/dkms/mok.pub

# install dkms and nvidia support packages
apt-get --no-install-recommends -qq -y install \
     dkms \
     "linux-headers-$(uname -r)" \
     nvidia-container-toolkit \
     nvidia-open-kernel-support \
     nvidia-smi \
     libglvnd0 \
     libcuda1

# install the driver itself
apt-get --no-install-recommends -qq -y install \
     nvidia-open-kernel-dkms

apt-get clean
apt-get autoremove -y

# Install CUDA
cuda_runfile="cuda_${nv_cuda_ver}_${nv_driver_ver}_linux.run"
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
     "https://developer.download.nvidia.com/compute/cuda/${nv_cuda_ver}/local_installers/${cuda_runfile}" \
     -o cuda.run
bash ./cuda.run --silent --toolkit --no-opengl-libs
rm cuda.run
====== Filename: ./examples/secure-boot/dask.screenrc ======
#
# For debugging, uncomment the following line
#

# screen -L -t monitor 0 /bin/bash

screen -L -t 2.2-debian12 1 /bin/bash -x examples/secure-boot/pre-init.sh 2.2-debian12 examples/secure-boot/dask.sh
screen -L -t 2.1-debian11 2 /bin/bash -x examples/secure-boot/pre-init.sh 2.1-debian11 examples/secure-boot/dask.sh
screen -L -t 2.0-debian10 3 /bin/bash -x examples/secure-boot/pre-init.sh 2.0-debian10 examples/secure-boot/dask.sh

screen -L -t 2.2-ubuntu22 4 /bin/bash -x examples/secure-boot/pre-init.sh 2.2-ubuntu22 examples/secure-boot/dask.sh
screen -L -t 2.1-ubuntu20 5 /bin/bash -x examples/secure-boot/pre-init.sh 2.1-ubuntu20 examples/secure-boot/dask.sh
screen -L -t 2.0-ubuntu18 6 /bin/bash -x examples/secure-boot/pre-init.sh 2.0-ubuntu18 examples/secure-boot/dask.sh

screen -L -t 2.2-rocky9   7 /bin/bash -x examples/secure-boot/pre-init.sh 2.2-rocky9   examples/secure-boot/dask.sh
screen -L -t 2.1-rocky8   8 /bin/bash -x examples/secure-boot/pre-init.sh 2.1-rocky8   examples/secure-boot/dask.sh
screen -L -t 2.0-rocky8   9 /bin/bash -x examples/secure-boot/pre-init.sh 2.0-rocky8   examples/secure-boot/dask.sh
====== Filename: ./examples/secure-boot/create-key-pair.sh ======
#!/bin/bash
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This script creates a key pair and publishes to cloud secrets or
# fetches an already published key pair from cloud secrets

set -e

# https://github.com/glevand/secure-boot-utils

# https://cloud.google.com/compute/shielded-vm/docs/creating-shielded-images#adding-shielded-image

# https://cloud.google.com/compute/shielded-vm/docs/creating-shielded-images#generating-security-keys-certificates

# https://wiki.archlinux.org/title/Unified_Extensible_Firmware_Interface/Secure_Boot#Creating_keys

ITERATION=042

CURRENT_PROJECT_ID="$(gcloud config get project)"
if [[ -z "${CURRENT_PROJECT_ID}" ]]; then
    echo 'project is not set.  please set with `gcloud config set project ${PROJECT_ID}`' >&2
    exit -1
fi
PROJECT_ID="${CURRENT_PROJECT_ID}"

function create_key () {
    local EFI_VAR_NAME="$1"
    local CN_VAL="$2"
    local PRIVATE_KEY="tls/${EFI_VAR_NAME}.rsa"
    local CACERT="tls/${EFI_VAR_NAME}.pem"
    local CACERT_DER="tls/${EFI_VAR_NAME}.der"
    CA_KEY_SECRET_NAME="efi-${EFI_VAR_NAME}-priv-key-${ITERATION}"
    CA_CERT_SECRET_NAME="efi-${EFI_VAR_NAME}-pub-key-${ITERATION}"
    # If the secrets exist in secret manager, populate the tls/ directory
    if [[ ! -f "${PRIVATE_KEY}" ]] && gcloud secrets describe "${CA_CERT_SECRET_NAME}" > /dev/null ; then
      mkdir -p tls

      gcloud secrets versions access "1" \
        --project="${PROJECT_ID}" \
        --secret="${CA_KEY_SECRET_NAME}" \
        | dd of="${PRIVATE_KEY}" status=none

      gcloud secrets versions access "1" \
        --project="${PROJECT_ID}" \
        --secret="${CA_CERT_SECRET_NAME}" \
        | base64 --decode \
        | dd of="${CACERT_DER}" status=none

      # Create a PEM-format version of the cert
      openssl x509 \
        -inform DER \
        -in "${CACERT_DER}" \
        -outform PEM \
        -out "${CACERT}"

      MS_UEFI_CA="tls/MicCorUEFCA2011_2011-06-27.crt"
      curl -s -L -o "${MS_UEFI_CA}" 'https://go.microsoft.com/fwlink/p/?linkid=321194'

      echo "${CA_KEY_SECRET_NAME}" > tls/private-key-secret-name.txt
      echo "${CA_CERT_SECRET_NAME}" > tls/public-key-secret-name.txt
      modulus_md5sum="$(openssl rsa -noout -modulus -in ${PRIVATE_KEY} | openssl md5 | awk '{print $2}' | tee tls/modulus-md5sum.txt)"
      return
    fi

    if [[ -f "${PRIVATE_KEY}" ]]; then
        echo "key already exists.  Skipping generation." >&2
        modulus_md5sum="$(cat tls/modulus-md5sum.txt)"
        return
    fi
    mkdir -p tls

    echo "generating '${CN_VAL}' '${CACERT}', '${CACERT_DER}' and '${PRIVATE_KEY}'" >&2
    # Generate new x.509 key and cert
    openssl req \
            -newkey rsa:3072 \
            -nodes \
            -keyout "${PRIVATE_KEY}" \
            -new \
            -x509 \
            -sha256 \
            -days 3650 \
            -subj "/CN=${CN_VAL}/" \
            -out "${CACERT}"

    # Create a DER-format version of the cert
    openssl x509 \
            -outform DER \
            -in "${CACERT}" \
            -outform DER \
            -in "${CACERT}" \
            -out "${CACERT_DER}"

    # Create a new secret containing private key
    gcloud secrets create "${CA_KEY_SECRET_NAME}" \
           --project="${PROJECT_ID}" \
           --replication-policy="automatic" \
           --data-file="${PRIVATE_KEY}"

    echo "Private key secret name: '${CA_KEY_SECRET_NAME}'" >&2
    echo "${CA_KEY_SECRET_NAME}" > tls/private-key-secret-name.txt

    # Create a new secret containing public key
    cat "${CACERT_DER}" | base64 > "${CACERT_DER}.base64"
    gcloud secrets create "${CA_CERT_SECRET_NAME}" \
           --project="${PROJECT_ID}" \
           --replication-policy="automatic" \
           --data-file="${CACERT_DER}.base64"

    modulus_md5sum="$(openssl x509 -noout -modulus -in ${CACERT} | openssl md5 | awk '{print $2}')"
    echo "modulus-md5sum: ${modulus_md5sum}" >&2
    echo "${modulus_md5sum}" > tls/modulus-md5sum.txt
    echo "Public key secret name: '${CA_CERT_SECRET_NAME}'" >&2
    echo "${CA_CERT_SECRET_NAME}" > tls/public-key-secret-name.txt

}

EFI_VAR_NAME=db

create_key "${EFI_VAR_NAME}" "Cloud Dataproc Custom Image CA ${ITERATION}"

echo "modulus_md5sum=${modulus_md5sum}"
echo "private_secret_name=${CA_KEY_SECRET_NAME}"
echo "public_secret_name=${CA_CERT_SECRET_NAME}"
echo "secret_project=${PROJECT_ID}"
echo "secret_version=1"
====== Filename: ./examples/secure-boot/pre-init.screenrc ======
#
# For debugging, uncomment the following line
#

# screen -L -t monitor 0 /bin/bash

#screen -L -t 1.5-debian10     1 /bin/bash -x examples/secure-boot/pre-init.sh 1.5-debian10

screen -L -t 2.0-debian10     2  /bin/bash -x examples/secure-boot/pre-init.sh 2.0-debian10
screen -L -t 2.0-rocky8       3  /bin/bash -x examples/secure-boot/pre-init.sh 2.0-rocky8
screen -L -t 2.0-ubuntu18     4  /bin/bash -x examples/secure-boot/pre-init.sh 2.0-ubuntu18

screen -L -t 2.1-debian11     5  /bin/bash -x examples/secure-boot/pre-init.sh 2.1-debian11
screen -L -t 2.1-rocky8       6  /bin/bash -x examples/secure-boot/pre-init.sh 2.1-rocky8
screen -L -t 2.1-ubuntu20     7  /bin/bash -x examples/secure-boot/pre-init.sh 2.1-ubuntu20
#screen -L -t 2.1-ubuntu20-arm 11 /bin/bash -x examples/secure-boot/pre-init.sh 2.1-ubuntu20-arm

screen -L -t 2.2-debian12     8  /bin/bash -x examples/secure-boot/pre-init.sh 2.2-debian12
screen -L -t 2.2-rocky9       9  /bin/bash -x examples/secure-boot/pre-init.sh 2.2-rocky9
screen -L -t 2.2-ubuntu22     10 /bin/bash -x examples/secure-boot/pre-init.sh 2.2-ubuntu22

screen -L -t 2.3-debian12     12 /bin/bash -x examples/secure-boot/pre-init.sh 2.3-debian12
screen -L -t 2.3-rocky9       13 /bin/bash -x examples/secure-boot/pre-init.sh 2.3-rocky9
screen -L -t 2.3-ubuntu22     14 /bin/bash -x examples/secure-boot/pre-init.sh 2.3-ubuntu22
#screen -L -t 2.3-ml-ubuntu22  15 /bin/bash -x examples/secure-boot/pre-init.sh 2.3-ml-ubuntu22
====== Filename: ./examples/secure-boot/env.json.sample ======
{
  "PROJECT_ID":"example-yyyy-nn",
  "PURPOSE":"cuda-pre-init",
  "BUCKET":"my-bucket-name",
  "IMAGE_VERSION":"2.2-debian12",
  "ZONE":"us-west4-a"
}
====== Filename: ./examples/secure-boot/cloud-sql-proxy.sh ======
#!/bin/bash

# Copyright 2016 Google LLC and contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This init script installs a cloud-sql-proxy on each node in the cluster, and
# uses that proxy to expose TCP proxies of one or more CloudSQL instances.
# One of these instances is used for the clusters Hive Metastore.

# Do not use "set -x" to avoid printing passwords in clear in the logs
set -euo pipefail

function os_id()       ( set +x ;  grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; )
function os_version()  ( set +x ;  grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
function os_codename() ( set +x ;  grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; )

function version_ge(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|tail -n1)" ]]; }
function version_gt(){ [[ "$1" = "$2" ]]&& return 1 || version_ge "$1" "$2";}
function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; }
function version_lt(){ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";}

readonly -A supported_os=(
  ['debian']="10 11 12"
  ['rocky']="8 9"
  ['ubuntu']="18.04 20.04 22.04"
)

# dynamically define OS version test utility functions
if [[ "$(os_id)" == "rocky" ]];
then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
else _os_version="$(os_version)"; fi
for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
  eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"

  for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
    eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
    eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
    eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
  done
done

function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )

function print_metadata_value() {
  local readonly tmpfile=$(mktemp)
  http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \
    -s -o ${tmpfile} 2>/dev/null)
  local readonly return_code=$?
  # If the command completed successfully, print the metadata value to stdout.
  if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then
    cat ${tmpfile}
  fi
  rm -f ${tmpfile}
  return ${return_code}
}

function print_metadata_value_if_exists() {
  local return_code=1
  local readonly url=$1
  print_metadata_value ${url}
  return_code=$?
  return ${return_code}
}

function get_metadata_value() (
  set +x
  local readonly varname=$1
  local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1
  # Print the instance metadata value.
  print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname}
  return_code=$?
  # If the instance doesn't have the value, try the project.
  if [[ ${return_code} != 0 ]]; then
    print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname}
    return_code=$?
  fi

  return ${return_code}
)

function get_metadata_attribute() (
  set +x
  local -r attribute_name="$1"
  local -r default_value="${2:-}"
  get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
)

# Detect dataproc image version from its various names
if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then
  DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
fi

readonly OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]')
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)

declare -A DEFAULT_DB_PORT=(['MYSQL']='3306' ['POSTGRES']='5432' ['SQLSERVER']='1433')
declare -A DEFAULT_DB_ADMIN_USER=(['MYSQL']='root' ['POSTGRES']='postgres' ['SQLSERVER']='sqlserver')
declare -A DEFAULT_DB_PROTO=(['MYSQL']='mysql' ['POSTGRES']='postgresql' ['SQLSERVER']='sqlserver')
declare -A DEFAULT_DB_DRIVER=(['MYSQL']='com.mysql.jdbc.Driver' ['POSTGRES']='org.postgresql.Driver' ['SQLSERVER']='com.microsoft.sqlserver.jdbc.SQLServerDriver')

function err() {
  echo "[$(date +'%Y-%m-%dT%H:%M:%S%z')] [$(hostname)]: ERROR: $*" >&2
  return 1
}

function log() {
  echo "[$(date +'%Y-%m-%dT%H:%M:%S%z')] [$(hostname)]: INFO: $*" >&2
}

readonly ADDITIONAL_INSTANCES_KEY='attributes/additional-cloud-sql-instances'
readonly PROXY_DIR='/var/run/cloud_sql_proxy'
readonly PROXY_BIN='/usr/local/bin/cloud_sql_proxy'
readonly INIT_SCRIPT='/usr/lib/systemd/system/cloud-sql-proxy.service'
readonly PROXY_LOG_DIR='/var/log/cloud-sql-proxy'

# Whether to configure the Hive metastore to point to a Cloud SQL database.
# This is not required for Hive & Spark I/O.
ENABLE_CLOUD_SQL_METASTORE="$(/usr/share/google/get_metadata_value attributes/enable-cloud-sql-hive-metastore || echo 'true')"
readonly ENABLE_CLOUD_SQL_METASTORE

# Whether to enable the proxy on workers. This is not necessary for the
# Metastore, but is required for Hive & Spark I/O.
ENABLE_PROXY_ON_WORKERS="$(/usr/share/google/get_metadata_value attributes/enable-cloud-sql-proxy-on-workers || echo 'true')"
readonly ENABLE_PROXY_ON_WORKERS

# Whether to use the private IP address of the cloud sql instance.
USE_CLOUD_SQL_PRIVATE_IP="$(/usr/share/google/get_metadata_value attributes/use-cloud-sql-private-ip || echo 'false')"
readonly USE_CLOUD_SQL_PRIVATE_IP

METASTORE_INSTANCE="$(/usr/share/google/get_metadata_value attributes/hive-metastore-instance || echo '')"
readonly METASTORE_INSTANCE

ADDITIONAL_INSTANCES="$(/usr/share/google/get_metadata_value ${ADDITIONAL_INSTANCES_KEY} || echo '')"
readonly ADDITIONAL_INSTANCES

function repair_old_backports {
  if ! is_debuntu ; then return ; fi
  # This script uses 'apt-get update' and is therefore potentially dependent on
  # backports repositories which have been archived.  In order to mitigate this
  # problem, we will use archive.debian.org for the oldoldstable repo

  # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157
  debdists="https://deb.debian.org/debian/dists"
  oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}');
  oldstable=$(   curl -s "${debdists}/oldstable/Release"    | awk '/^Codename/ {print $2}');
  stable=$(      curl -s "${debdists}/stable/Release"       | awk '/^Codename/ {print $2}');

  matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) )

  for filename in "${matched_files[@]}"; do
    # Fetch from archive.debian.org for ${oldoldstable}-backports
    perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports }
                  {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}"
  done
}

# Get metastore DB instance type, result be one of MYSQL, POSTGRES, SQLSERVER
function get_cloudsql_instance_type() {
  local instance=$(echo "$1" | cut -d "," -f 1)
  local database=''
  if [[ -z "${instance}" ]]; then
    log 'cloudsql instance VM metadata not specified'
  elif ! [[ "${instance}" =~ .+:.+:.+ ]]; then
    log 'cloudsql instance not of form project:region:instance'
  else
    local project=${instance%*:*:*}
    instance=${instance##*:}
    database=$(gcloud sql instances describe --project=${project} ${instance} | grep 'databaseVersion')
    if [[ -z "${database}" ]]; then
      log 'Unable to describe metastore_instance'
    else
      # Trim off version and whitespaces and use upper case
      # databaseVersion: MYSQL_8_0
      # databaseVersion: POSTGRES_12
      # databaseVersion: SQLSERVER_2019_STANDARD
      database=${database##*:}
      database=${database%%_*}
      database="${database#"${database%%[![:space:]]*}"}"
    fi
  fi
  echo "${database^^}"
}

# CLOUD SQL instance type is one of MYSQL, POSTGRES, SQLSERVER. If not specified
# try to infer it from METASTORE_INSTANCE, ADDITIONAL_INSTANCES, default to MYSQL
CLOUDSQL_INSTANCE_TYPE="$(/usr/share/google/get_metadata_value attributes/cloud-sql-instance-type || echo '')"
CLOUDSQL_INSTANCE_TYPE=${CLOUDSQL_INSTANCE_TYPE^^}
if [[ -z "${CLOUDSQL_INSTANCE_TYPE}" ]]; then
  if [[ -n "${METASTORE_INSTANCE}" ]]; then
    CLOUDSQL_INSTANCE_TYPE=$(get_cloudsql_instance_type "${METASTORE_INSTANCE}")
  elif [[ -n "${ADDITIONAL_INSTANCES}" ]]; then
    CLOUDSQL_INSTANCE_TYPE=$(get_cloudsql_instance_type "${ADDITIONAL_INSTANCES}")
  fi
fi
if [[ -z "${CLOUDSQL_INSTANCE_TYPE}" ]]; then
  CLOUDSQL_INSTANCE_TYPE='MYSQL'
fi
readonly CLOUDSQL_INSTANCE_TYPE

METASTORE_PROXY_PORT="$(/usr/share/google/get_metadata_value attributes/metastore-proxy-port || echo '')"
if [[ "${METASTORE_INSTANCE}" =~ =tcp:[0-9]+$ ]]; then
  METASTORE_PROXY_PORT="${METASTORE_INSTANCE##*:}"
else
  METASTORE_PROXY_PORT=${DEFAULT_DB_PORT["${CLOUDSQL_INSTANCE_TYPE}"]}
fi
readonly METASTORE_PROXY_PORT

# Database user to use to access metastore.
DB_HIVE_USER="$(/usr/share/google/get_metadata_value attributes/db-hive-user || echo 'hive')"
readonly DB_HIVE_USER

DB_ADMIN_USER="$(/usr/share/google/get_metadata_value attributes/db-admin-user || echo '')"
if [[ -z ${DB_ADMIN_USER} ]]; then
  DB_ADMIN_USER=${DEFAULT_DB_ADMIN_USER["${CLOUDSQL_INSTANCE_TYPE}"]}
fi
readonly DB_ADMIN_USER

KMS_KEY_URI="$(/usr/share/google/get_metadata_value attributes/kms-key-uri || echo '')"
readonly KMS_KEY_URI

# Database admin user password used to create the metastore database and user.
DB_ADMIN_PASSWORD_URI="$(/usr/share/google/get_metadata_value attributes/db-admin-password-uri || echo '')"
readonly DB_ADMIN_PASSWORD_URI

DB_ADMIN_PASSWORD=''
if [[ -n "${DB_ADMIN_PASSWORD_URI}" ]]; then
  # Decrypt password
  DB_ADMIN_PASSWORD="$(gsutil cat "${DB_ADMIN_PASSWORD_URI}" |
    gcloud kms decrypt \
      --ciphertext-file - \
      --plaintext-file - \
      --key "${KMS_KEY_URI}")"
fi
if [[ "${CLOUDSQL_INSTANCE_TYPE}" == "POSTGRES" && -z "${DB_ADMIN_PASSWORD}" ]]; then
  log 'POSTGRES DB admin password is not set'
fi
readonly DB_ADMIN_PASSWORD

# Database password used to access metastore.
DB_HIVE_PASSWORD_URI="$(/usr/share/google/get_metadata_value attributes/db-hive-password-uri || echo '')"
readonly DB_HIVE_PASSWORD_URI
if [[ -n "${DB_HIVE_PASSWORD_URI}" ]]; then
  # Decrypt password
  DB_HIVE_PASSWORD="$(gsutil cat "${DB_HIVE_PASSWORD_URI}" |
    gcloud kms decrypt \
      --ciphertext-file - \
      --plaintext-file - \
      --key "${KMS_KEY_URI}")"
  readonly DB_HIVE_PASSWORD
else
  db_hive_pwd=$(bdconfig get_property_value \
    --configuration_file "/etc/hive/conf/hive-site.xml" \
    --name "javax.jdo.option.ConnectionPassword" 2>/dev/null)
  if [[ "${db_hive_pwd}" == "None" ]]; then
    db_hive_pwd="hive-password"
  fi
  readonly DB_HIVE_PASSWORD=${db_hive_pwd}
fi

# Name of MySQL database to use for the metastore.
# Will be created if it doesn't exist.
METASTORE_DB="$(/usr/share/google/get_metadata_value attributes/hive-metastore-db || echo 'hive_metastore')"
readonly METASTORE_DB

# Dataproc master nodes information
readonly DATAPROC_MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master)

function get_java_property() {
  local property_file=$1
  local property_name=$2
  local property_value
  property_value=$(grep "^${property_name}=" "${property_file}" |
    tail -n 1 | cut -d '=' -f 2- | sed -r 's/\\([#!=:])/\1/g')
  echo "${property_value}"
}

function get_dataproc_property() {
  local property_name=$1
  local property_value
  [[ -f /etc/google-dataproc/dataproc.properties ]] || return
  property_value=$(get_java_property \
    /etc/google-dataproc/dataproc.properties "${property_name}")
  echo "${property_value}"
}

function is_component_selected() {
  local component=$1

  local activated_components
  activated_components=$(get_dataproc_property dataproc.components.activate)

  if [[ ${activated_components} == *${component}* ]]; then
    return 0
  fi
  return 1
}

KERBEROS_ENABLED=$(is_component_selected 'kerberos' && echo 'true' || echo 'false')
readonly KERBEROS_ENABLED

function get_hive_principal() {
  # Hostname is fully qualified
  local host
  host=$(hostname -f)
  local domain
  domain=$(dnsdomainname)
  # Realm is uppercase domain name
  echo "hive/${host}@${domain^^}"
}

function get_hiveserver_uri() {
  local base_connect_string="jdbc:hive2://localhost:10000"
  if [[ "${KERBEROS_ENABLED}" == 'true' ]]; then
    local hive_principal
    hive_principal=$(get_hive_principal)
    echo "${base_connect_string}/;principal=${hive_principal}"
  else
    echo "${base_connect_string}"
  fi
}

# Helper to run any command with Fibonacci backoff.
# If all retries fail, returns last attempt's exit code.
# Args: "$@" is the command to run.
function run_with_retries() {
  local retry_backoff=(1 1 2 3 5 8 13 21 34 55 89 144)
  local -a cmd=("$@")
  log "About to run '${cmd[*]}' with retries..."

  for ((i = 0; i < ${#retry_backoff[@]}; i++)); do
    if "${cmd[@]}"; then
      return 0
    fi
    local sleep_time=${retry_backoff[$i]}
    log "'${cmd[*]}' attempt $((i + 1)) failed! Sleeping ${sleep_time}."
    sleep "${sleep_time}"
  done

  log "Final attempt of '${cmd[*]}'..."
  # Let any final error propagate all the way out to any error traps.
  "${cmd[@]}"
}

function get_metastore_instance() {
  local metastore_instance="${METASTORE_INSTANCE}"
  if ! [[ "${metastore_instance}" =~ =tcp:[0-9]+$ ]]; then
    metastore_instance+="=tcp:${METASTORE_PROXY_PORT}"
  fi
  echo "${metastore_instance}"
}

function get_proxy_flags() {
  local proxy_instances_flags=''
  # If a Cloud SQL instance has both public and private IP, use private IP.
  if [[ ${USE_CLOUD_SQL_PRIVATE_IP} == "true" ]]; then
    proxy_instances_flags+=" --ip_address_types=PRIVATE"
  fi
  if [[ ${ENABLE_CLOUD_SQL_METASTORE} == "true" ]]; then
    local metastore_instance
    metastore_instance=$(get_metastore_instance)
    proxy_instances_flags+=" -instances=${metastore_instance}"
  fi

  if [[ -n "${ADDITIONAL_INSTANCES}" ]]; then
    # Pass additional instances straight to the proxy.
    proxy_instances_flags+=" -instances_metadata=instance/${ADDITIONAL_INSTANCES_KEY}"
  fi

  echo "${proxy_instances_flags}"
}

function install_cloud_sql_proxy() {
  echo 'Installing Cloud SQL Proxy ...' >&2
  # Install proxy.
  wget -nv --timeout=30 --tries=5 --retry-connrefused \
    https://dl.google.com/cloudsql/cloud_sql_proxy.linux.amd64
  mv cloud_sql_proxy.linux.amd64 ${PROXY_BIN}
  chmod +x ${PROXY_BIN}

  mkdir -p ${PROXY_DIR}
  mkdir -p ${PROXY_LOG_DIR}

  local proxy_flags
  proxy_flags="$(get_proxy_flags)"

  # Validate db_hive_password and escape invalid xml characters if found.
  local db_hive_password_xml_escaped
  db_hive_password_xml_escaped=${DB_HIVE_PASSWORD//&/&amp;}
  db_hive_password_xml_escaped=${db_hive_password_xml_escaped//</&lt;}
  db_hive_password_xml_escaped=${db_hive_password_xml_escaped//>/&gt;}
  db_hive_password_xml_escaped=${db_hive_password_xml_escaped//'"'/&quot;}

  # Install proxy as systemd service for reboot tolerance.
  cat <<EOF >${INIT_SCRIPT}
[Unit]
Description=Google Cloud SQL Proxy
After=local-fs.target network-online.target
After=google.service
Before=shutdown.target

[Service]
Type=simple
ExecStart=/bin/sh -c '${PROXY_BIN} \
  -dir=${PROXY_DIR} \
  ${proxy_flags} >> /var/log/cloud-sql-proxy/cloud-sql-proxy.log 2>&1'

[Install]
WantedBy=multi-user.target
EOF
  chmod a+rw ${INIT_SCRIPT}

  if [[ $ENABLE_CLOUD_SQL_METASTORE == "true" ]]; then
    local db_url=jdbc:${DEFAULT_DB_PROTO["${CLOUDSQL_INSTANCE_TYPE}"]}://localhost:${METASTORE_PROXY_PORT}/${METASTORE_DB}
    local db_driver=${DEFAULT_DB_DRIVER["${CLOUDSQL_INSTANCE_TYPE}"]}

    # Update hive-site.xml
    cat <<EOF >hive-template.xml
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
  <property>
    <name>javax.jdo.option.ConnectionURL</name>
    <value>${db_url}</value>
    <description>the URL of the MySQL database</description>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionDriverName</name>
    <value>${db_driver}</value>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionUserName</name>
    <value>${DB_HIVE_USER}</value>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionPassword</name>
    <value>${db_hive_password_xml_escaped}</value>
  </property>
</configuration>
EOF

    bdconfig merge_configurations \
      --configuration_file /etc/hive/conf/hive-site.xml \
      --source_configuration_file hive-template.xml \
      --clobber
  fi

  log 'Cloud SQL Proxy installation succeeded'
}

function initialize_mysql_metastore_db() {
  log 'Initialzing MYSQL DB for Hive metastore ...'
  local db_password_param='--password='
  if [[ -n ${DB_ADMIN_PASSWORD} ]]; then
      db_password_param+=${DB_ADMIN_PASSWORD}
  fi
  local db_hive_password_param=''
  if [[ -n ${DB_HIVE_PASSWORD} ]]; then
    db_hive_password_param+="-p${DB_HIVE_PASSWORD}"
  fi

  # Check if metastore is initialized.
  if ! mysql -h 127.0.0.1 -P "${METASTORE_PROXY_PORT}" -u "${DB_HIVE_USER}" "${db_hive_password_param}" -e ''; then
    mysql -h 127.0.0.1 -P "${METASTORE_PROXY_PORT}" -u "${DB_ADMIN_USER}" "${db_password_param}" -e \
      "CREATE USER '${DB_HIVE_USER}' IDENTIFIED BY '${DB_HIVE_PASSWORD}';"
  fi
  if ! mysql -h 127.0.0.1 -P "${METASTORE_PROXY_PORT}" -u "${DB_HIVE_USER}" "${db_hive_password_param}" -e "use ${METASTORE_DB}"; then
    # Initialize a Hive metastore DB
    mysql -h 127.0.0.1 -P "${METASTORE_PROXY_PORT}" -u "${DB_ADMIN_USER}" "${db_password_param}" -e \
      "CREATE DATABASE ${METASTORE_DB};
       GRANT ALL PRIVILEGES ON ${METASTORE_DB}.* TO '${DB_HIVE_USER}';"
    /usr/lib/hive/bin/schematool -dbType mysql -initSchema ||
      err 'Failed to set mysql schema.'
  fi
  log 'MYSQL DB initialized for Hive metastore'
}

function initialize_postgres_metastore_db() {
  log 'Initialzing POSTGRES DB for Hive metastore ...'
  local admin_connection=postgresql://"${DB_ADMIN_USER}":"${DB_ADMIN_PASSWORD}"@127.0.0.1:"${METASTORE_PROXY_PORT}"/
  local hive_connection=postgresql://"${DB_HIVE_USER}":"${DB_HIVE_PASSWORD}"@127.0.0.1:"${METASTORE_PROXY_PORT}"/postgres

  # Check if metastore is initialized.
  if ! psql "${hive_connection}" -c ''; then
    log 'Create DB Hive user...'
    psql "${admin_connection}" -c "CREATE USER ${DB_HIVE_USER} WITH PASSWORD '${DB_HIVE_PASSWORD}';"
  fi
  if ! psql "${hive_connection}" -c '\c "${METASTORE_DB}" ' ; then
    log 'Create Hive Metastore database...'
    psql "${admin_connection}" -c "CREATE DATABASE ${METASTORE_DB};"
    psql "${hive_connection}" -c '\c "${METASTORE_DB}" '
    psql "${admin_connection}" -c "GRANT ALL PRIVILEGES ON DATABASE ${METASTORE_DB} TO ${DB_HIVE_USER} ;"

    log 'Create Hive Metastore schema...'
    /usr/lib/hive/bin/schematool -dbType postgres -initSchema ||
      err 'Failed to set postgres schema.'
  fi
  log 'POSTGRES DB initialized for Hive metastore'
}

function initialize_metastore_db() {
  case ${CLOUDSQL_INSTANCE_TYPE} in
    MYSQL)
      initialize_mysql_metastore_db
      ;;
    POSTGRES)
      initialize_postgres_metastore_db
      ;;
    SQLSERVER)
      # TODO: add SQLSERVER support
      ;;
    *)
      # NO-OP
      ;;
  esac
}

function run_validation() {
  log 'Validating Hive is running...'

  # Check that metastore schema is compatible.
  /usr/lib/hive/bin/schematool -dbType ${CLOUDSQL_INSTANCE_TYPE,,} -info ||
    err 'Run /usr/lib/hive/bin/schematool -dbType ${CLOUDSQL_INSTANCE_TYPE,,} -upgradeSchemaFrom <schema-version> to upgrade the schema. Note that this may break Hive metastores that depend on the old schema'

  # Validate it's functioning.
  # On newer Dataproc images, we start hive-server2 after init actions are run,
  # so skip this step if hive-server2 isn't already running.
  if (systemctl show -p SubState --value hive-server2 | grep -q running); then
    local hiveserver_uri
    hiveserver_uri=$(get_hiveserver_uri)
    if ! timeout 60s beeline -u "${hiveserver_uri}" -e 'SHOW TABLES;' >&/dev/null; then
      err 'Failed to bring up Cloud SQL Metastore'
    else
      log 'Cloud SQL Hive Metastore initialization succeeded'
    fi

    # Execute the Hive "reload function" DDL to reflect permanent functions
    # that have already been created in the HiveServer.
    beeline -u "${hiveserver_uri}" -e "reload function;"
    log 'Reloaded permanent functions'
  fi
   log 'Validated Hive functioning'
}

function install_mysql_cli() {
  if command -v mysql >/dev/null; then
    log "MySQL CLI is already installed"
    return
  fi

  log "Installing MySQL CLI ..."
  if command -v apt >/dev/null; then
    apt update && apt install mysql-client -y
  elif command -v yum >/dev/null; then
    yum -y update && yum -y install mysql
  fi
  log "MySQL CLI installed"
}

function install_postgres_cli() {
  if command -v psql >/dev/null; then
    log "POSTGRES CLI is already installed"
    return
  fi

  log "Installing POSTGRES CLI ..."
  if command -v apt >/dev/null; then
    apt update && apt install postgresql-client -y
  elif command -v yum >/dev/null; then
    yum -y update && yum -y install postgresql
  fi
  log "POSTGRES CLI installed"
}

function install_db_cli() {
  case ${CLOUDSQL_INSTANCE_TYPE} in
    MYSQL)
      install_mysql_cli
      ;;
    POSTGRES)
      install_postgres_cli
      ;;
    SQLSERVER)
      # TODO: add SQL support
      err 'Fail fast here if SQLSERVER support is not enabled.'
      ;;
    *)
      # NO-OP
      ;;
  esac
}

function stop_mysql_service() {
  # Debian/Ubuntu
  if (systemctl is-enabled --quiet mysql); then
    log 'Stopping and disabling mysql.service ...'
    systemctl stop mysql
    systemctl disable mysql
    log 'mysql.service stopped and disabled'
  # CentOS/Rocky
  elif systemctl is-enabled --quiet mysqld; then
    log 'Stopping and disabling mysqld.service ...'
    systemctl stop mysqld
    systemctl disable mysqld
    log 'mysqld.service stopped and disabled'
  else
    log 'Service mysql is not enabled'
  fi
}

function stop_hive_services() {
  if (systemctl is-enabled --quiet hive-server2); then
    log 'Stopping Hive server2 ...'
    systemctl stop hive-server2
    log 'Hive server2 stopped'
  else
    echo "Service Hive server2 is not enabled"
  fi

  if (systemctl is-enabled --quiet hive-metastore); then
    log 'Stopping Hive metastore ...'
    systemctl stop hive-metastore
    log 'Hive metastore stopped'
  else
    echo "Service Hive metastore is not enabled"
  fi
}

function start_hive_services() {
  if (systemctl is-enabled --quiet hive-metastore); then
    log 'Restarting Hive metastore ...'
    # Re-start metastore to pickup config changes.
    systemctl restart hive-metastore ||
      err 'Unable to start hive-metastore service'
    log 'Hive metastore restarted'
  else
    echo "Service Hive metastore is not enabled"
  fi

  if (systemctl is-enabled --quiet hive-server2); then
    log 'Restarting Hive server2 ...'
    # Re-start Hive server2 to re-establish Metastore connection.
    systemctl restart hive-server2 ||
      err 'Unable to start hive-server2 service'
    log 'Hive server2 restarted'
  else
    echo "Service Hive server2 is not enabled"
  fi
}

function start_cloud_sql_proxy() {
  log 'Starting Cloud SQL proxy ...'
  systemctl enable cloud-sql-proxy
  systemctl start cloud-sql-proxy ||
    err 'Unable to start cloud-sql-proxy service'

  if [[ $ENABLE_CLOUD_SQL_METASTORE == "true" ]]; then
    run_with_retries nc -zv localhost "${METASTORE_PROXY_PORT}"
  fi

  log 'Cloud SQL Proxy started'
  log 'Logs can be found in /var/log/cloud-sql-proxy/cloud-sql-proxy.log'
}

function validate() {
  if [[ $ENABLE_CLOUD_SQL_METASTORE != "true" ]] && [[ -z "${ADDITIONAL_INSTANCES}" ]]; then
    err 'No Cloud SQL instances to proxy'
  fi
}

function update_master() {
  if [[ $ENABLE_CLOUD_SQL_METASTORE == "true" ]]; then
    stop_hive_services
    stop_mysql_service
  fi

  install_cloud_sql_proxy
  start_cloud_sql_proxy

  if [[ $ENABLE_CLOUD_SQL_METASTORE == "true" ]]; then
    install_db_cli

    # Retry as there may be failures due to race condition
    run_with_retries initialize_metastore_db

    start_hive_services
    # Make sure that Hive metastore properly configured.
    run_with_retries run_validation
  fi
}

function update_worker() {
  # This part runs on workers. There is no in-cluster MySQL on workers.
  if [[ $ENABLE_PROXY_ON_WORKERS == "true" ]]; then
    install_cloud_sql_proxy
    start_cloud_sql_proxy
  fi
}

function clean_up_sources_lists() {
  if ! is_debuntu ; then return ; fi
  #
  # bigtop (primary)
  #
  local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list"

  if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then
    region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')"

    local regional_bigtop_repo_uri
    regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} |
      sed -E "s#/dataproc-bigtop-repo(-dev)?/#/goog-dataproc-bigtop-repo\\1-${region}/#" |
      grep -E "deb .*goog-dataproc-bigtop-repo(-dev)?-${region}.* dataproc contrib" |
      cut -d ' ' -f 2 |
      head -1)

    if [[ "${regional_bigtop_repo_uri}" == */ ]]; then
      local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key"
    else
      local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key"
    fi

    local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
    rm -f "${bigtop_kr_path}"
    curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
      "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"

    sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
    sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
  fi

  #
  # adoptium
  #
  # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu
  local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
  local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
  rm -f "${adoptium_kr_path}"
  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
   | gpg --dearmor -o "${adoptium_kr_path}"
  echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
   > /etc/apt/sources.list.d/adoptium.list


  #
  # docker
  #
  local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg"
  local docker_repo_file="/etc/apt/sources.list.d/docker.list"
  local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"

  rm -f "${docker_kr_path}"
  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
    | gpg --dearmor -o "${docker_kr_path}"
  echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
    > ${docker_repo_file}

  #
  # google cloud + logging/monitoring
  #
  if ls /etc/apt/sources.list.d/google-cloud*.list ; then
    rm -f /usr/share/keyrings/cloud.google.gpg
    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
    for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
      list_file="/etc/apt/sources.list.d/${list}.list"
      if [[ -f "${list_file}" ]]; then
        sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}"
      fi
    done
  fi

  #
  # cran-r
  #
  if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
    keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
    if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
    rm -f /usr/share/keyrings/cran-r.gpg
    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
      gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
    sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
  fi

  #
  # mysql
  #
  if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
    rm -f /usr/share/keyrings/mysql.gpg
    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
      gpg --dearmor -o /usr/share/keyrings/mysql.gpg
    sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
  fi

  if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi

}

function main() {
  local role
  role="$(/usr/share/google/get_metadata_value attributes/dataproc-role)"

  validate

  repair_old_backports

  clean_up_sources_lists

  if [[ "${role}" == 'Master' ]]; then
    update_master
  else
    update_worker
  fi

  log 'All done'
}

main
====== Filename: ./examples/secure-boot/rapids.sh ======
#!/bin/bash

# Copyright 2019,2020,2021,2022,2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This initialization action script will install rapids on a Dataproc
# cluster.

set -euxo pipefail

function os_id()       { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; }
function is_ubuntu()   { [[ "$(os_id)" == 'ubuntu' ]] ; }
function is_ubuntu18() { is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; }
function is_debian()   { [[ "$(os_id)" == 'debian' ]] ; }
function is_debuntu()  { is_debian || is_ubuntu ; }

function print_metadata_value() {
  local readonly tmpfile=$(mktemp)
  http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \
    -s -o ${tmpfile} 2>/dev/null)
  local readonly return_code=$?
  # If the command completed successfully, print the metadata value to stdout.
  if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then
    cat ${tmpfile}
  fi
  rm -f ${tmpfile}
  return ${return_code}
}

function print_metadata_value_if_exists() {
  local return_code=1
  local readonly url=$1
  print_metadata_value ${url}
  return_code=$?
  return ${return_code}
}

function get_metadata_value() {
  set +x
  local readonly varname=$1
  local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1
  # Print the instance metadata value.
  print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname}
  return_code=$?
  # If the instance doesn't have the value, try the project.
  if [[ ${return_code} != 0 ]]; then
    print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname}
    return_code=$?
  fi
  set -x
  return ${return_code}
}

function get_metadata_attribute() (
  set +x
  local -r attribute_name="$1"
  local -r default_value="${2:-}"
  get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
)

function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; }
function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; }

function execute_with_retries() {
  local -r cmd="$*"
  for i in {0..9} ; do
    if eval "$cmd"; then
      return 0 ; fi
    sleep 5
  done
  echo "Cmd '${cmd}' failed."
  return 1
}

function configure_dask_yarn() {
  readonly DASK_YARN_CONFIG_DIR=/etc/dask/
  readonly DASK_YARN_CONFIG_FILE=${DASK_YARN_CONFIG_DIR}/config.yaml
  # Minimal custom configuration is required for this
  # setup. Please see https://yarn.dask.org/en/latest/quickstart.html#usage
  # for information on tuning Dask-Yarn environments.
  mkdir -p "${DASK_YARN_CONFIG_DIR}"

  cat <<EOF >"${DASK_YARN_CONFIG_FILE}"
# Config file for Dask Yarn.
#
# These values are joined on top of the default config, found at
# https://yarn.dask.org/en/latest/configuration.html#default-configuration

yarn:
  environment: python://${DASK_CONDA_ENV}/bin/python

  worker:
    count: 2
    gpus: 1
    class: "dask_cuda.CUDAWorker"
EOF
}

function install_systemd_dask_worker() {
  echo "Installing systemd Dask Worker service..."
  local -r dask_worker_local_dir="/tmp/${DASK_WORKER_SERVICE}"

  mkdir -p "${dask_worker_local_dir}"

  local DASK_WORKER_LAUNCHER="/usr/local/bin/${DASK_WORKER_SERVICE}-launcher.sh"

  cat <<EOF >"${DASK_WORKER_LAUNCHER}"
#!/bin/bash
LOGFILE="/var/log/${DASK_WORKER_SERVICE}.log"
nvidia-smi -c DEFAULT
echo "dask-cuda-worker starting, logging to \${LOGFILE}"
${DASK_CONDA_ENV}/bin/dask-cuda-worker "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>&1
EOF

  chmod 750 "${DASK_WORKER_LAUNCHER}"

  local -r dask_service_file="/usr/lib/systemd/system/${DASK_WORKER_SERVICE}.service"
  cat <<EOF >"${dask_service_file}"
[Unit]
Description=Dask Worker Service
[Service]
Type=simple
Restart=on-failure
ExecStart=/bin/bash -c 'exec ${DASK_WORKER_LAUNCHER}'
[Install]
WantedBy=multi-user.target
EOF
  chmod a+r "${dask_service_file}"

  systemctl daemon-reload

  # Enable the service
  if [[ "${ROLE}" != "Master" ]]; then
    enable_worker_service="1"
  else
     local RUN_WORKER_ON_MASTER=$(get_metadata_attribute dask-cuda-worker-on-master 'true')
    # Enable service on single-node cluster (no workers)
    local worker_count="$(get_metadata_attribute dataproc-worker-count)"
    if [[ "${worker_count}" == "0" || "${RUN_WORKER_ON_MASTER}" == "true" ]]; then
      enable_worker_service="1"
    fi
  fi

  if [[ "${enable_worker_service}" == "1" ]]; then
    systemctl enable "${DASK_WORKER_SERVICE}"
    systemctl restart "${DASK_WORKER_SERVICE}"
  fi
}

function install_systemd_dask_scheduler() {
  # only run scheduler on primary master
  if [[ "$(hostname -s)" != "${MASTER}" ]]; then return ; fi
  echo "Installing systemd Dask Scheduler service..."
  local -r dask_scheduler_local_dir="/tmp/${DASK_SCHEDULER_SERVICE}"

  mkdir -p "${dask_scheduler_local_dir}"

  local DASK_SCHEDULER_LAUNCHER="/usr/local/bin/${DASK_SCHEDULER_SERVICE}-launcher.sh"

  cat <<EOF >"${DASK_SCHEDULER_LAUNCHER}"
#!/bin/bash
LOGFILE="/var/log/${DASK_SCHEDULER_SERVICE}.log"
echo "dask scheduler starting, logging to \${LOGFILE}"
${DASK_CONDA_ENV}/bin/dask scheduler >> "\${LOGFILE}" 2>&1
EOF

  chmod 750 "${DASK_SCHEDULER_LAUNCHER}"

  local -r dask_service_file="/usr/lib/systemd/system/${DASK_SCHEDULER_SERVICE}.service"
  cat <<EOF >"${dask_service_file}"
[Unit]
Description=Dask Scheduler Service
[Service]
Type=simple
Restart=on-failure
ExecStart=/bin/bash -c 'exec ${DASK_SCHEDULER_LAUNCHER}'
[Install]
WantedBy=multi-user.target
EOF
  chmod a+r "${dask_service_file}"

  systemctl daemon-reload

  # Enable the service
  systemctl enable "${DASK_SCHEDULER_SERVICE}"
}

function install_systemd_dask_service() {
  install_systemd_dask_scheduler
  install_systemd_dask_worker
}

function restart_knox() {
  systemctl stop knox
  rm -rf "${KNOX_HOME}/data/deployments/*"
  systemctl start knox
}

function configure_knox_for_dask() {
  if [[ ! -d "${KNOX_HOME}" ]]; then
    echo "Skip configuring Knox rules for Dask"
    return 0
  fi

  local DASK_UI_PORT=8787
  if [[ -f /etc/knox/conf/topologies/default.xml ]]; then
    sed -i \
      "/<\/topology>/i <service><role>DASK<\/role><url>http://localhost:${DASK_UI_PORT}<\/url><\/service> <service><role>DASKWS<\/role><url>ws:\/\/${MASTER}:${DASK_UI_PORT}<\/url><\/service>" \
      /etc/knox/conf/topologies/default.xml
  fi

  mkdir -p "${KNOX_DASK_DIR}"

  cat >"${KNOX_DASK_DIR}/service.xml" <<'EOF'
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>

<service role="DASK" name="dask" version="0.1.0">
  <policies>
    <policy role="webappsec"/>
    <policy role="authentication" name="Anonymous"/>
    <policy role="rewrite"/>
    <policy role="authorization"/>
  </policies>

  <routes>
    <!-- Javascript paths -->
    <route path="/dask/**/*.js">
      <rewrite apply="DASK/dask/inbound/js/dask" to="request.url"/>
      <rewrite apply="DASK/dask/outbound/js" to="response.body"/>
    </route>
    <route path="/dask/**/*.js?**">
      <rewrite apply="DASK/dask/inbound/js/dask" to="request.url"/>
      <rewrite apply="DASK/dask/outbound/js" to="response.body"/>
    </route>

    <!-- CSS paths -->
    <route path="/dask/**/*.css">
      <rewrite apply="DASK/dask/inbound/css/dask" to="request.url"/>
    </route>

    <!-- General path routing -->
    <route path="/dask">
      <rewrite apply="DASK/dask/inbound/root" to="request.url"/>
      <rewrite apply="DASK/dask/outbound/headers" to="response.headers"/>
    </route>
    <route path="/dask/**">
      <rewrite apply="DASK/dask/inbound/root/path" to="request.url"/>
      <rewrite apply="DASK/dask/outbound/headers" to="response.headers"/>
      <rewrite apply="DASK/dask/outbound/logs" to="response.body"/>
    </route>
    <route path="/dask/**?**">
      <rewrite apply="DASK/dask/inbound/root/query" to="request.url"/>
      <rewrite apply="DASK/dask/outbound/headers" to="response.headers"/>
      <rewrite apply="DASK/dask/outbound/logs" to="response.body"/>
    </route>
  </routes>
  <dispatch classname="org.apache.knox.gateway.dispatch.PassAllHeadersNoChunkedPostDispatch"/>
</service>
EOF

  cat >"${KNOX_DASK_DIR}/rewrite.xml" <<'EOF'
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>

<rules>
  <rule dir="IN" name="DASK/dask/inbound/js/dask" pattern="http://*:*/**/dask/{**}?{**}">
    <rewrite template="{$serviceUrl[DASK]}/{**}?{**}"/>
  </rule>
  <rule dir="IN" name="DASK/dask/inbound/root" pattern="http://*:*/**/dask">
    <rewrite template="{$serviceUrl[DASK]}"/>
  </rule>
  <rule dir="IN" name="DASK/dask/inbound/root/path" pattern="http://*:*/**/dask/{**}">
    <rewrite template="{$serviceUrl[DASK]}/{**}"/>
  </rule>
  <rule dir="IN" name="DASK/dask/inbound/root/query" pattern="http://*:*/**/dask/{**}?{**}">
    <rewrite template="{$serviceUrl[DASK]}/{**}?{**}"/>
  </rule>
  <rule dir="IN" name="DASK/dask/inbound/css/dask" pattern="http://*:*/**/dask/{**}?{**}">
    <rewrite template="{$serviceUrl[DASK]}/{**}?{**}"/>
  </rule>
  <!-- without the /gateway/default prefix -->
  <rule dir="IN" name="DASK/dask/inbound/root/noprefix" pattern="http://*:*/dask">
    <rewrite template="{$serviceUrl[DASK]}"/>
  </rule>

  <rule dir="OUT" name="DASK/dask/outbound/logs" pattern="/logs">
    <rewrite template="{$frontend[path]}/dask/info/logs"/>
  </rule>

  <!-- Rewrite redirect responses Location header -->
  <filter name="DASK/dask/outbound/headers">
    <content type="application/x-http-headers">
      <apply path="Location" rule="DASK/dask/outbound/headers/location"/>
    </content>
  </filter>

  <rule dir="OUT" name="DASK/dask/outbound/headers/location" flow="OR">
    <match pattern="*://*:*/">
      <rewrite template="{$frontend[path]}/dask/"/>
    </match>
    <match pattern="*://*:*/{**}">
      <rewrite template="{$frontend[path]}/dask/{**}"/>
    </match>
    <match pattern="*://*:*/{**}?{**}">
      <rewrite template="{$frontend[path]}/dask/{**}?{**}"/>
    </match>
    <match pattern="/{**}">
      <rewrite template="{$frontend[path]}/dask/{**}"/>
    </match>
    <match pattern="/{**}?{**}">
      <rewrite template="{$frontend[path]}/dask/{**}?{**}"/>
    </match>
  </rule>
</rules>
EOF

  mkdir -p "${KNOX_DASKWS_DIR}"

  cat >"${KNOX_DASKWS_DIR}/service.xml" <<'EOF'
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>

<service role="DASKWS" name="daskws" version="0.1.0">
  <policies>
    <policy role="webappsec"/>
    <policy role="authentication" name="Anonymous"/>
    <policy role="rewrite"/>
    <policy role="authorization"/>
  </policies>

  <routes>

    <route path="/dask/**/ws">
      <rewrite apply="DASKWS/daskws/inbound/ws" to="request.url"/>
    </route>

  </routes>
  <dispatch classname="org.apache.knox.gateway.dispatch.PassAllHeadersNoChunkedPostDispatch"/>
</service>
EOF

  cat >"${KNOX_DASKWS_DIR}/rewrite.xml" <<'EOF'
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>

<rules>
  <rule dir="IN" name="DASKWS/daskws/inbound/ws" pattern="ws://*:*/**/dask/{**}/ws">
    <rewrite template="{$serviceUrl[DASKWS]}/{**}/ws"/>
  </rule>
</rules>
EOF

  chown -R knox:knox "${KNOX_DASK_DIR}" "${KNOX_DASKWS_DIR}"

  # Do not restart knox during pre-init script run
  if [[ -n "${ROLE}" ]]; then
    restart_knox
  fi
}

function configure_fluentd_for_dask() {
  if [[ "$(hostname -s)" == "${MASTER}" ]]; then
    cat >/etc/google-fluentd/config.d/dataproc-dask.conf <<EOF
# Fluentd config for Dask logs

# Dask scheduler
<source>
  @type tail
  path /var/log/dask-scheduler.log
  pos_file /var/tmp/fluentd.dataproc.dask.scheduler.pos
  read_from_head true
  tag google.dataproc.dask-scheduler
  <parse>
    @type none
  </parse>
</source>

<filter google.dataproc.dask-scheduler>
  @type record_transformer
  <record>
    filename dask-scheduler.log
  </record>
</filter>
EOF
  fi

  if [[ "${enable_worker_service}" == "1" ]]; then
    cat >>/etc/google-fluentd/config.d/dataproc-dask.conf <<EOF
# Dask worker
<source>
  @type tail
  path /var/log/dask-worker.log
  pos_file /var/tmp/fluentd.dataproc.dask.worker.pos
  read_from_head true
  tag google.dataproc.dask-worker
  <parse>
    @type none
  </parse>
</source>

<filter google.dataproc.dask-worker>
  @type record_transformer
  <record>
    filename dask-worker.log
  </record>
</filter>
EOF
  fi

  systemctl restart google-fluentd
}

function install_dask_rapids() {
  if is_cuda12 ; then
    local python_spec="python>=3.11"
    local cuda_spec="cuda-version>=12,<13"
    local dask_spec="dask>=2024.7"
    local numba_spec="numba"
  elif is_cuda11 ; then
    local python_spec="python>=3.9"
    local cuda_spec="cuda-version>=11,<12.0a0"
    local dask_spec="dask"
    local numba_spec="numba"
  fi

  rapids_spec="rapids>=${RAPIDS_VERSION}"
  CONDA_PACKAGES=()
  if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then
    # Pin `distributed` and `dask` package versions to old release
    # because `dask-yarn` 0.9 uses skein in a way which
    # is not compatible with `distributed` package 2022.2 and newer:
    # https://github.com/dask/dask-yarn/issues/155

    dask_spec="dask<2022.2"
    python_spec="python>=3.7,<3.8.0a0"
    rapids_spec="rapids<=24.05"
    if is_ubuntu18 ; then
      # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic
      CONDA_PACKAGES+=("fiona<1.8.22")
    fi
    CONDA_PACKAGES+=('dask-yarn=0.9' "distributed<2022.2")
  fi

  CONDA_PACKAGES+=(
    "${cuda_spec}"
    "${rapids_spec}"
    "${dask_spec}"
    "dask-bigquery"
    "dask-ml"
    "dask-sql"
    "cudf"
    "${numba_spec}"
  )

  # Install cuda, rapids, dask
  mamba="/opt/conda/miniconda3/bin/mamba"
  conda="/opt/conda/miniconda3/bin/conda"

  "${conda}" remove -n dask --all || echo "unable to remove conda environment [dask]"

  ( set +e
  local is_installed="0"
  for installer in "${mamba}" "${conda}" ; do
    test -d "${DASK_CONDA_ENV}" || \
      time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \
      -c 'conda-forge' -c 'nvidia' -c 'rapidsai'  \
      ${CONDA_PACKAGES[*]} \
      "${python_spec}" \
      > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
    sync
    if [[ "$retval" == "0" ]] ; then
      is_installed="1"
      break
    fi
    "${conda}" config --set channel_priority flexible
  done
  if [[ "${is_installed}" == "0" ]]; then
    echo "failed to install dask"
    return 1
  fi
  )
}

function main() {
  # Install Dask with RAPIDS
  install_dask_rapids

  # In "standalone" mode, Dask relies on a systemd unit to launch.
  # In "yarn" mode, it relies a config.yaml file.
  if [[ "${DASK_RUNTIME}" == "yarn" ]]; then
    # Create Dask YARN config file
    configure_dask_yarn
  else
    # Create Dask service
    install_systemd_dask_service

    if [[ "$(hostname -s)" == "${MASTER}" ]]; then
      systemctl start "${DASK_SCHEDULER_SERVICE}"
      systemctl status "${DASK_SCHEDULER_SERVICE}"
    fi

    echo "Starting Dask 'standalone' cluster..."
    if [[ "${enable_worker_service}" == "1" ]]; then
      systemctl start "${DASK_WORKER_SERVICE}"
      systemctl status "${DASK_WORKER_SERVICE}"
    fi

    configure_knox_for_dask

    local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')"
    if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then
      configure_fluentd_for_dask
    fi
  fi

  echo "Dask RAPIDS for ${DASK_RUNTIME} successfully initialized."
  if [[ "${ROLE}" == "Master" ]]; then
    systemctl restart hadoop-yarn-resourcemanager.service
    # Restart NodeManager on Master as well if this is a single-node-cluster.
    if systemctl list-units | grep hadoop-yarn-nodemanager; then
      systemctl restart hadoop-yarn-nodemanager.service
    fi
  else
    systemctl restart hadoop-yarn-nodemanager.service
  fi
}

function exit_handler() (
  set +e
  echo "Exit handler invoked"

  # Free conda cache
  /opt/conda/miniconda3/bin/conda clean -a > /dev/null 2>&1

  # Clear pip cache
  pip cache purge || echo "unable to purge pip cache"

  # remove the tmpfs conda pkgs_dirs
  if [[ -d /mnt/shm ]] ; then /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm ; fi

  # Clean up shared memory mounts
  for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do
    if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then
      rm -rf ${shmdir}/*
      umount -f ${shmdir}
    fi
  done

  # Clean up OS package cache ; re-hold systemd package
  if is_debuntu ; then
    apt-get -y -qq clean
    apt-get -y -qq autoremove
  else
    dnf clean all
  fi

  # print disk usage statistics
  if is_debuntu ; then
    # Rocky doesn't have sort -h and fails when the argument is passed
    du --max-depth 3 -hx / | sort -h | tail -10
  fi

  # Process disk usage logs from installation period
  rm -f "${tmpdir}/keep-running-df"
  sleep 6s
  # compute maximum size of disk during installation
  # Log file contains logs like the following (minus the preceeding #):
#Filesystem      Size  Used Avail Use% Mounted on
#/dev/vda2       6.8G  2.5G  4.0G  39% /
  df -h / | tee -a "${tmpdir}/disk-usage.log"
  perl -e '$max=( sort
                   map { (split)[2] =~ /^(\d+)/ }
                  grep { m:^/: } <STDIN> )[-1];
print( "maximum-disk-used: $max", $/ );' < "${tmpdir}/disk-usage.log"

  echo "exit_handler has completed"

  # zero free disk space
  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
    dd if=/dev/zero of=/zero ; sync ; rm -f /zero
  fi

  return 0
)

function prepare_to_install(){
  readonly DEFAULT_CUDA_VERSION="12.4"
  CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION})
  readonly CUDA_VERSION

  readonly ROLE=$(get_metadata_attribute dataproc-role)
  readonly MASTER=$(get_metadata_attribute dataproc-master)

  # RAPIDS config
  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK')
  readonly RAPIDS_RUNTIME

  readonly DEFAULT_DASK_RAPIDS_VERSION="24.08"
  readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION})

  # Dask config
  DASK_RUNTIME="$(get_metadata_attribute dask-runtime || echo 'standalone')"
  readonly DASK_RUNTIME
  readonly DASK_SERVICE=dask-cluster
  readonly DASK_WORKER_SERVICE=dask-worker
  readonly DASK_SCHEDULER_SERVICE=dask-scheduler
  readonly DASK_CONDA_ENV="/opt/conda/miniconda3/envs/dask-rapids"

  # Knox config
  readonly KNOX_HOME=/usr/lib/knox
  readonly KNOX_DASK_DIR="${KNOX_HOME}/data/services/dask/0.1.0"
  readonly KNOX_DASKWS_DIR="${KNOX_HOME}/data/services/daskws/0.1.0"
  enable_worker_service="0"

  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
  # Write to a ramdisk instead of churning the persistent disk
  if [[ ${free_mem} -ge 5250000 ]]; then
    tmpdir=/mnt/shm
    mkdir -p /mnt/shm
    mount -t tmpfs tmpfs /mnt/shm

    # Download conda packages to tmpfs
    /opt/conda/miniconda3/bin/conda config --add pkgs_dirs /mnt/shm
    mount -t tmpfs tmpfs /mnt/shm

    # Download pip packages to tmpfs
    pip config set global.cache-dir /mnt/shm || echo "unable to set global.cache-dir"

    # Download OS packages to tmpfs
    if is_debuntu ; then
      mount -t tmpfs tmpfs /var/cache/apt/archives
    else
      mount -t tmpfs tmpfs /var/cache/dnf
    fi
  else
    tmpdir=/tmp
  fi
  install_log="${tmpdir}/install.log"
  trap exit_handler EXIT

  # Monitor disk usage in a screen session
  if is_debuntu ; then
      apt-get install -y -qq screen
  else
      dnf -y -q install screen
  fi
  df -h / | tee "${tmpdir}/disk-usage.log"
  touch "${tmpdir}/keep-running-df"
  screen -d -m -US keep-running-df \
    bash -c "while [[ -f ${tmpdir}/keep-running-df ]] ; do df -h / | tee -a ${tmpdir}/disk-usage.log ; sleep 5s ; done"
}

prepare_to_install

main
====== Filename: ./examples/secure-boot/ai-notebooks.sh ======
#!/bin/bash
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This script creates a custom image pre-loaded with cuda

set -ex

export PROJECT_ID="$(jq    -r .PROJECT_ID    env.json)"
export PURPOSE="$(jq       -r .PURPOSE       env.json)"
export BUCKET="$(jq        -r .BUCKET        env.json)"
export IMAGE_VERSION="$(jq -r .IMAGE_VERSION env.json)"
export ZONE="$(jq          -r .ZONE          env.json)"

custom_image_zone="${ZONE}"
disk_size_gb="50" # greater than or equal to 30

SA_NAME="sa-${PURPOSE}"
GSA="${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com"

gcloud config set project ${PROJECT_ID}

gcloud auth login

if [[ -d tls ]] ; then mv tls "tls-$(date +%s)" ; fi
eval "$(bash examples/secure-boot/create-key-pair.sh)"

metadata="public_secret_name=${public_secret_name}"
metadata="${metadata},private_secret_name=${private_secret_name}"
metadata="${metadata},secret_project=${secret_project}"
metadata="${metadata},secret_version=${secret_version}"

if gcloud iam service-accounts list --filter email="${GSA}" 2>&1 | grep 'Listed 0 items.' ; then
  # Create service account for this purpose
  echo "creating pre-init customization service account ${GSA}"
  gcloud iam service-accounts create "${SA_NAME}" \
    --description="Service account for pre-init customization" \
    --display-name="${SA_NAME}"
fi

# Grant service account access to bucket
gcloud storage buckets add-iam-policy-binding "gs://${BUCKET}" \
  --member="serviceAccount:${GSA}" \
  --role="roles/storage.objectViewer"

# Grant the service account access to list secrets for the project
gcloud projects add-iam-policy-binding "${PROJECT_ID}" \
  --member="serviceAccount:${GSA}" \
  --role="roles/secretmanager.viewer"

# Grant service account permission to access the private secret
gcloud secrets add-iam-policy-binding "${private_secret_name}" \
  --member="serviceAccount:${GSA}" \
  --role="roles/secretmanager.secretAccessor"

# Grant service account permission to access the public secret
gcloud secrets add-iam-policy-binding "${public_secret_name}" \
  --member="serviceAccount:${GSA}" \
  --role="roles/secretmanager.secretAccessor"

# If no OS family specified, default to debian
if [[ "${IMAGE_VERSION}" != *-* ]] ; then
  case "${IMAGE_VERSION}" in
    "2.2" ) dataproc_version="${IMAGE_VERSION}-debian12" ;;
    "2.1" ) dataproc_version="${IMAGE_VERSION}-debian11" ;;
    "2.0" ) dataproc_version="${IMAGE_VERSION}-debian10" ;;
  esac
else
  dataproc_version="${IMAGE_VERSION}"
fi

#dataproc_version="${IMAGE_VERSION}-ubuntu22"
#dataproc_version="${IMAGE_VERSION}-rocky9"
#customization_script="examples/secure-boot/install-nvidia-driver-debian11.sh"
#customization_script="examples/secure-boot/install-nvidia-driver-debian12.sh"
customization_script="examples/secure-boot/install_gpu_driver.sh"
#echo "#!/bin/bash\necho no op" | dd of=empty.sh
#customization_script=empty.sh
#image_name="nvidia-open-kernel-2.2-ubuntu22-$(date +%F)"
#image_name="nvidia-open-kernel-2.2-rocky9-$(date +%F)"
#image_name="nvidia-open-kernel-2.2-debian12-$(date +%F)"
#image_name="nvidia-open-kernel-${dataproc_version}-$(date +%F)"
image_name="cuda-${dataproc_version/\./-}-$(date +%F-%H-%M)"

python generate_custom_image.py \
    --accelerator "type=nvidia-tesla-t4" \
    --image-name "${image_name}" \
    --dataproc-version "${dataproc_version}" \
    --trusted-cert "tls/db.der" \
    --customization-script "${customization_script}" \
    --service-account "${GSA}" \
    --metadata "${metadata}" \
    --zone "${custom_image_zone}" \
    --disk-size "${disk_size_gb}" \
    --no-smoke-test \
    --gcs-bucket "${BUCKET}" \
    --shutdown-instance-timer-sec=30
set +x
# Revoke permission to access the private secret
gcloud secrets remove-iam-policy-binding "${private_secret_name}" \
  --member="serviceAccount:${GSA}" \
  --role="roles/secretmanager.secretAccessor" > /dev/null 2>&1

# Revoke access to bucket
gcloud storage buckets remove-iam-policy-binding "gs://${BUCKET}" \
  --member="serviceAccount:${GSA}" \
  --role="roles/storage.objectViewer" > /dev/null 2>&1

# Revoke access to list secrets for the project
gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \
  --member="serviceAccount:${GSA}" \
  --role="roles/secretmanager.viewer" > /dev/null 2>&1
====== Filename: ./examples/secure-boot/mig.sh ======
#!/bin/bash
#

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This script installs NVIDIA GPU drivers and enables MIG on Amphere GPU architectures.
#
# This script should be specified in --metadata=startup-script-url= option and
# --metadata=ENABLE_MIG can be used to enable or disable MIG. The default is to enable it.
# The script does a reboot to fully enable MIG and then configures the MIG device based on the
# user specified MIG_CGI profiles specified via: --metadata=^:^MIG_CGI='9,9'. If MIG_CGI
# is not specified it assumes it's using an A100 and configures 2 instances with profile id 9.
# It is assumed this script is used in conjuntion with install_gpu_driver.sh, which does the
# YARN setup to fully utilize the MIG instances on YARN.
#
# This initialization action is generated from
# initialization-actions/templates/spark-rapids/mig.sh.in
#
# Modifications made directly to the generated file will be lost when
# the template is re-evaluated


set -euxo pipefail

function os_id()       ( set +x ;  grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; )
function os_version()  ( set +x ;  grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; )
function os_codename() ( set +x ;  grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; )

function version_ge(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|tail -n1)" ]]; }
function version_gt(){ [[ "$1" = "$2" ]]&& return 1 || version_ge "$1" "$2";}
function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; }
function version_lt(){ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";}

readonly -A supported_os=(
  ['debian']="10 11 12"
  ['rocky']="8 9"
  ['ubuntu']="18.04 20.04 22.04"
)

# dynamically define OS version test utility functions
if [[ "$(os_id)" == "rocky" ]];
then _os_version=$(os_version | sed -e 's/[^0-9].*$//g')
else _os_version="$(os_version)"; fi
for os_id_val in 'rocky' 'ubuntu' 'debian' ; do
  eval "function is_${os_id_val}() ( set +x ;  [[ \"$(os_id)\" == '${os_id_val}' ]] ; )"

  for osver in $(echo "${supported_os["${os_id_val}"]}") ; do
    eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )"
    eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )"
    eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )"
  done
done

function is_debuntu()  ( set +x ;  is_debian || is_ubuntu ; )

function os_vercat()   ( set +x
  if   is_ubuntu ; then os_version | sed -e 's/[^0-9]//g'
  elif is_rocky  ; then os_version | sed -e 's/[^0-9].*$//g'
                   else os_version ; fi ; )

function repair_old_backports {
  if ! is_debuntu ; then return ; fi
  # This script uses 'apt-get update' and is therefore potentially dependent on
  # backports repositories which have been archived.  In order to mitigate this
  # problem, we will use archive.debian.org for the oldoldstable repo

  # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157
  debdists="https://deb.debian.org/debian/dists"
  oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}');
  oldstable=$(   curl -s "${debdists}/oldstable/Release"    | awk '/^Codename/ {print $2}');
  stable=$(      curl -s "${debdists}/stable/Release"       | awk '/^Codename/ {print $2}');

  matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) )

  for filename in "${matched_files[@]}"; do
    # Fetch from archive.debian.org for ${oldoldstable}-backports
    perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports }
                  {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}"
  done
}

function print_metadata_value() {
  local readonly tmpfile=$(mktemp)
  http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \
    -s -o ${tmpfile} 2>/dev/null)
  local readonly return_code=$?
  # If the command completed successfully, print the metadata value to stdout.
  if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then
    cat ${tmpfile}
  fi
  rm -f ${tmpfile}
  return ${return_code}
}

function print_metadata_value_if_exists() {
  local return_code=1
  local readonly url=$1
  print_metadata_value ${url}
  return_code=$?
  return ${return_code}
}

# replicates /usr/share/google/get_metadata_value
function get_metadata_value() (
  set +x
  local readonly varname=$1
  local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1
  # Print the instance metadata value.
  print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname}
  return_code=$?
  # If the instance doesn't have the value, try the project.
  if [[ ${return_code} != 0 ]]; then
    print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname}
    return_code=$?
  fi

  return ${return_code}
)

function get_metadata_attribute() (
  set +x
  local -r attribute_name="$1"
  local -r default_value="${2:-}"
  get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}"
)

function execute_with_retries() (
  set +x
  local -r cmd="$*"

  if [[ "$cmd" =~ "^apt-get install" ]] ; then
    apt-get -y clean
    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
  fi
  for ((i = 0; i < 3; i++)); do
    set -x
    time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
    set +x
    if [[ $retval == 0 ]] ; then return 0 ; fi
    sleep 5
  done
  return 1
)

function cache_fetched_package() {
  local src_url="$1"
  local gcs_fn="$2"
  local local_fn="$3"

  if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then
    time gcloud storage cp "${gcs_fn}" "${local_fn}"
  else
    time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \
           gcloud storage cp "${local_fn}" "${gcs_fn}" ; )
  fi
}

function add_contrib_component() {
  if ge_debian12 ; then
      # Include in sources file components on which nvidia-kernel-open-dkms depends
      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
      local components="main contrib"

      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
  elif is_debian ; then
      sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list
  fi
}

function set_hadoop_property() {
  local -r config_file=$1
  local -r property=$2
  local -r value=$3
  "${bdcfg}" set_property \
    --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \
    --name "${property}" --value "${value}" \
    --clobber
}

function configure_yarn_resources() {
  if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts
  if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then
    printf '<?xml version="1.0" ?>\n<configuration/>' >"${HADOOP_CONF_DIR}/resource-types.xml"
  fi
  set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu'

  set_hadoop_property 'capacity-scheduler.xml' \
    'yarn.scheduler.capacity.resource-calculator' \
    'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator'

  set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu'
}

# This configuration should be applied only if GPU is attached to the node
function configure_yarn_nodemanager() {
  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu'
  set_hadoop_property 'yarn-site.xml' \
    'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto'
  set_hadoop_property 'yarn-site.xml' \
    'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH
  set_hadoop_property 'yarn-site.xml' \
    'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true'
  set_hadoop_property 'yarn-site.xml' \
    'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup'
  set_hadoop_property 'yarn-site.xml' \
    'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn'
  set_hadoop_property 'yarn-site.xml' \
    'yarn.nodemanager.container-executor.class' \
    'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor'
  set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn'

  # Fix local dirs access permissions
  local yarn_local_dirs=()

  readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \
    --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \
    --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n')

  if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then
    chown yarn:yarn -R "${yarn_local_dirs[@]/,/}"
  fi
}

function clean_up_sources_lists() {
  #
  # bigtop (primary)
  #
  local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list"

  if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then
    region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')"

    local regional_bigtop_repo_uri
    regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} |
      sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" |
      grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" |
      cut -d ' ' -f 2 |
      head -1)

    if [[ "${regional_bigtop_repo_uri}" == */ ]]; then
      local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key"
    else
      local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key"
    fi

    local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg"
    rm -f "${bigtop_kr_path}"
    curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \
      "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}"

    sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
    sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}"
  fi

  #
  # adoptium
  #
  # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu
  local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public"
  local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg"
  rm -f "${adoptium_kr_path}"
  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \
   | gpg --dearmor -o "${adoptium_kr_path}"
  echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \
   > /etc/apt/sources.list.d/adoptium.list


  #
  # docker
  #
  local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg"
  local docker_repo_file="/etc/apt/sources.list.d/docker.list"
  local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg"

  rm -f "${docker_kr_path}"
  curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \
    | gpg --dearmor -o "${docker_kr_path}"
  echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \
    > ${docker_repo_file}

  #
  # google cloud + logging/monitoring
  #
  if ls /etc/apt/sources.list.d/google-cloud*.list ; then
    rm -f /usr/share/keyrings/cloud.google.gpg
    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
    for list in google-cloud google-cloud-logging google-cloud-monitoring ; do
      list_file="/etc/apt/sources.list.d/${list}.list"
      if [[ -f "${list_file}" ]]; then
        sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}"
      fi
    done
  fi

  #
  # cran-r
  #
  if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then
    keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7"
    if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi
    rm -f /usr/share/keyrings/cran-r.gpg
    curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \
      gpg --dearmor -o /usr/share/keyrings/cran-r.gpg
    sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list
  fi

  #
  # mysql
  #
  if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then
    rm -f /usr/share/keyrings/mysql.gpg
    curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \
      gpg --dearmor -o /usr/share/keyrings/mysql.gpg
    sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list
  fi

  if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi

}

function set_proxy(){
  METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')"

  if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi

  export METADATA_HTTP_PROXY
  export http_proxy="${METADATA_HTTP_PROXY}"
  export https_proxy="${METADATA_HTTP_PROXY}"
  export HTTP_PROXY="${METADATA_HTTP_PROXY}"
  export HTTPS_PROXY="${METADATA_HTTP_PROXY}"
  no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254"
  local no_proxy_svc
  for no_proxy_svc in compute  secretmanager dns    servicedirectory     logging  \
                      bigquery composer      pubsub bigquerydatatransfer dataflow \
                      storage  datafusion    ; do
    no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com"
  done

  export NO_PROXY="${no_proxy}"
}

function mount_ramdisk(){
  local free_mem
  free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)"
  if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi

  # Write to a ramdisk instead of churning the persistent disk

  tmpdir="/mnt/shm"
  mkdir -p "${tmpdir}"
  mount -t tmpfs tmpfs "${tmpdir}"

  # Download conda packages to tmpfs
  /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}"

  # Clear pip cache
  # TODO: make this conditional on which OSs have pip without cache purge
  pip cache purge || echo "unable to purge pip cache"

  # Download pip packages to tmpfs
  pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir"

  # Download OS packages to tmpfs
  if is_debuntu ; then
    mount -t tmpfs tmpfs /var/cache/apt/archives
  else
    mount -t tmpfs tmpfs /var/cache/dnf
  fi
}

function check_os() {
  if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then
      echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version."
      exit 1
  elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22  ) ; then
      echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version."
      exit 1
  elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then
      echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version."
      exit 1
  fi

  SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)"
  readonly SPARK_VERSION
  if version_lt "${SPARK_VERSION}" "3.1" || \
     version_ge "${SPARK_VERSION}" "4.0" ; then
    echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions."
    exit 1
  fi

  # Detect dataproc image version
  if (! test -v DATAPROC_IMAGE_VERSION) ; then
    if test -v DATAPROC_VERSION ; then
      DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}"
    else
      if   version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0"
      elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1"
      elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2"
      else echo "Unknown dataproc image version" ; exit 1 ; fi
    fi
  fi
}

readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')"

# Dataproc configurations
readonly HADOOP_CONF_DIR='/etc/hadoop/conf'
readonly HIVE_CONF_DIR='/etc/hive/conf'
readonly SPARK_CONF_DIR='/etc/spark/conf'


function set_support_matrix() {
  # CUDA version and Driver version
  # https://docs.nvidia.com/deploy/cuda-compatibility/
  # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
  # https://developer.nvidia.com/cuda-downloads

  # Minimum supported version for open kernel driver is 515.43.04
  # https://github.com/NVIDIA/open-gpu-kernel-modules/tags
  # Rocky8: 12.0: 525.147.05
  local latest
  latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')"
  readonly -A DRIVER_FOR_CUDA=(
          ["11.7"]="515.65.01"   ["11.8"]="525.147.05"
          ["12.0"]="525.147.05"  ["12.1"]="530.30.02" ["12.4"]="550.135"    ["12.5"]="555.42.02"  ["12.6"]="560.35.03"
  )
  readonly -A DRIVER_SUBVER=(
          ["515"]="515.48.07"   ["520"]="525.147.05" ["525"]="525.147.05"  ["530"]="530.41.03"   ["535"]="535.216.01"
          ["545"]="545.29.06"   ["550"]="550.135"    ["555"]="555.58.02"   ["560"]="560.35.03"   ["565"]="565.57.01"
  )
  # https://developer.nvidia.com/cudnn-downloads
  if is_debuntu ; then
  readonly -A CUDNN_FOR_CUDA=(
          ["11.7"]="9.5.1.17"   ["11.8"]="9.5.1.17"
          ["12.0"]="9.5.1.17"   ["12.1"]="9.5.1.17"  ["12.4"]="9.5.1.17"   ["12.5"]="9.5.1.17"   ["12.6"]="9.5.1.17"
  )
  elif is_rocky ; then
  # rocky:
  #   12.0: 8.8.1.3
  #   12.1: 8.9.3.28
  #   12.2: 8.9.7.29
  #   12.3: 9.0.0.312
  #   12.4: 9.1.1.17
  #   12.5: 9.2.1.18
  #   12.6: 9.5.1.17
  readonly -A CUDNN_FOR_CUDA=(
          ["11.7"]="8.9.7.29"   ["11.8"]="9.5.1.17"
          ["12.0"]="8.8.1.3"    ["12.1"]="8.9.3.28"  ["12.4"]="9.1.1.17"   ["12.5"]="9.2.1.18"   ["12.6"]="9.5.1.17"
  )
  fi
  # https://developer.nvidia.com/nccl/nccl-download
  # 12.2: 2.19.3, 12.5: 2.21.5
  readonly -A NCCL_FOR_CUDA=(
          ["11.7"]="2.21.5"     ["11.8"]="2.21.5"
          ["12.0"]="2.16.5"     ["12.1"]="2.18.3"    ["12.4"]="2.23.4"     ["12.5"]="2.21.5"     ["12.6"]="2.23.4"
  )
  readonly -A CUDA_SUBVER=(
          ["11.7"]="11.7.1"     ["11.8"]="11.8.0"
          ["12.0"]="12.0.1"     ["12.1"]="12.1.1"    ["12.2"]="12.2.2"     ["12.3"]="12.3.2"     ["12.4"]="12.4.1"     ["12.5"]="12.5.1"     ["12.6"]="12.6.2"
  )
}

set_support_matrix

function set_cuda_version() {
  local cuda_url
  cuda_url=$(get_metadata_attribute 'cuda-url' '')
  if [[ -n "${cuda_url}" ]] ; then
    # if cuda-url metadata variable has been passed, extract default version from url
    local CUDA_URL_VERSION
    CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')"
    if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then
      DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}"
      CUDA_FULL_VERSION="${CUDA_URL_VERSION}"
    fi
  fi

  if ( ! test -v DEFAULT_CUDA_VERSION ) ; then
    DEFAULT_CUDA_VERSION='12.4'
  fi
  readonly DEFAULT_CUDA_VERSION

  CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}")
  readonly CUDA_VERSION
  if ( ! test -v CUDA_FULL_VERSION ) ; then
    CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]}
  fi
  readonly CUDA_FULL_VERSION

}

set_cuda_version

function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; )
function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; )
function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; )

function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; )
function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; )
function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; )

function set_driver_version() {
  local gpu_driver_url
  gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '')

  local cuda_url
  cuda_url=$(get_metadata_attribute 'cuda-url' '')

  local DEFAULT_DRIVER
  # Take default from gpu-driver-url metadata value
  if [[ -n "${gpu_driver_url}" ]] ; then
    DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')"
    if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi
  # Take default from cuda-url metadata value as a backup
  elif [[ -n "${cuda_url}" ]] ; then
    local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')"
    if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then
      major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}"
      driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]}
      if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then
        # use the version indicated by the cuda url as the default if it exists
	DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}"
      elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then
        # use the maximum sub-version available for the major version indicated in cuda url as the default
	DEFAULT_DRIVER="${driver_max_maj_version}"
      fi
    fi
  fi

  if ( ! test -v DEFAULT_DRIVER ) ; then
    # If a default driver version has not been extracted, use the default for this version of CUDA
    DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]}
  fi

  DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}")

  readonly DRIVER_VERSION
  readonly DRIVER="${DRIVER_VERSION%%.*}"

  export DRIVER_VERSION DRIVER

  gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
  if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then
    echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}"
    exit 1
  fi
}

set_driver_version

readonly DEFAULT_CUDNN8_VERSION="8.0.5.39"
readonly DEFAULT_CUDNN9_VERSION="9.1.0.70"

# Parameters for NVIDIA-provided cuDNN library
readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]}
CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}")
function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; )
function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; )
# The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION}
if is_rocky  && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then
  CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}"
elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then
  # cuDNN v8 is not distribution for ubuntu20+, debian12
  CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}"
elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then
  # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8
  CUDNN_VERSION="8.8.0.121"
fi
readonly CUDNN_VERSION

readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]}
readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION})

# Parameters for NVIDIA-provided Debian GPU driver
readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"

readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}")

USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')"
readonly USERSPACE_FILENAME

# Short name for urls
if is_ubuntu22  ; then
    # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at
    # https://developer.download.nvidia.com/compute/machine-learning/repos/
    # use packages from previous release until such time as nvidia
    # release ubuntu2204 builds

    shortname="$(os_id)$(os_vercat)"
    nccl_shortname="ubuntu2004"
elif ge_rocky9 ; then
    # use packages from previous release until such time as nvidia
    # release rhel9 builds

    shortname="rhel9"
    nccl_shortname="rhel8"
elif is_rocky ; then
    shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)"
    nccl_shortname="${shortname}"
else
    shortname="$(os_id)$(os_vercat)"
    nccl_shortname="${shortname}"
fi

# Parameters for NVIDIA-provided package repositories
readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute'
readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64"

# Parameters for NVIDIA-provided NCCL library
readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb"
NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}")
readonly NCCL_REPO_URL
readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub

function set_cuda_runfile_url() {
  local MAX_DRIVER_VERSION
  local MAX_CUDA_VERSION

  local MIN_OPEN_DRIVER_VER="515.48.07"
  local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}"
  local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER

  if is_cuda12 ; then
    if is_debian12 ; then
      MIN_DRIVER_VERSION="545.23.06"
      MIN_CUDA_VERSION="12.3.0"
    elif is_debian10 ; then
      MAX_DRIVER_VERSION="555.42.02"
      MAX_CUDA_VERSION="12.5.0"
    elif is_ubuntu18 ; then
      MAX_DRIVER_VERSION="530.30.02"
      MAX_CUDA_VERSION="12.1.1"
    fi
  elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
    if le_debian10 ; then
      # cuda 11 is not supported for <= debian10
      MAX_CUDA_VERSION="0"
      MAX_DRIVER_VERSION="0"
    fi
  else
    echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
  fi

  if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then
    echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
  elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then
    echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}.  Specified: ${CUDA_VERSION}"
  fi
  if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then
    echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
  elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then
    echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}.  Specified: ${DRIVER_VERSION}"
  fi

  # driver version named in cuda runfile filename
  # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/)
  readonly -A drv_for_cuda=(
          ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01"
          ["11.8.0"]="520.61.05"
          ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12"
          ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02"
          ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05"
          ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08"
          ["12.4.0"]="550.54.15" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/
          ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.41.06 is not
          ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03"
  )

  # Verify that the file with the indicated combination exists
  local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]}
  CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run"
  local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}"
  local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}"

  NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}")
  readonly NVIDIA_CUDA_URL

  CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')"
  readonly CUDA_RUNFILE

  if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then
    echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}"
    exit 1
  fi

  if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then
    echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12"
  elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then
    echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18.  Requested version: ${CUDA_VERSION}"
  elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then
    echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}"
  elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then
    echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}"
  fi
}

set_cuda_runfile_url

# Parameter for NVIDIA-provided Rocky Linux GPU driver
readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo"

CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz"
CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}"
if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then
  # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format
  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz"
  if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then
    # When cuDNN version is greater than or equal to 8.4.1.50 use this format
    CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz"
  fi
  # Use legacy url format with one of the tarball name formats depending on version as above
  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}"
fi
if ( version_ge "${CUDA_VERSION}" "12.0" ); then
  # Use modern url format When cuda version is greater than or equal to 12.0
  CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz"
  CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}"
fi
readonly CUDNN_TARBALL
readonly CUDNN_TARBALL_URL

# Whether to install NVIDIA-provided or OS-provided GPU driver
GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA')
readonly GPU_DRIVER_PROVIDER

# Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver
INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false')
readonly INSTALL_GPU_AGENT

NVIDIA_SMI_PATH='/usr/bin'
MIG_MAJOR_CAPS=0
IS_MIG_ENABLED=0

CUDA_KEYRING_PKG_INSTALLED="0"
function install_cuda_keyring_pkg() {
  if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi
  local kr_ver=1.1
  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
    "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \
    -o "${tmpdir}/cuda-keyring.deb"
  dpkg -i "${tmpdir}/cuda-keyring.deb"
  rm -f "${tmpdir}/cuda-keyring.deb"
  CUDA_KEYRING_PKG_INSTALLED="1"
}

function uninstall_cuda_keyring_pkg() {
  apt-get purge -yq cuda-keyring
  CUDA_KEYRING_PKG_INSTALLED="0"
}

function install_local_cuda_repo() {
  if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi

  if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi
  CUDA_LOCAL_REPO_INSTALLED="1"
  pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local"
  CUDA_LOCAL_REPO_PKG_NAME="${pkgname}"
  readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb"
  readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}"
  readonly DIST_KEYRING_DIR="/var/${pkgname}"

  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
    "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}"

  dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}"
  rm "${tmpdir}/${LOCAL_INSTALLER_DEB}"
  cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/

  if is_ubuntu ; then
    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
      "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \
      -o /etc/apt/preferences.d/cuda-repository-pin-600
  fi

  touch "${workdir}/install-local-cuda-repo-complete"
}
function uninstall_local_cuda_repo(){
  apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}"
  rm -f "${workdir}/install-local-cuda-repo-complete"
}

CUDNN_PKG_NAME=""
function install_local_cudnn_repo() {
  if test -f "${workdir}/install-local-cudnn-repo-complete" ; then return ; fi
  pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}"
  CUDNN_PKG_NAME="${pkgname}"
  local_deb_fn="${pkgname}_1.0-1_amd64.deb"
  local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}"

  # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz
  curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \
    "${local_deb_url}" -o "${tmpdir}/local-installer.deb"

  dpkg -i "${tmpdir}/local-installer.deb"

  rm -f "${tmpdir}/local-installer.deb"

  cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings

  touch "${workdir}/install-local-cudnn-repo-complete"
}

function uninstall_local_cudnn_repo() {
  apt-get purge -yq "${CUDNN_PKG_NAME}"
  rm -f "${workdir}/install-local-cudnn-repo-complete"
}

CUDNN8_LOCAL_REPO_INSTALLED="0"
CUDNN8_PKG_NAME=""
function install_local_cudnn8_repo() {
  if test -f "${workdir}/install-local-cudnn8-repo-complete" ; then return ; fi

  if   is_ubuntu ; then cudnn8_shortname="ubuntu2004"
  elif is_debian ; then cudnn8_shortname="debian11"
  else return 0 ; fi
  if   is_cuda12 ; then CUDNN8_CUDA_VER=12.0
  elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8
  else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi
  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}"

  pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}"
  CUDNN8_PKG_NAME="${pkgname}"

  deb_fn="${pkgname}_1.0-1_amd64.deb"
  local_deb_fn="${tmpdir}/${deb_fn}"
  local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}"

  # cache the cudnn package
  cache_fetched_package "${local_deb_url}" \
                        "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \
                        "${local_deb_fn}"

  local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')"
  # If we are using a ram disk, mount another where we will unpack the cudnn local installer
  if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then
    mkdir -p "${cudnn_path}"
    mount -t tmpfs tmpfs "${cudnn_path}"
  fi

  dpkg -i "${local_deb_fn}"

  rm -f "${local_deb_fn}"

  cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings
  touch "${workdir}/install-local-cudnn8-repo-complete"
}

function uninstall_local_cudnn8_repo() {
  apt-get purge -yq "${CUDNN8_PKG_NAME}"
  rm -f "${workdir}/install-local-cudnn8-repo-complete"
}

function install_nvidia_nccl() {
  if test -f "${workdir}/nccl-complete" ; then return ; fi

  if is_cuda11 && is_debian12 ; then
    echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}"
    return
  fi

  local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}"

  # https://github.com/NVIDIA/nccl/blob/master/README.md
  # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
  # Fermi:     SM_20,             compute_30
  # Kepler:    SM_30,SM_35,SM_37, compute_30,compute_35,compute_37
  # Maxwell:   SM_50,SM_52,SM_53, compute_50,compute_52,compute_53
  # Pascal:    SM_60,SM_61,SM_62, compute_60,compute_61,compute_62

  # The following architectures are suppored by open kernel driver
  # Volta:     SM_70,SM_72,       compute_70,compute_72
  # Ampere:    SM_80,SM_86,SM_87, compute_80,compute_86,compute_87

  # The following architectures are supported by CUDA v11.8+
  # Ada:       SM_89,             compute_89
  # Hopper:    SM_90,SM_90a       compute_90,compute_90a
  # Blackwell: SM_100,            compute_100
                  NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72"
  NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87"
  if version_ge "${CUDA_VERSION}" "11.8" ; then
    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89"
  fi
  if version_ge "${CUDA_VERSION}" "12.0" ; then
    NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a"
  fi

  mkdir -p "${workdir}"
  pushd "${workdir}"

  test -d "${workdir}/nccl" || {
    local tarball_fn="v${NCCL_VERSION}-1.tar.gz"
    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
      "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \
      | tar xz
    mv "nccl-${NCCL_VERSION}-1" nccl
  }

  local build_path
  if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else
                       build_path="nccl/build/pkg/rpm/x86_64" ; fi

  test -d "${workdir}/nccl/build" || {
    local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz"
    local local_tarball="${workdir}/${build_tarball}"
    local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}"

    output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '')
    if echo "${output}" | grep -q "${gcs_tarball}" ; then
      # cache hit - unpack from cache
      echo "cache hit"
    else
      # build and cache
      pushd nccl
      # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install
      install_build_dependencies
      if is_debuntu ; then
        # These packages are required to build .deb packages from source
        execute_with_retries \
          apt-get install -y -qq build-essential devscripts debhelper fakeroot
        export NVCC_GENCODE
        execute_with_retries make -j$(nproc) pkg.debian.build
      elif is_rocky ; then
        # These packages are required to build .rpm packages from source
        execute_with_retries \
          dnf -y -q install rpm-build rpmdevtools
        export NVCC_GENCODE
        execute_with_retries make -j$(nproc) pkg.redhat.build
      fi
      tar czvf "/${local_tarball}" "../${build_path}"
      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
      rm "${local_tarball}"
      make clean
      popd
    fi
    gcloud storage cat "${gcs_tarball}" | tar xz
  }

  if is_debuntu ; then
    dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb"
  elif is_rocky ; then
    rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm"
  fi

  popd
  touch "${workdir}/nccl-complete"
}

function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; )
function is_src_os()     ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; )

function install_nvidia_cudnn() {
  if test -f "${workdir}/cudnn-complete" ; then return ; fi
  local major_version
  major_version="${CUDNN_VERSION%%.*}"
  local cudnn_pkg_version
  cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}"

  if is_rocky ; then
    if is_cudnn8 ; then
      execute_with_retries dnf -y -q install \
        "libcudnn${major_version}" \
        "libcudnn${major_version}-devel"
      sync
    elif is_cudnn9 ; then
      execute_with_retries dnf -y -q install \
        "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \
        "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}"
      sync
    else
      echo "Unsupported cudnn version: '${major_version}'"
    fi
  elif is_debuntu; then
    if ge_debian12 && is_src_os ; then
      apt-get -y install nvidia-cudnn
    else
      if is_cudnn8 ; then
        install_local_cudnn8_repo

        apt-get update -qq

        execute_with_retries \
          apt-get -y install --no-install-recommends \
            "libcudnn8=${cudnn_pkg_version}" \
            "libcudnn8-dev=${cudnn_pkg_version}"

        uninstall_local_cudnn8_repo
	sync
      elif is_cudnn9 ; then
	install_cuda_keyring_pkg

        apt-get update -qq

        execute_with_retries \
          apt-get -y install --no-install-recommends \
          "libcudnn9-cuda-${CUDA_VERSION%%.*}" \
          "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \
          "libcudnn9-static-cuda-${CUDA_VERSION%%.*}"
	sync
      else
        echo "Unsupported cudnn version: [${CUDNN_VERSION}]"
      fi
    fi
  else
    echo "Unsupported OS: '${_shortname}'"
    exit 1
  fi

  ldconfig

  echo "NVIDIA cuDNN successfully installed for ${_shortname}."
  touch "${workdir}/cudnn-complete"
}

function add_nonfree_components() {
  if is_src_nvidia ; then return; fi
  if ge_debian12 ; then
      # Include in sources file components on which nvidia-open-kernel-dkms depends
      local -r debian_sources="/etc/apt/sources.list.d/debian.sources"
      local components="main contrib non-free non-free-firmware"

      sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}"
  elif is_debian ; then
      sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list
  fi
}

function add_repo_nvidia_container_toolkit() {
  if is_debuntu ; then
      local kr_path=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg
      local sources_list_path=/etc/apt/sources.list.d/nvidia-container-toolkit.list
      # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
      test -f "${kr_path}" ||
        curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
          | gpg --dearmor -o "${kr_path}"

      test -f "${sources_list_path}" ||
        curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
          | perl -pe "s#deb https://#deb [signed-by=${kr_path}] https://#g" \
          | tee "${sources_list_path}"
      apt-get update
  else
    curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \
      tee /etc/yum.repos.d/nvidia-container-toolkit.repo
  fi
}

function add_repo_cuda() {
  if is_debuntu ; then
    install_cuda_keyring_pkg # 11.7+, 12.0+
  elif is_rocky ; then
    execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}"
  fi
}

function build_driver_from_github() {
  # non-GPL driver will have been built on rocky8
  if is_rocky8 ; then return 0 ; fi
  pushd "${workdir}"

  test -d "${workdir}/open-gpu-kernel-modules" || {
    local tarball_fn="${DRIVER_VERSION}.tar.gz"
    curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
      "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \
      | tar xz
    mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules
  }

  local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
  test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
    local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
    local local_tarball="${workdir}/${build_tarball}"
    local build_dir
    if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
      then build_dir="${modulus_md5sum}"
      else build_dir="unsigned" ; fi

    local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"

    if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
      echo "cache hit"
    else
      # build the kernel modules
      pushd open-gpu-kernel-modules
      install_build_dependencies
      if is_cuda11 && is_ubuntu22 ; then
        echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}"
        exit 1
      fi
      execute_with_retries make -j$(nproc) modules \
        >  kernel-open/build.log \
        2> kernel-open/build_error.log
      # Sign kernel modules
      if [[ -n "${PSN}" ]]; then
        for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do
          "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \
          "${mok_key}" \
          "${mok_der}" \
          "${module}"
        done
      fi
      make modules_install \
        >>  kernel-open/build.log \
        2>> kernel-open/build_error.log
      # Collect build logs and installed binaries
      tar czvf "${local_tarball}" \
        "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \
        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
      rm "${local_tarball}"
      make clean
      popd
    fi
    gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
    depmod -a
  }

  popd
}

function build_driver_from_packages() {
  if is_debuntu ; then
    if [[ -n "$(apt-cache search -n "nvidia-driver-${DRIVER}-server-open")" ]] ; then
      local pkglist=("nvidia-driver-${DRIVER}-server-open") ; else
      local pkglist=("nvidia-driver-${DRIVER}-open") ; fi
    if is_debian ; then
      pkglist=(
        "firmware-nvidia-gsp=${DRIVER_VERSION}-1"
        "nvidia-smi=${DRIVER_VERSION}-1"
        "nvidia-alternative=${DRIVER_VERSION}-1"
        "nvidia-kernel-open-dkms=${DRIVER_VERSION}-1"
        "nvidia-kernel-support=${DRIVER_VERSION}-1"
        "nvidia-modprobe=${DRIVER_VERSION}-1"
        "libnvidia-ml1=${DRIVER_VERSION}-1"
      )
    fi
    add_contrib_component
    apt-get update -qq
    execute_with_retries apt-get install -y -qq --no-install-recommends dkms
    #configure_dkms_certs
    execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}"
    sync

  elif is_rocky ; then
    #configure_dkms_certs
    if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then
      echo "nvidia-driver:${DRIVER}-dkms installed successfully"
    else
      execute_with_retries dnf -y -q module install 'nvidia-driver:latest'
    fi
    sync
  fi
  #clear_dkms_key
}

function install_nvidia_userspace_runfile() {

  # This .run file contains NV's OpenGL implementation as well as
  # nvidia optimized implementations of the gtk+ 2,3 stack(s) not
  # including glib (https://docs.gtk.org/glib/), and what appears to
  # be a copy of the source from the kernel-open directory of for
  # example DRIVER_VERSION=560.35.03
  #
  # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz
  #
  # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run
  # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it.
  if test -f "${workdir}/userspace-complete" ; then return ; fi
  local local_fn="${tmpdir}/userspace.run"

  cache_fetched_package "${USERSPACE_URL}" \
                        "${pkg_bucket}/${USERSPACE_FILENAME}" \
                        "${local_fn}"

  local runfile_args
  runfile_args=""
  local cache_hit="0"
  local local_tarball

  if is_rocky8 ; then
    local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')"
    test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || {
      local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz"
      local_tarball="${workdir}/${build_tarball}"
      local build_dir
      if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]]
        then build_dir="${modulus_md5sum}"
        else build_dir="unsigned" ; fi

      local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}"

      if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then
        cache_hit="1"
        runfile_args="--no-kernel-modules"
        echo "cache hit"
      else
        install_build_dependencies

        local signing_options
        signing_options=""
        if [[ -n "${PSN}" ]]; then
          signing_options="--module-signing-hash sha256 \
          --module-signing-x509-hash sha256 \
          --module-signing-secret-key \"${mok_key}\" \
          --module-signing-public-key \"${mok_der}\" \
          --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \
          "
        fi

        runfile_args="--no-dkms ${signing_options}"
      fi
    }
  else
    runfile_args="--no-kernel-modules"
  fi

  execute_with_retries bash "${local_fn}" -e -q \
    ${runfile_args} \
    --ui=none \
    --install-libglvnd \
    --tmpdir="${tmpdir}"

  if is_rocky8 ; then
    if [[ "${cache_hit}" == "1" ]] ; then
      gcloud storage cat "${gcs_tarball}" | tar -C / -xzv
      depmod -a
    else
      tar czvf "${local_tarball}" \
        /var/log/nvidia-installer.log \
        $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko')
      gcloud storage cp "${local_tarball}" "${gcs_tarball}"
    fi
  fi

  rm -f "${local_fn}"
  touch "${workdir}/userspace-complete"
  sync
}

function install_cuda_runfile() {
  if test -f "${workdir}/cuda-complete" ; then return ; fi
  local local_fn="${tmpdir}/cuda.run"

  cache_fetched_package "${NVIDIA_CUDA_URL}" \
			"${pkg_bucket}/${CUDA_RUNFILE}" \
                        "${local_fn}"

  execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}"
  rm -f "${local_fn}"
  touch "${workdir}/cuda-complete"
  sync
}

function install_cuda_toolkit() {
  local cudatk_package=cuda-toolkit
  if ge_debian12 && is_src_os ; then
    cudatk_package="${cudatk_package}=${CUDA_FULL_VERSION}-1"
  elif [[ -n "${CUDA_VERSION}" ]]; then
    cudatk_package="${cudatk_package}-${CUDA_VERSION//./-}"
  fi
  cuda_package="cuda=${CUDA_FULL_VERSION}-1"
  readonly cudatk_package
  if is_debuntu ; then
#    if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi
    execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package}
  elif is_rocky ; then
    # rocky9: cuda-11-[7,8], cuda-12-[1..6]
    execute_with_retries dnf -y -q install "${cudatk_package}"
  fi
  sync
}

function load_kernel_module() {
  # for some use cases, the kernel module needs to be removed before first use of nvidia-smi
  for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do
    rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}"
  done

  depmod -a
  modprobe nvidia
  for suffix in uvm modeset drm; do
    modprobe "nvidia-${suffix}"
  done
  # TODO: if peermem is available, also modprobe nvidia-peermem
}

function install_cuda(){
  if test -f "${workdir}/cuda-repo-complete" ; then return ; fi

  if ( ge_debian12 && is_src_os ) ; then
    echo "installed with the driver on ${_shortname}"
    return 0
  fi

  # The OS package distributions are unreliable
  install_cuda_runfile

  # Includes CUDA packages
  add_repo_cuda

  touch "${workdir}/cuda-repo-complete"
}

function install_nvidia_container_toolkit() {
  local container_runtime_default
    if command -v docker     ; then container_runtime_default='docker'
  elif command -v containerd ; then container_runtime_default='containerd'
  elif command -v crio       ; then container_runtime_default='crio'
                               else container_runtime_default='' ; fi
  CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}")

  if test -z "${CONTAINER_RUNTIME}" ; then return ; fi

  add_repo_nvidia_container_toolkit
  if is_debuntu ; then
    execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else
    execute_with_retries dnf     install -y -q nvidia-container-toolkit ; fi
  nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}"
  systemctl restart "${CONTAINER_RUNTIME}"
}

# Install NVIDIA GPU driver provided by NVIDIA
function install_nvidia_gpu_driver() {
  if test -f "${workdir}/gpu-driver-complete" ; then return ; fi

  if ( ge_debian12 && is_src_os ) ; then
    add_nonfree_components
    apt-get update -qq
    apt-get -yq install \
        dkms \
        nvidia-open-kernel-dkms \
        nvidia-open-kernel-support \
        nvidia-smi \
        libglvnd0 \
        libcuda1
    echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully"
    return 0
  fi

  # OS driver packages do not produce reliable driver ; use runfile
  install_nvidia_userspace_runfile

  build_driver_from_github

  echo "NVIDIA GPU driver provided by NVIDIA was installed successfully"
  touch "${workdir}/gpu-driver-complete"
}

function install_ops_agent(){
  if test -f "${workdir}/ops-agent-complete" ; then return ; fi

  mkdir -p /opt/google
  cd /opt/google
  # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation
  curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh
  execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install

  touch "${workdir}/ops-agent-complete"
}

# Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics
function install_gpu_agent() {
  # Stackdriver GPU agent parameters
#  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics'
  local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics'
  if ( ! command -v pip && is_debuntu ) ; then
    execute_with_retries "apt-get install -y -qq python3-pip"
  fi
  local install_dir=/opt/gpu-utilization-agent
  mkdir -p "${install_dir}"
  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
    "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt"
  curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
    "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \
    | sed -e 's/-u --format=/--format=/' \
    | dd status=none of="${install_dir}/report_gpu_metrics.py"
  local venv="${install_dir}/venv"
  python3 -m venv "${venv}"
(
  source "${venv}/bin/activate"
  python3 -m pip install --upgrade pip
  execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt"
)
  sync

  # Generate GPU service.
  cat <<EOF >/lib/systemd/system/gpu-utilization-agent.service
[Unit]
Description=GPU Utilization Metric Agent

[Service]
Type=simple
PIDFile=/run/gpu_agent.pid
ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"'
User=root
Group=root
WorkingDirectory=/
Restart=always

[Install]
WantedBy=multi-user.target
EOF
  # Reload systemd manager configuration
  systemctl daemon-reload
  # Enable gpu-utilization-agent service
  systemctl --no-reload --now enable gpu-utilization-agent.service
}

function configure_gpu_exclusive_mode() {
  # check if running spark 3, if not, enable GPU exclusive mode
  local spark_version
  spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)
  if [[ ${spark_version} != 3.* ]]; then
    # include exclusive mode on GPU
    nvidia-smi -c EXCLUSIVE_PROCESS
  fi
}

function fetch_mig_scripts() {
  mkdir -p /usr/local/yarn-mig-scripts
  sudo chmod 755 /usr/local/yarn-mig-scripts
  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi
  wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh
  sudo chmod 755 /usr/local/yarn-mig-scripts/*
}

function configure_gpu_script() {
  # Download GPU discovery script
  local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu'
  mkdir -p ${spark_gpu_script_dir}
  # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still
  # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of:
  # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh
  local -r gpus_resources_script="${spark_gpu_script_dir}/getGpusResources.sh"
  cat > "${gpus_resources_script}" <<'EOF'
#!/usr/bin/env bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}<STDIN>))')

echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]}
EOF

  chmod a+rx "${gpus_resources_script}"

  local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf"
  if version_ge "${SPARK_VERSION}" "3.0" ; then
    local gpu_count
    gpu_count="$(lspci | grep NVIDIA | wc -l)"
    local executor_cores
    executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')"
    local executor_memory
    executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')"
    local task_cpus=2
    local gpu_amount
    gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")"

    cat >>"${spark_defaults_conf}" <<EOF
###### BEGIN : RAPIDS properties for Spark ${SPARK_VERSION} ######
# Rapids Accelerator for Spark can utilize AQE, but when the plan is not finalized,
# query explain output won't show GPU operator, if the user has doubts
# they can uncomment the line before seeing the GPU plan explain;
# having AQE enabled gives user the best performance.
spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}
spark.executor.resource.gpu.amount=${gpu_count}
spark.executor.cores=${executor_cores}
spark.executor.memory=${executor_memory_gb}G
spark.dynamicAllocation.enabled=false
# please update this config according to your application
spark.task.resource.gpu.amount=${gpu_amount}
spark.task.cpus=2
spark.yarn.unmanagedAM.enabled=false
###### END   : RAPIDS properties for Spark ${SPARK_VERSION} ######
EOF
  fi
}

function configure_gpu_isolation() {
  # enable GPU isolation
  sed -i "s/yarn\.nodemanager\.linux\-container\-executor\.group\=.*$/yarn\.nodemanager\.linux\-container\-executor\.group\=yarn/g" "${HADOOP_CONF_DIR}/container-executor.cfg"
  if [[ $IS_MIG_ENABLED -ne 0 ]]; then
    # configure the container-executor.cfg to have major caps
    printf '\n[gpu]\nmodule.enabled=true\ngpu.major-device-number=%s\n\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' $MIG_MAJOR_CAPS >> "${HADOOP_CONF_DIR}/container-executor.cfg"
    printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
    printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh"
  else
    printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg"
  fi

  # Configure a systemd unit to ensure that permissions are set on restart
  cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service<<EOF
[Unit]
Description=Set permissions to allow YARN to access device directories

[Service]
ExecStart=/bin/bash -c "chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct; chmod a+rwx -R /sys/fs/cgroup/devices"

[Install]
WantedBy=multi-user.target
EOF

  systemctl enable dataproc-cgroup-device-permissions
  systemctl start dataproc-cgroup-device-permissions
}

function nvsmi() {
  local nvsmi="/usr/bin/nvidia-smi"
  if   [[ "${nvsmi_works}" == "1" ]] ; then echo -n ''
  elif [[ ! -f "${nvsmi}" ]]         ; then echo "nvidia-smi not installed" >&2 ; return 0
  elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0
  else nvsmi_works="1" ; fi

  if [[ "$1" == "-L" ]] ; then
    local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt"
    if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}"
    else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi

    return 0
  fi

  "${nvsmi}" $*
}

function install_build_dependencies() {
  if test -f "${workdir}/build-dependencies-complete" ; then return ; fi

  if is_debuntu ; then
    if is_ubuntu22 && is_cuda12 ; then
      # On ubuntu22, the default compiler does not build some kernel module versions
      # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11
      execute_with_retries apt-get install -y -qq gcc-12
      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12
      update-alternatives --set gcc /usr/bin/gcc-12
    fi

  elif is_rocky ; then
    execute_with_retries dnf -y -q install gcc

    local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}"
    set +e
    eval "${dnf_cmd}" > "${install_log}" 2>&1
    local retval="$?"
    set -e

    if [[ "${retval}" == "0" ]] ; then return ; fi

    if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then
      # this kernel-devel may have been migrated to the vault
      local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')"
      local vault="https://download.rockylinux.org/vault/rocky/${os_ver}"
      dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \
        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \
        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \
        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \
        "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \
        "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm"
       )"
    fi

    execute_with_retries "${dnf_cmd}"
  fi
  touch "${workdir}/build-dependencies-complete"
}

function install_dependencies() {
  pkg_list="pciutils screen"
  if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list}
  elif is_rocky ; then execute_with_retries dnf     -y -q install ${pkg_list} ; fi
}

function prepare_gpu_env(){
  # Verify SPARK compatability
  RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK')

  readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1
  nvsmi_works="0"

  if   is_cuda11 ; then gcc_ver="11"
  elif is_cuda12 ; then gcc_ver="12" ; fi
}

# Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades
# Users should run apt-mark unhold before they wish to upgrade these packages
function hold_nvidia_packages() {
  apt-mark hold nvidia-*
  apt-mark hold libnvidia-*
  if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then
    apt-mark hold xserver-xorg-video-nvidia*
  fi
}

function configure_mig_cgi() {
  if (/usr/share/google/get_metadata_value attributes/MIG_CGI); then
    META_MIG_CGI_VALUE=$(/usr/share/google/get_metadata_value attributes/MIG_CGI)
    nvidia-smi mig -cgi $META_MIG_CGI_VALUE -C
  else
    # Dataproc only supports A100's right now split in 2 if not specified
    nvidia-smi mig -cgi 9,9  -C
  fi
}

function enable_mig() {
  nvidia-smi -mig 1
}


function configure_dkms_certs() {
  if test -v PSN && [[ -z "${PSN}" ]]; then
      echo "No signing secret provided.  skipping";
      return 0
  fi

  mkdir -p "${CA_TMPDIR}"

  # If the private key exists, verify it
  if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then
    echo "Private key material exists"

    local expected_modulus_md5sum
    expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum)
    if [[ -n "${expected_modulus_md5sum}" ]]; then
      modulus_md5sum="${expected_modulus_md5sum}"

      # Verify that cert md5sum matches expected md5sum
      if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then
        echo "unmatched rsa key"
      fi

      # Verify that key md5sum matches expected md5sum
      if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then
        echo "unmatched x509 cert"
      fi
    else
      modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')"
    fi
    ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"

    return
  fi

  # Retrieve cloud secrets keys
  local sig_priv_secret_name
  sig_priv_secret_name="${PSN}"
  local sig_pub_secret_name
  sig_pub_secret_name="$(get_metadata_attribute public_secret_name)"
  local sig_secret_project
  sig_secret_project="$(get_metadata_attribute secret_project)"
  local sig_secret_version
  sig_secret_version="$(get_metadata_attribute secret_version)"

  # If metadata values are not set, do not write mok keys
  if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi

  # Write private material to volatile storage
  gcloud secrets versions access "${sig_secret_version}" \
         --project="${sig_secret_project}" \
         --secret="${sig_priv_secret_name}" \
      | dd status=none of="${CA_TMPDIR}/db.rsa"

  # Write public material to volatile storage
  gcloud secrets versions access "${sig_secret_version}" \
         --project="${sig_secret_project}" \
         --secret="${sig_pub_secret_name}" \
      | base64 --decode \
      | dd status=none of="${CA_TMPDIR}/db.der"

  local mok_directory="$(dirname "${mok_key}")"
  mkdir -p "${mok_directory}"

  # symlink private key and copy public cert from volatile storage to DKMS directory
  ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}"
  cp  -f "${CA_TMPDIR}/db.der" "${mok_der}"

  modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')"
}

function clear_dkms_key {
  if [[ -z "${PSN}" ]]; then
      echo "No signing secret provided.  skipping" >&2
      return 0
  fi
  rm -rf "${CA_TMPDIR}" "${mok_key}"
}

function check_secure_boot() {
  local SECURE_BOOT="disabled"
  SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}')

  PSN="$(get_metadata_attribute private_secret_name)"
  readonly PSN

  if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then
    echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster."
    exit 1
  elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then
    echo "Secure boot is enabled, but no signing material provided."
    echo "Please either disable secure boot or provide signing material as per"
    echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot"
    return 1
  fi

  CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
  readonly CA_TMPDIR

  if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv
                      mok_der=/var/lib/shim-signed/mok/MOK.der
                 else mok_key=/var/lib/dkms/mok.key
                      mok_der=/var/lib/dkms/mok.pub ; fi

  configure_dkms_certs
}


function exit_handler() {
  # Purge private key material until next grant
  clear_dkms_key

  set +ex
  echo "Exit handler invoked"

  # Clear pip cache
  pip cache purge || echo "unable to purge pip cache"

  # If system memory was sufficient to mount memory-backed filesystems
  if [[ "${tmpdir}" == "/mnt/shm" ]] ; then
    # remove the tmpfs pip cache-dir
    pip config unset global.cache-dir || echo "unable to unset global pip cache"

    # Clean up shared memory mounts
    for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do
      if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then
        umount -f ${shmdir}
      fi
    done

    # restart services stopped during preparation stage
    # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/'
  fi

  if is_debuntu ; then
    # Clean up OS package cache
    apt-get -y -qq clean
    apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove
    # re-hold systemd package
    if ge_debian12 ; then
    apt-mark hold systemd libsystemd0 ; fi
    hold_nvidia_packages
  else
    dnf clean all
  fi

  # print disk usage statistics for large components
  if is_ubuntu ; then
    du -hs \
      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
      /usr/lib \
      /opt/nvidia/* \
      /usr/local/cuda-1?.? \
      /opt/conda/miniconda3 | sort -h
  elif is_debian ; then
    du -x -hs \
      /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \
      /var/lib/{docker,mysql,} \
      /usr/lib \
      /opt/nvidia/* \
      /usr/local/cuda-1?.? \
      /opt/{conda,google-cloud-ops-agent,install-nvidia,} \
      /usr/bin \
      /usr \
      /var \
      / 2>/dev/null | sort -h
  else
    du -hs \
      /var/lib/docker \
      /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \
      /usr/lib64/google-cloud-sdk \
      /usr/lib \
      /opt/nvidia/* \
      /usr/local/cuda-1?.? \
      /opt/conda/miniconda3
  fi

  # Process disk usage logs from installation period
  rm -f /run/keep-running-df
  sync
  sleep 5.01s
  # compute maximum size of disk during installation
  # Log file contains logs like the following (minus the preceeding #):
#Filesystem     1K-blocks    Used Available Use% Mounted on
#/dev/vda2        7096908 2611344   4182932  39% /
  df / | tee -a "/run/disk-usage.log"

  perl -e '@siz=( sort { $a => $b }
                   map { (split)[2] =~ /^(\d+)/ }
                  grep { m:^/: } <STDIN> );
$max=$siz[0]; $min=$siz[-1]; $inc=$max-$min;
print( "    samples-taken: ", scalar @siz, $/,
       "maximum-disk-used: $max", $/,
       "minimum-disk-used: $min", $/,
       "     increased-by: $inc", $/ )' < "/run/disk-usage.log"

  echo "exit_handler has completed"

  # zero free disk space
  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
    dd if=/dev/zero of=/zero
    sync
    sleep 3s
    rm -f /zero
  fi

  return 0
}

function prepare_to_install(){
  # Verify OS compatability and Secure boot state
  check_os
  check_secure_boot

  prepare_gpu_env

  OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')"
  readonly OS_NAME

  # node role
  ROLE="$(get_metadata_attribute dataproc-role)"
  readonly ROLE

  workdir=/opt/install-dpgce
  tmpdir=/tmp/
  temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)"
  readonly temp_bucket
  readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages"
  uname_r=$(uname -r)
  readonly uname_r
  readonly bdcfg="/usr/local/bin/bdconfig"
  export DEBIAN_FRONTEND=noninteractive

  mkdir -p "${workdir}"
  trap exit_handler EXIT
  set_proxy
  mount_ramdisk

  readonly install_log="${tmpdir}/install.log"

  if test -f "${workdir}/prepare-complete" ; then return ; fi

  repair_old_backports

  if is_debuntu ; then
    clean_up_sources_lists
    apt-get update -qq
    apt-get -y clean
    apt-get -o DPkg::Lock::Timeout=60 -y autoremove
    if ge_debian12 ; then
    apt-mark unhold systemd libsystemd0 ; fi
    hold_nvidia_packages
  else
    dnf clean all
  fi

  # zero free disk space
  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e
    time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero
  ) fi

  install_dependencies

  # Monitor disk usage in a screen session
  df / > "/run/disk-usage.log"
  touch "/run/keep-running-df"
  screen -d -m -LUS keep-running-df \
    bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"

  touch "${workdir}/prepare-complete"
}

function main() {
  # default MIG to on when this script is used
  META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1")

  if (lspci | grep -q NVIDIA); then
    if [[ $META_MIG_VALUE -ne 0 ]]; then
      # if the first invocation, the NVIDIA drivers and tools are not installed
      if [[ -f "/usr/bin/nvidia-smi" ]]; then
        # check to see if we already enabled mig mode and rebooted so we don't end
        # up in infinite reboot loop
        NUM_GPUS_WITH_DIFF_MIG_MODES=`/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l`
        if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then
          if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then
            echo "MIG is enabled on all GPUs, configuring instances"
            configure_mig_cgi
            exit 0
          else
            echo "GPUs present but MIG is not enabled"
          fi
        else
          echo "More than 1 GPU with MIG configured differently between them"
        fi
      fi
    fi

    install_nvidia_gpu_driver

    if [[ ${META_MIG_VALUE} -ne 0 ]]; then
      enable_mig
      NUM_GPUS_WITH_DIFF_MIG_MODES="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l)"
      if [[ NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then
        if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then
          echo "MIG is fully enabled, we don't need to reboot"
          configure_mig_cgi
        else
          echo "MIG is configured on but NOT enabled.  Failing"
          exit 1
        fi
      else
        echo "MIG is NOT enabled all on GPUs.  Failing"
        exit 1
      fi
    else
      echo "Not enabling MIG"
    fi
  fi
}

prepare_to_install

main
====== Filename: ./examples/secure-boot/build-current-images.sh ======
#!/bin/bash

# Copyright 2024 Google LLC and contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This script creates a custom image pre-loaded with
#
# GPU drivers + cuda + rapids + cuDNN + nccl + tensorflow + pytorch + ipykernel + numba

# To run the script, the following will bootstrap
#
# git clone git@github.com:GoogleCloudDataproc/custom-images
# cd custom-images
# git checkout 2025.02
# cp examples/secure-boot/env.json.sample env.json
# vi env.json
# docker build -f Dockerfile -t custom-images-builder:latest .
# time docker run -it custom-images-builder:latest bash examples/secure-boot/build-current-images.sh


set -ex

function execute_with_retries() (
  set +x
  local -r cmd="$*"
  local install_log="${tmpdir}/install.log"

  for ((i = 0; i < 3; i++)); do
    set -x
    eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; }
    set +x
    if [[ $retval == 0 ]] ; then return 0 ; fi
    sleep 5
  done
  return 1
)

function configure_service_account() {
  # Create service account
  if gcloud iam service-accounts list --filter email="${GSA}" 2>&1 | grep -q 'Listed 0 items.' ; then
    # Create service account for this purpose
    echo "creating pre-init customization service account ${GSA}"
    gcloud iam service-accounts create "${SA_NAME}" \
      --description="Service account for pre-init customization" \
      --display-name="${SA_NAME}"
  fi

  if [[ -d tls ]] ; then mv tls "tls-$(date +%s)" ; fi
  eval "$(bash examples/secure-boot/create-key-pair.sh)"

  execute_with_retries gcloud projects add-iam-policy-binding "${PROJECT_ID}" \
    --member="serviceAccount:${GSA}" \
    --role="roles/dataproc.worker" \
    --condition=None

  # Grant the service account access to buckets in this project
  # TODO: this is over-broad and should be limited only to the buckets
  # used by these clusters
  for storage_object_role in 'User' 'Creator' 'Viewer' ; do
    execute_with_retries gcloud projects add-iam-policy-binding "${PROJECT_ID}" \
      --member="serviceAccount:${GSA}" \
      --role="roles/storage.object${storage_object_role}" \
      --condition=None
  done

  for secret in "${public_secret_name}" "${private_secret_name}" ; do
    for sm_role in 'viewer' 'secretAccessor' ; do
      # Grant the service account permission to list the secret
      execute_with_retries gcloud secrets -q add-iam-policy-binding "${secret}" \
        --member="serviceAccount:${GSA}" \
        --role="roles/secretmanager.${sm_role}" \
        --condition=None
    done
  done

  execute_with_retries gcloud projects add-iam-policy-binding "${PROJECT_ID}" \
    --member="serviceAccount:${GSA}" \
    --role=roles/compute.instanceAdmin.v1 \
    --condition=None

  execute_with_retries gcloud iam service-accounts add-iam-policy-binding "${GSA}" \
    --member="serviceAccount:${GSA}" \
    --role=roles/iam.serviceAccountUser \
    --condition=None
}

function revoke_bindings() {
  execute_with_retries gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \
    --member="serviceAccount:${GSA}" \
    --role="roles/dataproc.worker" \
    --condition=None

  # Revoke the service account's access to buckets in this project
  for storage_object_role in 'User' 'Creator' 'Viewer' ; do
    execute_with_retries gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \
      --member="serviceAccount:${GSA}" \
      --role="roles/storage.object${storage_object_role}" \
      --condition=None
  done

  for secret in "${public_secret_name}" "${private_secret_name}" ; do
    # Revoke the service account's permission to list and access the secret
    for sm_role in 'viewer' 'secretAccessor' ; do
      execute_with_retries gcloud secrets -q remove-iam-policy-binding "${secret}" \
        --member="serviceAccount:${GSA}" \
        --role="roles/secretmanager.${sm_role}" \
        --condition=None
    done
  done

  execute_with_retries gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \
    --member="serviceAccount:${GSA}" \
    --role=roles/compute.instanceAdmin.v1 \
    --condition=None

  execute_with_retries gcloud iam service-accounts remove-iam-policy-binding "${GSA}" \
    --member="serviceAccount:${GSA}" \
    --role=roles/iam.serviceAccountUser \
    --condition=None
}


export DOMAIN="$(jq           -r .DOMAIN        env.json)"
export PROJECT_ID="$(jq       -r .PROJECT_ID    env.json)"
export PURPOSE="$(jq          -r .PURPOSE       env.json)"
export BUCKET="$(jq           -r .BUCKET        env.json)"
export SECRET_NAME="$(jq      -r .SECRET_NAME   env.json)"
export REGION="$(jq           -r .REGION        env.json)"
export ZONE="$(jq             -r .ZONE          env.json)"
export PRINCIPAL_USER="$(jq   -r .PRINCIPAL     env.json)"
export PRINCIPAL_DOMAIN="$(jq -r .DOMAIN        env.json)"
export PRINCIPAL="${PRINCIPAL_USER}@${PRINCIPAL_DOMAIN}"

echo -n "setting gcloud config..."
gcloud config set project "${PROJECT_ID}"
gcloud config set account "${PRINCIPAL}"
gcloud auth login

CURRENT_COMPUTE_REGION="$(gcloud config get compute/region)"
if [[ "${CURRENT_COMPUTE_REGION}" != "${REGION}" ]]; then
    echo "setting compute region"
    gcloud config set compute/region "${REGION}"
fi
CURRENT_DATAPROC_REGION="$(gcloud config get dataproc/region)"
if [[ "${CURRENT_DATAPROC_REGION}" != "${REGION}" ]]; then
    echo "setting dataproc region"
    gcloud config set dataproc/region "${REGION}"
fi
CURRENT_COMPUTE_ZONE="$(gcloud config get compute/zone)"
if [[ "${CURRENT_COMPUTE_ZONE}" != "${ZONE}" ]]; then
    echo "setting compute zone"
    gcloud config set compute/zone "${ZONE}"
fi
SA_NAME="sa-${PURPOSE}"

if [[ "${PROJECT_ID}" =~ ":" ]] ; then
  GSA="${SA_NAME}@${PROJECT_ID#*:}.${PROJECT_ID%:*}.iam.gserviceaccount.com"
else
  GSA="${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com"
fi

readonly timestamp="$(date "+%Y%m%d-%H%M%S")"
export timestamp

export tmpdir=/tmp/${timestamp};
mkdir -p ${tmpdir}

configure_service_account

# screen session name
session_name="build-current-images"

export ZONE="$(jq -r .ZONE env.json)"
gcloud compute instances list --zones "${ZONE}" --format json > ${tmpdir}/instances.json
gcloud compute images    list                   --format json > ${tmpdir}/images.json

# Run generation scripts simultaneously for each dataproc image version
screen -L -US "${session_name}" -c examples/secure-boot/pre-init.screenrc

function find_disk_usage() {
  #  grep maximum-disk-used /tmp/custom-image-*/logs/startup-script.log
  grep -H 'Customization script' /tmp/custom-image-*/logs/workflow.log
  echo '# DP_IMG_VER       RECOMMENDED_DISK_SIZE   DSK_SZ  D_USED   D_FREE  D%F     PURPOSE'
# workflow_log=/tmp/custom-image-dataproc-2-0-deb10-20250424-232955-tf-20250425-230559/logs/workflow.log
  for workflow_log in $(grep -Hl "Customization script" /tmp/custom-image-*/logs/workflow.log) ; do
    startup_log="${workflow_log/workflow/startup-script}"
    grep -v '^\['  "${startup_log}" \
      | grep -A20 'Filesystem.*Avail' | tail -20 \
      | perl examples/secure-boot/genline.pl "${startup_log}"
  done
}

revoke_bindings
====== Filename: ./examples/secure-boot/no-customization.sh ======
#!/bin/bash

function exit_handler() {
  set +ex
  echo "Exit handler invoked"

  # Process disk usage logs from installation period
  rm -f /run/keep-running-df
  sync
  sleep 5.01s
  # compute maximum size of disk during installation
  # Log file contains logs like the following (minus the preceeding #):
#Filesystem     1K-blocks    Used Available Use% Mounted on
#/dev/vda2        7096908 2611344   4182932  39% /
  df / | tee -a "/run/disk-usage.log"

  perl -e '($first, @samples) = grep { m:^/: } <STDIN>;
           unshift(@samples,$first); $final=$samples[-1];
           ($starting)=(split(/\s+/,$first))[2] =~ /^(\d+)/;
             ($ending)=(split(/\s+/,$final))[2] =~ /^(\d+)/;
           @siz=( sort { $a => $b }
                   map { (split)[2] =~ /^(\d+)/ } @samples );
$max=$siz[0]; $min=$siz[-1]; $inc=$max-$starting;
print( "     samples-taken: ", scalar @siz, $/,
       "starting-disk-used: $starting", $/,
       "  ending-disk-used: $ending", $/,
       " maximum-disk-used: $max", $/,
       " minimum-disk-used: $min", $/,
       "      increased-by: $inc", $/ )' < "/run/disk-usage.log"

  # zero free disk space
  if [[ -n "$(get_metadata_attribute creating-image)" ]]; then
    dd if=/dev/zero of=/zero
    sync
    sleep 3s
    rm -f /zero
  fi

  echo "exit_handler has completed"
  return 0
}

# Monitor disk usage in a screen session
df / | tee "/run/disk-usage.log"
touch "/run/keep-running-df"
screen -d -m -LUS keep-running-df \
  bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done"

trap exit_handler EXIT

sleep 30s

echo "exit handler will be triggered after this operation."
====== Filename: ./examples/secure-boot/pre-init.sh ======
#!/bin/bash
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This script creates a custom image with the script specified loaded
#
# pre-init.sh <dataproc version>

function version_ge(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|tail -n1)" ]]; }
function version_gt(){ [[ "$1" = "$2" ]]&& return 1 || version_ge "$1" "$2";}
function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; }
function version_lt(){ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";}

set -e

IMAGE_VERSION="$1"
if [[ -z "${IMAGE_VERSION}" ]] ; then
IMAGE_VERSION="$(jq    -r .IMAGE_VERSION        env.json)" ; fi
PROJECT_ID="$(jq       -r .PROJECT_ID           env.json)"
PURPOSE="$(jq          -r .PURPOSE              env.json)"
BUCKET="$(jq           -r .BUCKET               env.json)"
TEMP_BUCKET="$(jq      -r .TEMP_BUCKET          env.json)"
ZONE="$(jq             -r .ZONE                 env.json)"
SUBNET="$(jq           -r .SUBNET               env.json)"
HIVE_NAME="$(jq        -r .HIVE_INSTANCE_NAME   env.json)"
HIVEDB_PW_URI="$(jq    -r .DB_HIVE_PASSWORD_URI env.json)"
SECRET_NAME="$(jq      -r .SECRET_NAME          env.json)"
KMS_KEY_URI="$(jq      -r .KMS_KEY_URI          env.json)"

PRINCIPAL_USER="$(jq   -r .PRINCIPAL            env.json)"
PRINCIPAL_DOMAIN="$(jq -r .DOMAIN               env.json)"
PRINCIPAL="${PRINCIPAL_USER}@${PRINCIPAL_DOMAIN}"
gcloud config set project "${PROJECT_ID}"
gcloud config set account "${PRINCIPAL}"

region="$(echo "${ZONE}" | perl -pe 's/-[a-z]+$//')"

custom_image_zone="${ZONE}"
disk_size_gb="30" # greater than or equal to 30 (32 for rocky8)

SA_NAME="sa-${PURPOSE}"
GSA="${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com"

# If no OS family specified, default to debian
if [[ "${IMAGE_VERSION}" != *-* ]] ; then
  case "${IMAGE_VERSION}" in
    "2.3" ) dataproc_version="${IMAGE_VERSION}-debian12" ;;
    "2.2" ) dataproc_version="${IMAGE_VERSION}-debian12" ;;
    "2.1" ) dataproc_version="${IMAGE_VERSION}-debian11" ;;
    "2.0" ) dataproc_version="${IMAGE_VERSION}-debian10" ;;
    "1.5" ) dataproc_version="${IMAGE_VERSION}-debian10" ;;
  esac
else
  dataproc_version="${IMAGE_VERSION}"
fi

CUDA_VERSION="12.4.1"
case "${dataproc_version}" in
  "1.5-debian10"     ) CUDA_VERSION="11.5.2" ; short_dp_ver=1.5-deb10 ; disk_size_gb="20";;
  "2.0-debian10"     ) CUDA_VERSION="12.1.1" ; short_dp_ver=2.0-deb10 ;;
  "2.0-rocky8"       ) CUDA_VERSION="12.1.1" ; short_dp_ver=2.0-roc8 ; disk_size_gb="32";;
  "2.0-ubuntu18"     ) CUDA_VERSION="12.1.1" ; short_dp_ver=2.0-ubu18 ;;
  "2.1-debian11"     ) CUDA_VERSION="12.4.1" ; short_dp_ver=2.1-deb11 ;;
  "2.1-rocky8"       ) CUDA_VERSION="12.4.1" ; short_dp_ver=2.1-roc8 ;;
  "2.1-ubuntu20"     ) CUDA_VERSION="12.4.1" ; short_dp_ver=2.1-ubu20 ;;
  "2.1-ubuntu20-arm" ) CUDA_VERSION="12.4.1" ; short_dp_ver=2.1-ubu20-arm ;;
  "2.2-debian12"     ) CUDA_VERSION="12.6.3" ; short_dp_ver=2.2-deb12 ;;
  "2.2-rocky9"       ) CUDA_VERSION="12.6.3" ; short_dp_ver=2.2-roc9 ;;
  "2.2-ubuntu22"     ) CUDA_VERSION="12.6.3" ; short_dp_ver=2.2-ubu22 ;;
  "2.3-debian12"     ) CUDA_VERSION="12.6.3" ; short_dp_ver=2.3-deb12 ;;
  "2.3-rocky9"       ) CUDA_VERSION="12.6.3" ; short_dp_ver=2.3-roc9 ;;
  "2.3-ubuntu22"     ) CUDA_VERSION="12.6.3" ; short_dp_ver=2.3-ubu22 ;;
  "2.3-ml-ubuntu22"  ) CUDA_VERSION="12.6.3" ; short_dp_ver=2.3-ml-ubu22 ; disk_size_gb="50";;
esac

function create_h100_instance() {
  python generate_custom_image.py \
    --machine-type         "a3-highgpu-2g" \
    --accelerator          "type=nvidia-h100-80gb,count=2" \
    $*
}

function create_t4_instance() {
  python generate_custom_image.py \
    --machine-type         "n1-standard-32" \
    --accelerator          "type=nvidia-tesla-t4,count=1" \
    $*
}

function create_unaccelerated_instance() {
  python generate_custom_image.py \
    --machine-type         "n1-standard-2" \
    $*
}

OPTIONAL_COMPONENTS_ARG=""

function generate() {
  local extra_args="$*"
#  local image_name="${PURPOSE}-${timestamp}-${dataproc_version//\./-}"
  local image_name="dataproc-${short_dp_ver//\./-}-${timestamp}-${PURPOSE}"

  local image="$(jq -r ".[] | select(.name == \"${image_name}\").name" "${tmpdir}/images.json")"

  if [[ -n "${image}" ]] ; then
    echo "Image already exists"
    return
  fi

  local metadata="invocation-type=custom-images"
  metadata="${metadata},dataproc-temp-bucket=${TEMP_BUCKET}"

  local install_image="$(jq -r ".[] | select(.name == \"${image_name}-install\").name" "${tmpdir}/images.json")"
  if [[ -n "${install_image}" ]] ; then
    echo "Install image already exists.  Cleaning up after aborted run."
    gcloud -q compute images delete "${image_name}-install"
  fi

  local instance="$(jq -r ".[] | select(.name == \"${image_name}-install\").name" "${tmpdir}/instances.json")"

  if [[ -n "${instance}" ]]; then
    # if previous run ended without cleanup...
    echo "cleaning up instance from previous run"
    gcloud -q compute instances delete "${image_name}-install" --zone "${ZONE}"
  fi

  create_function="create_unaccelerated_instance"

  if [[ "${customization_script}" =~ "cloud-sql-proxy.sh"  ]] ; then
    metadata="${metadata},hive-metastore-instance=${PROJECT_ID}:${region}:${HIVE_NAME}"
    metadata="${metadata},db-hive-password-uri=${HIVEDB_PW_URI}"
    metadata="${metadata},kms-key-uri=${KMS_KEY_URI}"
  fi

  # For actions requiring access to the MOK during runtime, pass the requisite
  # metadata to extract the signing material
  if [[ "${customization_script}" =~ "install_gpu_driver.sh" ]] ; then
    eval "$(bash examples/secure-boot/create-key-pair.sh)"
    metadata="${metadata},public_secret_name=${public_secret_name}"
    metadata="${metadata},private_secret_name=${private_secret_name}"
    metadata="${metadata},secret_project=${secret_project}"
    metadata="${metadata},secret_version=${secret_version}"
    metadata="${metadata},modulus_md5sum=${modulus_md5sum}"
  fi

  if [[ "${customization_script}" =~ "install_gpu_driver.sh" ]] ; then
    metadata="${metadata},cuda-version=${CUDA_VERSION}"
    metadata="${metadata},include-pytorch=1"
    create_function="create_t4_instance"
  fi

  if [[ "${customization_script}" =~ "spark-rapids.sh" ]] ; then
    metadata="${metadata},rapids-runtime=SPARK"
    create_function="create_t4_instance"
  fi

  if [[ "${customization_script}" =~ "rapids.sh" ]] ; then
    metadata="${metadata},rapids-runtime=DASK"
    create_function="create_t4_instance"
  fi

  # check for known retry-able errors after failed completion
  local do_retry=1
  set -x
  while [[ "${do_retry}" == "1" ]]; do
    do_retry=0
    set +e
    "${create_function}" \
      --image-name           "${image_name}" \
      --customization-script "${customization_script}" \
      --service-account      "${GSA}" \
      --metadata             "${metadata}" \
      --zone                 "${custom_image_zone}" \
      --disk-size            "${disk_size_gb}" \
      --gcs-bucket           "${BUCKET}" \
      --subnet               "${SUBNET}" \
      ${OPTIONAL_COMPONENTS_ARG} \
      --shutdown-instance-timer-sec=30 \
      --no-smoke-test \
      ${extra_args}
    if [[ "$?" != "0" ]]; then
      local img_build_dir="$(ls -d /tmp/custom-image-${image_name}-*)"
      # retry if the startup-script.log file does not exist or is empty
      local startup_script_log="${img_build_dir}/logs/startup-script.log"
      if [[ ! -f "${startup_script_log}" ]] || [[ "$(wc -l < $startup-script.log)" == "0" ]]; then
        do_retry=1
        mkdir -p /tmp/old
        mv "${img_build_dir}" /tmp/old
      else
        exit 1
      fi
    fi
  done
  set +x
}

function generate_from_dataproc_version() { generate --dataproc-version "$1" ; }

function generate_from_prerelease_version() {
  # base image -> tensorflow
  local img_pfx="https://www.googleapis.com/compute/v1/projects/cloud-dataproc/global/images"
#  local src_timestamp="20250410-165100"
  local src_timestamp="20250505-045100"
  case "${dataproc_version}" in
#    "1.5-debian10"     ) image_uri="${img_pfx}/dataproc-1-5-deb10-${src_timestamp}-rc01"  ;;
#    "1.5-debian10"     ) image_uri="${img_pfx}/dataproc-1-5-deb10-20200820-160220-rc01"  ;;
#    "1.5-debian10"     ) image_uri="https://www.googleapis.com/compute/v1/projects/cloud-dataproc-ci/global/images/dataproc-1-5-deb10-20230909-165100-rc01" ;;
    "1.5-debian10"     ) image_uri="https://www.googleapis.com/compute/v1/projects/cloud-dataproc/global/images/dataproc-1-5-deb10-20230909-165100-rc01" ;;
    "2.0-debian10"     ) image_uri="${img_pfx}/dataproc-2-0-deb10-${src_timestamp}-rc01"  ;;
    "2.0-rocky8"       ) image_uri="${img_pfx}/dataproc-2-0-roc8-${src_timestamp}-rc01"   ;;
    "2.0-ubuntu18"     ) image_uri="${img_pfx}/dataproc-2-0-ubu18-${src_timestamp}-rc01"  ;;
    "2.1-debian11"     ) image_uri="${img_pfx}/dataproc-2-1-deb11-${src_timestamp}-rc01"  ;;
    "2.1-rocky8"       ) image_uri="${img_pfx}/dataproc-2-1-roc8-${src_timestamp}-rc01"   ;;
    "2.1-ubuntu20"     ) image_uri="${img_pfx}/dataproc-2-1-ubu20-${src_timestamp}-rc01"  ;;
    "2.1-ubuntu20-arm" ) image_uri="${img_pfx}/dataproc-2-1-ubu20-arm-${src_timestamp}-rc01"  ;;
    "2.2-debian12"     ) image_uri="${img_pfx}/dataproc-2-2-deb12-${src_timestamp}-rc01"  ;;
    "2.2-rocky9"       ) image_uri="${img_pfx}/dataproc-2-2-roc9-${src_timestamp}-rc01"   ;;
    "2.2-ubuntu22"     ) image_uri="${img_pfx}/dataproc-2-2-ubu22-${src_timestamp}-rc01"  ;;
    "2.3-debian12"     ) image_uri="${img_pfx}/dataproc-2-3-deb12-${src_timestamp}-rc01"  ;;
    "2.3-rocky9"       ) image_uri="${img_pfx}/dataproc-2-3-roc9-${src_timestamp}-rc01"   ;;
    "2.3-ubuntu22"     ) image_uri="${img_pfx}/dataproc-2-3-ubu22-${src_timestamp}-rc01"  ;;
    "2.3-ml-ubuntu22"  ) image_uri="${img_pfx}/dataproc-2-3-ml-ubu22-${src_timestamp}-rc01"  ;;
  esac
  generate --base-image-uri "${image_uri}"
}

function generate_from_base_purpose() {
#  local image_name="dataproc-${short_dp_ver//\./-}-${timestamp}-${PURPOSE}"
# https://pantheon.corp.google.com/compute/imagesDetail/projects/cloud-dataproc-ci/global/images/dataproc-2-0-deb10-20250422-193049-secure-boot?project=cloud-dataproc-ci
# https://www.googleapis.com/compute/v1/projects/dataproc-cloud-dataproc-ci/global/images/dataproc-2-0-deb10-20250422-193049-secure-boot
# https://www.googleapis.com/compute/v1/projects/cloud-dataproc-ci/global/images/dataproc-2-0-deb10-20250422-193049-secure-bootprojects/dataproc-${PROJECT_ID}/global/images"
#  local img_pfx="https://www.googleapis.com/compute/v1/projects/dataproc-${PROJECT_ID}/global/images"
  local img_pfx="https://www.googleapis.com/compute/v1/projects/${PROJECT_ID}/global/images"
  generate --base-image-uri "${img_pfx}/dataproc-${short_dp_ver/\./-}-${timestamp}-${1}"
#  generate --base-image-uri "${img_pfx}/${1}-${dataproc_version/\./-}-${timestamp}"
}

# base image -> secure-boot

# Install secure-boot certs without customization
PURPOSE="secure-boot"
customization_script="examples/secure-boot/no-customization.sh"
time generate_from_dataproc_version "${dataproc_version}"

#time generate_from_prerelease_version "${dataproc_version}"

if version_ge "${IMAGE_VERSION}" "2.3" ; then

  ## run the installer for the DOCKER optional component
  PURPOSE="docker"
  OPTIONAL_COMPONENTS_ARG='--optional-components=DOCKER'
  customization_script="examples/secure-boot/no-customization.sh"
  time generate_from_base_purpose "secure-boot"

  ## run the installer for the ZEPPELIN optional component
  PURPOSE="zeppelin"
  OPTIONAL_COMPONENTS_ARG='--optional-components=ZEPPELIN'
  customization_script="examples/secure-boot/no-customization.sh"
  time generate_from_base_purpose "secure-boot"

  ## run the installer for the DOCKER,PIG optional components
  PURPOSE="docker-pig"
  OPTIONAL_COMPONENTS_ARG='--optional-components=PIG'
  customization_script="examples/secure-boot/no-customization.sh"
  time generate_from_base_purpose "docker"

fi

OPTIONAL_COMPONENTS_ARG=""

## Execute spark-rapids/spark-rapids.sh init action on base image
PURPOSE="cloud-sql-proxy"
customization_script="examples/secure-boot/cloud-sql-proxy.sh"
echo time generate_from_base_purpose "secure-boot"

# secure-boot -> tensorflow

case "${dataproc_version}" in
# DP_IMG_VER       RECOMMENDED_DISK_SIZE   DSK_SZ  D_USED   D_FREE  D%F     DATE_SAMPLED

  "2.0-debian10"     ) disk_size_gb="36" ;; #  35.20G  30.74G    2.91G  92% / # 20250507-083009-tf
  "2.0-rocky8"       ) disk_size_gb="43" ;; #  48.79G  36.34G   12.45G  75% / # 20250507-083009-tf
  "2.0-ubuntu18"     ) disk_size_gb="38" ;; #  36.65G  32.24G    4.39G  89% / # 20250507-083009-tf

  "2.1-debian11"     ) disk_size_gb="42" ;; #  41.11G  35.82G    3.50G  92% / # 20250507-083009-tf
  "2.1-rocky8"       ) disk_size_gb="45" ;; #  59.79G  38.41G   21.39G  65% / # 20250429-193537-tf
  "2.1-ubuntu20"     ) disk_size_gb="42" ;; #  47.31G  36.02G   11.27G  77% / # 20250507-083009-tf
  "2.1-ubuntu20-arm" ) disk_size_gb="45" ;; # 39.55G 39.39G   0.15G 100% / # pre-init-2-1-ubuntu20

  "2.2-debian12"     ) disk_size_gb="51" ;; #  58.82G  43.88G   12.44G  78% / # 20250429-193537-tf
  "2.2-rocky9"       ) disk_size_gb="51" ;; #  49.79G  43.51G    6.28G  88% / # 20250429-193537-tf
  "2.2-ubuntu22"     ) disk_size_gb="50" ;; #  48.28G  43.32G    4.94G  90% / # 20250429-193537-tf

  "2.3-debian12"     ) disk_size_gb="42" ;; #  41.11G  36.20G    3.12G  93% / # 20250507-083009-tf
  "2.3-rocky9"       ) disk_size_gb="44" ;; #  49.79G  37.82G   11.98G  76% / # 20250507-083009-tf
  "2.3-ubuntu22"     ) disk_size_gb="42" ;; #  40.52G  36.18G    4.33G  90% / # 20250507-083009-tf
  "2.3-ml-ubuntu22"  ) disk_size_gb="70" ;; #  40.52G  36.18G    4.33G  90% / # 20250507-083009-tf

esac

# Install GPU drivers + cuda + rapids + cuDNN + nccl + tensorflow + pytorch on dataproc base image
PURPOSE="tf"
customization_script="examples/secure-boot/install_gpu_driver.sh"
time generate_from_base_purpose "secure-boot"

## Execute spark-rapids/spark-rapids.sh init action on base image
PURPOSE="spark"
customization_script="examples/secure-boot/spark-rapids.sh"
time generate_from_base_purpose "tf"

## Execute spark-rapids/mig.sh init action on base image
PURPOSE="mig-pre-init"
customization_script="examples/secure-boot/mig.sh"
echo time generate_from_base_purpose "tf"

# tf image -> rapids
case "${dataproc_version}" in
  "2.0-debian10" ) disk_size_gb="41" ;; # 40.12G 37.51G   0.86G  98% / # rapids-pre-init-2-0-debian10
  "2.0-rocky8"   ) disk_size_gb="41" ;; # 38.79G 38.04G   0.76G  99% / # rapids-pre-init-2-0-rocky8
  "2.0-ubuntu18" ) disk_size_gb="40" ;; # 37.62G 36.69G   0.91G  98% / # rapids-pre-init-2-0-ubuntu18
  "2.1-debian11" ) disk_size_gb="44" ;; # 42.09G 39.77G   0.49G  99% / # rapids-pre-init-2-1-debian11
  "2.1-rocky8"   ) disk_size_gb="44" ;; # 43.79G 41.11G   2.68G  94% / # rapids-pre-init-2-1-rocky8
  "2.1-ubuntu20" ) disk_size_gb="45" ;; # 39.55G 39.39G   0.15G 100% / # rapids-pre-init-2-1-ubuntu20
  "2.1-ubuntu20-arm" ) disk_size_gb="45" ;; # 39.55G 39.39G   0.15G 100% / # pre-init-2-1-ubuntu20
  "2.2-debian12" ) disk_size_gb="46" ;; # 44.06G 41.73G   0.41G 100% / # rapids-pre-init-2-2-debian12
  "2.2-rocky9"   ) disk_size_gb="45" ;; # 44.79G 42.29G   2.51G  95% / # rapids-pre-init-2-2-rocky9
  "2.2-ubuntu22" ) disk_size_gb="46" ;; # 42.46G 41.97G   0.48G  99% / # rapids-pre-init-2-2-ubuntu22
esac

#disk_size_gb="45"

# Install dask with rapids on base image
PURPOSE="rapids"
customization_script="examples/secure-boot/rapids.sh"
echo time generate_from_base_purpose "tf"
#time generate_from_base_purpose "cuda-pre-init"

## Install dask without rapids on base image
PURPOSE="dask"
customization_script="examples/secure-boot/dask.sh"
echo time generate_from_base_purpose "secure-boot"
#time generate_from_base_purpose "cuda-pre-init"

# cuda image -> pytorch
case "${dataproc_version}" in
  "2.0-debian10"     ) disk_size_gb="44" ;; # 40.12G 37.51G   0.86G  98% / # pre-init-2-0-debian10
  "2.0-rocky8"       ) disk_size_gb="41" ;; # 38.79G 38.04G   0.76G  99% / # pre-init-2-0-rocky8
  "2.0-ubuntu18"     ) disk_size_gb="44" ;; # 37.62G 36.69G   0.91G  98% / # pre-init-2-0-ubuntu18
  "2.1-debian11"     ) disk_size_gb="44" ;; # 42.09G 39.77G   0.49G  99% / # pre-init-2-1-debian11
  "2.1-rocky8"       ) disk_size_gb="44" ;; # 43.79G 41.11G   2.68G  94% / # pre-init-2-1-rocky8
  "2.1-ubuntu20"     ) disk_size_gb="45" ;; # 39.55G 39.39G   0.15G 100% / # pre-init-2-1-ubuntu20
  "2.1-ubuntu20-arm" ) disk_size_gb="45" ;; # 39.55G 39.39G   0.15G 100% / # pre-init-2-1-ubuntu20
  "2.2-debian12"     ) disk_size_gb="48" ;; # 44.06G 41.73G   0.41G 100% / # pre-init-2-2-debian12
  "2.2-rocky9"       ) disk_size_gb="48" ;; # 44.79G 42.29G   2.51G  95% / # pre-init-2-2-rocky9
  "2.2-ubuntu22"     ) disk_size_gb="46" ;; # 42.46G 41.97G   0.48G  99% / # pre-init-2-2-ubuntu22
  "2.3-debian12"     ) disk_size_gb="42" ;; # 41.11G 36.20G   3.12G  93% / # 20250507-083009-tf
  "2.3-rocky9"       ) disk_size_gb="44" ;; # 49.79G 37.82G  11.98G  76% / # 20250507-083009-tf
  "2.3-ubuntu22"     ) disk_size_gb="42" ;; # 40.52G 36.18G   4.33G  90% / # 20250507-083009-tf
  "2.3-ml-ubuntu22"  ) disk_size_gb="60" ;; # 40.52G 36.18G   4.33G  90% / # 20250507-083009-tf
esac

## Install pytorch on base image
PURPOSE="pytorch"
customization_script="examples/secure-boot/pytorch.sh"
echo time generate_from_base_purpose "tf"

====== Filename: ./examples/secure-boot/install-nvidia-driver-debian11.sh ======
#!/bin/bash
set -xeu

WORKDIR=/opt/install-nvidia-driver
mkdir -p ${WORKDIR}
cd $_

nv_driver_ver="550.54.14"
nv_cuda_ver="12.4.0"

# read secret name, project, version
sig_pub_secret_name="$(/usr/share/google/get_metadata_value attributes/public_secret_name)"
sig_priv_secret_name="$(/usr/share/google/get_metadata_value attributes/private_secret_name)"
sig_secret_project="$(/usr/share/google/get_metadata_value attributes/secret_project)"
sig_secret_version="$(/usr/share/google/get_metadata_value attributes/secret_version)"
expected_modulus_md5sum="$(/usr/share/google/get_metadata_value attributes/modulus_md5sum)"
readonly expected_modulus_md5sum

ca_tmpdir="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)"
mkdir -p "${ca_tmpdir}"

# The Microsoft Corporation UEFI CA 2011
ms_uefi_ca="${ca_tmpdir}/MicCorUEFCA2011_2011-06-27.crt"
if [[ ! -f "${ms_uefi_ca}" ]]; then
  curl -L -o "${ms_uefi_ca}" "https://go.microsoft.com/fwlink/p/?linkid=321194"
fi

# Write private material to volatile storage
gcloud secrets versions access "${sig_secret_version}" \
       --project="${sig_secret_project}" \
       --secret="${sig_priv_secret_name}" \
    | dd of="${ca_tmpdir}/db.rsa"

readonly cacert_der="${ca_tmpdir}/db.der"
gcloud secrets versions access "${sig_secret_version}" \
       --project="${sig_secret_project}" \
       --secret="${sig_pub_secret_name}" \
    | base64 --decode \
    | dd of="${cacert_der}"

mokutil --sb-state

# configure the nvidia-container-toolkit package source
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
  | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg

curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \
  | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
  | tee /etc/apt/sources.list.d/nvidia-container-toolkit.list

# add non-free components
sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list

# update package cache
apt-get update -qq

# install nvidia-container-toolkit and kernel headers
apt-get --no-install-recommends -qq -y install \
     nvidia-container-toolkit \
     "linux-headers-$(uname -r)"

apt-get clean
apt-get autoremove -y

# fetch .run file
curl -o driver.run \
  "https://download.nvidia.com/XFree86/Linux-x86_64/${nv_driver_ver}/NVIDIA-Linux-x86_64-${nv_driver_ver}.run"
# Install all but kernel driver
bash driver.run --no-kernel-modules --silent --install-libglvnd
rm driver.run

# Fetch open souce kernel module with corresponding tag
git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git --branch "${nv_driver_ver}" --single-branch
cd ${WORKDIR}/open-gpu-kernel-modules
#
# build kernel modules
#
make -j$(nproc) modules > /var/log/open-gpu-kernel-modules-build.log
# sign
for module in $(find kernel-open -name '*.ko'); do
    /lib/modules/$(uname -r)/build/scripts/sign-file sha256 \
      "${ca_tmpdir}/db.rsa" \
      "${ca_tmpdir}/db.der" \
      "${module}"
done
# install
make modules_install >> /var/log/open-gpu-kernel-modules-build.log
# rebuilt module index
depmod -a
cd ${WORKDIR}

#
# Install CUDA
#
cuda_runfile="cuda_${nv_cuda_ver}_${nv_driver_ver}_linux.run"
curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \
     "https://developer.download.nvidia.com/compute/cuda/${nv_cuda_ver}/local_installers/${cuda_runfile}" \
     -o cuda.run
bash cuda.run --silent --toolkit --no-opengl-libs
rm cuda.run
====== Filename: ./examples/secure-boot/cuda.sh ======
#!/bin/bash
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This script creates a custom image pre-loaded with cuda

set -ex

export PROJECT_ID="$(jq    -r .PROJECT_ID    env.json)"
export PURPOSE="$(jq       -r .PURPOSE       env.json)"
export BUCKET="$(jq        -r .BUCKET        env.json)"
export IMAGE_VERSION="$(jq -r .IMAGE_VERSION env.json)"
export ZONE="$(jq          -r .ZONE          env.json)"

custom_image_zone="${ZONE}"
disk_size_gb="50" # greater than or equal to 30

SA_NAME="sa-${PURPOSE}"
GSA="${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com"

gcloud config set project ${PROJECT_ID}

gcloud auth login

if [[ -d tls ]] ; then mv tls "tls-$(date +%s)" ; fi
eval "$(bash examples/secure-boot/create-key-pair.sh)"

metadata="public_secret_name=${public_secret_name}"
metadata="${metadata},private_secret_name=${private_secret_name}"
metadata="${metadata},secret_project=${secret_project}"
metadata="${metadata},secret_version=${secret_version}"

if gcloud iam service-accounts list --filter email="${GSA}" 2>&1 | grep 'Listed 0 items.' ; then
  # Create service account for this purpose
  echo "creating pre-init customization service account ${GSA}"
  gcloud iam service-accounts create "${SA_NAME}" \
    --description="Service account for pre-init customization" \
    --display-name="${SA_NAME}"
fi

# Grant service account access to bucket
gcloud storage buckets add-iam-policy-binding "gs://${BUCKET}" \
  --member="serviceAccount:${GSA}" \
  --role="roles/storage.objectViewer"

# Grant the service account access to list secrets for the project
gcloud projects add-iam-policy-binding "${PROJECT_ID}" \
  --member="serviceAccount:${GSA}" \
  --role="roles/secretmanager.viewer"

# Grant service account permission to access the private secret
gcloud secrets add-iam-policy-binding "${private_secret_name}" \
  --member="serviceAccount:${GSA}" \
  --role="roles/secretmanager.secretAccessor"

# Grant service account permission to access the public secret
gcloud secrets add-iam-policy-binding "${public_secret_name}" \
  --member="serviceAccount:${GSA}" \
  --role="roles/secretmanager.secretAccessor"

# If no OS family specified, default to debian
if [[ "${IMAGE_VERSION}" != *-* ]] ; then
  case "${IMAGE_VERSION}" in
    "2.2" ) dataproc_version="${IMAGE_VERSION}-debian12" ;;
    "2.1" ) dataproc_version="${IMAGE_VERSION}-debian11" ;;
    "2.0" ) dataproc_version="${IMAGE_VERSION}-debian10" ;;
  esac
else
  dataproc_version="${IMAGE_VERSION}"
fi

#dataproc_version="${IMAGE_VERSION}-ubuntu22"
#dataproc_version="${IMAGE_VERSION}-rocky9"
#customization_script="examples/secure-boot/install-nvidia-driver-debian11.sh"
#customization_script="examples/secure-boot/install-nvidia-driver-debian12.sh"
customization_script="examples/secure-boot/install_gpu_driver.sh"
#echo "#!/bin/bash\necho no op" | dd of=empty.sh
#customization_script=empty.sh
#image_name="nvidia-open-kernel-2.2-ubuntu22-$(date +%F)"
#image_name="nvidia-open-kernel-2.2-rocky9-$(date +%F)"
#image_name="nvidia-open-kernel-2.2-debian12-$(date +%F)"
#image_name="nvidia-open-kernel-${dataproc_version}-$(date +%F)"
image_name="cuda-${dataproc_version/\./-}-$(date +%F-%H-%M)"

python generate_custom_image.py \
    --accelerator "type=nvidia-tesla-t4" \
    --image-name "${image_name}" \
    --dataproc-version "${dataproc_version}" \
    --trusted-cert "tls/db.der" \
    --customization-script "${customization_script}" \
    --service-account "${GSA}" \
    --metadata "${metadata}" \
    --zone "${custom_image_zone}" \
    --disk-size "${disk_size_gb}" \
    --no-smoke-test \
    --gcs-bucket "${BUCKET}" \
    --shutdown-instance-timer-sec=30
set +x
# Revoke permission to access the private secret
gcloud secrets remove-iam-policy-binding "${private_secret_name}" \
  --member="serviceAccount:${GSA}" \
  --role="roles/secretmanager.secretAccessor" > /dev/null 2>&1

# Revoke access to bucket
gcloud storage buckets remove-iam-policy-binding "gs://${BUCKET}" \
  --member="serviceAccount:${GSA}" \
  --role="roles/storage.objectViewer" > /dev/null 2>&1

# Revoke access to list secrets for the project
gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \
  --member="serviceAccount:${GSA}" \
  --role="roles/secretmanager.viewer" > /dev/null 2>&1
====== Filename: ./CONTRIBUTING.md ======
# How to become a contributor and submit your own code

## Contributor License Agreements

We'd love to accept your patches! Before we can take them, we
have to jump a couple of legal hurdles.

Please fill out either the individual or corporate Contributor License Agreement
(CLA).

  * If you are an individual writing original source code and you're sure you
    own the intellectual property, then you'll need to sign an [individual CLA]
    (https://developers.google.com/open-source/cla/individual).
  * If you work for a company that wants to allow you to contribute your work,
    then you'll need to sign a [corporate CLA]
    (https://developers.google.com/open-source/cla/corporate).

Follow either of the two links above to access the appropriate CLA and
instructions for how to sign and return it. Once we receive it, we'll be able to
accept your pull requests.

## Coding Practices
1. [Write small PRs](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/getting-started/best-practices-for-pull-requests#write-small-prs), this helps reviewers to provide feedback and reason why something is changed. 
1. [Provide context and guidance](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/getting-started/best-practices-for-pull-requests#provide-context-and-guidance) in the title/description.
1. Squash commit messages into final one while merging to remove intermediate changes, to keep commit history clean.
1. Ensure that your code adheres to the existing style in the sample to which you are contributing. Shell scripts should follow the [Google shell style guide](https://google.github.io/styleguide/shell.xml).
1. Ensure that your code has an appropriate set of unit/integration tests which all pass.
   
## Contributing A Patch

1. Submit an issue describing your proposed change to the repo in question.
1. The repo owner will respond to your issue promptly.
1. If your proposed change is accepted, and you haven't already done so, sign a
   Contributor License Agreement (see details above).
1. Fork the desired repo, develop and test your code changes.
1. Ensure that your code has an appropriate set of unit tests which all pass.
1. Submit a pull request.
====== Filename: ./.gitignore ======
.vscode
__pycache__
*.pyc

# Ignore IntelliJ files.
.idea/
*.iml
*.ipr
*.iws

# MacOS folder files
.DS_Store

# Emacs
*~
\#*#
.\#*

# secure-boot certificate store
tls/
tls-*/

# Screen session logs
screenlog.*

# environment configuration file
env.json

# failed patches
*.orig
*.rej
====== Filename: ./startup_script/run.more ======
#!/bin/bash

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# run.sh will be used by image build workflow to run custom initialization
# script when creating a custom image.
#
# Immediately after image build workflow creates an GCE instance, it will
# execute run.sh on the GCE instance that it just created:
# 1. Download user's custom init action script from cloud Storage bucket.
# 2. Run the custom init action script.
# 3. Check for init action script output, and print success or failure
#    message.
# 4. Shutdown GCE instance.

set -x

# get custom-sources-path
CUSTOM_SOURCES_PATH=$(/usr/share/google/get_metadata_value attributes/custom-sources-path)
# get time to wait for stdout to flush
SHUTDOWN_TIMER_IN_SEC=$(/usr/share/google/get_metadata_value attributes/shutdown-timer-in-sec)

USER_DATAPROC_COMPONENTS=$( /usr/share/google/get_metadata_value attributes/optional-components | tr '[:upper:]' '[:lower:]' | tr '.' ',' || echo "")
BDUTIL_DIR="/usr/local/share/google/dataproc/bdutil"
DATAPROC_VERSION=$(/usr/share/google/get_metadata_value attributes/dataproc-version | cut -c1-3 | tr '-' '.' || echo "")

ready=""

function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; }
function version_lt(){ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";}

# With the 402.0.0 release of gcloud sdk, `gcloud storage` can be
# used as a more performant replacement for `gsutil`
gsutil_cmd="gcloud storage"
gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')"
gsutil_cp_cmd="${gsutil_cmd} cp"

if version_lt "${gcloud_sdk_version}" "402.0.0" ; then
  gsutil_cmd="gsutil"
  gsutil_cp_cmd="${gsutil_cmd} -m cp"
fi

function wait_until_ready() {
  # For Ubuntu, wait until /snap is mounted, so that gsutil is unavailable.
  if [[ $(. /etc/os-release && echo "${ID}") == ubuntu ]]; then
    for i in {0..10}; do
      sleep 5

      if command -v "${gsutil_cmd/ *}" >/dev/null; then
        ready="true"
        break
      fi

      if ((i == 10)); then
        echo "BuildFailed: timed out waiting for gsutil to be available on Ubuntu."
      fi
    done
  else
    ready="true"
  fi
}

function download_scripts() {

  ${gsutil_cp_cmd} -r "${CUSTOM_SOURCES_PATH}/*" ./
}

function run_custom_script() {
  if ! download_scripts; then
    echo "BuildFailed: failed to download scripts from ${CUSTOM_SOURCES_PATH}."
    return 1
  fi

  set -x                                                                                                                                                                                              
  
  # Start-up script wrapper that installs screen and writes screenrc                                                                                                                                  
  

  # get return code
  RET_CODE=$?

  # print failure message if install fails
  if [[ $RET_CODE -ne 0 ]]; then
    echo "BuildFailed: Dataproc Initialization Actions Failed. Please check your initialization script."
  else
    echo "BuildSucceeded: Dataproc Initialization Actions Succeeded."
  fi
}

function cleanup() {
  # .config and .gsutil dirs are created by the gsutil command. It contains
  # transient authentication keys to access gcs bucket. The init_actions.sh and
  # run.sh are your customization and bootstrap scripts (this) which must be
  # removed after creating the image
  rm -rf ~/.config/ ~/.gsutil/
  rm ./init_actions.sh ./run.sh
}

function is_version_at_least() {
  local -r VERSION=$1
  if [[ $(echo "$DATAPROC_VERSION >= $VERSION" | bc -l) -eq 1 ]]; then
    return 0
  else
    return 1
  fi
}

function run_install_optional_components_script() {
  if ! is_version_at_least "2.3" || [[ -z "$USER_DATAPROC_COMPONENTS" ]]; then
    return
  fi
  source "${BDUTIL_DIR}/install_optional_components.sh"
}

function main() {
  wait_until_ready

  if [[ "${ready}" == "true" ]]; then
    run_install_optional_components_script
    run_custom_script
    cleanup
  fi

  echo "Sleep ${SHUTDOWN_TIMER_IN_SEC}s before shutting down..."
  echo "You can change the timeout value with --shutdown-instance-timer-sec"
  sleep "${SHUTDOWN_TIMER_IN_SEC}" # wait for stdout to flush
  shutdown -h now
}

main "$@"
====== Filename: ./startup_script/README.md ======
[GCE VM startup script](https://cloud.google.com/compute/docs/startupscript)
which downloads and runs the user-provided customization script.
====== Filename: ./startup_script/run.ng.sh ======
#!/bin/bash

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# run.sh will be used by image build workflow to run custom initialization
# script when creating a custom image.
#
# Immediately after image build workflow creates an GCE instance, it will
# execute run.sh on the GCE instance that it just created:
# 1. Download user's custom init action script from cloud Storage bucket.
# 2. Run the custom init action script.
# 3. Check for init action script output, and print success or failure
#    message.
# 4. Shutdown GCE instance.

set -x

# get custom-sources-path
CUSTOM_SOURCES_PATH=$(/usr/share/google/get_metadata_value attributes/custom-sources-path)
# get time to wait for stdout to flush
SHUTDOWN_TIMER_IN_SEC=$(/usr/share/google/get_metadata_value attributes/shutdown-timer-in-sec)

USER_DATAPROC_COMPONENTS=$( /usr/share/google/get_metadata_value attributes/optional-components | tr '[:upper:]' '[:lower:]' | tr '.' ',' || echo "")
BDUTIL_DIR="/usr/local/share/google/dataproc/bdutil"
DATAPROC_VERSION=$(/usr/share/google/get_metadata_value attributes/dataproc-version | cut -c1-3 | tr '-' '.' || echo "")

ready=""

function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; }
function version_lt(){ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";}

# With the 402.0.0 release of gcloud sdk, `gcloud storage` can be
# used as a more performant replacement for `gsutil`
gsutil_cmd="gcloud storage"
gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')"
gsutil_cp_cmd="${gsutil_cmd} cp"

if version_lt "${gcloud_sdk_version}" "402.0.0" ; then
  gsutil_cmd="gsutil"
  gsutil_cp_cmd="${gsutil_cmd} -m cp"
fi

function wait_until_ready() {
  # For Ubuntu, wait until /snap is mounted, so that gsutil is unavailable.
  if [[ $(. /etc/os-release && echo "${ID}") == ubuntu ]]; then
    for i in {0..10}; do
      sleep 5

      if command -v "${gsutil_cmd/ *}" >/dev/null; then
        ready="true"
        break
      fi

      if ((i == 10)); then
        echo "BuildFailed: timed out waiting for gsutil to be available on Ubuntu."
      fi
    done
  else
    ready="true"
  fi
}

function download_scripts() {

  ${gsutil_cp_cmd} -r "${CUSTOM_SOURCES_PATH}/*" ./
}

function run_custom_script() {
  if ! download_scripts; then
    echo "BuildFailed: failed to download scripts from ${CUSTOM_SOURCES_PATH}."
    return 1
  fi

  set -x                                                                                                                                                                                              
  
  # Start-up script wrapper that installs screen and writes screenrc                                                                                                                                  
  
  set -euo pipefail                                                                                                                                                                                   
  
  echo "preparing screenrc"                                                                                                                                                                           
  
  # configure screen in which build-image.sh will run
  BUILD_TMP_DIR="/tmp/dataproc/custom-images"
  mkdir -p "${BUILD_TMP_DIR}"
  grep -q "tmpfs /tmp tmpfs" /proc/mounts || \
      mount -t tmpfs tmpfs /tmp
  screen_dir="${BUILD_TMP_DIR}/screen"
  log_dir="${BUILD_TMP_DIR}/log"
  mkdir -p "${BUILD_TMP_DIR}/"{screen,log}

  screen_rcfile="${screen_dir}/build-image.screenrc"
  screen_success="${screen_dir}/build-image.success"
  screen_logfile="${log_dir}/build-image.log"

  cat > "${screen_rcfile}" <<EOSRC
deflog on
defscrollback 100000
log on
caption always
caption string "%{= kw}%-w%{wr}%n %t%{-}%+w"
logfile "${screen_logfile}"

screen -LU -t build-image 0 /bin/bash -c \
  "bash -e /root/nested_startup_script.sh && touch ${screen_success} && exit 0 || exit 1"
EOSRC

  echo "screenrc prepared."

  # run init actions
  if command -v screen ; then
    echo "screen installed"
  else
    echo -n "installing screen..."
    if command -v dnf ; then
      # installed sequentially because screen is not in RHEL package index without epel
      for pkg in epel-release screen ; do
        dnf install -y -qq \$pkg > /dev/null 2>&1
      done
    elif command -v apt-get ; then
      apt-get install -y -qq screen > /dev/null 2>&1
    else
      echo "unable to install screen"
      exit 1
    fi
    echo "done"
  fi
  screen -US bash -x ./init_actions.sh

  # get return code
  RET_CODE=$?

  # print failure message if install fails
  if [[ $RET_CODE -ne 0 ]]; then
    echo "BuildFailed: Dataproc Initialization Actions Failed. Please check your initialization script."
  else
    echo "BuildSucceeded: Dataproc Initialization Actions Succeeded."
  fi
}

function cleanup() {
  # .config and .gsutil dirs are created by the gsutil command. It contains
  # transient authentication keys to access gcs bucket. The init_actions.sh and
  # run.sh are your customization and bootstrap scripts (this) which must be
  # removed after creating the image
  rm -rf ~/.config/ ~/.gsutil/
  rm ./init_actions.sh ./run.sh
}

function is_version_at_least() {
  local -r VERSION=$1
  if [[ $(echo "$DATAPROC_VERSION >= $VERSION" | bc -l) -eq 1 ]]; then
    return 0
  else
    return 1
  fi
}

function run_install_optional_components_script() {
  if ! is_version_at_least "2.3" || [[ -z "$USER_DATAPROC_COMPONENTS" ]]; then
    return
  fi
  source "${BDUTIL_DIR}/install_optional_components.sh"
}

function main() {
  wait_until_ready

  if [[ "${ready}" == "true" ]]; then
    run_install_optional_components_script
    run_custom_script
    cleanup
  fi

  echo "Sleep ${SHUTDOWN_TIMER_IN_SEC}s before shutting down..."
  echo "You can change the timeout value with --shutdown-instance-timer-sec"
  sleep "${SHUTDOWN_TIMER_IN_SEC}" # wait for stdout to flush
  shutdown -h now
}

main "$@"
====== Filename: ./startup_script/run.sh ======
#!/bin/bash

# Copyright 2019 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# run.sh will be used by image build workflow to run custom initialization
# script when creating a custom image.
#
# Immediately after image build workflow creates an GCE instance, it will
# execute run.sh on the GCE instance that it just created:
# 1. Download user's custom init action script from cloud Storage bucket.
# 2. Run the custom init action script.
# 3. Check for init action script output, and print success or failure
#    message.
# 4. Shutdown GCE instance.

set -x

# get custom-sources-path
CUSTOM_SOURCES_PATH=$(/usr/share/google/get_metadata_value attributes/custom-sources-path)
# get time to wait for stdout to flush
SHUTDOWN_TIMER_IN_SEC=$(/usr/share/google/get_metadata_value attributes/shutdown-timer-in-sec)

USER_DATAPROC_COMPONENTS=$( /usr/share/google/get_metadata_value attributes/optional-components | tr '[:upper:]' '[:lower:]' | tr '.' ' ' || echo "")
DATAPROC_IMAGE_VERSION=$(/usr/share/google/get_metadata_value attributes/dataproc_dataproc_version | cut -c1-3 | tr '-' '.' || echo "")
DATAPROC_IMAGE_TYPE=$(/usr/share/google/get_metadata_value attributes/dataproc_image_type || echo "standard")
export REGION=$(/usr/share/google/get_metadata_value attributes/dataproc-region)
[[ -n "${DATAPROC_IMAGE_TYPE}" ]] # Sanity validation
export DATAPROC_IMAGE_TYPE
[[ "${DATAPROC_IMAGE_VERSION}" =~ ^[0-9]+\.[0-9]+$ ]] # Sanity validation
export DATAPROC_IMAGE_VERSION
# Startup script that performs first boot configuration for Dataproc cluster.

ready=""

function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; }
function version_lt(){ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";}

# With the 402.0.0 release of gcloud sdk, `gcloud storage` can be
# used as a more performant replacement for `gsutil`
gsutil_cmd="gcloud storage"
gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')"
gsutil_cp_cmd="${gsutil_cmd} cp"

if version_lt "${gcloud_sdk_version}" "402.0.0" ; then
  gsutil_cmd="gsutil"
  gsutil_cp_cmd="${gsutil_cmd} -m cp"
fi

function wait_until_ready() {
  # For Ubuntu, wait until /snap is mounted, so that gsutil is unavailable.
  if [[ $(. /etc/os-release && echo "${ID}") == ubuntu ]]; then
    for i in {0..10}; do
      if command -v "${gsutil_cmd/ *}" >/dev/null; then
        ready="true"
        break
      fi

      sleep 5

      if ((i == 10)); then
        echo "BuildFailed: timed out waiting for gsutil to be available on Ubuntu."
      fi
    done
  else
    ready="true"
  fi
}

function download_scripts() {

  ${gsutil_cp_cmd} -r "${CUSTOM_SOURCES_PATH}/*" ./
}

function run_custom_script() {
  if ! download_scripts; then
    echo "BuildFailed: failed to download scripts from ${CUSTOM_SOURCES_PATH}."
    return 1
  fi

  # run init actions
  bash -x ./init_actions.sh

  # get return code
  RET_CODE=$?

  # print failure message if install fails
  if [[ $RET_CODE -ne 0 ]]; then
    echo "BuildFailed: Dataproc Initialization Actions Failed. Please check your initialization script."
  else
    echo "BuildSucceeded: Dataproc Initialization Actions Succeeded."
  fi
}

function cleanup() {
  # .config and .gsutil dirs are created by the gsutil command. It contains
  # transient authentication keys to access gcs bucket. The init_actions.sh and
  # run.sh are your customization and bootstrap scripts (this) which must be
  # removed after creating the image
  rm -rf ~/.config/ ~/.gsutil/
  rm ./init_actions.sh ./run.sh
}

function is_version_at_least() {
  local -r VERSION=$1
  if [[ $(echo "$DATAPROC_IMAGE_VERSION >= $VERSION" | bc -l) -eq 1 ]]; then
    return 0
  else
    return 1
  fi
}

function run_install_optional_components_script() {
  if ! is_version_at_least "2.3" || [[ -z "$USER_DATAPROC_COMPONENTS" ]]; then
    return
  fi

  (
    export BDUTIL_DIR="/usr/local/share/google/dataproc/bdutil"
    # Install Optional components
    set -Ee
    source /etc/environment
    source "${BDUTIL_DIR}/bdutil_env.sh"
    source "${BDUTIL_DIR}/bdutil_helpers.sh"
    source "${BDUTIL_DIR}/bdutil_metadata.sh"
    source "${BDUTIL_DIR}/bdutil_misc.sh"
    source "${BDUTIL_DIR}/components/components-helpers.sh"
    set -x

    export USER_DATAPROC_COMPONENTS=(${USER_DATAPROC_COMPONENTS})
    source "${BDUTIL_DIR}/install_optional_components.sh"
  )
  # get return code
  local RET_CODE=$?

  # print failure message if install fails
  if [[ $RET_CODE -ne 0 ]]; then
    echo "BuildFailed: Dataproc optional component installation Failed. Please check logs."
  else
    echo "BuildSucceeded: Dataproc optional component installation Succeeded."
  fi
}

function main() {
  wait_until_ready

  if [[ "${ready}" == "true" ]]; then
    run_install_optional_components_script
    run_custom_script
    cleanup
  fi

  echo "Sleep ${SHUTDOWN_TIMER_IN_SEC}s before shutting down..."
  echo "You can change the timeout value with --shutdown-instance-timer-sec"
  sleep "${SHUTDOWN_TIMER_IN_SEC}" # wait for stdout to flush
  shutdown -h now
}

main "$@"
====== Filename: ./Dockerfile ======
FROM python:slim

# To build: docker build -t dataproc-custom-images:latest .
# To run: docker run -it dataproc-custom-images:latest /bin/bash

# Then from the docker bash shell, run examples/secure-boot/cuda.sh

WORKDIR /custom-images

RUN apt-get -qq update \
  && apt-get -y -qq install \
     apt-transport-https ca-certificates gnupg curl jq less screen
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \
    | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg
RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \
    | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list

RUN apt-get -y -qq update && apt-get -y -qq install google-cloud-cli && apt-get clean

RUN apt-get -y -qq install emacs-nox vim libmime-base64-perl && apt-get clean

COPY . ${WORKDIR}

CMD ["/bin/bash"]