====== Filename: ./cloud_build/presubmit.sh ====== #!/bin/bash # Copyright 2019 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Run all tests python2 -m unittest discover ====== Filename: ./cloud_build/cloudbuild.yaml ====== steps: - name: 'gcr.io/cloud-builders/gcloud' id: 'presubmit' entrypoint: 'bash' args: ['cloud_build/presubmit.sh'] ====== Filename: ./custom_image_utils/__init__.py ====== ====== Filename: ./custom_image_utils/shell_script_generator.py.orig ====== # Copyright 2019,2020,2024 Google LLC. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Shell script based image creation workflow generator. """ from datetime import datetime _template = """#!/usr/bin/env bash # Script for creating Dataproc custom image. set -euo pipefail RED='\\e[0;31m' GREEN='\\e[0;32m' NC='\\e[0m' base_obj_type="images" function execute_with_retries() ( set +x local -r cmd="$*" for ((i = 0; i < 3; i++)); do set -x time eval "$cmd" > "/tmp/{run_id}/install.log" 2>&1 && retval=$? || {{ retval=$? ; cat "/tmp/{run_id}/install.log" ; }} set +x if [[ $retval == 0 ]] ; then return 0 ; fi sleep 5 done return 1 ) function exit_handler() {{ echo 'Cleaning up before exiting.' if [[ -f /tmp/{run_id}/vm_created ]]; then ( set +e echo 'Deleting VM instance.' execute_with_retries \ gcloud compute instances delete {image_name}-install --project={project_id} --zone={zone} -q ) elif [[ -f /tmp/{run_id}/disk_created ]]; then echo 'Deleting disk.' execute_with_retries \ gcloud compute ${{base_obj_type}} delete {image_name}-install --project={project_id} --zone={zone} -q fi echo 'Uploading local logs to GCS bucket.' gsutil -m rsync -r {log_dir}/ {gcs_log_dir}/ if [[ -f /tmp/{run_id}/image_created ]]; then echo -e "${{GREEN}}Workflow succeeded${{NC}}, check logs at {log_dir}/ or {gcs_log_dir}/" exit 0 else echo -e "${{RED}}Workflow failed${{NC}}, check logs at {log_dir}/ or {gcs_log_dir}/" exit 1 fi }} function test_element_in_array {{ local test_element="$1" ; shift local -a test_array=("$@") for item in "${{test_array[@]}}"; do if [[ "${{item}}" == "${{test_element}}" ]]; then return 0 ; fi done return 1 }} function print_modulus_md5sum {{ local derfile="$1" openssl x509 -noout -modulus -in "${{derfile}}" | openssl md5 | awk '{{print $2}}' }} function print_img_dbs_modulus_md5sums() {{ local long_img_name="$1" local img_name="$(echo ${{long_img_name}} | sed -e 's:^.*/::')" local json_tmpfile="/tmp/{run_id}/${{img_name}}.json" gcloud compute images describe ${{long_img_name}} --format json > "${{json_tmpfile}}" local -a db_certs=() mapfile -t db_certs < <( cat ${{json_tmpfile}} | jq -r 'try .shieldedInstanceInitialState.dbs[].content' ) local -a modulus_md5sums=() for key in "${{!db_certs[@]}}" ; do local derfile="/tmp/{run_id}/${{img_name}}.${{key}}.der" echo "${{db_certs[${{key}}]}}" | \ perl -M'MIME::Base64(decode_base64url)' -ne 'chomp; print( decode_base64url($_) )' \ > "${{derfile}}" modulus_md5sums+=( $(print_modulus_md5sum "${{derfile}}") ) done echo "${{modulus_md5sums[@]}}" }} function main() {{ echo 'Uploading files to GCS bucket.' declare -a sources_k=({sources_map_k}) declare -a sources_v=({sources_map_v}) for i in "${{!sources_k[@]}}"; do gsutil cp "${{sources_v[i]}}" "{custom_sources_path}/${{sources_k[i]}}" > /dev/null 2>&1 done local cert_args="" local num_src_certs="0" metadata_arg="{metadata_flag}" if [[ -n '{trusted_cert}' ]] && [[ -f '{trusted_cert}' ]]; then # build tls/ directory from variables defined near the header of # the examples/secure-boot/create-key-pair.sh file eval "$(bash examples/secure-boot/create-key-pair.sh)" metadata_arg="${{metadata_arg}},public_secret_name=${{public_secret_name}},private_secret_name=${{private_secret_name}},secret_project=${{secret_project}},secret_version=${{secret_version}}" # by default, a gcloud secret with the name of efi-db-pub-key-042 is # created in the current project to store the certificate installed # as the signature database file for this disk image # The MS UEFI CA is a reasonable base from which to build trust. We # will trust code signed by this CA as well as code signed by # trusted_cert (tls/db.der) # The Microsoft Corporation UEFI CA 2011 local -r MS_UEFI_CA="tls/MicCorUEFCA2011_2011-06-27.crt" test -f "${{MS_UEFI_CA}}" || \ curl -L -o ${{MS_UEFI_CA}} 'https://go.microsoft.com/fwlink/p/?linkid=321194' local -a cert_list=() local -a default_cert_list default_cert_list=("{trusted_cert}" "${{MS_UEFI_CA}}") local -a src_img_modulus_md5sums=() mapfile -t src_img_modulus_md5sums < <(print_img_dbs_modulus_md5sums {dataproc_base_image}) num_src_certs="${{#src_img_modulus_md5sums[@]}}" echo "debug - num_src_certs: [${{#src_img_modulus_md5sums[*]}}]" echo "value of src_img_modulus_md5sums: [${{src_img_modulus_md5sums}}]" if [[ -z "${{src_img_modulus_md5sums}}" ]]; then num_src_certs=0 echo "no db certificates in source image" cert_list=( "${{default_cert_list[@]}}" ) else echo "${{num_src_certs}} db certificates attached to source image" echo "db certs exist in source image" for cert in ${{default_cert_list[*]}}; do if test_element_in_array "$(print_modulus_md5sum ${{cert}})" ${{src_img_modulus_md5sums[@]}} ; then echo "cert ${{cert}} is already in source image's db list" else cert_list+=("${{cert}}") fi done # append source image's cert list local img_name="$(echo {dataproc_base_image} | sed -e 's:^.*/::')" if [[ ${{#cert_list[@]}} -ne 0 ]] && compgen -G "/tmp/{run_id}/${{img_name}}.*.der" > /dev/null ; then cert_list+=(/tmp/{run_id}/${{img_name}}.*.der) fi fi if [[ ${{#cert_list[@]}} -eq 0 ]]; then echo "all certificates already included in source image's db list" else cert_args="--signature-database-file=$(IFS=, ; echo "${{cert_list[*]}}") --guest-os-features=UEFI_COMPATIBLE" fi fi date if [[ -z "${{cert_args}}" && "${{num_src_certs}}" -ne "0" ]]; then echo 'Re-using base image' base_obj_type="reuse" instance_disk_args='--image-project={project_id} --image={dataproc_base_image} --boot-disk-size={disk_size}G --boot-disk-type=pd-ssd' elif [[ -n "${{cert_args}}" ]] ; then echo 'Creating image.' base_obj_type="images" instance_disk_args='--image-project={project_id} --image={image_name}-install --boot-disk-size={disk_size}G --boot-disk-type=pd-ssd' execute_with_retries \ gcloud compute images create {image_name}-install \ --project={project_id} \ --source-image={dataproc_base_image} \ ${{cert_args}} \ {storage_location_flag} \ --family={family} touch "/tmp/{run_id}/disk_created" else echo 'Creating disk.' base_obj_type="disks" instance_disk_args='--disk=auto-delete=yes,boot=yes,mode=rw,name={image_name}-install' execute_with_retries gcloud compute disks create {image_name}-install \ --project={project_id} \ --zone={zone} \ --image={dataproc_base_image} \ --type=pd-ssd \ --size={disk_size}GB touch "/tmp/{run_id}/disk_created" fi date echo 'Creating VM instance to run customization script.' execute_with_retries gcloud compute instances create {image_name}-install \ --project={project_id} \ --zone={zone} \ {network_flag} \ {subnetwork_flag} \ {no_external_ip_flag} \ --machine-type={machine_type} \ ${{instance_disk_args}} \ {accelerator_flag} \ {service_account_flag} \ --scopes=cloud-platform \ "${{metadata_arg}}" \ --metadata-from-file startup-script=startup_script/run.sh touch /tmp/{run_id}/vm_created # clean up intermediate install image if [[ "${{base_obj_type}}" == "images" ]] ; then ( set +e # This sometimes returns an API error but deletes the image despite the failure gcloud compute images delete -q {image_name}-install --project={project_id} ) fi echo 'Waiting for customization script to finish and VM shutdown.' execute_with_retries gcloud compute instances tail-serial-port-output {image_name}-install \ --project={project_id} \ --zone={zone} \ --port=1 2>&1 \ | grep 'startup-script' \ | sed -e 's/ {image_name}-install.*startup-script://g' \ | dd status=none bs=1 of={log_dir}/startup-script.log \ || true echo 'Checking customization script result.' date if grep -q 'BuildFailed:' {log_dir}/startup-script.log; then echo -e "${{RED}}Customization script failed.${{NC}}" echo "See {log_dir}/startup-script.log for details" exit 1 elif grep -q 'BuildSucceeded:' {log_dir}/startup-script.log; then echo -e "${{GREEN}}Customization script succeeded.${{NC}}" else echo 'Unable to determine the customization script result.' exit 1 fi date echo 'Creating custom image.' execute_with_retries gcloud compute images create {image_name} \ --project={project_id} \ --source-disk-zone={zone} \ --source-disk={image_name}-install \ {storage_location_flag} \ --family={family} touch /tmp/{run_id}/image_created }} trap exit_handler EXIT mkdir -p {log_dir} main "$@" 2>&1 | tee {log_dir}/workflow.log """ class Generator: """Shell script based image creation workflow generator.""" def _init_args(self, args): self.args = args if "run_id" not in self.args: self.args["run_id"] = "custom-image-{image_name}-{timestamp}".format( timestamp=datetime.now().strftime("%Y%m%d-%H%M%S"), **self.args) self.args["bucket_name"] = self.args["gcs_bucket"].replace("gs://", "") self.args["custom_sources_path"] = "gs://{bucket_name}/{run_id}/sources".format(**self.args) all_sources = { "run.sh": "startup_script/run.sh", "init_actions.sh": self.args["customization_script"] } all_sources.update(self.args["extra_sources"]) sources_map_items = tuple(enumerate(all_sources.items())) self.args["sources_map_k"] = " ".join([ "[{}]='{}'".format(i, kv[0].replace("'", "'\\''")) for i, kv in sources_map_items]) self.args["sources_map_v"] = " ".join([ "[{}]='{}'".format(i, kv[1].replace("'", "'\\''")) for i, kv in sources_map_items]) self.args["log_dir"] = "/tmp/{run_id}/logs".format(**self.args) self.args["gcs_log_dir"] = "gs://{bucket_name}/{run_id}/logs".format( **self.args) if self.args["subnetwork"]: self.args["subnetwork_flag"] = "--subnet={subnetwork}".format(**self.args) self.args["network_flag"] = "" elif self.args["network"]: self.args["network_flag"] = "--network={network}".format(**self.args) self.args["subnetwork_flag"] = "" if self.args["service_account"]: self.args[ "service_account_flag"] = "--service-account={service_account}".format( **self.args) self.args["no_external_ip_flag"] = "--no-address" if self.args[ "no_external_ip"] else "" self.args[ "accelerator_flag"] = "--accelerator={accelerator} --maintenance-policy terminate".format( **self.args) if self.args["accelerator"] else "" self.args[ "storage_location_flag"] = "--storage-location={storage_location}".format( **self.args) if self.args["storage_location"] else "" metadata_flag_template = ( "--metadata=shutdown-timer-in-sec={shutdown_timer_in_sec}," "custom-sources-path={custom_sources_path}") if self.args["metadata"]: metadata_flag_template += ",{metadata}" self.args["metadata_flag"] = metadata_flag_template.format(**self.args) def generate(self, args): self._init_args(args) return _template.format(**args) ====== Filename: ./custom_image_utils/shell_script_generator.py ====== # Copyright 2019,2020,2024 Google LLC. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Shell script based image creation workflow generator. """ from datetime import datetime _template = """#!/usr/bin/env bash # Script for creating Dataproc custom image. set -euo pipefail RED='\\e[0;31m' GREEN='\\e[0;32m' NC='\\e[0m' base_obj_type="images" function execute_with_retries() ( set +x local -r cmd="$*" for ((i = 0; i < 3; i++)); do if eval "$cmd"; then return 0 ; fi sleep 12 done return 1 ) function version_le(){{ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; }} function version_lt(){{ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";}} function prepare() {{ # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be # used as a more performant replacement for `gsutil` gsutil_cmd="gcloud storage" rsync_cmd="${{gsutil_cmd}} rsync" gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {{print $2}}')" if version_lt "${{gcloud_sdk_version}}" "402.0.0" ; then gsutil_cmd="$(which gsutil) -o GSUtil:check_hashes=never" rsync_cmd="${{gsutil_cmd}} -m rsync" fi }} function exit_handler() {{ echo 'Cleaning up before exiting.' if [[ -f /tmp/{run_id}/vm_created ]]; then ( set +e echo 'Deleting VM instance.' execute_with_retries \ gcloud compute instances delete {image_name}-install --project={project_id} --zone={zone} -q ) elif [[ -f /tmp/{run_id}/disk_created ]]; then echo 'Deleting disk.' execute_with_retries gcloud compute ${{base_obj_type}} delete {image_name}-install --project={project_id} -q fi echo 'Uploading local logs to GCS bucket.' ${{rsync_cmd}} -r {log_dir}/ {gcs_log_dir}/ if [[ -f /tmp/{run_id}/image_created ]]; then echo -e "${{GREEN}}Workflow succeeded${{NC}}, check logs at {log_dir}/ or {gcs_log_dir}/" exit 0 else echo -e "${{RED}}Workflow failed${{NC}}, check logs at {log_dir}/ or {gcs_log_dir}/" exit 1 fi }} function test_element_in_array {{ local test_element="$1" ; shift local -a test_array=("$@") for item in "${{test_array[@]}}"; do if [[ "${{item}}" == "${{test_element}}" ]]; then return 0 ; fi done return 1 }} function print_modulus_md5sum {{ local derfile="$1" openssl x509 -noout -modulus -in "${{derfile}}" | openssl md5 | awk '{{print $2}}' }} function print_img_dbs_modulus_md5sums() {{ local long_img_name="$1" local img_name="$(echo ${{long_img_name}} | sed -e 's:^.*/::')" local json_tmpfile="/tmp/{run_id}/${{img_name}}.json" gcloud compute images describe ${{long_img_name}} --format json > "${{json_tmpfile}}" local -a db_certs=() mapfile -t db_certs < <( cat ${{json_tmpfile}} | jq -r 'try .shieldedInstanceInitialState.dbs[].content' ) local -a modulus_md5sums=() for key in "${{!db_certs[@]}}" ; do local derfile="/tmp/{run_id}/${{img_name}}.${{key}}.der" echo "${{db_certs[${{key}}]}}" | \ perl -M'MIME::Base64(decode_base64url)' -ne 'chomp; print( decode_base64url($_) )' \ > "${{derfile}}" modulus_md5sums+=( $(print_modulus_md5sum "${{derfile}}") ) done echo "${{modulus_md5sums[@]}}" }} function main() {{ echo 'Uploading files to GCS bucket.' declare -a sources_k=({sources_map_k}) declare -a sources_v=({sources_map_v}) for i in "${{!sources_k[@]}}"; do ${{gsutil_cmd}} cp "${{sources_v[i]}}" "{custom_sources_path}/${{sources_k[i]}}" > /dev/null 2>&1 done local cert_args="" local num_src_certs="0" metadata_arg="{metadata_flag}" if [[ -n '{trusted_cert}' ]] && [[ -f '{trusted_cert}' ]]; then # build tls/ directory from variables defined near the header of # the examples/secure-boot/create-key-pair.sh file eval "$(bash examples/secure-boot/create-key-pair.sh)" metadata_arg="${{metadata_arg}},public_secret_name=${{public_secret_name}},private_secret_name=${{private_secret_name}},secret_project=${{secret_project}},secret_version=${{secret_version}}" # by default, a gcloud secret with the name of efi-db-pub-key-042 is # created in the current project to store the certificate installed # as the signature database file for this disk image # The MS UEFI CA is a reasonable base from which to build trust. We # will trust code signed by this CA as well as code signed by # trusted_cert (tls/db.der) # The Microsoft Corporation UEFI CA 2011 local -r MS_UEFI_CA="tls/MicCorUEFCA2011_2011-06-27.crt" test -f "${{MS_UEFI_CA}}" || \ curl -L -o ${{MS_UEFI_CA}} 'https://go.microsoft.com/fwlink/p/?linkid=321194' local -a cert_list=() local -a default_cert_list default_cert_list=("{trusted_cert}" "${{MS_UEFI_CA}}") local -a src_img_modulus_md5sums=() mapfile -t src_img_modulus_md5sums < <(print_img_dbs_modulus_md5sums {dataproc_base_image}) num_src_certs="${{#src_img_modulus_md5sums[@]}}" echo "debug - num_src_certs: [${{#src_img_modulus_md5sums[*]}}]" echo "value of src_img_modulus_md5sums: [${{src_img_modulus_md5sums}}]" if [[ -z "${{src_img_modulus_md5sums}}" ]]; then num_src_certs=0 echo "no db certificates in source image" cert_list=( "${{default_cert_list[@]}}" ) else echo "${{num_src_certs}} db certificates attached to source image" echo "db certs exist in source image" for cert in ${{default_cert_list[*]}}; do if test_element_in_array "$(print_modulus_md5sum ${{cert}})" ${{src_img_modulus_md5sums[@]}} ; then echo "cert ${{cert}} is already in source image's db list" else cert_list+=("${{cert}}") fi done # append source image's cert list local img_name="$(echo {dataproc_base_image} | sed -e 's:^.*/::')" if [[ ${{#cert_list[@]}} -ne 0 ]] && compgen -G "/tmp/{run_id}/${{img_name}}.*.der" > /dev/null ; then cert_list+=(/tmp/{run_id}/${{img_name}}.*.der) fi fi if [[ ${{#cert_list[@]}} -eq 0 ]]; then echo "all certificates already included in source image's db list" else cert_args="--signature-database-file=$(IFS=, ; echo "${{cert_list[*]}}") --guest-os-features=UEFI_COMPATIBLE" fi fi date if [[ -z "${{cert_args}}" && "${{num_src_certs}}" -ne "0" ]]; then echo 'Re-using base image' base_obj_type="reuse" instance_disk_args='--image-project={project_id} --image={dataproc_base_image} --boot-disk-size={disk_size}G --boot-disk-type=pd-ssd' elif [[ -n "${{cert_args}}" ]] ; then echo 'Creating image.' base_obj_type="images" instance_disk_args='--image-project={project_id} --image={image_name}-install --boot-disk-size={disk_size}G --boot-disk-type=pd-ssd' execute_with_retries \ gcloud compute images create {image_name}-install \ --project={project_id} \ --source-image={dataproc_base_image} \ ${{cert_args}} \ {storage_location_flag} \ --family={family} touch "/tmp/{run_id}/disk_created" else echo 'Creating disk.' base_obj_type="disks" instance_disk_args='--disk=auto-delete=yes,boot=yes,mode=rw,name={image_name}-install' execute_with_retries gcloud compute disks create {image_name}-install \ --project={project_id} \ --zone={zone} \ --image={dataproc_base_image} \ --type=pd-ssd \ --size={disk_size}GB touch "/tmp/{run_id}/disk_created" fi date echo 'Creating VM instance to run customization script.' execute_with_retries gcloud compute instances create {image_name}-install \ --project={project_id} \ --zone={zone} \ {network_flag} \ {subnetwork_flag} \ {no_external_ip_flag} \ --machine-type={machine_type} \ ${{instance_disk_args}} \ {accelerator_flag} \ {service_account_flag} \ --scopes=cloud-platform \ "${{metadata_arg}}" \ --metadata-from-file startup-script=startup_script/run.sh touch /tmp/{run_id}/vm_created # clean up intermediate install image if [[ "${{base_obj_type}}" == "images" ]] ; then ( set +e # This sometimes returns an API error but deletes the image despite the failure gcloud compute images delete -q {image_name}-install --project={project_id} ) fi echo "Monitor startup logs in {log_dir}/startup-script.log" echo 'Waiting for customization script to finish and VM shutdown.' set -x # too many serial port output requests per minute occur if they all occur at once sleep $(( ( RANDOM % 60 ) + 20 )) gcloud compute instances describe --format json {image_name}-install --zone {zone} | tee {log_dir}/instance.json execute_with_retries gcloud compute instances tail-serial-port-output {image_name}-install \ --project={project_id} \ --zone={zone} \ --port=1 2>&1 \ | grep 'startup-script' | grep -v '^\\[' \ | sed -e 's/ {image_name}-install.*startup-script://g' \ | dd bs=1 status=none of={log_dir}/startup-script.log \ || true echo 'Checking customization script result.' date if grep -q 'BuildSucceeded:' {log_dir}/startup-script.log; then echo -e "${{GREEN}}Customization script succeeded.${{NC}}" else echo -e "${{RED}}Customization script failed.${{NC}}" echo "See {log_dir}/startup-script.log for details" exit 1 fi date echo 'Creating custom image.' execute_with_retries gcloud compute images create {image_name} \ --project={project_id} \ --source-disk-zone={zone} \ --source-disk={image_name}-install \ {storage_location_flag} \ --family={family} touch /tmp/{run_id}/image_created }} prepare trap exit_handler EXIT mkdir -p {log_dir} main "$@" 2>&1 | tee {log_dir}/workflow.log """ class Generator: """Shell script based image creation workflow generator.""" def _init_args(self, args): self.args = args if "run_id" not in self.args: self.args["run_id"] = "custom-image-{image_name}-{timestamp}".format( timestamp=datetime.now().strftime("%Y%m%d-%H%M%S"), **self.args) self.args["bucket_name"] = self.args["gcs_bucket"].replace("gs://", "") self.args["custom_sources_path"] = "gs://{bucket_name}/{run_id}/sources".format(**self.args) all_sources = { "run.sh": "startup_script/run.sh", "init_actions.sh": self.args["customization_script"] } all_sources.update(self.args["extra_sources"]) sources_map_items = tuple(enumerate(all_sources.items())) self.args["sources_map_k"] = " ".join([ "[{}]='{}'".format(i, kv[0].replace("'", "'\\''")) for i, kv in sources_map_items]) self.args["sources_map_v"] = " ".join([ "[{}]='{}'".format(i, kv[1].replace("'", "'\\''")) for i, kv in sources_map_items]) self.args["log_dir"] = "/tmp/{run_id}/logs".format(**self.args) self.args["gcs_log_dir"] = "gs://{bucket_name}/{run_id}/logs".format( **self.args) if self.args["subnetwork"]: self.args["subnetwork_flag"] = "--subnet={subnetwork}".format(**self.args) self.args["network_flag"] = "" elif self.args["network"]: self.args["network_flag"] = "--network={network}".format(**self.args) self.args["subnetwork_flag"] = "" if self.args["service_account"]: self.args[ "service_account_flag"] = "--service-account={service_account}".format( **self.args) self.args["no_external_ip_flag"] = "--no-address" if self.args[ "no_external_ip"] else "" self.args[ "accelerator_flag"] = "--accelerator={accelerator} --maintenance-policy terminate".format( **self.args) if self.args["accelerator"] else "" self.args[ "storage_location_flag"] = "--storage-location={storage_location}".format( **self.args) if self.args["storage_location"] else "" metadata_flag_template = ( "--metadata=shutdown-timer-in-sec={shutdown_timer_in_sec}," "custom-sources-path={custom_sources_path}" ) if self.args["zone"]: region = "-".join(self.args["zone"].split("-")[:-1]) metadata_flag_template += ',dataproc-region="{}"'.format(region) if self.args["optional_components"]: optional_components = self.args["optional_components"].split(',') # convert to component names used inside image and join to set as metadata value optional_image_components = '.'.join(self._get_optional_to_image_components(optional_components)) metadata_flag_template += ',optional-components="{}"'.format(optional_image_components) if self.args["dataproc_version"]: dataproc_version = self.args["dataproc_version"] metadata_flag_template += ',dataproc_dataproc_version="{}"'.format(dataproc_version) if self.args["metadata"]: metadata_flag_template += ",{metadata}" self.args["metadata_flag"] = metadata_flag_template.format(**self.args) def _get_optional_to_image_components(self, optional_components): """Get the equivalent component names in the image for user provided optional components.""" # Add new component here, if component name inside image scripts is different. optional_to_image_component_map = { "DOCKER": "DOCKER-CE", "HIVE_WEBHCAT": "HIVE-WEBHCAT-SERVER", "SOLR": "SOLR-SERVER", } optional_image_components = [] for component in optional_components: image_component = optional_to_image_component_map.get(component, component) optional_image_components.append(image_component) return optional_image_components def generate(self, args): self._init_args(args) return _template.format(**args) ====== Filename: ./custom_image_utils/args_inferer.py ====== # Copyright 2019 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Infer arguments for Dataproc custom image build. """ import logging import os import re import subprocess import tempfile _IMAGE_PATH = "projects/{}/global/images/{}" _IMAGE_URI = re.compile( r"^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/([^/]+)$" ) _IMAGE_FAMILY_PATH = "projects/{}/global/images/family/{}" _IMAGE_FAMILY_URI = re.compile( r"^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/family/([^/]+)$" ) logging.basicConfig() _LOG = logging.getLogger(__name__) _LOG.setLevel(logging.WARN) def _get_project_id(): """Get project id from gcloud config.""" gcloud_command = ["gcloud", "config", "get-value", "project"] with tempfile.NamedTemporaryFile() as temp_file: pipe = subprocess.Popen(gcloud_command, stdout=temp_file) pipe.wait() if pipe.returncode != 0: raise RuntimeError("Cannot find gcloud project ID. " "Please setup the project ID in gcloud SDK") # get project id temp_file.seek(0) stdout = temp_file.read() return stdout.decode('utf-8').strip() def _extract_image_name_and_project(image_uri): """Get Dataproc image name and project.""" m = _IMAGE_URI.match(image_uri) return m.group(3), m.group(4) # project, image_name def _extract_image_name_and_project_from_family_uri(image_uri): """Get Dataproc image family name and project.""" m = _IMAGE_FAMILY_URI.match(image_uri) return m.group(3), m.group(4) # project, image_name def _get_dataproc_image_version(image_uri): """Get Dataproc image version from image URI.""" project, image_name = _extract_image_name_and_project(image_uri) command = [ "gcloud", "compute", "images", "describe", image_name, "--project", project, "--format=value(labels.goog-dataproc-version)" ] # get stdout from compute images list --filters with tempfile.NamedTemporaryFile() as temp_file: pipe = subprocess.Popen(command, stdout=temp_file) pipe.wait() if pipe.returncode != 0: raise RuntimeError( "Cannot find dataproc base image, please check and verify " "the base image URI.") temp_file.seek(0) # go to start of the stdout stdout = temp_file.read() # parse the first ready image with the dataproc version attached in labels if stdout: parsed_line = stdout.decode('utf-8').strip() # should be just one value return parsed_line raise RuntimeError("Cannot find dataproc base image: %s", image_uri) def _get_dataproc_version_from_image_family(image_family_uri): """Get Dataproc image family version from family name.""" project, image_family_name = _extract_image_name_and_project_from_family_uri(image_family_uri) command = [ "gcloud", "compute", "images", "describe-from-family", image_family_name, "--project", project, "--format=value(labels.goog-dataproc-version)" ] # get stdout from compute images list --filters with tempfile.NamedTemporaryFile() as temp_file: pipe = subprocess.Popen(command, stdout=temp_file) pipe.wait() if pipe.returncode != 0: raise RuntimeError( "Cannot find dataproc base family image, please check and verify " "the family URI.") temp_file.seek(0) # go to start of the stdout stdout = temp_file.read() # parse the first ready image with the dataproc version attached in labels if stdout: dataproc_version = stdout.decode('utf-8').strip() # should be just one value return dataproc_version raise RuntimeError("Cannot find dataproc base image family: %s" % image_family_uri) def _extract_image_path(image_uri): """Get the partial image URI from the full image URI.""" project, image_name = _extract_image_name_and_project(image_uri) return _IMAGE_PATH.format(project, image_name) def _extract_image_family_path(image_family_uri): """Get the partial image family URI from the full image family URI.""" project, image_name = _extract_image_name_and_project_from_family_uri(image_family_uri) return _IMAGE_FAMILY_PATH.format(project, image_name) def _get_dataproc_image_path_by_version(version): """Get Dataproc base image name from version.""" # version regex already checked in arg parser parsed_version = version.split(".") major_version = parsed_version[0] if len(parsed_version) == 2: # The input version must be of format 1.5-debian10 in which case we need to # expand it to 1-5-\d+-debian10 so we can do a regexp on the minor version minor_version = parsed_version[1].split("-")[0] parsed_version[1] = parsed_version[1].replace("-", "-\d+-") filter_arg = ("labels.goog-dataproc-version ~ ^{}-{} AND NOT name ~ -eap$" " AND status = READY").format(parsed_version[0], parsed_version[1]) else: major_version = parsed_version[0] minor_version = parsed_version[1] # Moreover, push the filter of READY status and name not containing 'eap' to # gcloud command so we don't have to iterate the list filter_arg = ("labels.goog-dataproc-version = {}-{}-{} AND NOT name ~ -eap$" " AND status = READY").format(parsed_version[0], parsed_version[1], parsed_version[2]) command = [ "gcloud", "compute", "images", "list", "--project", "cloud-dataproc", "--filter", filter_arg, "--format", "csv[no-heading=true](name,labels.goog-dataproc-version)", "--sort-by=~creationTimestamp" ] _LOG.info("Executing command: {}".format(command)) # get stdout from compute images list --filters with tempfile.NamedTemporaryFile() as temp_file: pipe = subprocess.Popen(command, stdout=temp_file) pipe.wait() if pipe.returncode != 0: raise RuntimeError( "Cannot find dataproc base image, please check and verify " "[--dataproc-version]") temp_file.seek(0) # go to start of the stdout stdout = temp_file.read() # parse the first ready image with the dataproc version attached in labels if stdout: # in case there are multiple images parsed_lines = stdout.decode('utf-8').strip().split('\n') expected_prefix = "dataproc-{}-{}".format(major_version, minor_version) _LOG.info("Filtering images : %s", expected_prefix) image_versions=[] all_images_for_version = {} for line in parsed_lines: parsed_image = line.split(",") if len(parsed_image) == 2: parsed_image_name = parsed_image[0] if not parsed_image_name.startswith(expected_prefix): _LOG.info("Skipping non-release image %s", parsed_image_name) # Not a regular dataproc release image. Maybe a custom image with same label. continue parsed_image_version = parsed_image[1] if parsed_image_version not in all_images_for_version: all_images_for_version[parsed_image_version] = [_IMAGE_PATH.format("cloud-dataproc", parsed_image_name)] image_versions.append(parsed_image_version) else: all_images_for_version[parsed_image_version].append(_IMAGE_PATH.format("cloud-dataproc", parsed_image_name)) _LOG.info("All Images : %s", all_images_for_version) _LOG.info("All Image-Versions : %s", image_versions) latest_available_version = image_versions[0] if (len(all_images_for_version[latest_available_version]) > 1): raise RuntimeError( "Found more than one images for latest dataproc-version={}. Images: {}".format( latest_available_version, str(all_images_for_version[latest_available_version]))) _LOG.info("Choosing image %s with version %s", all_images_for_version[image_versions[0]][0], image_versions[0]) return all_images_for_version[image_versions[0]][0], image_versions[0] raise RuntimeError( "Cannot find dataproc base image with dataproc-version=%s." % version) def _infer_project_id(args): if not args.project_id: args.project_id = _get_project_id() def _infer_base_image(args): # get dataproc base image from dataproc version _LOG.info("Getting Dataproc base image name...") if args.base_image_uri: args.dataproc_base_image = _extract_image_path(args.base_image_uri) args.dataproc_version = _get_dataproc_image_version(args.base_image_uri) elif args.dataproc_version: args.dataproc_base_image, args.dataproc_version = _get_dataproc_image_path_by_version( args.dataproc_version) elif args.base_image_family: args.dataproc_base_image = _extract_image_family_path(args.base_image_family) args.dataproc_version = _get_dataproc_version_from_image_family(args.base_image_family) else: raise RuntimeError( "Neither --dataproc-version nor --base-image-uri nor --source-image-family-uri is specified.") _LOG.info("Returned Dataproc base image: %s", args.dataproc_base_image) _LOG.info("Returned Dataproc version : %s", args.dataproc_version) def _infer_oauth(args): if args.oauth: args.oauth = "\n \"OAuthPath\": \"{}\",".format( os.path.abspath(args.oauth)) else: args.oauth = "" def _infer_network(args): # When the user wants to create a VM in a shared VPC, # only the subnetwork argument has to be provided whereas # the network one has to be left empty. if not args.network and not args.subnetwork: args.network = 'global/networks/default' # The --network flag requires format global/networks/, # which does not work for gcloud, here we convert it to # projects//global/networks/. if args.network.startswith('global/networks/'): args.network = 'projects/{}/{}'.format(args.project_id, args.network) def infer_args(args): _infer_project_id(args) _infer_base_image(args) _infer_oauth(args) _infer_network(args) args.shutdown_timer_in_sec = args.shutdown_instance_timer_sec ====== Filename: ./custom_image_utils/image_labeller.py ====== # Copyright 2019 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Add label to Dataproc custom images. """ import logging import subprocess logging.basicConfig() _LOG = logging.getLogger(__name__) _LOG.setLevel(logging.WARN) def _set_custom_image_label(image_name, version, project_id): """Sets Dataproc version label in the custom image.""" # Convert `1.5.0-RC1-debian9` version to `1-5-0-rc1-debian9` label version_label = version.replace('.', '-').lower() label_flag = "--labels=goog-dataproc-version={}".format(version_label) command = [ "gcloud", "compute", "images", "add-labels", image_name, "--project", project_id, label_flag ] _LOG.info("Running: {}".format(" ".join(command))) # get stdout from compute images list --filters pipe = subprocess.Popen(command) pipe.wait() if pipe.returncode != 0: raise RuntimeError("Cannot set dataproc version to image label.") def add_label(args): """Sets Dataproc version label in the custom image.""" if not args.dry_run: _LOG.info("Setting label on custom image...") _set_custom_image_label(args.image_name, args.dataproc_version, args.project_id) _LOG.info("Successfully set label on custom image...") else: _LOG.info("Skip setting label on custom image (dry run).") ====== Filename: ./custom_image_utils/args_parser.py.rej ====== --- custom_image_utils/args_parser.py +++ custom_image_utils/args_parser.py @@ -238,6 +246,14 @@ def parse_args(args): default="tls/db.der", help="""(Optional) Inserts the specified DER-format certificate into the custom image's EFI boot sector for use with secure boot.""") + parser.add_argument( + "--optional-components", + type=_validate_components, + required=False, + help="""Optional Components to be installed with the image. + Can be a comma-separated list of components, e.g., TRINO,ZEPPELIN. + (Only supported for Dataproc Images 2.3 and above)""" + ) return parser.parse_args(args) ====== Filename: ./custom_image_utils/shell_script_executor.py ====== # Copyright 2019 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Shell script executor. """ import os import subprocess import sys import tempfile def run(shell_script): """Runs a Shell script.""" # Write the script to a temp file. temp_file = tempfile.NamedTemporaryFile(delete=False) try: temp_file.write(shell_script.encode("utf-8")) temp_file.flush() temp_file.close() # close this file but do not delete # Run the shell script from the temp file, then wait for it to complete. pipe = subprocess.Popen( ['bash', temp_file.name], stdout=sys.stdout, stderr=sys.stderr ) #for line in iter(pipe.stdout.readline, b''): # if not line: # print(line) #pipe.stdout.close() pipe.wait() if pipe.returncode != 0: raise RuntimeError("Error building custom image.") finally: try: os.remove(temp_file.name) except OSError: pass ====== Filename: ./custom_image_utils/args_parser.py ====== # Copyright 2019,2020,2021,2024 Google LLC. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This is a utility module which defines and parses the command-line arguments for the generate_custom_image.py script. """ import argparse import json import re from custom_image_utils import constants # Old style images: 1.2.3 # New style images: 1.2.3-deb8, 1.2.3-debian9, 1.2.3-RC10-debian9 _VERSION_REGEX = re.compile(r"^\d+\.\d+\.\d+(-RC\d+)?(-[a-z]+\d+)?$") _FULL_IMAGE_URI = re.compile(r"^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/([^/]+)$") _FULL_IMAGE_FAMILY_URI = re.compile(r"^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/family/([^/]+)$") _LATEST_FROM_MINOR_VERSION = re.compile(r"^(\d+)\.(\d+)-((?:debian|ubuntu|rocky)\d+)$") _VALID_OPTIONAL_COMPONENTS = ["HIVE_WEBHCAT", "ZEPPELIN", "TRINO", "RANGER", "SOLR", "FLINK", "DOCKER", "HUDI", "ICEBERG", "PIG"] def _version_regex_type(s): """Check if version string matches regex.""" if not _VERSION_REGEX.match(s) and not _LATEST_FROM_MINOR_VERSION.match(s): raise argparse.ArgumentTypeError("Invalid version: {}.".format(s)) return s def _full_image_uri_regex_type(s): """Check if the partial image uri string matches regex.""" if not _FULL_IMAGE_URI.match(s): raise argparse.ArgumentTypeError("Invalid image URI: {}.".format(s)) return s def _full_image_family_uri_regex_type(s): """Check if the partial image family uri string matches regex.""" if not _FULL_IMAGE_FAMILY_URI.match(s): raise argparse.ArgumentTypeError("Invalid image family URI: {}.".format(s)) return s def _validate_components(optional_components): components = optional_components.split(',') for component in components: if component not in _VALID_OPTIONAL_COMPONENTS: raise argparse.ArgumentTypeError("Invalid optional component selected.") return optional_components def parse_args(args): """Parses command-line arguments.""" parser = argparse.ArgumentParser() required_args = parser.add_argument_group("required named arguments") required_args.add_argument( "--image-name", type=str, required=True, help="""The image name for the Dataproc custom image.""") image_args = required_args.add_mutually_exclusive_group() image_args.add_argument( "--dataproc-version", type=_version_regex_type, help=constants.version_help_text) image_args.add_argument( "--base-image-uri", type=_full_image_uri_regex_type, help="""The full image URI for the base Dataproc image. The customiziation script will be executed on top of this image instead of an out-of-the-box Dataproc image. This image must be a valid Dataproc image. """) image_args.add_argument( "--base-image-family", type=_full_image_family_uri_regex_type, help="""The source image family URI. The latest non-depracated image associated with the family will be used. """) required_args.add_argument( "--customization-script", type=str, required=True, help="""User's script to install custom packages.""") required_args.add_argument( "--metadata", type=str, required=False, help="""VM metadata which can be read by the customization script with `/usr/share/google/get_metadata_value attributes/` at runtime. The value of this flag takes the form of `key1=value1,key2=value2,...`. If the value includes special characters (e.g., `=`, `,` or spaces) which needs to be escaped, consider encoding the value, then decode it back in the customization script. See more information about VM metadata on https://cloud.google.com/sdk/gcloud/reference/compute/instances/create. """) required_args.add_argument( "--zone", type=str, required=True, help="""GCE zone used to build the custom image.""") required_args.add_argument( "--gcs-bucket", type=str, required=True, help="""GCS bucket used to store files and logs when building custom image.""") parser.add_argument( "--family", type=str, required=False, default='dataproc-custom-image', help="""(Optional) The family of the image.""") parser.add_argument( "--project-id", type=str, required=False, help="""The project Id of the project where the custom image will be created and saved. The default value will be set to the project id specified by `gcloud config get-value project`.""") parser.add_argument( "--oauth", type=str, required=False, help="""A local path to JSON credentials for your GCE project. The default oauth is the application-default credentials from gcloud.""") parser.add_argument( "--machine-type", type=str, required=False, default="n1-standard-1", help="""(Optional) Machine type used to build custom image. Default machine type is n1-standard-1.""") parser.add_argument( "--no-smoke-test", action="store_true", help="""(Optional) Disables smoke test to verify if the custom image can create a functional Dataproc cluster.""") parser.add_argument( "--network", type=str, required=False, default="", help="""(Optional) Network interface used to launch the VM instance that builds the custom image. Default network is 'global/networks/default' when no network and subnetwork arguments are provided. If the default network does not exist in your project, please specify a valid network interface.""") parser.add_argument( "--subnetwork", type=str, required=False, default="", help="""(Optional) The subnetwork that is used to launch the VM instance that builds the custom image. A full subnetwork URL is required. Default subnetwork is None. For shared VPC only provide this parameter and do not use the --network argument.""") parser.add_argument( "--no-external-ip", action="store_true", help="""(Optional) Disables external IP for the image build VM. The VM will not be able to access the internet, but if Private Google Access is enabled for the subnetwork, it can still access Google services (e.g., GCS) through internal IP of the VPC.""") parser.add_argument( "--service-account", type=str, required=False, default="default", help= """(Optional) The service account that is used to launch the VM instance that builds the custom image. If not specified, the default service account under the GCE project will be used. The scope of this service account is defaulted to /auth/cloud-platform.""") parser.add_argument( "--extra-sources", type=json.loads, required=False, default={}, help= """(Optional) Additional files/directories uploaded along with customization script. This argument is evaluated to a json dictionary. For example: '--extra-sources "{\\"notes.txt\\": \\"/path/to/notes.txt\\"}"' """) parser.add_argument( "--disk-size", type=int, required=False, default=30, help= """(Optional) The size in GB of the disk attached to the VM instance that builds the custom image. If not specified, the default value of 15 GB will be used.""") parser.add_argument( "--accelerator", type=str, required=False, default=None, help= """(Optional) The accelerators (e.g. GPUs) attached to the VM instance that builds the custom image. If not specified, no accelerators are attached.""") parser.add_argument( "--storage-location", type=str, required=False, default=None, help= """(Optional) The storage location (e.g. US, us-central1) of the custom GCE image. If not specified, the default GCE image storage location is used.""") parser.add_argument( "--shutdown-instance-timer-sec", type=int, required=False, default=300, help= """(Optional) The time to wait in seconds before shutting down the VM instance. This value may need to be increased if your init script generates a lot of output on stdout. If not specified, the default value of 300 seconds will be used.""") parser.add_argument( "--dry-run", action="store_true", help="""(Optional) Only generates script without creating image.""") parser.add_argument( "--trusted-cert", type=str, required=False, default="tls/db.der", help="""(Optional) Inserts the specified DER-format certificate into the custom image's EFI boot sector for use with secure boot.""") parser.add_argument( "--optional-components", type=_validate_components, required=False, help="""Optional Components to be installed with the image. Can be a comma-separated list of components, e.g., TRINO,ZEPPELIN. (Only supported for Dataproc Images 2.3 and above)""" ) return parser.parse_args(args) ====== Filename: ./custom_image_utils/shell_image_creator.py ====== # Copyright 2019 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Shell script based custom image creator. """ import logging from custom_image_utils import shell_script_executor from custom_image_utils import shell_script_generator logging.basicConfig() _LOG = logging.getLogger(__name__) _LOG.setLevel(logging.WARN) def create(args): """Creates a custom image with generated Shell script.""" # Generate Shell script. _LOG.info("Generating Shell script...") script = shell_script_generator.Generator().generate(vars(args)) _LOG.info("#" * 60) _LOG.info(script) _LOG.info("#" * 60) _LOG.info("Successfully generated Shell script...") # Run the script to build custom image. if not args.dry_run: _LOG.info("Creating custom image...") shell_script_executor.run(script) _LOG.info("Successfully created custom image...") else: _LOG.info("Skip creating custom image (dry run).") ====== Filename: ./custom_image_utils/args_parser.py.orig ====== # Copyright 2019,2020,2021,2024 Google LLC. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This is a utility module which defines and parses the command-line arguments for the generate_custom_image.py script. """ import argparse import json import re from custom_image_utils import constants # Old style images: 1.2.3 # New style images: 1.2.3-deb8, 1.2.3-debian9, 1.2.3-RC10-debian9 _VERSION_REGEX = re.compile(r"^\d+\.\d+\.\d+(-RC\d+)?(-[a-z]+\d+)?$") _FULL_IMAGE_URI = re.compile(r"^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/([^/]+)$") _FULL_IMAGE_FAMILY_URI = re.compile(r"^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/family/([^/]+)$") _LATEST_FROM_MINOR_VERSION = re.compile(r"^(\d+)\.(\d+)-((?:debian|ubuntu|rocky)\d+)$") def _version_regex_type(s): """Check if version string matches regex.""" if not _VERSION_REGEX.match(s) and not _LATEST_FROM_MINOR_VERSION.match(s): raise argparse.ArgumentTypeError("Invalid version: {}.".format(s)) return s def _full_image_uri_regex_type(s): """Check if the partial image uri string matches regex.""" if not _FULL_IMAGE_URI.match(s): raise argparse.ArgumentTypeError("Invalid image URI: {}.".format(s)) return s def _full_image_family_uri_regex_type(s): """Check if the partial image family uri string matches regex.""" if not _FULL_IMAGE_FAMILY_URI.match(s): raise argparse.ArgumentTypeError("Invalid image family URI: {}.".format(s)) return s def parse_args(args): """Parses command-line arguments.""" parser = argparse.ArgumentParser() required_args = parser.add_argument_group("required named arguments") required_args.add_argument( "--image-name", type=str, required=True, help="""The image name for the Dataproc custom image.""") image_args = required_args.add_mutually_exclusive_group() image_args.add_argument( "--dataproc-version", type=_version_regex_type, help=constants.version_help_text) image_args.add_argument( "--base-image-uri", type=_full_image_uri_regex_type, help="""The full image URI for the base Dataproc image. The customiziation script will be executed on top of this image instead of an out-of-the-box Dataproc image. This image must be a valid Dataproc image. """) image_args.add_argument( "--base-image-family", type=_full_image_family_uri_regex_type, help="""The source image family URI. The latest non-depracated image associated with the family will be used. """) required_args.add_argument( "--customization-script", type=str, required=True, help="""User's script to install custom packages.""") required_args.add_argument( "--metadata", type=str, required=False, help="""VM metadata which can be read by the customization script with `/usr/share/google/get_metadata_value attributes/` at runtime. The value of this flag takes the form of `key1=value1,key2=value2,...`. If the value includes special characters (e.g., `=`, `,` or spaces) which needs to be escaped, consider encoding the value, then decode it back in the customization script. See more information about VM metadata on https://cloud.google.com/sdk/gcloud/reference/compute/instances/create. """) required_args.add_argument( "--zone", type=str, required=True, help="""GCE zone used to build the custom image.""") required_args.add_argument( "--gcs-bucket", type=str, required=True, help="""GCS bucket used to store files and logs when building custom image.""") parser.add_argument( "--family", type=str, required=False, default='dataproc-custom-image', help="""(Optional) The family of the image.""") parser.add_argument( "--project-id", type=str, required=False, help="""The project Id of the project where the custom image will be created and saved. The default value will be set to the project id specified by `gcloud config get-value project`.""") parser.add_argument( "--oauth", type=str, required=False, help="""A local path to JSON credentials for your GCE project. The default oauth is the application-default credentials from gcloud.""") parser.add_argument( "--machine-type", type=str, required=False, default="n1-standard-1", help="""(Optional) Machine type used to build custom image. Default machine type is n1-standard-1.""") parser.add_argument( "--no-smoke-test", action="store_true", help="""(Optional) Disables smoke test to verify if the custom image can create a functional Dataproc cluster.""") parser.add_argument( "--network", type=str, required=False, default="", help="""(Optional) Network interface used to launch the VM instance that builds the custom image. Default network is 'global/networks/default' when no network and subnetwork arguments are provided. If the default network does not exist in your project, please specify a valid network interface.""") parser.add_argument( "--subnetwork", type=str, required=False, default="", help="""(Optional) The subnetwork that is used to launch the VM instance that builds the custom image. A full subnetwork URL is required. Default subnetwork is None. For shared VPC only provide this parameter and do not use the --network argument.""") parser.add_argument( "--no-external-ip", action="store_true", help="""(Optional) Disables external IP for the image build VM. The VM will not be able to access the internet, but if Private Google Access is enabled for the subnetwork, it can still access Google services (e.g., GCS) through internal IP of the VPC.""") parser.add_argument( "--service-account", type=str, required=False, default="default", help= """(Optional) The service account that is used to launch the VM instance that builds the custom image. If not specified, the default service account under the GCE project will be used. The scope of this service account is defaulted to /auth/cloud-platform.""") parser.add_argument( "--extra-sources", type=json.loads, required=False, default={}, help= """(Optional) Additional files/directories uploaded along with customization script. This argument is evaluated to a json dictionary. For example: '--extra-sources "{\\"notes.txt\\": \\"/path/to/notes.txt\\"}"' """) parser.add_argument( "--disk-size", type=int, required=False, default=30, help= """(Optional) The size in GB of the disk attached to the VM instance that builds the custom image. If not specified, the default value of 15 GB will be used.""") parser.add_argument( "--accelerator", type=str, required=False, default=None, help= """(Optional) The accelerators (e.g. GPUs) attached to the VM instance that builds the custom image. If not specified, no accelerators are attached.""") parser.add_argument( "--storage-location", type=str, required=False, default=None, help= """(Optional) The storage location (e.g. US, us-central1) of the custom GCE image. If not specified, the default GCE image storage location is used.""") parser.add_argument( "--shutdown-instance-timer-sec", type=int, required=False, default=300, help= """(Optional) The time to wait in seconds before shutting down the VM instance. This value may need to be increased if your init script generates a lot of output on stdout. If not specified, the default value of 300 seconds will be used.""") parser.add_argument( "--dry-run", action="store_true", help="""(Optional) Only generates script without creating image.""") parser.add_argument( "--trusted-cert", type=str, required=False, default="tls/db.der", help="""(Optional) Pass an empty string to this argument to disable support for shielded-secure-boot.""") return parser.parse_args(args) ====== Filename: ./custom_image_utils/smoke_test_runner.py ====== # Copyright 2019 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Run smoke test for Dataproc custom images. """ import datetime import logging import subprocess import uuid logging.basicConfig() _LOG = logging.getLogger(__name__) _LOG.setLevel(logging.WARN) def _create_workflow_template(workflow_name, image_name, project_id, zone, region, network, subnet, no_external_ip): """Create a Dataproc workflow template for testing.""" create_command = [ "gcloud", "dataproc", "workflow-templates", "create", workflow_name, "--project", project_id, "--region", region ] set_cluster_command = [ "gcloud", "dataproc", "workflow-templates", "set-managed-cluster", workflow_name, "--project", project_id, "--image", image_name, "--zone", zone, "--region", region ] if network and not subnet: set_cluster_command.extend(["--network", network]) else: set_cluster_command.extend(["--subnet", subnet]) if no_external_ip: set_cluster_command.extend(["--no-address"]) add_job_command = [ "gcloud", "dataproc", "workflow-templates", "add-job", "spark", "--workflow-template", workflow_name, "--project", project_id, "--region", region, "--step-id", "001", "--class", "org.apache.spark.examples.SparkPi", "--jars", "file:///usr/lib/spark/examples/jars/spark-examples.jar", "--", "1000" ] pipe = subprocess.Popen(create_command) pipe.wait() if pipe.returncode != 0: raise RuntimeError("Error creating Dataproc workflow template '%s'.", workflow_name) pipe = subprocess.Popen(set_cluster_command) pipe.wait() if pipe.returncode != 0: raise RuntimeError( "Error setting cluster for Dataproc workflow template '%s'.", workflow_name) pipe = subprocess.Popen(add_job_command) pipe.wait() if pipe.returncode != 0: raise RuntimeError("Error adding job to Dataproc workflow template '%s'.", workflow_name) def _instantiate_workflow_template(workflow_name, project_id, region): """Run a Dataproc workflow template to test the newly built custom image.""" command = [ "gcloud", "dataproc", "workflow-templates", "instantiate", workflow_name, "--project", project_id, "--region", region ] pipe = subprocess.Popen(command) pipe.wait() if pipe.returncode != 0: raise RuntimeError("Unable to instantiate workflow template.") def _delete_workflow_template(workflow_name, project_id, region): """Delete a Dataproc workflow template.""" command = [ "gcloud", "dataproc", "workflow-templates", "delete", workflow_name, "-q", "--project", project_id, "--region", region ] pipe = subprocess.Popen(command) pipe.wait() if pipe.returncode != 0: raise RuntimeError("Error deleting workfloe template %s.", workflow_name) def _verify_custom_image(image_name, project_id, zone, network, subnetwork, no_external_ip): """Verifies if custom image works with Dataproc.""" region = zone[:-2] date = datetime.datetime.now().strftime("%Y%m%d%H%M%S") # Note: workflow_name can collide if the script runs more than 10000 # times/second. workflow_name = "verify-image-{}-{}".format(date, uuid.uuid4().hex[-8:]) try: _LOG.info("Creating Dataproc workflow-template %s with image %s...", workflow_name, image_name) _create_workflow_template(workflow_name, image_name, project_id, zone, region, network, subnetwork, no_external_ip) _LOG.info( "Successfully created Dataproc workflow-template %s with image %s...", workflow_name, image_name) _LOG.info("Smoke testing Dataproc workflow-template %s...") _instantiate_workflow_template(workflow_name, project_id, region) _LOG.info("Successfully smoke tested Dataproc workflow-template %s...", workflow_name) except RuntimeError as e: err_msg = "Verification of custom image {} failed: {}".format( image_name, e) _LOG.error(err_msg) raise RuntimeError(err_msg) finally: try: _LOG.info("Deleting Dataproc workflow-template %s...", workflow_name) _delete_workflow_template(workflow_name, project_id, region) _LOG.info("Successfully deleted Dataproc workflow-template %s...", workflow_name) except RuntimeError: pass def run(args): """Runs smoke test.""" if not args.dry_run: if not args.no_smoke_test: _LOG.info("Verifying the custom image...") _verify_custom_image(args.image_name, args.project_id, args.zone, args.network, args.subnetwork, args.no_external_ip) _LOG.info("Successfully verified the custom image...") else: _LOG.info("Skip running smoke test (dry run).") ====== Filename: ./custom_image_utils/shell_script_generator.py.rej ====== --- custom_image_utils/shell_script_generator.py +++ custom_image_utils/shell_script_generator.py @@ -111,11 +111,13 @@ function main() {{ local cert_args="" local num_src_certs="0" + metadata_arg="{metadata_flag}" if [[ -n '{trusted_cert}' ]] && [[ -f '{trusted_cert}' ]]; then # build tls/ directory from variables defined near the header of # the examples/secure-boot/create-key-pair.sh file eval "$(bash examples/secure-boot/create-key-pair.sh)" + metadata_arg="${{metadata_arg}},public_secret_name=${{public_secret_name}},private_secret_name=${{private_secret_name}},secret_project=${{secret_project}},secret_version=${{secret_version}}" # by default, a gcloud secret with the name of efi-db-pub-key-042 is # created in the current project to store the certificate installed @@ -209,7 +211,7 @@ function main() {{ {accelerator_flag} \ {service_account_flag} \ --scopes=cloud-platform \ - {metadata_flag} \ + ${{metadata_arg}} \ --metadata-from-file startup-script=startup_script/run.sh ) touch /tmp/{run_id}/vm_created ====== Filename: ./custom_image_utils/expiration_notifier.py ====== # Copyright 2019 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Notify expiration for Dataproc custom images. """ import datetime import logging import subprocess import tempfile logging.basicConfig() _LOG = logging.getLogger(__name__) _LOG.setLevel(logging.WARN) _expiration_notification_text = """\ ##################################################################### WARNING: DATAPROC CUSTOM IMAGE '{}' WILL EXPIRE ON {}. ##################################################################### """ def _parse_date_time(timestamp_string): """Parses a timestamp string (RFC3339) to datetime format.""" return datetime.datetime.strptime(timestamp_string[:-6], "%Y-%m-%dT%H:%M:%S.%f") def _get_image_creation_timestamp(image_name, project_id): """Gets the creation timestamp of the custom image.""" # version regex already checked in arg parser command = [ "gcloud", "compute", "images", "describe", image_name, "--project", project_id, "--format=csv[no-heading=true](creationTimestamp)" ] with tempfile.NamedTemporaryFile() as temp_file: pipe = subprocess.Popen(command, stdout=temp_file) pipe.wait() if pipe.returncode != 0: raise RuntimeError("Cannot get custom image creation timestamp.") # get creation timestamp temp_file.seek(0) stdout = temp_file.read() return stdout.decode('utf-8').strip() def notify(args): """Notifies when the image will expire.""" if not args.dry_run: _LOG.info("Successfully built Dataproc custom image: %s", args.image_name) creation_date = _parse_date_time( _get_image_creation_timestamp(args.image_name, args.project_id)) expiration_date = creation_date + datetime.timedelta(days=365) _LOG.info( _expiration_notification_text.format(args.image_name, str(expiration_date))) else: _LOG.info("Dry run succeeded.") ====== Filename: ./custom_image_utils/constants.py ====== # Copyright 2017 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Constant variables for building custom image.""" version_help_text = """\ The dataproc image version to be used for building the custom Dataproc image. The image version is in the format of: version_major.version_minor.version_patch Example: 1.2.13 Please refer to https://cloud.google.com/dataproc/docs/concepts/versioning/overview for more information on image versions. """ ====== Filename: ./custom_image_utils/__pycache__/shell_image_creator.cpython-311.pyc ====== § ýдgµãó¢—dZddlZddlmZddlmZej¦«eje¦«Ze  ej ¦«d„Z dS)z* Shell script based custom image creator. éN)Úshell_script_executor)Úshell_script_generatorcóF—t d¦«tj¦« t |¦«¦«}t d¦«t |¦«t d¦«t d¦«|jsJt d¦«tj|¦«t d¦«dSt d¦«dS)z3Creates a custom image with generated Shell script.zGenerating Shell script...z<############################################################z&Successfully generated Shell script...zCreating custom image...z$Successfully created custom image...z%Skip creating custom image (dry run).N) Ú_LOGÚinforÚ GeneratorÚgenerateÚvarsÚdry_runrÚrun)ÚargsÚscripts úw/usr/local/google/home/cjac/src/github/LLC-Technologies-Collier/custom-images/custom_image_utils/shell_image_creator.pyÚcreatersä€õ‡)‚)Ð (Ñ)Ô)Ð)Ý !Ô +Ñ -Ô -× 6Ò 6µt¸D±z´zÑ BÔ B€&݇)‚)ˆHÑÔÐ݇)‚)ˆFÑÔÐ݇)‚)ˆHÑÔÐ݇)‚)Ð 4Ñ5Ô5Ð5ð Œð7݇I‚IÐ(Ñ)Ô)Ð)ÝÔ˜fÑ%Ô%Ð%݇I‚IÐ4Ñ5Ô5Ð5Ð5Ð5å‡I‚IÐ5Ñ6Ô6Ð6Ð6Ð6ó) Ú__doc__ÚloggingÚcustom_image_utilsrrÚ basicConfigÚ getLoggerÚ__name__rÚsetLevelÚWARNr©rrúrsŽðððð€€€à4Ð4Ð4Ð4Ð4Ð4Ø5Ð5Ð5Ð5Ð5Ð5à€ÔÑÔÐØ€wÔ˜Ñ"Ô"€Ø‡ ‚ ˆgŒlÑÔÐð7ð7ð7ð7ð7r====== Filename: ./custom_image_utils/__pycache__/image_labeller.cpython-311.pyc ====== § ýдg=ãó˜—dZddlZddlZej¦«eje¦«Ze ej¦«d„Z d„Z dS)z%Add label to Dataproc custom images. éNcó˜—| dd¦« ¦«}d |¦«}dddd|d||g}t d  d  |¦«¦«¦«t j|¦«}| ¦«|j d krtd ¦«‚d S)ú0Sets Dataproc version label in the custom image.ú.ú-z!--labels=goog-dataproc-version={}ÚgcloudÚcomputeÚimagesz add-labelsz --projectz Running: {}ú rz+Cannot set dataproc version to image label.N) ÚreplaceÚlowerÚformatÚ_LOGÚinfoÚjoinÚ subprocessÚPopenÚwaitÚ returncodeÚ RuntimeError)Ú image_nameÚversionÚ project_idÚ version_labelÚ label_flagÚcommandÚpipes úr/usr/local/google/home/cjac/src/github/LLC-Technologies-Collier/custom-images/custom_image_utils/image_labeller.pyÚ_set_custom_image_labelrsÀ€ð—/’/ # sÑ+Ô+×1Ò1Ñ3Ô3€-Ø2×9Ò9¸-ÑHÔH€*à ˜8 \°:¸{Ø*ð €'õ‡)‚)ˆM× Ò  §¢¨'Ñ!2Ô!2Ñ 3Ô 3Ñ4Ô4Ð4õ Ô ˜'Ñ "Ô "€$؇)‚)+„+€+Ø „_˜ÒÐÝ ÐDÑ EÔ EÐEðÐócóô—|jsVt d¦«t|j|j|j¦«t d¦«dSt d¦«dS)rz Setting label on custom image...z)Successfully set label on custom image...z-Skip setting label on custom image (dry run).N)Údry_runrrrrÚdataproc_versionr)Úargss rÚ add_labelr$,ss€ð Œð?݇I‚IÐ0Ñ1Ô1Ð1ݘDœO¨TÔ-BØ œOñ-ô-ð-å‡I‚IÐ9Ñ:Ô:Ð:Ð:Ð:å‡I‚IÐ=Ñ>Ô>Ð>Ð>Ð>r) Ú__doc__ÚloggingrÚ basicConfigÚ getLoggerÚ__name__rÚsetLevelÚWARNrr$©rrúr-sˆðððð€€€ØÐÐÐà€ÔÑÔÐØ€wÔ˜Ñ"Ô"€Ø‡ ‚ ˆgŒlÑÔÐðFðFðFð& ?ð ?ð ?ð ?ð ?r====== Filename: ./custom_image_utils/__pycache__/constants.cpython-311.pyc ====== § Ó·pfãó—dZdZdS)z-Constant variables for building custom image.aV The dataproc image version to be used for building the custom Dataproc image. The image version is in the format of: version_major.version_minor.version_patch Example: 1.2.13 Please refer to https://cloud.google.com/dataproc/docs/concepts/versioning/overview for more information on image versions. N)Ú__doc__Úversion_help_text©óúY/usr/local/google/home/cjac/src/github/cjac/custom-images/custom_image_utils/constants.pyúrsðð4Ð3ð ÐÐÐr====== Filename: ./custom_image_utils/__pycache__/smoke_test_runner.cpython-311.pyc ====== § ýдgÌãóº—dZddlZddlZddlZddlZej¦«eje¦«Ze  ej ¦«d„Z d„Z d„Z d„Zd„ZdS)z+Run smoke test for Dataproc custom images. éNcóŒ—dddd|d|d|g }dddd|d|d|d |d|g } |r|s|  d |g¦«n|  d |g¦«|r|  d g¦«dddd dd|d|d|ddddddddg} tj|¦«} |  ¦«| jdkrt d|¦«‚tj| ¦«} |  ¦«| jdkrt d|¦«‚tj| ¦«} |  ¦«| jdkrt d|¦«‚dS)z0Create a Dataproc workflow template for testing.ÚgcloudÚdataprocúworkflow-templatesÚcreateú --projectú--regionzset-managed-clusterz--imagez--zonez --networkz--subnetz --no-addresszadd-jobÚsparkz--workflow-templatez --step-idÚ001z--classz!org.apache.spark.examples.SparkPiz--jarsz6file:///usr/lib/spark/examples/jars/spark-examples.jarz--Ú1000rz/Error creating Dataproc workflow template '%s'.z:Error setting cluster for Dataproc workflow template '%s'.z4Error adding job to Dataproc workflow template '%s'.N)ÚextendÚ subprocessÚPopenÚwaitÚ returncodeÚ RuntimeError) Ú workflow_nameÚ image_nameÚ project_idÚzoneÚregionÚnetworkÚsubnetÚno_external_ipÚcreate_commandÚset_cluster_commandÚadd_job_commandÚpipes úu/usr/local/google/home/cjac/src/github/LLC-Technologies-Collier/custom-images/custom_image_utils/smoke_test_runner.pyÚ_create_workflow_templater s¸€ð Ð0°(Ø[ *¨j¸&ð€.ð  Ð0ؘ]¨K¸ÀYØ(˜D *¨fðÐð  ð5Vð5Ø×Ò  ¨WÐ5Ñ6Ô6Ð6Ð6à×Ò  ¨FÐ3Ñ4Ô4Ð4Øð1Ø×Ò Ð/Ñ0Ô0Ð0à Ð0°)¸Wؘ]¨K¸ÀZÐQWØ5˜)Ð%HØÐHÈ$Ø ð €/õ Ô ˜.Ñ )Ô )€$؇)‚)+„+€+Ø „_˜ÒÐÝ ÐHØ$ñ &ô &ð&õ Ô Ð-Ñ .Ô .€$؇)‚)+„+€+Ø „_˜ÒÐÝ ØDØñ ô ðõ Ô ˜/Ñ *Ô *€$؇)‚)+„+€+Ø „_˜ÒÐÝ ÐMØ$ñ &ô &ð&ðÐóc ó —dddd|d|d|g }tj|¦«}| ¦«|jdkrt d¦«‚d S) zFRun a Dataproc workflow template to test the newly built custom image.rrrÚ instantiaterr rz(Unable to instantiate workflow template.N©rrrrr©rrrÚcommandrs rÚ_instantiate_workflow_templater'Gsc€ð Ð0°-Ø[ *¨j¸&ð €'õ Ô ˜'Ñ "Ô "€$؇)‚)+„+€+Ø „_˜ÒÐÝ ÐAÑ BÔ BÐBðÐr!c ó¤—dddd|dd|d|g }tj|¦«}| ¦«|jdkrt d |¦«‚d S) z$Delete a Dataproc workflow template.rrrÚdeletez-qrr rz$Error deleting workfloe template %s.Nr$r%s rÚ_delete_workflow_templater*Ssg€ð Ð0°(ØT˜;¨ °JÀð €'õ Ô ˜'Ñ "Ô "€$؇)‚)+„+€+Ø „_˜ÒÐÝ Ð=¸}Ñ MÔ MÐMðÐr!c ó —|dd…}tj ¦« d¦«}d |t j¦«jdd…¦«} t d||¦«t||||||||¦«t d||¦«t d¦«t|||¦«t d |¦«nQ#t$rD} d  || ¦«} t  | ¦«t| ¦«‚d} ~ wwxYw t d |¦«t|||¦«t d |¦«dS#t$rYdSwxYw# t d |¦«t|||¦«t d |¦«w#t$rYwwxYwxYw) z-Verifies if custom image works with Dataproc.Néþÿÿÿz %Y%m%d%H%M%Szverify-image-{}-{}iøÿÿÿz7Creating Dataproc workflow-template %s with image %s...zCSuccessfully created Dataproc workflow-template %s with image %s...z.Smoke testing Dataproc workflow-template %s...z:Successfully smoke tested Dataproc workflow-template %s...z*Verification of custom image {} failed: {}z)Deleting Dataproc workflow-template %s...z5Successfully deleted Dataproc workflow-template %s...)ÚdatetimeÚnowÚstrftimeÚformatÚuuidÚuuid4ÚhexÚ_LOGÚinfor r'rÚerrorr*) rrrrÚ subnetworkrrÚdaterÚeÚerr_msgs rÚ_verify_custom_imager;_sL€à Œ9€&Ý Ô × Ò Ñ Ô × )Ò )¨.Ñ 9Ô 9€$ð'×-Ò-¨dµD´J±L´LÔ4DÀRÀSÀSÔ4IÑJÔJ€-ð ݇I‚IÐGؘZñ)ô)ð)å˜m¨Z¸ÀTÈ6Ø% z°>ñCôCðCå‡I‚IØMØzñ#ô#ð#õ ‡I‚IÐ>Ñ?Ô?Ð?Ý" =°*¸fÑEÔEÐE݇I‚IÐJØñôððøå ð ð ð Ø:×AÒAØAñô€Gå‡J‚JˆwÑÔÐÝ wÑ Ô Ðøøøøð  øøøðð Ý ‡i‚iÐ;¸]ÑKÔKÐKÝ  ¨z¸6ÑBÔBÐBÝ ‡i‚iÐGØñôðððøå ð ð ð Ø €d€dð øøøøð  Ý ‡i‚iÐ;¸]ÑKÔKÐKÝ  ¨z¸6ÑBÔBÐBÝ ‡i‚iÐGØñôððøå ð ð ð Ø €dð øøøøøøscÁ1BDÄF3Ä EÄ?EÅEÅF3ÅAF"Æ" F0Æ/F0Æ3H Æ5AG=Ç<H Ç= H ÈH È H È H có*—|jsq|jsht d¦«t |j|j|j|j|j |j ¦«t d¦«dSdSt d¦«dS)zRuns smoke test.zVerifying the custom image...z)Successfully verified the custom image...z"Skip running smoke test (dry run).N) Údry_runÚ no_smoke_testr4r5r;rrrrr7r)Úargss rÚrunr@s™€ð Œð4Ø Ô ð=Ý ‡i‚iÐ/Ñ0Ô0Ð0ݘ4œ?¨D¬O¸T¼YØœ<¨¬¸$Ô:MñOôOðOå ‡i‚iÐ;Ñ<Ô<Ð<Ð<Ð<ð =ð=õ  ‡I‚IÐ2Ñ3Ô3Ð3Ð3Ð3r!)Ú__doc__r-Úloggingrr1Ú basicConfigÚ getLoggerÚ__name__r4ÚsetLevelÚWARNr r'r*r;r@©r!rúrIsÌðððð€€€Ø€€€ØÐÐÐØ € € € à€ÔÑÔÐØ€wÔ˜Ñ"Ô"€Ø‡ ‚ ˆgŒlÑÔÐð*&ð*&ð*&ðZ Cð Cð Cð Nð Nð Nð ð ð ðD 4ð 4ð 4ð 4ð 4r!====== Filename: ./custom_image_utils/__pycache__/expiration_notifier.cpython-311.pyc ====== § ýдg ãó²—dZddlZddlZddlZddlZej¦«eje¦«Ze  ej ¦«dZ d„Z d„Z d„ZdS)z/ Notify expiration for Dataproc custom images. éNzÒ ##################################################################### WARNING: DATAPROC CUSTOM IMAGE '{}' WILL EXPIRE ON {}. ##################################################################### cóR—tj |dd…d¦«S)z7Parses a timestamp string (RFC3339) to datetime format.Niúÿÿÿz%Y-%m-%dT%H:%M:%S.%f)ÚdatetimeÚstrptime)Útimestamp_strings úw/usr/local/google/home/cjac/src/github/LLC-Technologies-Collier/custom-images/custom_image_utils/expiration_notifier.pyÚ_parse_date_timer%s.€õ Ô × #Ò #Ð$4°S°b°SÔ$9Ø$:ñ <ô <ð<ócó˜—dddd|d|dg}tj¦«5}tj||¬¦«}| ¦«|jdkrt d ¦«‚| d¦«| ¦«}|  d ¦«  ¦«cd d d ¦«S#1swxYwYd S) z0Gets the creation timestamp of the custom image.ÚgcloudÚcomputeÚimagesÚdescribez --projectz0--format=csv[no-heading=true](creationTimestamp))Ústdoutrz+Cannot get custom image creation timestamp.zutf-8N) ÚtempfileÚNamedTemporaryFileÚ subprocessÚPopenÚwaitÚ returncodeÚ RuntimeErrorÚseekÚreadÚdecodeÚstrip)Ú image_nameÚ project_idÚcommandÚ temp_fileÚpipers rÚ_get_image_creation_timestampr ,s €ð  ˜8 Z°¸[ØÐDð €'õ Ô"Ñ$Ô$ð *¨ Ý Ô ˜G¨IÐ 6Ñ 6Ô 6€D؇I‚IK„K€KØ „˜!ÒÐÝ ÐFÑ GÔ GÐGð‡N‚N1ÑÔÐØ ^Š^Ñ Ô €FØ =Š=˜Ñ !Ô !× 'Ò 'Ñ )Ô )ð *ð *ð *ð *ñ *ô *ð *ð *ð *ð *ð *ð *øøøð *ð *ð *ð *ð *ð *sžBB?Â?CÃCcó”—|js¦t d|j¦«t t |j|j¦«¦«}|tjd¬¦«z}t t  |jt|¦«¦«¦«dSt d¦«dS)z$Notifies when the image will expire.z,Successfully built Dataproc custom image: %sim)ÚdayszDry run succeeded.N) Údry_runÚ_LOGÚinforrr rrÚ timedeltaÚ_expiration_notification_textÚformatÚstr)ÚargsÚ creation_dateÚexpiration_dates rÚnotifyr-AsÀð Œð $݇I‚IÐ<¸d¼oÑNÔNÐNÝ$Ý% d¤o°t´ÑGÔGñIôI€Mà#¥hÔ&8¸cÐ&BÑ&BÔ&BÑB€O݇I‚IÝ%×,Ò,¨T¬_Ý-0°Ñ-AÔ-Añ Cô CñDôDðDðDðDõ ‡I‚IÐ"Ñ#Ô#Ð#Ð#Ð#r )Ú__doc__rÚloggingrrÚ basicConfigÚ getLoggerÚ__name__r$ÚsetLevelÚWARNr'rr r-©r rúr6s®ðððð€€€Ø€€€ØÐÐÐØ€€€à€ÔÑÔÐØ€wÔ˜Ñ"Ô"€Ø‡ ‚ ˆgŒlÑÔÐð!Ðð<ð<ð<ð*ð*ð*ð* $ð $ð $ð $ð $r ====== Filename: ./custom_image_utils/__pycache__/args_inferer.cpython-311.pyc ====== § ýдgn)ãó:—dZddlZddlZddlZddlZddlZdZejd¦«ZdZ ejd¦«Z ej ¦«ej e ¦«Ze ej¦«d„Zd„Zd „Zd „Zd „Zd „Zd „Zd„Zd„Zd„Zd„Zd„Zd„ZdS)z2 Infer arguments for Dataproc custom image build. éNzprojects/{}/global/images/{}zX^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/([^/]+)$z#projects/{}/global/images/family/{}z_^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/family/([^/]+)$cóŒ—gd¢}tj¦«5}tj||¬¦«}| ¦«|jdkrt d¦«‚| d¦«| ¦«}|  d¦«  ¦«cddd¦«S#1swxYwYdS)z"Get project id from gcloud config.)ÚgcloudÚconfigz get-valueÚproject©ÚstdoutrzHCannot find gcloud project ID. Please setup the project ID in gcloud SDKúutf-8N) ÚtempfileÚNamedTemporaryFileÚ subprocessÚPopenÚwaitÚ returncodeÚ RuntimeErrorÚseekÚreadÚdecodeÚstrip)Úgcloud_commandÚ temp_fileÚpipers úp/usr/local/google/home/cjac/src/github/LLC-Technologies-Collier/custom-images/custom_image_utils/args_inferer.pyÚ_get_project_idr%s€à?Ð?Ð?€.ÝÔ"Ñ$Ô$ð *¨ Ý Ô ˜N°9Ð =Ñ =Ô =€D؇I‚IK„K€KØ „˜!ÒÐÝ ðEñ Fô FðFð‡N‚N1ÑÔÐØ ^Š^Ñ Ô €FØ =Š=˜Ñ !Ô !× 'Ò 'Ñ )Ô )ð *ð *ð *ð *ñ *ô *ð *ð *ð *ð *ð *ð *øøøð *ð *ð *ð *ð *ð *s˜BB9Â9B=ÃB=cóŠ—t |¦«}| d¦«| d¦«fS)z$Get Dataproc image name and project.éé)Ú _IMAGE_URIÚmatchÚgroup©Ú image_uriÚms rÚ_extract_image_name_and_projectr#4s4€å×ÒyÑ!Ô!€!Ø Š‰ŒQ—W’W˜Q‘Z”ZÐ ÐócóŠ—t |¦«}| d¦«| d¦«fS)z+Get Dataproc image family name and project.rr)Ú_IMAGE_FAMILY_URIrrr s rÚ/_extract_image_name_and_project_from_family_urir':s4€å×Ò˜iÑ(Ô(€!Ø Š‰ŒQ—W’W˜Q‘Z”ZÐ Ðr$cóú—t|¦«\}}dddd|d|dg}tj¦«5}tj||¬¦«}| ¦«|jdkrtd ¦«‚| d¦«|  ¦«}|r5|  d ¦«  ¦«}|cd d d ¦«S d d d ¦«n #1swxYwYtd |¦«‚) z*Get Dataproc image version from image URI.rÚcomputeÚimagesÚdescribeú --projectú,--format=value(labels.goog-dataproc-version)rrzLCannot find dataproc base image, please check and verify the base image URI.r Nz#Cannot find dataproc base image: %s) r#r r r r rrrrrrr)r!rÚ image_nameÚcommandrrrÚ parsed_lines rÚ_get_dataproc_image_versionr1@sb€å7¸ ÑBÔBÑ€'ˆ:à ˜8 Z°¸[Ø Ð=ð €'õ Ô"Ñ$Ô$ð ¨ Ý Ô ˜G¨IÐ 6Ñ 6Ô 6€D؇I‚IK„K€KØ „˜!ÒÐÝ ð ñ !ô !ð!ð‡N‚N1ÑÔÐØ ^Š^Ñ Ô €Fà ðØ—M’M 'Ñ*Ô*×0Ò0Ñ2Ô2€kØ ð ð ð ð ñ ô ð ð ðð ð ð ñ ô ð ð ð ð ð ð øøøð ð ð ð õ Ð:¸IÑFÔFÐFó°BC"Ã"C&Ã)C&cóþ—t|¦«\}}dddd|d|dg}tj¦«5}tj||¬¦«}| ¦«|jdkrtd ¦«‚| d¦«|  ¦«}|r5|  d ¦«  ¦«}|cd d d ¦«S d d d ¦«n #1swxYwYtd |z¦«‚) z3Get Dataproc image family version from family name.rr)r*zdescribe-from-familyr,r-rrzOCannot find dataproc base family image, please check and verify the family URI.r Nz*Cannot find dataproc base image family: %s) r'r r r r rrrrrrr)Úimage_family_urirÚimage_family_namer/rrrÚdataproc_versions rÚ'_get_dataproc_version_from_image_familyr7[st€åNÐO_Ñ`Ô`Ñ€'Ð à ˜8Ð%;Ð=NÐP[Ø Ð=ð €'õ Ô"Ñ$Ô$ð ¨ Ý Ô ˜G¨IÐ 6Ñ 6Ô 6€D؇I‚IK„K€KØ „˜!ÒÐÝ ð ñ ô ðð‡N‚N1ÑÔÐØ ^Š^Ñ Ô €Fà ðØŸš wÑ/Ô/×5Ò5Ñ7Ô7ÐØ ð ð ð ð ñ ô ð ð ðð ð ð ñ ô ð ð ð ð ð ð øøøð ð ð ð õ ÐAØ%ñ&ñ 'ô 'ð'r2có\—t|¦«\}}t ||¦«S)z2Get the partial image URI from the full image URI.)r#Ú _IMAGE_PATHÚformat)r!rr.s rÚ_extract_image_pathr;vs*€å7¸ ÑBÔBÑ€'ˆ:Ý × Ò ˜G ZÑ 0Ô 0Ð0r$có\—t|¦«\}}t ||¦«S)z@Get the partial image family URI from the full image family URI.)r'Ú_IMAGE_FAMILY_PATHr:)r4rr.s rÚ_extract_image_family_pathr>{s+€åGÐHXÑYÔYÑ€'ˆ:Ý × "Ò " 7¨JÑ 7Ô 7Ð7r$c óЗ| d¦«}|d}t|¦«dkrc|d d¦«d}|d dd¦«|d<d |d|d¦«}n9|d}|d}d |d|d|d¦«}d d d d d dd|dddg }t d |¦«¦«t j¦«5}tj ||¬¦«}|  ¦«|j dkrtd¦«‚|  d¦«| ¦«}|r3| d¦« ¦« d¦«} d ||¦«} t d| ¦«g} i} | D]Ø} |  d¦«}t|¦«dkr®|d}| | ¦«st d|¦«Œc|d}|| vr5t$ d|¦«g| |<|  |¦«Œ¤| | t$ d|¦«¦«ŒÙt d| ¦«t d| ¦«| d}t| |¦«dkr6td |t)| |¦«¦«¦«‚t d| | dd| d¦«| | dd| dfcd d d ¦«S d d d ¦«n #1swxYwYtd!|z¦«‚)"z*Get Dataproc base image name from version.ú.rééú-z-\d+-zMlabels.goog-dataproc-version ~ ^{}-{} AND NOT name ~ -eap$ AND status = READYzOlabels.goog-dataproc-version = {}-{}-{} AND NOT name ~ -eap$ AND status = READYrr)r*Úlistr,zcloud-dataprocz--filterz--formatz7csv[no-heading=true](name,labels.goog-dataproc-version)z--sort-by=~creationTimestampzExecuting command: {}rzMCannot find dataproc base image, please check and verify [--dataproc-version]r ú zdataproc-{}-{}zFiltering images : %sú,zSkipping non-release image %szAll Images : %szAll Image-Versions : %szEFound more than one images for latest dataproc-version={}. Images: {}z!Choosing image %s with version %sNz9Cannot find dataproc base image with dataproc-version=%s.)ÚsplitÚlenÚreplacer:Ú_LOGÚinfor r r r rrrrrrrÚ startswithr9ÚappendÚstr)ÚversionÚparsed_versionÚ major_versionÚ minor_versionÚ filter_argr/rrrÚ parsed_linesÚexpected_prefixÚimage_versionsÚall_images_for_versionÚlineÚ parsed_imageÚparsed_image_nameÚparsed_image_versionÚlatest_available_versions rÚ#_get_dataproc_image_path_by_versionr]€sŠ€ð—=’= Ñ%Ô%€.Ø  Ô#€-݈ÑÔ˜AÒÐð# 1Ô%×+Ò+¨CÑ0Ô0°Ô3€MØ& qÔ)×1Ò1°#°wÑ?Ô?€N1Ñð(ß)/ª°¸qÔ0AØ0>¸qÔ0Añ*Cô*Cð€Jð# 1Ô%€MØ" 1Ô%€Mð(ß)/ª°¸qÔ0AØ0>¸qÔ0AØ0>¸qÔ0Añ*Cô*Cðð  ˆi˜ 6¨;Ð8HØ ˜JØ=Ø"ð  €'õ‡)‚)Ð #× *Ò *¨7Ñ 3Ô 3Ñ4Ô4Ð4åÔ"Ñ$Ô$ð,M¨ Ý Ô ˜G¨IÐ 6Ñ 6Ô 6€D؇I‚IK„K€KØ „˜!ÒÐÝ ð ñ ô ð ð‡N‚N1ÑÔÐØ ^Š^Ñ Ô €Fà ñ!Mà—]’] 7Ñ+Ô+×1Ò1Ñ3Ô3×9Ò9¸$Ñ?Ô?€lØ(×/Ò/° ¸}ÑMÔM€oÝ ‡i‚iÐ'¨Ñ9Ô9Ð9Ø€nØ!ÐØð yð yˆ$Ø—z’z #‘”ˆ Ý ˆ|Ñ Ô  Ò !Ð !Ø*¨1œoÐ Ø"×-Ò-¨oÑ>Ô>ð Ý IŠIÐ5Ð7HÑ IÔ IÐ Ià Ø!-¨a¤Ð Ø !Ð)?Ð ?Ð ?ÝÑ:Ô:Ð:à!/°Ô!2ÐÝ Ð$Ð%=Ô>Ñ ?Ô ?À!Ò CÐ CÝØ Q× XÒ XØ $Ý Ð&Ð'?Ô@Ñ AÔ Añ Cô CñDôDð Dõ  ‡i‚iÐ3Ð5KÈNÐ[\ÔL]Ô5^Ð_`Ô5aÐcqÐrsÔctÑuÔuÐuØ # N°1Ô$5Ô 6°qÔ 9¸>È!Ô;LÐ LðY,Mð,Mð,Mð,Mñ,Mô,Mð,Mð,Mð!Mð,Mð,Mð,Mñ,Mô,Mð,Mð,Mð,Mð,Mð,Mð,Møøøð,Mð,Mð,Mð,Mõ\ Ø?À'ÑIñ Kô KðKsÄJO Ï OÏOcó>—|jst¦«|_dSdS©N)Ú project_idr©Úargss rÚ_infer_project_idrcÑs'€Ø Œð(Ý%Ñ'Ô'€D„O€O€Oð(ð(r$có—t d¦«|jr3t|j¦«|_t |j¦«|_nr|jr"t|j¦«\|_|_nI|jr3t|j¦«|_t|j¦«|_ntd¦«‚t d|j¦«t d|j¦«dS)Nz#Getting Dataproc base image name...z[Neither --dataproc-version nor --base-image-uri nor --source-image-family-uri is specified.z Returned Dataproc base image: %sz Returned Dataproc version : %s) rJrKÚbase_image_urir;Údataproc_base_imager1r6r]Úbase_image_familyr>r7rras rÚ_infer_base_imagerhÖsü€å‡)‚)Ð 1Ñ2Ô2Ð2Ø Ôð gÝ2°4Ô3FÑGÔG€DÔÝ7¸Ô8KÑLÔL€DÔÐØ ÔðgÝ6YØ Ôñ7ô7Ñ3€DÔ˜dÔ3Ð3à ÔðgÝ9¸$Ô:PÑQÔQ€DÔÝCÀDÔDZÑ[Ô[€DÔÐå Øeñ gô gðgå‡)‚)Ð .°Ô0HÑIÔIÐI݇)‚)Ð .°Ô0EÑFÔFÐFÐFÐFr$cóž—|jr>d tj |j¦«¦«|_dSd|_dS)Nz "OAuthPath": "{}",Ú)Úoauthr:ÚosÚpathÚabspathras rÚ _infer_oauthroésG€Ø „ZðØ/×6Ò6Ý ŒŠ˜œ Ñ#Ô#ñ%ô%€D„J€J€Jð€D„J€J€Jr$có²—|js|jsd|_|j d¦«r'd |j|j¦«|_dSdS)Nzglobal/networks/defaultzglobal/networks/zprojects/{}/{})ÚnetworkÚ subnetworkrLr:r`ras rÚ_infer_networkrsñse€ð Œð-˜dœoð-Ø,€D„Lð „\×ÒÐ/Ñ0Ô0ðJØ#×*Ò*¨4¬?¸D¼LÑIÔI€D„L€L€LðJðJr$có–—t|¦«t|¦«t|¦«t|¦«|j|_dSr_)rcrhrorsÚshutdown_instance_timer_secÚshutdown_timer_in_secras rÚ infer_argsrwþsK€ÝDÑÔÐÝDÑÔÐ݈tÑÔÐÝÑÔÐØ#Ô?€$ÔÐÐr$)Ú__doc__ÚloggingrlÚrer r r9Úcompilerr=r&Ú basicConfigÚ getLoggerÚ__name__rJÚsetLevelÚWARNrr#r'r1r7r;r>r]rcrhrorsrw©r$rúr‚s–ðððð€€€Ø € € € Ø € € € ØÐÐÐØ€€€à,€ Ø ˆRŒZØ_ñô€ ð;ÐØB”JØfñôÐð€ÔÑÔÐØ€wÔ˜Ñ"Ô"€Ø‡ ‚ ˆgŒlÑÔÐð *ð *ð *ð ð ð ð  ð ð ð GðGðGð6'ð'ð'ð61ð1ð1ð 8ð8ð8ð NKðNKðNKðb(ð(ð(ð GðGðGð&ððð Jð Jð Jð@ð@ð@ð@ð@r$====== Filename: ./custom_image_utils/__pycache__/shell_script_generator.cpython-311.pyc ====== § ãµgX3ãó4—dZddlmZdZGd„d¦«ZdS)z7 Shell script based image creation workflow generator. é)Údatetimea%#!/usr/bin/env bash # Script for creating Dataproc custom image. set -euo pipefail RED='\e[0;31m' GREEN='\e[0;32m' NC='\e[0m' base_obj_type="images" function execute_with_retries() ( set +x local -r cmd="$*" for ((i = 0; i < 3; i++)); do time eval "$cmd" > "/tmp/{run_id}/install.log" 2>&1 && retval=$? || {{ retval=$? ; cat "/tmp/{run_id}/install.log" ; }} if [[ $retval == 0 ]] ; then return 0 ; fi sleep 5 done return 1 ) function gsutil() {{ ${{gsutil_cmd}} "$*" ; }} function version_ge() ( set +x ; [ "$1" = "$(echo -e "$1 $2" | sort -V | tail -n1)" ] ; ) function version_gt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_ge $1 $2 ; ) function version_le() ( set +x ; [ "$1" = "$(echo -e "$1 $2" | sort -V | head -n1)" ] ; ) function version_lt() ( set +x ; [ "$1" = "$2" ] && return 1 || version_le $1 $2 ; ) function prepare() {{ # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be # used as a more performant replacement for `gsutil` gsutil_cmd="gcloud storage" gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {{print $2}}')" if version_lt "${{gcloud_sdk_version}}" "402.0.0" ; then gsutil_cmd="$(which gsutil) -o GSUtil:check_hashes=never" fi }} function exit_handler() {{ echo 'Cleaning up before exiting.' if [[ -f /tmp/{run_id}/vm_created ]]; then ( set +e echo 'Deleting VM instance.' execute_with_retries gcloud compute instances delete {image_name}-install --project={project_id} --zone={zone} -q ) elif [[ -f /tmp/{run_id}/disk_created ]]; then echo 'Deleting disk.' execute_with_retries gcloud compute ${{base_obj_type}} delete {image_name}-install --project={project_id} --zone={zone} -q fi echo 'Uploading local logs to GCS bucket.' gsutil -m rsync -r {log_dir}/ {gcs_log_dir}/ if [[ -f /tmp/{run_id}/image_created ]]; then echo -e "${{GREEN}}Workflow succeeded${{NC}}, check logs at {log_dir}/ or {gcs_log_dir}/" exit 0 else echo -e "${{RED}}Workflow failed${{NC}}, check logs at {log_dir}/ or {gcs_log_dir}/" exit 1 fi }} function test_element_in_array {{ local test_element="$1" ; shift local -a test_array=("$@") for item in "${{test_array[@]}}"; do if [[ "${{item}}" == "${{test_element}}" ]]; then return 0 ; fi done return 1 }} function print_modulus_md5sum {{ local derfile="$1" openssl x509 -noout -modulus -in "${{derfile}}" | openssl md5 | awk '{{print $2}}' }} function print_img_dbs_modulus_md5sums() {{ local long_img_name="$1" local img_name="$(echo ${{long_img_name}} | sed -e 's:^.*/::')" local json_tmpfile="/tmp/{run_id}/${{img_name}}.json" gcloud compute images describe ${{long_img_name}} --format json > "${{json_tmpfile}}" local -a db_certs=() mapfile -t db_certs < <( cat ${{json_tmpfile}} | jq -r 'try .shieldedInstanceInitialState.dbs[].content' ) local -a modulus_md5sums=() for key in "${{!db_certs[@]}}" ; do local derfile="/tmp/{run_id}/${{img_name}}.${{key}}.der" echo "${{db_certs[${{key}}]}}" | perl -M'MIME::Base64(decode_base64url)' -ne 'chomp; print( decode_base64url($_) )' > "${{derfile}}" modulus_md5sums+=( $(print_modulus_md5sum "${{derfile}}") ) done echo "${{modulus_md5sums[@]}}" }} function main() {{ echo 'Uploading files to GCS bucket.' declare -a sources_k=({sources_map_k}) declare -a sources_v=({sources_map_v}) for i in "${{!sources_k[@]}}"; do gsutil cp "${{sources_v[i]}}" "{custom_sources_path}/${{sources_k[i]}}" > /dev/null 2>&1 done local cert_args="" local num_src_certs="0" metadata_arg="{metadata_flag}" if [[ -n '{trusted_cert}' ]] && [[ -f '{trusted_cert}' ]]; then # build tls/ directory from variables defined near the header of # the examples/secure-boot/create-key-pair.sh file eval "$(bash examples/secure-boot/create-key-pair.sh)" metadata_arg="${{metadata_arg}},public_secret_name=${{public_secret_name}},private_secret_name=${{private_secret_name}},secret_project=${{secret_project}},secret_version=${{secret_version}}" # by default, a gcloud secret with the name of efi-db-pub-key-042 is # created in the current project to store the certificate installed # as the signature database file for this disk image # The MS UEFI CA is a reasonable base from which to build trust. We # will trust code signed by this CA as well as code signed by # trusted_cert (tls/db.der) # The Microsoft Corporation UEFI CA 2011 local -r MS_UEFI_CA="tls/MicCorUEFCA2011_2011-06-27.crt" test -f "${{MS_UEFI_CA}}" || curl -L -o ${{MS_UEFI_CA}} 'https://go.microsoft.com/fwlink/p/?linkid=321194' local -a cert_list=() local -a default_cert_list default_cert_list=("{trusted_cert}" "${{MS_UEFI_CA}}") local -a src_img_modulus_md5sums=() mapfile -t src_img_modulus_md5sums < <(print_img_dbs_modulus_md5sums {dataproc_base_image}) num_src_certs="${{#src_img_modulus_md5sums[@]}}" echo "debug - num_src_certs: [${{#src_img_modulus_md5sums[*]}}]" echo "value of src_img_modulus_md5sums: [${{src_img_modulus_md5sums}}]" if [[ -z "${{src_img_modulus_md5sums}}" ]]; then num_src_certs=0 echo "no db certificates in source image" cert_list=( "${{default_cert_list[@]}}" ) else echo "${{num_src_certs}} db certificates attached to source image" echo "db certs exist in source image" for cert in ${{default_cert_list[*]}}; do if test_element_in_array "$(print_modulus_md5sum ${{cert}})" ${{src_img_modulus_md5sums[@]}} ; then echo "cert ${{cert}} is already in source image's db list" else cert_list+=("${{cert}}") fi done # append source image's cert list local img_name="$(echo {dataproc_base_image} | sed -e 's:^.*/::')" if [[ ${{#cert_list[@]}} -ne 0 ]] && compgen -G "/tmp/{run_id}/${{img_name}}.*.der" > /dev/null ; then cert_list+=(/tmp/{run_id}/${{img_name}}.*.der) fi fi if [[ ${{#cert_list[@]}} -eq 0 ]]; then echo "all certificates already included in source image's db list" else cert_args="--signature-database-file=$(IFS=, ; echo "${{cert_list[*]}}") --guest-os-features=UEFI_COMPATIBLE" fi fi date if [[ -z "${{cert_args}}" && "${{num_src_certs}}" -ne "0" ]]; then echo 'Re-using base image' base_obj_type="reuse" instance_disk_args='--image-project={project_id} --image={dataproc_base_image} --boot-disk-size={disk_size}G --boot-disk-type=pd-ssd' elif [[ -n "${{cert_args}}" ]] ; then echo 'Creating image.' base_obj_type="images" instance_disk_args='--image-project={project_id} --image={image_name}-install --boot-disk-size={disk_size}G --boot-disk-type=pd-ssd' execute_with_retries gcloud compute images create {image_name}-install --project={project_id} --source-image={dataproc_base_image} ${{cert_args}} {storage_location_flag} --family={family} touch "/tmp/{run_id}/disk_created" else echo 'Creating disk.' base_obj_type="disks" instance_disk_args='--disk=auto-delete=yes,boot=yes,mode=rw,name={image_name}-install' execute_with_retries gcloud compute disks create {image_name}-install --project={project_id} --zone={zone} --image={dataproc_base_image} --type=pd-ssd --size={disk_size}GB touch "/tmp/{run_id}/disk_created" fi date echo 'Creating VM instance to run customization script.' execute_with_retries gcloud compute instances create {image_name}-install --project={project_id} --zone={zone} {network_flag} {subnetwork_flag} {no_external_ip_flag} --machine-type={machine_type} ${{instance_disk_args}} {accelerator_flag} {service_account_flag} --scopes=cloud-platform "${{metadata_arg}}" --metadata-from-file startup-script=startup_script/run.sh touch /tmp/{run_id}/vm_created # clean up intermediate install image if [[ "${{base_obj_type}}" == "images" ]] ; then ( set +e # This sometimes returns an API error but deletes the image despite the failure gcloud compute images delete -q {image_name}-install --project={project_id} ) fi echo 'Waiting for customization script to finish and VM shutdown.' execute_with_retries gcloud compute instances tail-serial-port-output {image_name}-install --project={project_id} --zone={zone} --port=1 2>&1 | grep 'startup-script' | sed -e 's/ {image_name}-install.*startup-script://g' | dd status=none bs=1 of={log_dir}/startup-script.log || true echo 'Checking customization script result.' date if grep -q 'BuildFailed:' {log_dir}/startup-script.log; then echo -e "${{RED}}Customization script failed.${{NC}}" echo "See {log_dir}/startup-script.log for details" exit 1 elif grep -q 'BuildSucceeded:' {log_dir}/startup-script.log; then echo -e "${{GREEN}}Customization script succeeded.${{NC}}" else echo 'Unable to determine the customization script result.' exit 1 fi date echo 'Creating custom image.' execute_with_retries gcloud compute images create {image_name} --project={project_id} --source-disk-zone={zone} --source-disk={image_name}-install {storage_location_flag} --family={family} touch /tmp/{run_id}/image_created }} prepare trap exit_handler EXIT mkdir -p {log_dir} main "$@" 2>&1 | tee {log_dir}/workflow.log có—eZdZdZd„Zd„ZdS)Ú Generatorz5Shell script based image creation workflow generator.có"—||_d|jvr@djd4dtj¦« d¦«i|j¤Ž|jd<|jd dd¦«|jd<d jd4i|j¤Ž|jd <d |jd d œ}| |jd¦«tt|  ¦«¦«¦«}d  d„|D¦«¦«|jd<d  d„|D¦«¦«|jd<djd4i|j¤Ž|jd<djd4i|j¤Ž|jd<|jdr%djd4i|j¤Ž|jd<d|jd<n1|jdr$djd4i|j¤Ž|jd<d|jd<|jdrdjd4i|j¤Ž|jd <|jd!rd"nd|jd#<|jd$rd%jd4i|j¤Žnd|jd&<|jd'rd(jd4i|j¤Žnd|jd)<d*}|jd+r9|jd+ d,d-¦«}|d. |¦«z }|jd/r%|jd/}|d0 |¦«z }|jd1r|d2z }|jd4i|j¤Ž|jd3<dS)5NÚrun_idz%custom-image-{image_name}-{timestamp}Ú timestampz %Y%m%d-%H%M%SÚ gcs_bucketzgs://ÚÚ bucket_namez#gs://{bucket_name}/{run_id}/sourcesÚcustom_sources_pathzstartup_script/run.shÚcustomization_script)zrun.shzinit_actions.shÚ extra_sourcesú c ót—g|]5\}}d ||d dd¦«¦«‘Œ6S)ú [{}]='{}'rú'ú'\''©ÚformatÚreplace©Ú.0ÚiÚkvs úz/usr/local/google/home/cjac/src/github/LLC-Technologies-Collier/custom-images/custom_image_utils/shell_script_generator.pyú z(Generator._init_args..2óQ€ð+[ð+[ð+[Ù?D¸qÀ"ˆ ×Ò˜1˜b œeŸmšm¨C°Ñ9Ô9Ñ:Ô:ð+[ð+[ð+[óÚ sources_map_kc ót—g|]5\}}d ||d dd¦«¦«‘Œ6S)rérrrrs rrz(Generator._init_args..4rrÚ sources_map_vz/tmp/{run_id}/logsÚlog_dirz gs://{bucket_name}/{run_id}/logsÚ gcs_log_dirÚ subnetworkz--subnet={subnetwork}Úsubnetwork_flagÚ network_flagÚnetworkz--network={network}Úservice_accountz#--service-account={service_account}Úservice_account_flagÚno_external_ipz --no-addressÚno_external_ip_flagÚ acceleratorz:--accelerator={accelerator} --maintenance-policy terminateÚaccelerator_flagÚstorage_locationz%--storage-location={storage_location}Ústorage_location_flagzb--metadata=shutdown-timer-in-sec={shutdown_timer_in_sec},custom-sources-path={custom_sources_path}Úoptional_componentsú,ú.z,optional-components="{}"Údataproc_versionz,dataproc-version="{}"Úmetadataz ,{metadata}Ú metadata_flag©) ÚargsrrÚnowÚstrftimerÚupdateÚtupleÚ enumerateÚitemsÚjoin)Úselfr8Ú all_sourcesÚsources_map_itemsÚmetadata_flag_templater1r4s rÚ _init_argszGenerator._init_args#s€Ø€D„IØt”yÐ Ð ØJÐCÔJðKðKÝ”L‘N”N×+Ò+¨OÑ<Ô<ðKØ@DÄ ðKðK€d„iÑà#œy¨Ô6×>Ò>¸wÈÑKÔK€D„IˆmÑØ'SÐ'LÔ'SÐ'`Ð'`ÐVZÔV_Ð'`Ð'`€D„IÐ#Ñ$ð*Øœ9Ð%;Ô<ðð€Kð×Òt”y Ô1Ñ2Ô2Ð2åi¨ ×(9Ò(9Ñ(;Ô(;Ñ<Ô<Ñ=Ô=ÐØ!$§¢ð+[ð+[ØHYð+[ñ+[ô+[ñ"\ô"\€D„IˆoÑà!$§¢ð+[ð+[ØHYð+[ñ+[ô+[ñ"\ô"\€D„IˆoÑð7Ð/Ô6ÐCÐC¸¼ÐCÐC€D„IˆiÑØHÐAÔHð ð Ø Œ ð ð €D„IˆmÑà „yÔð(Ø%CÐ%<Ô%CÐ%PÐ%PÀdÄiÐ%PÐ%P€d„iÐ!Ñ"Ø"$€d„iÑÐØ Œ9Ô ð(Ø">Ð"7Ô">Ð"KÐ"KÀÄÐ"KÐ"K€d„iÑØ%'€d„iÐ!Ñ"Ø „yÐ"Ô#ðà"NÐ"GÔ"Nð#ð#Ø Œ)ð#ð#ð „iØñ ð:>¼Øô:ð(  ~ ~Øð „IÐ#Ñ$𜠠-Ô0ð9Ð_ÐXÔ_ððØ Œ)ðððØ68ð „IØñð œ Ð"4Ô5ð">Ð!OÐ!HÔ!Oð"ð"Ø Œ)ð"ð"ð"Ø;=ð „IØñð 4ðð „yÐ&Ô'ðXØ œIÐ&;Ô<×DÒDÀSÈ#ÑNÔNÐØÐ ;× BÒ BÐCVÑ WÔ WÑWÐØ „yÐ#Ô$ðRØœÐ#5Ô6ÐØÐ 8× ?Ò ?Ð@PÑ QÔ QÑQÐØ „yÔð.Ø  Ñ-ÐØ!>Ð!7Ô!>Ð!KÐ!KÀÄÐ!KÐ!K€D„IˆoÑÐÐrcóN—| |¦«tjdi|¤ŽS)Nr7)rDÚ _templater)r@r8s rÚgeneratezGenerator.generateZs+€Ø‡O‚ODÑÔÐÝ Ô Ð #Ð #˜dÐ #Ð #Ð#rN)Ú__name__Ú __module__Ú __qualname__Ú__doc__rDrGr7rrrr s<€€€€€Ø=Ð=ð5Lð5Lð5Lðn$ð$ð$ð$ð$rrN)rKrrFrr7rrúrLs^ððððÐÐÐÐÐðI € ðV<$ð<$ð<$ð<$ð<$ñ<$ô<$ð<$ð<$ð<$r====== Filename: ./custom_image_utils/__pycache__/args_parser.cpython-311.pyc ====== § #à´g &ãóÔ—dZddlZddlZddlZddlmZejd¦«Zejd¦«Zejd¦«Z ejd¦«Z gd¢Z d „Z d „Z d „Zd „Zd „ZdS)zw This is a utility module which defines and parses the command-line arguments for the generate_custom_image.py script. éN)Ú constantsz%^\d+\.\d+\.\d+(-RC\d+)?(-[a-z]+\d+)?$zX^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/([^/]+)$z_^(https://www\.googleapis\.com/compute/([^/]+)/)?projects/([^/]+)/global/images/family/([^/]+)$z+^(\d+)\.(\d+)-((?:debian|ubuntu|rocky)\d+)$) Ú HIVE_WEBHCATÚZEPPELINÚTRINOÚRANGERÚSOLRÚFLINKÚDOCKERÚHUDIÚICEBERGÚPIGcó¼—t |¦«sAt |¦«s'tjd |¦«¦«‚|S)z&Check if version string matches regex.zInvalid version: {}.)Ú_VERSION_REGEXÚmatchÚ_LATEST_FROM_MINOR_VERSIONÚargparseÚArgumentTypeErrorÚformat©Úss úo/usr/local/google/home/cjac/src/github/LLC-Technologies-Collier/custom-images/custom_image_utils/args_parser.pyÚ_version_regex_typer#sU€å × Ò ˜aÑ Ô ðGÕ)C×)IÒ)IÈ!Ñ)LÔ)LðGÝ Ô $Ð%;×%BÒ%BÀ1Ñ%EÔ%EÑ FÔ FÐFØ €(ócóˆ—t |¦«s'tjd |¦«¦«‚|S)z4Check if the partial image uri string matches regex.zInvalid image URI: {}.)Ú_FULL_IMAGE_URIrrrrrs rÚ_full_image_uri_regex_typer)s>€å × Ò ˜qÑ !Ô !ðIÝ Ô $Ð%=×%DÒ%DÀQÑ%GÔ%GÑ HÔ HÐHØ €(rcóˆ—t |¦«s'tjd |¦«¦«‚|S)z;Check if the partial image family uri string matches regex.zInvalid image family URI: {}.)Ú_FULL_IMAGE_FAMILY_URIrrrrrs rÚ!_full_image_family_uri_regex_typer/s>€å × %Ò % aÑ (Ô (ðPÝ Ô $Ð%D×%KÒ%KÈAÑ%NÔ%NÑ OÔ OÐOØ €(rcót—| d¦«}|D]}|tvrtjd¦«‚Œ |S)Nú,z$Invalid optional component selected.)ÚsplitÚ_VALID_OPTIONAL_COMPONENTSrr)Úoptional_componentsÚ componentsÚ components rÚ_validate_componentsr'5sO€Ø$×*Ò*¨3Ñ/Ô/€JØðUðUˆ Ø Õ6Ð 6Ð 6ÝÔ,Ð-SÑTÔTÐ Tð 7à ÐrcóŠ—tj¦«}| d¦«}| dtdd¬¦«| ¦«}| dt tj¬¦«| dtd ¬¦«| d td ¬¦«| d tdd ¬¦«| dtdd¬¦«| dtdd¬¦«| dtdd¬¦«| dtddd¬¦«| dtdd¬¦«| dtdd¬¦«| dtddd¬¦«| d d!d"¬#¦«| d$tdd%d&¬¦«| d'tdd%d(¬¦«| d)d!d*¬#¦«| d+tdd,d-¬¦«| d.tj did/¬¦«| d0tdd1d2¬¦«| d3tdd4d5¬¦«| d6tdd4d7¬¦«| d8tdd9d:¬¦«| d;d!d<¬#¦«| d=tdd>d?¬¦«| d@tddA¬¦«| |¦«S)BzParses command-line arguments.zrequired named argumentsz --image-nameTz-The image name for the Dataproc custom image.)ÚtypeÚrequiredÚhelpz--dataproc-version)r)r+z--base-image-urizàThe full image URI for the base Dataproc image. The customiziation script will be executed on top of this image instead of an out-of-the-box Dataproc image. This image must be a valid Dataproc image. z--base-image-familyzlThe source image family URI. The latest non-depracated image associated with the family will be used. z--customization-scriptz)User's script to install custom packages.z --metadataFaVM metadata which can be read by the customization script with `/usr/share/google/get_metadata_value attributes/` at runtime. The value of this flag takes the form of `key1=value1,key2=value2,...`. If the value includes special characters (e.g., `=`, `,` or spaces) which needs to be escaped, consider encoding the value, then decode it back in the customization script. See more information about VM metadata on https://cloud.google.com/sdk/gcloud/reference/compute/instances/create. z--zonez(GCE zone used to build the custom image.z --gcs-bucketzIGCS bucket used to store files and logs when building custom image.z--familyzdataproc-custom-imagez#(Optional) The family of the image.)r)r*Údefaultr+z --project-idz»The project Id of the project where the custom image will be created and saved. The default value will be set to the project id specified by `gcloud config get-value project`.z--oauthz‚A local path to JSON credentials for your GCE project. The default oauth is the application-default credentials from gcloud.z--machine-typez n1-standard-1z`(Optional) Machine type used to build custom image. Default machine type is n1-standard-1.z--no-smoke-testÚ store_truezl(Optional) Disables smoke test to verify if the custom image can create a functional Dataproc cluster.)Úactionr+z --networkÚa6(Optional) Network interface used to launch the VM instance that builds the custom image. Default network is 'global/networks/default' when no network and subnetwork arguments are provided. If the default network does not exist in your project, please specify a valid network interface.z --subnetworka(Optional) The subnetwork that is used to launch the VM instance that builds the custom image. A full subnetwork URL is required. Default subnetwork is None. For shared VPC only provide this parameter and do not use the --network argument.z--no-external-ipa(Optional) Disables external IP for the image build VM. The VM will not be able to access the internet, but if Private Google Access is enabled for the subnetwork, it can still access Google services (e.g., GCS) through internal IP of the VPC.z--service-accountr,a(Optional) The service account that is used to launch the VM instance that builds the custom image. If not specified, the default service account under the GCE project will be used. The scope of this service account is defaulted to /auth/cloud-platform.z--extra-sourceszä(Optional) Additional files/directories uploaded along with customization script. This argument is evaluated to a json dictionary. For example: '--extra-sources "{\"notes.txt\": \"/path/to/notes.txt\"}"' z --disk-sizeéz¦(Optional) The size in GB of the disk attached to the VM instance that builds the custom image. If not specified, the default value of 15 GB will be used.z --acceleratorNz(Optional) The accelerators (e.g. GPUs) attached to the VM instance that builds the custom image. If not specified, no accelerators are attached.z--storage-locationz(Optional) The storage location (e.g. US, us-central1) of the custom GCE image. If not specified, the default GCE image storage location is used.z--shutdown-instance-timer-seci,zü(Optional) The time to wait in seconds before shutting down the VM instance. This value may need to be increased if your init script generates a lot of output on stdout. If not specified, the default value of 300 seconds will be used.z --dry-runz8(Optional) Only generates script without creating image.z--trusted-certz tls/db.derz(Optional) Inserts the specified DER-format certificate into the custom image's EFI boot sector for use with secure boot.z--optional-componentszµOptional Components to be installed with the image. Can be a comma-separated list of components, e.g., TRINO,ZEPPELIN. (Only supported for Dataproc Images 2.3 and above))rÚArgumentParserÚadd_argument_groupÚ add_argumentÚstrÚadd_mutually_exclusive_grouprrÚversion_help_textrrÚjsonÚloadsÚintr'Ú parse_args)ÚargsÚparserÚ required_argsÚ image_argss rr:r:<sÄ€å Ô "Ñ $Ô $€&Ø×+Ò+Ð,FÑGÔG€-Ø×ÒØÝ ØØ >ð ñ@ô@ð@ð ×9Ò9Ñ;Ô;€*Ø ×ÒØÝ Ý Ô &ðñ(ô(ð(ð ×ÒØÝ %ð ðñ ô ð ð ×ÒØÝ ,ð ðñ ô ð ð ×ÒØÝ ØØ :ð ñ<ô<ð<ð ×ÒØÝ Øð ð ñ  ô  ð  ð×ÒØÝ ØØ 9ð ñ;ô;ð;ð ×ÒØÝ Øð ð ñ!ô!ð!ð  ×ÒØÝ ØØ%Ø 4ð ñ6ô6ð6ð  ×ÒØÝ Øð 9ð ñ:ô:ð:ð ×ÒØÝ Øð Oð ñPôPðPð  ×ÒØÝ ØØð 0ð ñ1ô1ð1ð ×ÒØØ ð 3ðñ4ô4ð4ð  ×ÒØÝ ØØð $ð ñ %ô %ð %ð ×ÒØÝ ØØð ,ð ñ-ô-ð-ð ×ÒØØ ð 5ðñ6ô6ð6ð ×ÒØÝ ØØð7ð ñ 8ô 8ð 8ð ×ÒØÝ Œ:ØØð ð ñ  ô  ð  ð ×ÒØÝ ØØðð ñôðð ×ÒØÝ ØØðð ñôðð ×ÒØÝ ØØðð ñôðð ×ÒØ%Ý ØØð&ð ñ 'ô 'ð 'ð ×ÒØØ Ø IðñKôKðKð ×ÒØÝ ØØð Fð ñGôGðGð ×ÒØÝ Øð <ð ñôðð × Ò ˜4Ñ Ô Ð r)Ú__doc__rr7ÚreÚcustom_image_utilsrÚcompilerrrrr#rrrr'r:©rrúrDsðððð €€€Ø € € € Ø € € € à(Ð(Ð(Ð(Ð(Ð(ð ”ÐDÑEÔE€Ø"”*ÐxÑyÔy€Ø#˜œð%GñHôHÐØ'˜RœZÐ(VÑWÔWÐðBðBðBÐð ð ð ð  ð ð ð  ð ð ð ððð!ð!ð!ð!ð!r====== Filename: ./custom_image_utils/__pycache__/__init__.cpython-311.pyc ====== § Ó·pfãó—dS)N©róúX/usr/local/google/home/cjac/src/github/cjac/custom-images/custom_image_utils/__init__.pyúrsðððr====== Filename: ./custom_image_utils/__pycache__/shell_script_executor.cpython-311.pyc ====== § ×@gÆãó0—dZddlZddlZddlZddlZd„ZdS)z Shell script executor. éNcóH—tjd¬¦«} | | d¦«¦«| ¦«| ¦«t jd|jgtj tj ¬¦«}|  ¦«|j dkrtd¦«‚ tj|j¦«dS#t"$rYdSwxYw# tj|j¦«w#t"$rYwwxYwxYw) zRuns a Shell script.F)Údeletezutf-8Úbash)ÚstdoutÚstderrrzError building custom image.N)ÚtempfileÚNamedTemporaryFileÚwriteÚencodeÚflushÚcloseÚ subprocessÚPopenÚnameÚsysrrÚwaitÚ returncodeÚ RuntimeErrorÚosÚremoveÚOSError)Ú shell_scriptÚ temp_fileÚpipes úy/usr/local/google/home/cjac/src/github/LLC-Technologies-Collier/custom-images/custom_image_utils/shell_script_executor.pyÚrunrs6€õÔ)°Ð7Ñ7Ô7€)ð Ø ‡O‚OL×'Ò'¨Ñ0Ô0Ñ1Ô1Ð1Ø ‡O‚OÑÔÐØ ‡O‚OÑÔÐõ Ô Ø ”Ð ÝŒzÝŒzð ñ ô €Dð ‡I‚IK„K€KØ „˜!ÒÐÝ Ð7Ñ 8Ô 8Ð8ðð Ý„i ”ÑÔÐÐÐøÝ ð ð ð Ø €d€dð øøøøð Ý„i ”ÑÔÐÐøÝ ð ð ð Ø €dð øøøøøøsB—B0C5à C$Ã$ C2Ã1C2Ã5D!Ã7DÄD!Ä DÄD!ÄDÄD!)Ú__doc__rrrrr©órúr sQðððð € € € ØÐÐÐØ € € € Ø€€€ð ð ð ð ð r====== Filename: ./env.json.bz2 ====== BZh91AY&SYmÒBãb߀Pÿð?¿ß0¾ÿßê0˜Öµ¶‘‰<“bž“jh4! mž)½MFÐÑÐÓMB#QM&Ò=M4ÓOPz€ÄcPÄ1ùâ÷"d$OíU v>V’Jš«™ôå›ÆÔ ÙÑRǪ# AŒ2ï;µ·\[o:VËÁy®a^W,´2¾ e e9¯È‹Ñ)jdí);ƒÃÑèûã©MÇ{­](eJ‡GîÜ£D‰žwn¥m%tŸô¦‚ð&„& Ä.Sv‘œ5’]ª©K€´ìU¼×Üš6ZíY1©>µBô1JÓa'Y À"¼U±÷HRqÁ•€>´Åj=ìvUµh/š™>³Ì"`Šà÷˜¾zfŒI×½ÚGdFÀs¶’¤’å¾ÑhAËÆbë ÙU.2„(¢i┥Ìä 8~k´o)ÇÕtYÜ/*Ÿwb¾zdv?uVóÒ¾J H¡IHš}vL윊 qÀqd4 aHª²<^æ™þÀ¤`xàbˆ p ÄU+*¶ $PÅ4’a„ºq?Áþ.äŠp¡ Û¤…Æ====== Filename: ./README.md ====== # Build Dataproc custom images This page describes how to generate a custom Dataproc image. ## Important notes To help ensure that clusters receive the latest service updates and bug fixes, the creation of clusters with a custom image is limited to **365 days** from the image creation date, but existing custom-image clusters can run indefinitely. Automation to continuously build a custom image may be necessary if you wish to create clusters with a custom image for a period greater than 365 days. Creating clusters with expired custom images is possible by following these [instructions](https://cloud.google.com/dataproc/docs/guides/dataproc-images#how_to_create_a_cluster_with_an_expired_custom_image), but Cloud Dataproc cannot guarantee support of issues that arise with these clusters. ## Requirements 1. Python 2. gcloud 3. Bash 3.0. 4. A GCE project with billing, Google Cloud Dataproc API, Google Compute Engine API, Google Secret Manager API, and Google Cloud Storage APIs enabled. 5. Use `gcloud config set project ` to specify which project to use to create and save your custom image. ## Generate custom image To generate a custom image, you can run the following command: ```shell python generate_custom_image.py \ --image-name '' \ --dataproc-version '' \ --customization-script '' \ --zone '' \ --gcs-bucket '' ``` ### Arguments * **--image-name**: The name for custom image. * **--dataproc-version**: The Dataproc version for this custom image to build on. Examples: `2.2.32-debian12`, `2.2.31-debian12`, `2.2.31-ubuntu22`. If the sub-minor version is unspecified, the latest available one will be used. Examples: `2.2-rocky9`, `2.2-debian12`. For a complete list of Dataproc image versions, please review the output of `gcloud compute images list --project cloud-dataproc`. To understand Dataproc versioning, please refer to [documentation](https://cloud.google.com/dataproc/docs/concepts/versioning/overview). **This argument is mutually exclusive with `--base-image-uri` and `--source-image-family`**. * **--base-image-uri**: The full image URI for the base Dataproc image. The customization script will be executed on top of this image instead of an out-of-the-box Dataproc image. This image must be a valid Dataproc image. **This argument is mutually exclusive with `--dataproc-version` and `--source-image-family`**. * **--base-image-family**: The image family that the boot disk will be initialized with. The latest non-deprecated image from the family will be used. An example base image family URI is `projects/PROJECT_NAME/global/images/family/`. To get the list of image families (and the associated image), run `gcloud compute images list [--project ]`. **This argument is mutually exclusive with `--dataproc-version` and `--base-image-uri`**. * **--customization-script**: The script used to install custom packages on the image. * **--zone**: The GCE zone for running your GCE instance. * **--gcs-bucket**: A GCS bucket to store the logs of building custom image. #### Optional Arguments * **--family**: The family of the source image. This will cause the latest non-deprecated image in the family to be used as the source image. * **--project-id**: The project Id of the project where the custom image is created and saved. The default project Id is the current project id specified in `gcloud config get-value project`. * **--oauth**: The OAuth credential file used to call Google Cloud APIs. The default OAuth is the application-default credentials from gcloud. * **--machine-type**: The machine type used to build custom image. The default is `n1-standard-1`. * **--no-smoke-test**: This parameter is used to disable smoke testing the newly built custom image. The smoke test is used to verify if the newly built custom image can create a functional Dataproc cluster. Disabling this step will speed up the custom image build process; however, it is not advised. Note: The smoke test will create a Dataproc cluster with the newly built image, runs a short job and deletes the cluster in the end. * **--network**: This parameter specifies the GCE network to be used to launch the GCE VM instance which builds the custom Dataproc image. The default network is 'global/networks/default'. If the default network does not exist in your project, please specify a valid network interface. For more information on network interfaces, please refer to [GCE VPC documentation](https://cloud.google.com/vpc/docs/vpc). * **--subnetwork**: This parameter specifies the subnetwork that is used to launch the VM instance that builds the custom Dataprocimage. A full subnetwork URL is required. The default subnetwork is None. For more information, please refer to [GCE VPC documentation](https://cloud.google.com/vpc/docs/vpc). * **--no-external-ip**: This parameter is used to disables external IP for the image build VM. The VM will not be able to access the internet, but if [Private Google Access](https://cloud.google.com/vpc/docs/configure-private-google-access) is enabled for the subnetwork, it can still access Google services (e.g., GCS) through internal IP of the VPC. * **--service-account**: The service account that is used to launch the VM instance that builds the custom Dataproc image. The scope of this service account is defaulted to "/auth/cloud-platform", which authorizes VM instance the access to all cloud platform services that is granted by IAM roles. Note: IAM role must allow the VM instance to access GCS bucket in order to access scripts and write logs. * **--extra-sources**: Additional files/directories uploaded along with customization script. This argument is evaluated to a json dictionary. * **--disk-size**: The size in GB of the disk attached to the VM instance used to build custom image. The default is `30` GB. * **--accelerator**: The accelerators (e.g. GPUs) attached to the VM instance used to build custom image. This flag supports the same [values](https://cloud.google.com/sdk/gcloud/reference/compute/instances/create#--accelerator) as `gcloud compute instances create --accelerator` flag. By default no accelerators are attached. * **--base-image-uri**: The partial image URI for the base Dataproc image. The customization script will be executed on top of this image instead of an out-of-the-box Dataproc image. This image must be a valid Dataproc image. The format of the partial image URI is the following: `projects//global/images/`. * **--storage-location**: The storage location (e.g. US, us-central1) of the custom GCE image. This flag supports the same [values](https://cloud.google.com/sdk/gcloud/reference/compute/images/create#--storage-location) as `gcloud compute images create --storage-location` flag. If not specified, the default GCE image storage location is used. * **--shutdown-instance-timer-sec**: The time to wait in seconds before shutting down the VM instance. This value may need to be increased if your init script generates a lot of output on stdout. If not specified, the default value of 300 seconds will be used. * **--dry-run**: Dry run mode which only validates input and generates workflow script without creating image. Disabled by default. * **--trusted-cert**: a certificate in DER format to be inserted into the custom image's EFI boot sector. Can be generated by reading examples/secure-boot/README.md. This argument is mutually exclusive with base-image-family * **--metadata**: VM metadata which can be read by the customization script with `/usr/share/google/get_metadata_value attributes/` at runtime. The value of this flag takes the form of `key1=value1,key2=value2,...`. If the value includes special characters (e.g., `=`, `,` or spaces) which needs to be escaped, consider encoding the value, then decode it back in the customization script. See more information about VM metadata on https://cloud.google.com/sdk/gcloud/reference/compute/instances/create. * **--optional-components**: List of optional components for 2.3+ DPGCE Images. This will install the optional components in the image. For eg. - SOLR,RANGER,TRINO,DOCKER,FLINK,HIVE_WEBHCAT,ZEPPELIN,HUDI,ICEBERG,PIG is the list of valid optional components list. #### Overriding cluster properties with a custom image You can use custom images to overwrite any [cluster properties](https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/cluster-properties) set during cluster creation. If a user creates a cluster with your custom image but sets cluster properties different from those you set with your custom image, your custom image cluster property settings will take precedence. To set cluster properties with your custom image: In your custom image [customization script](https://cloud.google.com/dataproc/docs/guides/dataproc-images#running_the_code), create a `dataproc.custom.properties` file in `/etc/google-dataproc`, then set cluster property values in the file. * Sample `dataproc.custom.properties` file contents: ```shell dataproc.conscrypt.provider.enable=true dataproc.logging.stackdriver.enable=false ``` * Sample customization script file-creation snippet to override two cluster properties: ```shell cat </etc/google-dataproc/dataproc.custom.properties dataproc.conscrypt.provider.enable=true dataproc.logging.stackdriver.enable=false EOF ``` ### Examples #### Create a custom image Create a custom image with name `custom-image-1-5-9` with Dataproc version `1.5.9-debian10`: ```shell python generate_custom_image.py \ --image-name custom-image-1-5-9 \ --dataproc-version 1.5.9-debian10 \ --customization-script ~/custom-script.sh \ --metadata 'key1=value1,key2=value2' \ --zone us-central1-f \ --gcs-bucket gs://my-test-bucket ``` #### Create a custom image without running smoke test ```shell python generate_custom_image.py \ --image-name custom-image-1-5-9 \ --dataproc-version 1.5.9-debian10 \ --customization-script ~/custom-script.sh \ --zone us-central1-f \ --gcs-bucket gs://my-test-bucket \ --no-smoke-test ``` ====== Filename: ./generate_custom_image.py ====== # Copyright 2017 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Generate custom Dataproc image. This python script is used to generate a custom Dataproc image for the user. With the required arguments such as custom install packages script and Dataproc version, this script will run the following steps in order: 1. Get user's gcloud project ID. 2. Get Dataproc's base image name with Dataproc version. 3. Run Shell script to create a custom Dataproc image. 1. Create a disk with Dataproc's base image. 2. Create an GCE instance with the disk. 3. Run custom install packages script to install custom packages. 4. Shutdown instance. 5. Create custom Dataproc image from the disk. 4. Set the custom image label (required for launching custom Dataproc image). 5. Run a Dataproc workflow to smoke test the custom image. Once this script is completed, the custom Dataproc image should be ready to use. """ import logging import os import subprocess import sys from custom_image_utils import args_inferer from custom_image_utils import args_parser from custom_image_utils import expiration_notifier from custom_image_utils import image_labeller from custom_image_utils import shell_image_creator from custom_image_utils import smoke_test_runner logging.basicConfig() _LOG = logging.getLogger(__name__) _LOG.setLevel(logging.WARN) def parse_args(raw_args): """Parses and infers command line arguments.""" args = args_parser.parse_args(raw_args) _LOG.info("Parsed args: {}".format(args)) args_inferer.infer_args(args) _LOG.info("Inferred args: {}".format(args)) return args def perform_sanity_checks(args): _LOG.info("Performing sanity checks...") # Customization script if not os.path.isfile(args.customization_script): raise Exception("Invalid path to customization script: '{}' is not a file.".format( args.customization_script)) # Check the image doesn't already exist. command = "gcloud compute images describe {} --project={}".format( args.image_name, args.project_id) with open(os.devnull, 'w') as devnull: pipe = subprocess.Popen( [command], stdout=devnull, stderr=devnull, shell=True) pipe.wait() if pipe.returncode == 0: raise RuntimeError("Image {} already exists.".format(args.image_name)) _LOG.info("Passed sanity checks...") def main(): """Generates custom image.""" args = parse_args(sys.argv[1:]) perform_sanity_checks(args) shell_image_creator.create(args) image_labeller.add_label(args) smoke_test_runner.run(args) expiration_notifier.notify(args) if __name__ == "__main__": main() ====== Filename: ./env.json.zst ====== (µ/ýdÇí æ™U#kîŽÿmâËõ’˜Z ö1õT€„˜Ð‰«å£2R+P€KKK‹”g¹VÊ)|«êJoK@ï}_øc#ôº-:BÎÑý‡/µòćæÐ‘d_WWŒã|7=¸VÞXV».|PWç-Žœrœh@«+) P±ðz6¸´…Û* h>¢N>(ÔŰŸY¹% Ř3€9 ka¤ð¢§†ºI3¶ª@Òè‡Îˆ7ô7ƒ9\#߈¥NrBAš6 N|SÀ†©‡Óà¯;TôŒ€> À¬ ====== Filename: ./scripts/customize_conda.sh ====== #!/usr/bin/env bash # Copyright 2019 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set -euxo pipefail # This customization-script can be used to customize the conda environment. # It expects the following metadata: # # conda-component: (Required) Must be either ANACONDA or MINICONDA3. Please # make sure the base image supports the component passed here, else the # script will fail. Anaconda is not supported on 2.0 images. For information # on Anaconda vs Miniconda, refer to Miniconda's latest documentation # https://docs.conda.io/en/latest/miniconda.html # # conda-env-config-uri: (Optional) Must be a GCS URI to the yaml config # file. # # conda-packages: (Optional) A list of conda packages with versions to be # installed in the base environment. Must be of the format # :#:... # # pip-packages: (Optional) A list of pip packages with versions to be # installed in the base environment. Must be of the format # :#:... # # conda-env-config-uri is mutually exclusive with conda-packages and # pip-packages. If both are provided, the script will fail. # If environment config file does not contain name of the environment, the name # "custom" will be used by default. # # # Examples # # The following example extracts config file from your environment, copies it to # your GCS bucket and uses it to create a cluster. # # For gcloud SDK < 402, use `gsutil` instead of `gcloud storage` # # conda env export --name= > environment.yaml # gcloud storage cp environment.yaml gs:///environment.yaml # python generate_custom_image.py \ # --image-name \ # --dataproc-version "1.5.34-debian10" \ # --customization-script scripts/customize_conda.sh \ # --zone \ # --gcs-bucket gs:// \ # --metadata 'conda-component=MINICONDA3,dataproc:conda.env.config.uri=gs:///environment.yaml' # # # The following example installs the specified conda and pip packages into the # base environment. # python generate_custom_image.py \ # --image-name \ # --dataproc-version "1.5.34-debian10" \ # --customization-script scripts/customize_conda.sh \ # --zone \ # --gcs-bucket gs:// \ # --metadata 'conda-component=MINICONDA3,conda-packages=pytorch:1.4.0#visions:0.7.1,pip-packages=tokenizers:0.10.1#numpy:1.19.2' function customize_conda() { local conda_component local conda_env_config_uri local conda_packages local pip_packages local conda_bin_dir conda_component=$(/usr/share/google/get_metadata_value attributes/conda-component || true) conda_env_config_uri=$(/usr/share/google/get_metadata_value attributes/conda-env-config-uri || true) conda_packages=$(/usr/share/google/get_metadata_value attributes/conda-packages || true) pip_packages=$(/usr/share/google/get_metadata_value attributes/pip-packages || true) validate_conda_component "${conda_component}" if [[ -n "${conda_env_config_uri}" && (( -n "${conda_packages}" || -n "${pip_packages}" )) ]]; then echo "conda-env-config-uri is mutually exclusive with conda-packages and pip-packages." exit 1 fi if [[ "${conda_component}" == 'ANACONDA' ]]; then conda_bin_dir="/opt/conda/anaconda/bin" elif [[ "${conda_component}" == 'MINICONDA3' ]]; then conda_bin_dir="/opt/conda/miniconda3/bin" fi if [[ -n "${conda_env_config_uri}" ]]; then customize_with_config_file "${conda_bin_dir}" "${conda_env_config_uri}" else customize_with_package_list "${conda_bin_dir}" "${conda_packages}" "${pip_packages}" fi } function validate_conda_component() { local -r conda_component=$1 if [[ -z "${conda_component}" ]]; then echo "Expected metadata conda-component not found" exit 1 fi if [[ "${conda_component}" != 'ANACONDA' && "${conda_component}" != 'MINICONDA3' ]]; then echo "Metadata conda-component should either be ANACONDA or MINICONDA3" exit 1 fi } function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; } function version_lt(){ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";} # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be # used as a more performant replacement for `gsutil` gsutil_cmd="gcloud storage" gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')" if version_lt "${gcloud_sdk_version}" "402.0.0" ; then gsutil_cmd="$(which gsutil) -o GSUtil:check_hashes=never" fi function customize_with_config_file() { local -r conda_bin_dir=$1 local -r conda_env_config_uri=$2 local temp_config_file temp_config_file=$(mktemp /tmp/conda_env_XXX.yaml) ${gsutil_cmd} cp "${conda_env_config_uri}" "${temp_config_file}" conda_env_name="$(grep 'name: ' "${temp_config_file}" | awk '{print $2}')" if [[ -z "${conda_env_name}" ]]; then conda_env_name="custom" fi create_and_activate_environment "${conda_bin_dir}" "${conda_env_name}" "${temp_config_file}" } function create_and_activate_environment() { local -r conda_bin_dir=$1 local -r conda_env_name=$2 local -r conda_env_config=$3 "${conda_bin_dir}/conda" env create --quiet --name="${conda_env_name}" --file="${conda_env_config}" source "${conda_bin_dir}/activate" "${conda_env_name}" # Set property conda.env, which can be used during activate of the conda # component to activate the right environment. local -r conda_properties_path=/etc/google-dataproc/conda.properties echo "conda.env=$conda_env_name" >> "${conda_properties_path}" } function customize_with_package_list() { local -r conda_bin_dir=$1 local conda_packages=$2 local pip_packages=$3 if [[ -n "${conda_packages}" ]]; then local -a packages conda_packages=$(echo "${conda_packages}" | sed -r 's/:/==/g') IFS='#' read -r -a packages <<< "${conda_packages}" validate_package_formats "${packages[@]}" # Conda will upgrade dependencies only if required, and fail if conflict # resolution with existing packages is not possible. "${conda_bin_dir}/conda" install "${packages[@]}" --yes fi if [[ -n "${pip_packages}" ]]; then local -a packages pip_packages=$(echo "${pip_packages}" | sed -r 's/:/==/g') IFS='#' read -r -a packages <<< "${pip_packages}" validate_package_formats "${packages[@]}" # Pip will upgrade dependencies only if required. Pip does not check for # conflicts and may result in inconsistent environment. "${conda_bin_dir}/pip" install -U --upgrade-strategy only-if-needed "${packages[@]}" fi } function validate_package_formats() { local -r packages=("$@") local -r regex='.+==[0-9]+[\\.[0-9]+]*' for package in "${packages[@]}"; do if ! [[ "${package}" =~ $regex ]]; then echo "Invalid package format ${package}" exit 1 fi done } customize_conda ====== Filename: ./tests/__init__.py ====== ====== Filename: ./tests/test_create_custom_image.sh ====== #!/usr/bin/env bash # Copyright 2019 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set -euxo pipefail readonly CURRENT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd) readonly REPO_DIR=$(realpath "${CURRENT_DIR}/..") readonly TEST_SUFFIX=$(tr -dc 'a-z0-9' /dev/null 2>&1 && pwd) readonly REPO_DIR=$(realpath "${CURRENT_DIR}/..") readonly TEST_SUFFIX=$(tr -dc 'a-z0-9' &1 | grep 'startup-script' | tee /tmp/custom-image-my-image-20190611-160823/logs/startup-script.log || true echo 'Checking customization script result.' if grep 'BuildFailed:' /tmp/custom-image-my-image-20190611-160823/logs/startup-script.log; then echo -e "${RED}Customization script failed.${NC}" exit 1 elif grep 'BuildSucceeded:' /tmp/custom-image-my-image-20190611-160823/logs/startup-script.log; then echo -e "${GREEN}Customization script succeeded.${NC}" else echo 'Unable to determine the customization script result.' exit 1 fi echo 'Creating custom image.' gcloud compute images create my-image --project=my-project --source-disk-zone=us-west1-a --source-disk=my-image-install --storage-location=us-east1 --family=debian9 touch /tmp/custom-image-my-image-20190611-160823/image_created } trap exit_handler EXIT mkdir -p /tmp/custom-image-my-image-20190611-160823/logs main "$@" 2>&1 | tee /tmp/custom-image-my-image-20190611-160823/logs/workflow.log """ class TestShellScriptGenerator(unittest.TestCase): def test_generate_shell_script(self): args = { 'run_id': 'custom-image-my-image-20190611-160823', 'family': 'debian9', 'image_name': 'my-image', 'customization_script': '/tmp/my-script.sh', 'metadata': 'key1=value1,key2=value2', 'extra_sources': {"ext'ra_src.txt": "/path/to/extra.txt"}, 'machine_type': 'n1-standard-2', 'disk_size': 40, 'accelerator': 'type=nvidia-tesla-v100,count=2', 'gcs_bucket': 'gs://my-bucket', 'network': 'my-network', 'subnetwork': 'my-subnet', 'no_external_ip': True, 'zone': 'us-west1-a', 'dataproc_base_image': 'projects/cloud-dataproc/global/images/dataproc-1-4-deb9-20190510-000000-rc01', 'service_account': 'my-service-account', 'oauth': '', 'project_id': 'my-project', 'storage_location': 'us-east1', 'shutdown_timer_in_sec': 500, 'base_image_family': 'projects/my-dataproc-project/global/images/family/debian-10' } script = shell_script_generator.Generator().generate(args) self.assertEqual(script, _expected_script) if __name__ == '__main__': unittest.main() ====== Filename: ./tests/data/customization_script_with_extra_sources.sh ====== #!/usr/bin/env bash # Copyright 2019 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. cat extra/source.txt ====== Filename: ./tests/data/extra_source.txt ====== # Copyright 2019 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. Example extra source file ====== Filename: ./tests/test_infer_subminor_version.sh ====== #!/usr/bin/env bash # Copyright 2019 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set -euxo pipefail readonly CURRENT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd) readonly REPO_DIR=$(realpath "${CURRENT_DIR}/..") readonly TEST_SUFFIX=$(tr -dc 'a-z0-9' "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } set +x if [[ $retval == 0 ]] ; then return 0 ; fi sleep 5 done return 1 ) function configure_service_account() { # Create service account if gcloud iam service-accounts list --filter email="${GSA}" 2>&1 | grep -q 'Listed 0 items.' ; then # Create service account for this purpose echo "creating pre-init customization service account ${GSA}" gcloud iam service-accounts create "${SA_NAME}" \ --description="Service account for pre-init customization" \ --display-name="${SA_NAME}" fi if [[ -d tls ]] ; then mv tls "tls-$(date +%s)" ; fi eval "$(bash examples/secure-boot/create-key-pair.sh)" execute_with_retries gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role="roles/dataproc.worker" \ --condition=None # Grant the service account access to buckets in this project # TODO: this is over-broad and should be limited only to the buckets # used by these clusters gsutil iam ch "serviceAccount:${GSA}:roles/storage.objectViewer" "gs://${BUCKET}" # KMS_KEY_URI =~ m:projects/.../locations/.../keyRings/.../cryptoKeys/...: ( eval "$(echo "${KMS_KEY_URI}" | perl -e '$l=; $l =~ m:([^/]+)/([^/]+)/([^/]+)/([^/]+)/([^/]+)/([^/]+)/([^/]+)/([^/]+):; print(join($/, ("$1=$2", "$3=$4", "$5=$6", "$7=$8")), $/)')" gcloud kms keys add-iam-policy-binding "${cryptoKeys}" \ --location "${locations}" \ --keyring "${keyRings}" \ --member "serviceAccount:${GSA}" \ --role "roles/cloudkms.cryptoKeyEncrypterDecrypter" ) gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role=roles/cloudkms.cryptoKeyDecrypter \ for storage_object_role in 'User' 'Creator' 'Viewer' ; do execute_with_retries gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role="roles/storage.object${storage_object_role}" \ --condition=None done for secret in "${public_secret_name}" "${private_secret_name}" ; do for sm_role in 'viewer' 'secretAccessor' ; do # Grant the service account permission to list the secret execute_with_retries gcloud secrets -q add-iam-policy-binding "${secret}" \ --member="serviceAccount:${GSA}" \ --role="roles/secretmanager.${sm_role}" \ --condition=None done done execute_with_retries gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role=roles/compute.instanceAdmin.v1 \ --condition=None execute_with_retries gcloud iam service-accounts add-iam-policy-binding "${GSA}" \ --member="serviceAccount:${GSA}" \ --role=roles/iam.serviceAccountUser \ --condition=None } function revoke_bindings() { execute_with_retries gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role="roles/dataproc.worker" # Revoke the service account's access to buckets in this project for storage_object_role in 'User' 'Creator' 'Viewer' ; do execute_with_retries gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role="roles/storage.object${storage_object_role}" done for secret in "${public_secret_name}" "${private_secret_name}" ; do # Revoke the service account's permission to list and access the secret for sm_role in 'viewer' 'secretAccessor' ; do execute_with_retries gcloud secrets -q remove-iam-policy-binding "${secret}" \ --member="serviceAccount:${GSA}" \ --role="roles/secretmanager.${sm_role}" \ --condition=None done done execute_with_retries gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role=roles/compute.instanceAdmin.v1 execute_with_retries gcloud iam service-accounts remove-iam-policy-binding "${GSA}" \ --member="serviceAccount:${GSA}" \ --role=roles/iam.serviceAccountUser } export PROJECT_ID="$(jq -r .PROJECT_ID env.json)" export PURPOSE="$(jq -r .PURPOSE env.json)" export BUCKET="$(jq -r .BUCKET env.json)" export KMS_KEY_URI="$(jq -r .KMS_KEY_URI env.json)" SA_NAME="sa-${PURPOSE}" if [[ "${PROJECT_ID}" =~ ":" ]] ; then GSA="${SA_NAME}@${PROJECT_ID#*:}.${PROJECT_ID%:*}.iam.gserviceaccount.com" else GSA="${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com" fi gcloud config set project "${PROJECT_ID}" gcloud auth login configure_service_account # screen session name session_name="build-current-images" #readonly timestamp="$(date +%F-%H-%M)" #readonly timestamp="2025-02-15-03-29" readonly timestamp="2025-03-20-19-43" export timestamp export tmpdir=/tmp/${timestamp}; mkdir -p ${tmpdir} export ZONE="$(jq -r .ZONE env.json)" gcloud compute instances list --zones "${ZONE}" --format json > ${tmpdir}/instances.json gcloud compute images list --format json > ${tmpdir}/images.json # Run generation scripts simultaneously for each dataproc image version screen -L -US "${session_name}" -c examples/secure-boot/pre-init.screenrc function find_disk_usage() { # grep maximum-disk-used /tmp/custom-image-*/logs/startup-script.log grep -H 'Customization script' /tmp/custom-image-*/logs/workflow.log for workflow_log in $(grep -Hl "Customization script" /tmp/custom-image-*/logs/workflow.log) ; do startup_log=$(echo "${workflow_log}" | sed -e 's/workflow.log/startup-script.log/') grep -v '^\[' "${startup_log}" \ | grep -A7 'Filesystem.*Avail' \ | perl examples/secure-boot/genline.pl "${workflow_log}" done } revoke_bindings ====== Filename: ./examples/secure-boot/dask.sh ====== #!/bin/bash # Copyright 2020,2021,2023,2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This initialization action script will install Dask and other relevant # libraries on a Dataproc cluster. This is supported for either "yarn" or # "standalone" runtimes Please see dask.org and yarn.dask.org for more # information. set -euxo pipefail function os_id() { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; } function is_ubuntu() { [[ "$(os_id)" == 'ubuntu' ]] ; } function is_ubuntu18() { is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; } function is_debian() { [[ "$(os_id)" == 'debian' ]] ; } function is_debuntu() { is_debian || is_ubuntu ; } function print_metadata_value() { local readonly tmpfile=$(mktemp) http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \ -s -o ${tmpfile} 2>/dev/null) local readonly return_code=$? # If the command completed successfully, print the metadata value to stdout. if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then cat ${tmpfile} fi rm -f ${tmpfile} return ${return_code} } function print_metadata_value_if_exists() { local return_code=1 local readonly url=$1 print_metadata_value ${url} return_code=$? return ${return_code} } function get_metadata_value() { set +x local readonly varname=$1 local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 # Print the instance metadata value. print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} return_code=$? # If the instance doesn't have the value, try the project. if [[ ${return_code} != 0 ]]; then print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} return_code=$? fi set -x return ${return_code} } function get_metadata_attribute() ( set +x local -r attribute_name="$1" local -r default_value="${2:-}" get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" ) function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } function execute_with_retries() { local -r cmd="$*" for i in {0..9} ; do if eval "$cmd"; then return 0 ; fi sleep 5 done echo "Cmd '${cmd}' failed." return 1 } function configure_dask_yarn() { readonly DASK_YARN_CONFIG_DIR=/etc/dask/ readonly DASK_YARN_CONFIG_FILE=${DASK_YARN_CONFIG_DIR}/config.yaml # Minimal custom configuration is required for this # setup. Please see https://yarn.dask.org/en/latest/quickstart.html#usage # for information on tuning Dask-Yarn environments. mkdir -p "${DASK_YARN_CONFIG_DIR}" cat <"${DASK_YARN_CONFIG_FILE}" # Config file for Dask Yarn. # # These values are joined on top of the default config, found at # https://yarn.dask.org/en/latest/configuration.html#default-configuration yarn: environment: python://${DASK_CONDA_ENV}/bin/python worker: count: 2 EOF } function install_systemd_dask_worker() { echo "Installing systemd Dask Worker service..." local -r dask_worker_local_dir="/tmp/${DASK_WORKER_SERVICE}" mkdir -p "${dask_worker_local_dir}" local DASK_WORKER_LAUNCHER="/usr/local/bin/${DASK_WORKER_SERVICE}-launcher.sh" cat <"${DASK_WORKER_LAUNCHER}" #!/bin/bash LOGFILE="/var/log/${DASK_WORKER_SERVICE}.log" echo "dask worker starting, logging to \${LOGFILE}" ${DASK_CONDA_ENV}/bin/dask worker "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>&1 EOF chmod 750 "${DASK_WORKER_LAUNCHER}" local -r dask_service_file="/usr/lib/systemd/system/${DASK_WORKER_SERVICE}.service" cat <"${dask_service_file}" [Unit] Description=Dask Worker Service [Service] Type=simple Restart=on-failure ExecStart=/bin/bash -c 'exec ${DASK_WORKER_LAUNCHER}' [Install] WantedBy=multi-user.target EOF chmod a+r "${dask_service_file}" systemctl daemon-reload # Enable the service if [[ "${ROLE}" != "Master" ]]; then enable_worker_service="1" else local RUN_WORKER_ON_MASTER="$(get_metadata_attribute dask-worker-on-master 'true')" # Enable service on single-node cluster (no workers) local worker_count="$(get_metadata_attribute dataproc-worker-count)" if [[ "${worker_count}" == "0" || "${RUN_WORKER_ON_MASTER}" == "true" ]]; then enable_worker_service="1" fi fi if [[ "${enable_worker_service}" == "1" ]]; then systemctl enable "${DASK_WORKER_SERVICE}" systemctl restart "${DASK_WORKER_SERVICE}" fi } function install_systemd_dask_scheduler() { # only run scheduler on primary master if [[ "$(hostname -s)" != "${MASTER}" ]]; then return ; fi echo "Installing systemd Dask Scheduler service..." local -r dask_scheduler_local_dir="/tmp/${DASK_SCHEDULER_SERVICE}" mkdir -p "${dask_scheduler_local_dir}" local DASK_SCHEDULER_LAUNCHER="/usr/local/bin/${DASK_SCHEDULER_SERVICE}-launcher.sh" cat <"${DASK_SCHEDULER_LAUNCHER}" #!/bin/bash LOGFILE="/var/log/${DASK_SCHEDULER_SERVICE}.log" echo "dask scheduler starting, logging to \${LOGFILE}" ${DASK_CONDA_ENV}/bin/dask scheduler >> "\${LOGFILE}" 2>&1 EOF chmod 750 "${DASK_SCHEDULER_LAUNCHER}" local -r dask_service_file="/usr/lib/systemd/system/${DASK_SCHEDULER_SERVICE}.service" cat <"${dask_service_file}" [Unit] Description=Dask Scheduler Service [Service] Type=simple Restart=on-failure ExecStart=/bin/bash -c 'exec ${DASK_SCHEDULER_LAUNCHER}' [Install] WantedBy=multi-user.target EOF chmod a+r "${dask_service_file}" systemctl daemon-reload # Enable the service systemctl enable "${DASK_SCHEDULER_SERVICE}" } function install_systemd_dask_service() { install_systemd_dask_scheduler install_systemd_dask_worker } function restart_knox() { systemctl stop knox rm -rf "${KNOX_HOME}/data/deployments/*" systemctl start knox } function configure_knox_for_dask() { if [[ ! -d "${KNOX_HOME}" ]]; then echo "Skip configuring Knox rules for Dask" return 0 fi local DASK_UI_PORT=8787 if [[ -f /etc/knox/conf/topologies/default.xml ]]; then sed -i \ "/<\/topology>/i DASK<\/role>http://localhost:${DASK_UI_PORT}<\/url><\/service> DASKWS<\/role>ws:\/\/${MASTER}:${DASK_UI_PORT}<\/url><\/service>" \ /etc/knox/conf/topologies/default.xml fi mkdir -p "${KNOX_DASK_DIR}" cat >"${KNOX_DASK_DIR}/service.xml" <<'EOF' EOF cat >"${KNOX_DASK_DIR}/rewrite.xml" <<'EOF' EOF mkdir -p "${KNOX_DASKWS_DIR}" cat >"${KNOX_DASKWS_DIR}/service.xml" <<'EOF' EOF cat >"${KNOX_DASKWS_DIR}/rewrite.xml" <<'EOF' EOF chown -R knox:knox "${KNOX_DASK_DIR}" "${KNOX_DASKWS_DIR}" # Do not restart knox during pre-init script run if [[ -n "${ROLE}" ]]; then restart_knox fi } function configure_fluentd_for_dask() { if [[ "$(hostname -s)" == "${MASTER}" ]]; then cat >/etc/google-fluentd/config.d/dataproc-dask.conf < @type tail path /var/log/dask-scheduler.log pos_file /var/tmp/fluentd.dataproc.dask.scheduler.pos read_from_head true tag google.dataproc.dask-scheduler @type none @type record_transformer filename dask-scheduler.log EOF fi if [[ "${enable_worker_service}" == "1" ]]; then cat >>/etc/google-fluentd/config.d/dataproc-dask.conf < @type tail path /var/log/dask-worker.log pos_file /var/tmp/fluentd.dataproc.dask.worker.pos read_from_head true tag google.dataproc.dask-worker @type none @type record_transformer filename dask-worker.log EOF fi systemctl restart google-fluentd } function install_dask() { if is_cuda12 ; then local python_spec="python>=3.11" local cuda_spec="cuda-version>=12,<13" local dask_spec="dask>=2024.5" elif is_cuda11 ; then local python_spec="python>=3.9" local cuda_spec="cuda-version>=11,<12.0a0" local dask_spec="dask" fi CONDA_PACKAGES=() if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then # Pin `distributed` and `dask` package versions to old release # because `dask-yarn` 0.9 uses skein in a way which # is not compatible with `distributed` package 2022.2 and newer: # https://github.com/dask/dask-yarn/issues/155 dask_spec="dask<2022.2" python_spec="python>=3.7,<3.8.0a0" if is_ubuntu18 ; then # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic CONDA_PACKAGES+=("fiona<1.8.22") fi CONDA_PACKAGES+=('dask-yarn=0.9' "distributed<2022.2") fi CONDA_PACKAGES+=( "${cuda_spec}" "${dask_spec}" "dask-bigquery" "dask-ml" "dask-sql" ) # Install dask local is_installed="0" mamba="/opt/conda/miniconda3/bin/mamba" conda="/opt/conda/miniconda3/bin/conda" ( set +e for installer in "${mamba}" "${conda}" ; do test -d "${DASK_CONDA_ENV}" || \ time "${installer}" "create" -m -n "dask" -y --no-channel-priority \ -c 'conda-forge' -c 'nvidia' \ ${CONDA_PACKAGES[*]} \ "${python_spec}" > /dev/null 2>&1 local retval=$? sync if [[ "$retval" == "0" ]] ; then is_installed="1" break fi "${conda}" config --set channel_priority flexible done if [[ "${is_installed}" == "0" ]]; then echo "failed to install dask" return 1 fi ) } function main() { # Install Dask install_dask # In "standalone" mode, Dask relies on a systemd unit to launch. # In "yarn" mode, it relies a config.yaml file. if [[ "${DASK_RUNTIME}" == "yarn" ]]; then # Create Dask YARN config file configure_dask_yarn elif [[ "${DASK_RUNTIME}" == "standalone" ]]; then # Create Dask service install_systemd_dask_service if [[ "$(hostname -s)" == "${MASTER}" ]]; then systemctl start "${DASK_SCHEDULER_SERVICE}" systemctl status "${DASK_SCHEDULER_SERVICE}" fi echo "Starting Dask 'standalone' cluster..." if [[ "${enable_worker_service}" == "1" ]]; then systemctl start "${DASK_WORKER_SERVICE}" systemctl status "${DASK_WORKER_SERVICE}" fi configure_knox_for_dask local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')" if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then configure_fluentd_for_dask fi else echo "Unsupported Dask Runtime: ${DASK_RUNTIME}" exit 1 fi echo "Dask for ${DASK_RUNTIME} successfully initialized." } function exit_handler() ( set +e echo "Exit handler invoked" # Free conda cache /opt/conda/miniconda3/bin/conda clean -a > /dev/null 2>&1 # Clear pip cache pip cache purge || echo "unable to purge pip cache" # remove the tmpfs conda pkgs_dirs if [[ -d /mnt/shm ]] ; then /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm ; fi # Clean up shared memory mounts for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then rm -rf ${shmdir}/* umount -f ${shmdir} fi done # Clean up OS package cache ; re-hold systemd package if is_debuntu ; then apt-get -y -qq clean apt-get -y -qq autoremove else dnf clean all fi # print disk usage statistics if is_debuntu ; then # Rocky doesn't have sort -h and fails when the argument is passed du --max-depth 3 -hx / | sort -h | tail -10 fi # Process disk usage logs from installation period rm -f /tmp/keep-running-df sleep 6s # compute maximum size of disk during installation # Log file contains logs like the following (minus the preceeding #): #Filesystem Size Used Avail Use% Mounted on #/dev/vda2 6.8G 2.5G 4.0G 39% / df --si perl -e '$max=( sort map { (split)[2] =~ /^(\d+)/ } grep { m:^/: } )[-1]; print( "maximum-disk-used: $max", $/ );' < /tmp/disk-usage.log echo "exit_handler has completed" # zero free disk space if [[ -n "$(get_metadata_attribute creating-image)" ]]; then dd if=/dev/zero of=/zero ; sync ; rm -f /zero fi return 0 ) trap exit_handler EXIT function prepare_to_install() { readonly DEFAULT_CUDA_VERSION="12.4" CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION}) readonly CUDA_VERSION readonly ROLE=$(get_metadata_attribute dataproc-role) readonly MASTER=$(get_metadata_attribute dataproc-master) # Dask config DASK_RUNTIME="$(get_metadata_attribute dask-runtime || echo 'standalone')" readonly DASK_RUNTIME readonly DASK_SERVICE=dask-cluster readonly DASK_WORKER_SERVICE=dask-worker readonly DASK_SCHEDULER_SERVICE=dask-scheduler readonly DASK_CONDA_ENV="/opt/conda/miniconda3/envs/dask" # Knox config readonly KNOX_HOME=/usr/lib/knox readonly KNOX_DASK_DIR="${KNOX_HOME}/data/services/dask/0.1.0" readonly KNOX_DASKWS_DIR="${KNOX_HOME}/data/services/daskws/0.1.0" enable_worker_service="0" free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" # Write to a ramdisk instead of churning the persistent disk if [[ ${free_mem} -ge 5250000 ]]; then mkdir -p /mnt/shm mount -t tmpfs tmpfs /mnt/shm # Download conda packages to tmpfs /opt/conda/miniconda3/bin/conda config --add pkgs_dirs /mnt/shm mount -t tmpfs tmpfs /mnt/shm # Download pip packages to tmpfs pip config set global.cache-dir /mnt/shm || echo "unable to set global.cache-dir" # Download OS packages to tmpfs if is_debuntu ; then mount -t tmpfs tmpfs /var/cache/apt/archives else mount -t tmpfs tmpfs /var/cache/dnf fi fi # Monitor disk usage in a screen session if is_debuntu ; then apt-get install -y -qq screen elif is_rocky ; then dnf -y -q install screen fi rm -f /tmp/disk-usage.log touch /tmp/keep-running-df screen -d -m -US keep-running-df \ bash -c 'while [[ -f /tmp/keep-running-df ]] ; do df --si / | tee -a /tmp/disk-usage.log ; sleep 5s ; done' } prepare_to_install main ====== Filename: ./examples/secure-boot/README.md ====== ## Secure Boot Secure Boot is a security technology implemented in UEFI firmware that verifies the integrity of the boot process of a computer system. It ensures that only trusted software, such as the operating system, firmware, and drivers, are loaded during startup. This helps prevent malicious software from gaining control of the system before security measures can be implemented. Secure Boot achieves this by verifying the digital signature of drivers and other software against a recognized root of trust. The EFI DB variable stores the cryptographic keys and certificates used for this verification process. How Secure Boot impacts VPC SC: Enhanced Security Perimeter: By verifying the integrity of the boot process, Secure Boot strengthens the foundation of the security perimeter created by VPC SC. This reduces the risk of unauthorized access or data exfiltration due to compromised host systems. Improved Trust in Service Perimeter Resources: VPC SC relies on the trust that the resources within a service perimeter are secure. Secure Boot helps to establish and maintain this trust by ensuring that these resources are protected from malicious boot-time attacks. Compliance and Regulatory Requirements: Many security compliance standards, such as PCI DSS and HIPAA, require specific measures to protect sensitive data. Secure Boot can be a valuable component of meeting these requirements by providing additional assurance of system integrity. Reduced Attack Surface: By preventing unauthorized software from loading during startup, Secure Boot reduces the potential attack surface for malicious actors. This can help to mitigate the risk of successful cyberattacks. In summary, Secure Boot provides a crucial layer of protection for VPC SC by ensuring that the underlying infrastructure is secure and trusted. This helps to strengthen the overall security posture of Google Cloud Platform environments and protect sensitive data. ## Examples To create a custom image with a self-signed, trusted certificate inserted into the boot sector, and then run a script to install cuda on a Dataproc image, the commands from cuda.sh can be run from the root of the custom-images git repository or from a docker container. First, write an env.json to the directory from which you will run the customization script. There is a sample which you can copy and edit in the file examples/secure-boot/env.json.sample. ```bash cp examples/secure-boot/env.json.sample env.json vi env.json docker build -t dataproc-cuda-pre-init:latest . docker run -it dataproc-cuda-pre-init:latest /bin/bash examples/secure-boot/cuda.sh ``` To do the same, but for all dataproc variants including supported versions and image families, the same env.json steps as above should be executed, and then the examples/secure-boot/build-current-images.sh script can be run in docker: ```bash cp examples/secure-boot/env.json.sample env.json vi env.json docker build -t dataproc-dask-rapids-pre-init:latest . docker run -it dataproc-dask-rapids-pre-init:latest /bin/bash examples/secure-boot/build-current-images.sh ``` ====== Filename: ./examples/secure-boot/genline.pl ====== #!/usr/bin/perl -w use strict; use POSIX qw(ceil); # /tmp/custom-image-cuda-pre-init-2-0-debian10-2024-11-14-20-00-20241114-200043/logs/workflow.log # /tmp/custom-image-dataproc-2-0-deb10-20250422-193049-secure-boot-20250422-193247 my $fn = $ARGV[0]; my( @matches ) = ( $fn =~ m{custom-image-dataproc- ( \d+-\d+-(?:deb|roc|ubu)\d+ )- (\d{8}-\d{6})-(.+)-(\d{8}-\d{6}) }x ); #print "matches: @matches\n"; my($short_dp_ver, $timestamp, $purpose, $another_timestamp)=@matches; $short_dp_ver =~ s/-/./; my $dp_version = $short_dp_ver; $dp_version =~ s/deb/debian/; $dp_version =~ s/roc/rocky/; $dp_version =~ s/ubu/ubuntu/; my @raw_lines = ; my( $l ) = grep { m: /dev/.*/\s*$: } @raw_lines; exit 0 unless $l; my( $stats ) = ( $l =~ m:\s*/dev/\S+\s+(.*?)\s*$: ); $stats =~ s:(\d{4,}):sprintf(q{%7s}, sprintf(q{%.2fG},($1/1024)/1024)):eg; my $max_regex = qr/ maximum-disk-used:\s+(\d+)/; my($max) = map { /$max_regex/ ; $1 } grep { /$max_regex/ } @raw_lines; my($gbmax) = ceil((($max / 1024) / 1024) * 1.15); $gbmax = 30 if $gbmax < 30; my $i_dp_version = sprintf(q{%-15s}, qq{"$dp_version"}); print( qq{ $i_dp_version) disk_size_gb="$gbmax" ;; # $stats # $timestamp-$purpose}, $/ ); ====== Filename: ./examples/secure-boot/install_gpu_driver.sh ====== #!/bin/bash # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS-IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # This script installs NVIDIA GPU drivers and collects GPU utilization metrics. set -xeuo pipefail function os_id() { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; } function os_version() { grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; } function os_codename() { grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; } function version_ge(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|tail -n1)" ]]; } function version_gt(){ [[ "$1" = "$2" ]]&& return 1 || version_ge "$1" "$2";} function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; } function version_lt(){ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";} readonly -A supported_os=( ['debian']="10 11 12" ['rocky']="8 9" ['ubuntu']="18.04 20.04 22.04" ) # dynamically define OS version test utility functions if [[ "$(os_id)" == "rocky" ]]; then _os_version=$(os_version | sed -e 's/[^0-9].*$//g') else _os_version="$(os_version)" fi for os_id_val in 'rocky' 'ubuntu' 'debian' ; do eval "function is_${os_id_val}() { [[ \"$(os_id)\" == '${os_id_val}' ]] ; }" for osver in $(echo "${supported_os["${os_id_val}"]}") ; do eval "function is_${os_id_val}${osver%%.*}() { is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; }" eval "function ge_${os_id_val}${osver%%.*}() { is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; }" eval "function le_${os_id_val}${osver%%.*}() { is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; }" done done function is_debuntu() { is_debian || is_ubuntu ; } function os_vercat() { if is_ubuntu ; then os_version | sed -e 's/[^0-9]//g' elif is_rocky ; then os_version | sed -e 's/[^0-9].*$//g' else os_version ; fi ; } function repair_old_backports { if ! is_debuntu ; then return ; fi # This script uses 'apt-get update' and is therefore potentially dependent on # backports repositories which have been archived. In order to mitigate this # problem, we will use archive.debian.org for the oldoldstable repo # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157 debdists="https://deb.debian.org/debian/dists" oldoldstable=$(curl ${curl_retry_args} "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); oldstable=$( curl ${curl_retry_args} "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); stable=$( curl ${curl_retry_args} "${debdists}/stable/Release" | awk '/^Codename/ {print $2}'); matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) ) for filename in "${matched_files[@]}"; do # Fetch from archive.debian.org for ${oldoldstable}-backports perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports } {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}" done } function print_metadata_value() { local readonly tmpfile=$(mktemp) http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \ -s -o ${tmpfile} 2>/dev/null) local readonly return_code=$? # If the command completed successfully, print the metadata value to stdout. if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then cat ${tmpfile} fi rm -f ${tmpfile} return ${return_code} } function print_metadata_value_if_exists() { local return_code=1 local readonly url=$1 print_metadata_value ${url} return_code=$? return ${return_code} } # replicates /usr/share/google/get_metadata_value function get_metadata_value() { local readonly varname=$1 local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 # Print the instance metadata value. print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} return_code=$? # If the instance doesn't have the value, try the project. if [[ ${return_code} != 0 ]]; then print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} return_code=$? fi return ${return_code} } function get_metadata_attribute() { local -r attribute_name="$1" local -r default_value="${2:-}" set +e get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" set -e } OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" distribution=$(. /etc/os-release;echo $ID$VERSION_ID) readonly OS_NAME # node role ROLE="$(get_metadata_attribute dataproc-role)" readonly ROLE # CUDA version and Driver version # https://docs.nvidia.com/deploy/cuda-compatibility/ # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html # https://developer.nvidia.com/cuda-downloads # Minimum supported version for open kernel driver is 515.43.04 # https://github.com/NVIDIA/open-gpu-kernel-modules/tags readonly -A DRIVER_FOR_CUDA=( ["10.0"]="410.48" ["10.1"]="418.87.00" ["10.2"]="440.33.01" ["11.1"]="455.45.01" ["11.2"]="460.91.03" ["11.3"]="465.31" ["11.4"]="470.256.02" ["11.5"]="495.46" ["11.6"]="510.108.03" ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05" ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.29.06" ["12.4"]="550.135" ["12.5"]="550.142" ["12.6"]="550.142" ) readonly -A DRIVER_SUBVER=( ["410"]="410.104" ["415"]="415.27" ["418"]="418.113" ["430"]="430.64" ["435"]="435.21" ["440"]="440.100" ["450"]="450.119.03" ["455"]="455.45.01" ["460"]="460.91.03" ["465"]="465.31" ["470"]="470.256.02" ["495"]="495.46" ["510"]="510.108.03" ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["535"]="535.216.01" ["545"]="545.29.06" ["550"]="550.142" ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.77" ) # https://developer.nvidia.com/cudnn-downloads readonly -A CUDNN_FOR_CUDA=( ["10.0"]="7.4.1" ["10.1"]="7.6.4" ["10.2"]="7.6.5" ["11.0"]="8.0.4" ["11.1"]="8.0.5" ["11.2"]="8.1.1" ["11.3"]="8.2.1" ["11.4"]="8.2.4.15" ["11.5"]="8.3.1.22" ["11.6"]="8.4.0.27" ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.2"]="8.9.5" ["12.3"]="9.0.0.306" ["12.4"]="9.1.0.70" ["12.5"]="9.2.1.18" ["12.6"]="9.6.0.74" ) # https://developer.nvidia.com/nccl/nccl-download readonly -A NCCL_FOR_CUDA=( ["10.0"]="2.3.7" ["10.1"]= ["11.0"]="2.7.8" ["11.1"]="2.8.3" ["11.2"]="2.8.4" ["11.3"]="2.9.9" ["11.4"]="2.11.4" ["11.5"]="2.11.4" ["11.6"]="2.12.10" ["11.7"]="2.12.12" ["11.8"]="2.21.5" ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.2"]="2.19.3" ["12.3"]="2.19.4" ["12.4"]="2.23.4" ["12.5"]="2.22.3" ["12.6"]="2.23.4" ) readonly -A CUDA_SUBVER=( ["10.0"]="10.0.130" ["10.1"]="10.1.234" ["10.2"]="10.2.89" ["11.0"]="11.0.3" ["11.1"]="11.1.1" ["11.2"]="11.2.2" ["11.3"]="11.3.1" ["11.4"]="11.4.4" ["11.5"]="11.5.2" ["11.6"]="11.6.2" ["11.7"]="11.7.1" ["11.8"]="11.8.0" ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" ["12.6"]="12.6.3" ) function set_cuda_version() { case "${DATAPROC_IMAGE_VERSION}" in "1.5" ) DEFAULT_CUDA_VERSION="11.6.2" ;; "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;; "2.2" ) DEFAULT_CUDA_VERSION="12.6.3" ;; "2.3" ) DEFAULT_CUDA_VERSION="12.6.3" ;; * ) echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}" exit 1 ;; esac local cuda_url cuda_url=$(get_metadata_attribute 'cuda-url' '') if [[ -n "${cuda_url}" ]] ; then # if cuda-url metadata variable has been passed, extract default version from url local CUDA_URL_VERSION CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')" if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION}" fi fi readonly DEFAULT_CUDA_VERSION CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then CUDA_FULL_VERSION="${CUDA_VERSION}" CUDA_VERSION="${CUDA_VERSION%.*}" fi readonly CUDA_VERSION if ( ! test -v CUDA_FULL_VERSION ) ; then CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]} fi readonly CUDA_FULL_VERSION } function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } function le_cuda12() { version_le "${CUDA_VERSION%%.*}" "12" ; } function ge_cuda12() { version_ge "${CUDA_VERSION%%.*}" "12" ; } function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } function le_cuda11() { version_le "${CUDA_VERSION%%.*}" "11" ; } function ge_cuda11() { version_ge "${CUDA_VERSION%%.*}" "11" ; } function set_driver_version() { local gpu_driver_url gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '') local cuda_url cuda_url=$(get_metadata_attribute 'cuda-url' '') local nv_xf86_x64_base="https://us.download.nvidia.com/XFree86/Linux-x86_64" local DEFAULT_DRIVER # Take default from gpu-driver-url metadata value if [[ -n "${gpu_driver_url}" ]] ; then DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')" if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi # Take default from cuda-url metadata value as a backup elif [[ -n "${cuda_url}" ]] ; then local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')" if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}" driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]} if curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q 'HTTP.*200' ; then # use the version indicated by the cuda url as the default if it exists DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" elif curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q 'HTTP.*200' ; then # use the maximum sub-version available for the major version indicated in cuda url as the default DEFAULT_DRIVER="${driver_max_maj_version}" fi fi fi if ( ! test -v DEFAULT_DRIVER ) ; then # If a default driver version has not been extracted, use the default for this version of CUDA DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]} fi DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") readonly DRIVER_VERSION readonly DRIVER="${DRIVER_VERSION%%.*}" export DRIVER_VERSION DRIVER gpu_driver_url="${nv_xf86_x64_base}/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" if ! curl ${curl_retry_args} --head "${gpu_driver_url}" | grep -E -q 'HTTP.*200' ; then echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}" exit 1 fi } function set_cudnn_version() { readonly MIN_ROCKY8_CUDNN8_VERSION="8.0.5.39" readonly DEFAULT_CUDNN8_VERSION="8.3.1.22" readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" # Parameters for NVIDIA-provided cuDNN library readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") # The minimum cuDNN version supported by rocky is ${MIN_ROCKY8_CUDNN8_VERSION} if ( is_rocky && version_lt "${CUDNN_VERSION}" "${MIN_ROCKY8_CUDNN8_VERSION}" ) ; then CUDNN_VERSION="${MIN_ROCKY8_CUDNN8_VERSION}" elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then # cuDNN v8 is not distribution for ubuntu20+, debian12 CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 CUDNN_VERSION="8.8.0.121" fi readonly CUDNN_VERSION } function is_cudnn8() { [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; } function is_cudnn9() { [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; } # Short name for urls if is_ubuntu22 ; then # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at # https://developer.download.nvidia.com/compute/machine-learning/repos/ # use packages from previous release until such time as nvidia # release ubuntu2204 builds nccl_shortname="ubuntu2004" shortname="$(os_id)$(os_vercat)" elif ge_rocky9 ; then # use packages from previous release until such time as nvidia # release rhel9 builds nccl_shortname="rhel8" shortname="rhel9" elif is_rocky ; then shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" nccl_shortname="${shortname}" else shortname="$(os_id)$(os_vercat)" nccl_shortname="${shortname}" fi function set_nv_urls() { # Parameters for NVIDIA-provided package repositories readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" # Parameter for NVIDIA-provided Rocky Linux GPU driver readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" } function set_cuda_runfile_url() { local MAX_DRIVER_VERSION local MAX_CUDA_VERSION MIN_OPEN_DRIVER_VER="515.43.04" local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}" local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER if is_cuda12 ; then if is_debian12 ; then MIN_DRIVER_VERSION="545.23.06" MIN_CUDA_VERSION="12.3.0" elif is_debian10 ; then MAX_DRIVER_VERSION="555.42.02" MAX_CUDA_VERSION="12.5.0" elif is_ubuntu18 ; then MAX_DRIVER_VERSION="530.30.02" MAX_CUDA_VERSION="12.1.1" fi elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then if le_debian10 ; then # cuda 11 is not supported for <= debian10 MAX_CUDA_VERSION="0" MAX_DRIVER_VERSION="0" fi else echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" fi if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}. Specified: ${CUDA_VERSION}" fi if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" fi # driver version named in cuda runfile filename # (these may not be actual driver versions - see https://us.download.nvidia.com/XFree86/Linux-x86_64/) readonly -A drv_for_cuda=( ["10.0.130"]="410.48" ["10.1.234"]="418.87.00" ["10.2.89"]="440.33.01" ["11.0.3"]="450.51.06" ["11.1.1"]="455.32.00" ["11.2.2"]="460.32.03" ["11.3.1"]="465.19.01" ["11.4.4"]="470.82.01" ["11.5.2"]="495.29.05" ["11.6.2"]="510.47.03" ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01" ["11.8.0"]="520.61.05" ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12" ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02" ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05" ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08" ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://us.download.nvidia.com/XFree86/Linux-x86_64/ ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ["12.6.3"]="560.35.05" ) # Verify that the file with the indicated combination exists local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]} CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run" local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}" local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}" NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") if ! curl ${curl_retry_args} --head "${NVIDIA_CUDA_URL}" | grep -E -q 'HTTP.*200' ; then echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}" if [[ "${DEFAULT_NVIDIA_CUDA_URL}" != "${NVIDIA_CUDA_URL}" ]]; then echo "consider [${DEFAULT_NVIDIA_CUDA_URL}] instead" fi exit 1 fi readonly NVIDIA_CUDA_URL CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" readonly CUDA_RUNFILE if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12" elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18. Requested version: ${CUDA_VERSION}" elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}" elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}" fi } function set_cudnn_tarball_url() { CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz" CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}" if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz" if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then # When cuDNN version is greater than or equal to 8.4.1.50 use this format CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz" fi # Use legacy url format with one of the tarball name formats depending on version as above CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}" fi if ( version_ge "${CUDA_VERSION}" "12.0" ); then # Use modern url format When cuda version is greater than or equal to 12.0 CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz" CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}" fi readonly CUDNN_TARBALL readonly CUDNN_TARBALL_URL } # Whether to install NVIDIA-provided or OS-provided GPU driver GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') readonly GPU_DRIVER_PROVIDER # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'true') readonly INSTALL_GPU_AGENT # Dataproc configurations readonly HADOOP_CONF_DIR='/etc/hadoop/conf' readonly HIVE_CONF_DIR='/etc/hive/conf' readonly SPARK_CONF_DIR='/etc/spark/conf' NVIDIA_SMI_PATH='/usr/bin' MIG_MAJOR_CAPS=0 IS_MIG_ENABLED=0 IS_CUSTOM_IMAGE_BUILD="false" # Default function execute_with_retries() ( local -r cmd="$*" if [[ "$cmd" =~ "^apt-get install" ]] ; then apt-get -y clean apt-get -o DPkg::Lock::Timeout=60 -y autoremove fi for ((i = 0; i < 3; i++)); do time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } if [[ $retval == 0 ]] ; then return 0 ; fi sleep 5 done return 1 ) function install_cuda_keyring_pkg() { is_complete cuda-keyring-installed && return local kr_ver=1.1 curl ${curl_retry_args} \ "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ -o "${tmpdir}/cuda-keyring.deb" dpkg -i "${tmpdir}/cuda-keyring.deb" rm -f "${tmpdir}/cuda-keyring.deb" mark_complete cuda-keyring-installed } function uninstall_cuda_keyring_pkg() { apt-get purge -yq cuda-keyring mark_incomplete cuda-keyring-installed } function install_local_cuda_repo() { is_complete install-local-cuda-repo && return pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local" CUDA_LOCAL_REPO_PKG_NAME="${pkgname}" readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb" readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" readonly DIST_KEYRING_DIR="/var/${pkgname}" curl ${curl_retry_args} \ "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}" dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}" rm "${tmpdir}/${LOCAL_INSTALLER_DEB}" cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ if is_ubuntu ; then curl ${curl_retry_args} \ "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \ -o /etc/apt/preferences.d/cuda-repository-pin-600 fi mark_complete install-local-cuda-repo } function uninstall_local_cuda_repo(){ apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}" mark_incomplete install-local-cuda-repo } function install_local_cudnn_repo() { is_complete install-local-cudnn-repo && return pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}" CUDNN_PKG_NAME="${pkgname}" local_deb_fn="${pkgname}_1.0-1_amd64.deb" local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}" # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz curl ${curl_retry_args} \ "${local_deb_url}" -o "${tmpdir}/local-installer.deb" dpkg -i "${tmpdir}/local-installer.deb" rm -f "${tmpdir}/local-installer.deb" cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings mark_complete install-local-cudnn-repo } function uninstall_local_cudnn_repo() { apt-get purge -yq "${CUDNN_PKG_NAME}" mark_incomplete install-local-cudnn-repo } function install_local_cudnn8_repo() { is_complete install-local-cudnn8-repo && return if is_ubuntu ; then cudnn8_shortname="ubuntu2004" elif is_debian ; then cudnn8_shortname="debian11" else return 0 ; fi if is_cuda12 ; then CUDNN8_CUDA_VER=12.0 elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8 else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}" pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}" CUDNN8_PKG_NAME="${pkgname}" deb_fn="${pkgname}_1.0-1_amd64.deb" local_deb_fn="${tmpdir}/${deb_fn}" local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}" # cache the cudnn package cache_fetched_package "${local_deb_url}" \ "${pkg_bucket}/nvidia/cudnn/${CUDNN8_CUDA_VER}/${deb_fn}" \ "${local_deb_fn}" local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')" # If we are using a ram disk, mount another where we will unpack the cudnn local installer if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then mkdir -p "${cudnn_path}" mount -t tmpfs tmpfs "${cudnn_path}" fi dpkg -i "${local_deb_fn}" rm -f "${local_deb_fn}" cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings mark_complete install-local-cudnn8-repo } function uninstall_local_cudnn8_repo() { apt-get purge -yq "${CUDNN8_PKG_NAME}" mark_incomplete install-local-cudnn8-repo } function install_nvidia_nccl() { readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) is_complete nccl && return if is_cuda11 && is_debian12 ; then echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}" return fi local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" mkdir -p "${workdir}" pushd "${workdir}" test -d "${workdir}/nccl" || { local tarball_fn="v${NCCL_VERSION}-1.tar.gz" curl ${curl_retry_args} \ "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \ | tar xz mv "nccl-${NCCL_VERSION}-1" nccl } local build_path if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else build_path="nccl/build/pkg/rpm/x86_64" ; fi test -d "${workdir}/nccl/build" || { local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz" local local_tarball="${workdir}/${build_tarball}" local gcs_tarball="${pkg_bucket}/nvidia/nccl/${_shortname}/${build_tarball}" if [[ "$(hostname -s)" =~ ^test-gpu && "$(nproc)" < 32 ]] ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')" if [[ "$?" == "0" ]] ; then local build_start_time build_start_epoch timeout_epoch build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" build_start_epoch="$(date -u -d "${build_start_time}" +%s)" timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m ${gsutil_cmd} rm "${gcs_tarball}.building" break fi sleep 5m done fi fi if ${gsutil_stat_cmd} "${gcs_tarball}" ; then # cache hit - unpack from cache echo "cache hit" ${gsutil_cmd} cat "${gcs_tarball}" | tar xvz else # build and cache touch "${local_tarball}.building" ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" pushd nccl # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install install_build_dependencies # https://github.com/NVIDIA/nccl/blob/master/README.md # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ # Fermi: SM_20, compute_30 # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 # The following architectures are suppored by open kernel driver # Volta: SM_70,SM_72, compute_70,compute_72 # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 # The following architectures are supported by CUDA v11.8+ # Ada: SM_89, compute_89 # Hopper: SM_90,SM_90a compute_90,compute_90a # Blackwell: SM_100, compute_100 NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86" if version_gt "${CUDA_VERSION}" "11.6" ; then NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_87,code=sm_87" ; fi if version_ge "${CUDA_VERSION}" "11.8" ; then NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" ; fi if version_ge "${CUDA_VERSION}" "12.0" ; then NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" ; fi if is_debuntu ; then # These packages are required to build .deb packages from source execute_with_retries \ apt-get install -y -qq build-essential devscripts debhelper fakeroot export NVCC_GENCODE execute_with_retries make -j$(nproc) pkg.debian.build elif is_rocky ; then # These packages are required to build .rpm packages from source execute_with_retries \ dnf -y -q install rpm-build rpmdevtools export NVCC_GENCODE execute_with_retries make -j$(nproc) pkg.redhat.build fi tar czvf "${local_tarball}" "../${build_path}" make clean popd tar xzvf "${local_tarball}" ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi building_file="" rm "${local_tarball}" fi } if is_debuntu ; then dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb" elif is_rocky ; then rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm" fi popd mark_complete nccl } function is_src_nvidia() { [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; } function is_src_os() { [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; } function install_nvidia_cudnn() { is_complete cudnn && return if le_debian10 ; then return ; fi local major_version major_version="${CUDNN_VERSION%%.*}" local cudnn_pkg_version cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}" if is_rocky ; then if is_cudnn8 ; then execute_with_retries dnf -y -q install \ "libcudnn${major_version}" \ "libcudnn${major_version}-devel" sync elif is_cudnn9 ; then execute_with_retries dnf -y -q install \ "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \ "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}" sync else echo "Unsupported cudnn version: '${major_version}'" fi elif is_debuntu; then if ge_debian12 && is_src_os ; then apt-get -y install nvidia-cudnn else if is_cudnn8 ; then add_repo_cuda apt-get update -qq # Ignore version requested and use the latest version in the package index cudnn_pkg_version="$(apt-cache show libcudnn8 | awk "/^Ver.*cuda${CUDA_VERSION%%.*}.*/ {print \$2}" | sort -V | tail -1)" execute_with_retries \ apt-get -y install --no-install-recommends \ "libcudnn8=${cudnn_pkg_version}" \ "libcudnn8-dev=${cudnn_pkg_version}" sync elif is_cudnn9 ; then install_cuda_keyring_pkg apt-get update -qq execute_with_retries \ apt-get -y install --no-install-recommends \ "libcudnn9-cuda-${CUDA_VERSION%%.*}" \ "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \ "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" sync else echo "Unsupported cudnn version: [${CUDNN_VERSION}]" fi fi else echo "Unsupported OS: '${OS_NAME}'" exit 1 fi ldconfig echo "NVIDIA cuDNN successfully installed for ${OS_NAME}." mark_complete cudnn } function install_pytorch() { is_complete pytorch && return local env env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce') local conda_root_path if version_lt "${DATAPROC_IMAGE_VERSION}" "2.3" ; then conda_root_path="/opt/conda/miniconda3" else conda_root_path="/opt/conda" fi [[ -d ${conda_root_path} ]] || return local envpath="${conda_root_path}/envs/${env}" if [[ "${env}" == "base" ]]; then echo "WARNING: installing to base environment known to cause solve issues" ; envpath="${conda_root_path}" ; fi # Set numa node to 0 for all GPUs for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done local build_tarball="pytorch_${env}_${_shortname}_cuda${CUDA_VERSION}.tar.gz" local local_tarball="${workdir}/${build_tarball}" local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')" if [[ "$?" == "0" ]] ; then local build_start_time build_start_epoch timeout_epoch build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" build_start_epoch="$(date -u -d "${build_start_time}" +%s)" timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m ${gsutil_cmd} rm "${gcs_tarball}.building" break fi sleep 5m done fi fi if ${gsutil_stat_cmd} "${gcs_tarball}" ; then # cache hit - unpack from cache echo "cache hit" mkdir -p "${envpath}" ${gsutil_cmd} cat "${gcs_tarball}" | tar -C "${envpath}" -xz else touch "${local_tarball}.building" ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" local verb=create if test -d "${envpath}" ; then verb=install ; fi cudart_spec="cuda-cudart" if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi # Install pytorch and company to this environment "${conda_root_path}/bin/mamba" "${verb}" -n "${env}" \ -c conda-forge -c nvidia -c rapidsai \ numba pytorch tensorflow[and-cuda] rapids pyspark \ "cuda-version<=${CUDA_VERSION}" "${cudart_spec}" # Install jupyter kernel in this environment "${envpath}/bin/python3" -m pip install ipykernel # package environment and cache in GCS pushd "${envpath}" tar czf "${local_tarball}" . popd ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi building_file="" fi # register the environment as a selectable kernel "${envpath}/bin/python3" -m ipykernel install --name "${env}" --display-name "Python (${env})" mark_complete pytorch } function configure_dkms_certs() { if test -v PSN && [[ -z "${PSN}" ]]; then echo "No signing secret provided. skipping"; return 0 fi mkdir -p "${CA_TMPDIR}" # If the private key exists, verify it if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then echo "Private key material exists" local expected_modulus_md5sum expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum) if [[ -n "${expected_modulus_md5sum}" ]]; then modulus_md5sum="${expected_modulus_md5sum}" # Verify that cert md5sum matches expected md5sum if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then echo "unmatched rsa key" fi # Verify that key md5sum matches expected md5sum if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then echo "unmatched x509 cert" fi else modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" fi ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" return fi # Retrieve cloud secrets keys local sig_priv_secret_name sig_priv_secret_name="${PSN}" local sig_pub_secret_name sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" local sig_secret_project sig_secret_project="$(get_metadata_attribute secret_project)" local sig_secret_version sig_secret_version="$(get_metadata_attribute secret_version)" # If metadata values are not set, do not write mok keys if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi # Write private material to volatile storage gcloud secrets versions access "${sig_secret_version}" \ --project="${sig_secret_project}" \ --secret="${sig_priv_secret_name}" \ | dd status=none of="${CA_TMPDIR}/db.rsa" # Write public material to volatile storage gcloud secrets versions access "${sig_secret_version}" \ --project="${sig_secret_project}" \ --secret="${sig_pub_secret_name}" \ | base64 --decode \ | dd status=none of="${CA_TMPDIR}/db.der" local mok_directory="$(dirname "${mok_key}")" mkdir -p "${mok_directory}" # symlink private key and copy public cert from volatile storage to DKMS directory ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" cp -f "${CA_TMPDIR}/db.der" "${mok_der}" modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')" } function clear_dkms_key { if [[ -z "${PSN}" ]]; then echo "No signing secret provided. skipping" >&2 return 0 fi rm -rf "${CA_TMPDIR}" "${mok_key}" } function add_contrib_component() { if ! is_debuntu ; then return ; fi if ge_debian12 ; then # Include in sources file components on which nvidia-kernel-open-dkms depends local -r debian_sources="/etc/apt/sources.list.d/debian.sources" local components="main contrib" sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" elif is_debian ; then sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list fi } function add_nonfree_components() { if is_src_nvidia ; then return; fi if ge_debian12 ; then # Include in sources file components on which nvidia-open-kernel-dkms depends local -r debian_sources="/etc/apt/sources.list.d/debian.sources" local components="main contrib non-free non-free-firmware" sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" elif is_debian ; then sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list fi } # # Install package signing key and add corresponding repository # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html function add_repo_nvidia_container_toolkit() { local nvctk_root="https://nvidia.github.io/libnvidia-container" local signing_key_url="${nvctk_root}/gpgkey" local repo_data # Since there are more than one keys to go into this keychain, we can't call os_add_repo, which only works with one if is_debuntu ; then # "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" local -r repo_name="nvidia-container-toolkit" local -r kr_path="/usr/share/keyrings/${repo_name}.gpg" execute_with_retries gpg --keyserver keyserver.ubuntu.com \ --no-default-keyring --keyring "${kr_path}" \ --recv-keys "0xae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80" "0xeb693b3035cd5710e231e123a4b469963bf863cc" "0xc95b321b61e88c1809c4f759ddcae044f796ecb0" local -r repo_data="${nvctk_root}/stable/deb/\$(ARCH) /" local -r repo_path="/etc/apt/sources.list.d/${repo_name}.list" echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" execute_with_retries apt-get update else repo_data="${nvctk_root}/stable/rpm/nvidia-container-toolkit.repo" os_add_repo nvidia-container-toolkit \ "${signing_key_url}" \ "${repo_data}" \ "no" fi } function add_repo_cuda() { if is_debuntu ; then if version_le "${CUDA_VERSION}" 11.6 ; then local kr_path=/usr/share/keyrings/cuda-archive-keyring.gpg local sources_list_path="/etc/apt/sources.list.d/cuda-${shortname}-x86_64.list" echo "deb [signed-by=${kr_path}] https://developer.download.nvidia.com/compute/cuda/repos/${shortname}/x86_64/ /" \ | sudo tee "${sources_list_path}" gpg --keyserver keyserver.ubuntu.com \ --no-default-keyring --keyring "${kr_path}" \ --recv-keys "0xae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80" "0xeb693b3035cd5710e231e123a4b469963bf863cc" else install_cuda_keyring_pkg # 11.7+, 12.0+ fi elif is_rocky ; then execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" fi } function build_driver_from_github() { # non-GPL driver will have been built on rocky8, or when driver # version is prior to open driver min, or GPU architecture is prior # to Turing if ( is_rocky8 \ || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" \ || [[ "$((16#${pci_device_id}))" < "$((16#1E00))" ]] ) ; then return 0 ; fi pushd "${workdir}" test -d "${workdir}/open-gpu-kernel-modules" || { tarball_fn="${DRIVER_VERSION}.tar.gz" execute_with_retries curl ${curl_retry_args} \ "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \ \| tar xz mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules } local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" local local_tarball="${workdir}/${build_tarball}" local build_dir if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] then build_dir="${modulus_md5sum}" else build_dir="unsigned" ; fi local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')" if [[ "$?" == "0" ]] ; then local build_start_time build_start_epoch timeout_epoch build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" build_start_epoch="$(date -u -d "${build_start_time}" +%s)" timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m ${gsutil_cmd} rm "${gcs_tarball}.building" || echo "might have been deleted by a peer" break fi sleep 5m done fi fi if ${gsutil_stat_cmd} "${gcs_tarball}" 2>&1 ; then echo "cache hit" else # build the kernel modules touch "${local_tarball}.building" ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" pushd open-gpu-kernel-modules install_build_dependencies if ( is_cuda11 && is_ubuntu22 ) ; then echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}" exit 1 fi execute_with_retries make -j$(nproc) modules \ > kernel-open/build.log \ 2> kernel-open/build_error.log # Sign kernel modules if [[ -n "${PSN}" ]]; then configure_dkms_certs for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \ "${mok_key}" \ "${mok_der}" \ "${module}" done clear_dkms_key fi make modules_install \ >> kernel-open/build.log \ 2>> kernel-open/build_error.log # Collect build logs and installed binaries tar czvf "${local_tarball}" \ "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi building_file="" rm "${local_tarball}" make clean popd fi ${gsutil_cmd} cat "${gcs_tarball}" | tar -C / -xzv depmod -a } popd } function build_driver_from_packages() { if is_debuntu ; then if [[ -n "$(apt-cache search -n "nvidia-driver-${DRIVER}-server-open")" ]] ; then local pkglist=("nvidia-driver-${DRIVER}-server-open") ; else local pkglist=("nvidia-driver-${DRIVER}-open") ; fi if is_debian ; then pkglist=( "firmware-nvidia-gsp=${DRIVER_VERSION}-1" "nvidia-smi=${DRIVER_VERSION}-1" "nvidia-alternative=${DRIVER_VERSION}-1" "nvidia-kernel-open-dkms=${DRIVER_VERSION}-1" "nvidia-kernel-support=${DRIVER_VERSION}-1" "nvidia-modprobe=${DRIVER_VERSION}-1" "libnvidia-ml1=${DRIVER_VERSION}-1" ) fi add_contrib_component apt-get update -qq execute_with_retries apt-get install -y -qq --no-install-recommends dkms configure_dkms_certs execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}" sync elif is_rocky ; then configure_dkms_certs if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then echo "nvidia-driver:${DRIVER}-dkms installed successfully" else execute_with_retries dnf -y -q module install 'nvidia-driver:latest' fi sync fi clear_dkms_key } function install_nvidia_userspace_runfile() { # Parameters for NVIDIA-provided Debian GPU driver readonly DEFAULT_USERSPACE_URL="https://us.download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')" readonly USERSPACE_FILENAME # This .run file contains NV's OpenGL implementation as well as # nvidia optimized implementations of the gtk+ 2,3 stack(s) not # including glib (https://docs.gtk.org/glib/), and what appears to # be a copy of the source from the kernel-open directory of for # example DRIVER_VERSION=560.35.03 # # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz # # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it. is_complete userspace && return local local_fn="${tmpdir}/userspace.run" cache_fetched_package "${USERSPACE_URL}" \ "${pkg_bucket}/nvidia/${USERSPACE_FILENAME}" \ "${local_fn}" local runfile_args runfile_args="" local cache_hit="0" local local_tarball # Build nonfree driver on rocky8, or when driver version is prior to # open driver min, or when GPU architecture is prior to Turing if ( is_rocky8 \ || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" \ || [[ "$((16#${pci_device_id}))" < "$((16#1E00))" ]] ) then local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}_nonfree.tar.gz" local_tarball="${workdir}/${build_tarball}" local build_dir if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] then build_dir="${modulus_md5sum}" else build_dir="unsigned" ; fi local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')" if [[ "$?" == "0" ]] ; then local build_start_time build_start_epoch timeout_epoch build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" build_start_epoch="$(date -u -d "${build_start_time}" +%s)" timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m ${gsutil_cmd} rm "${gcs_tarball}.building" break fi sleep 5m done fi fi if ${gsutil_stat_cmd} "${gcs_tarball}" ; then cache_hit="1" if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then runfile_args="${runfile_args} --no-kernel-modules" fi echo "cache hit" else # build the kernel modules touch "${local_tarball}.building" ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" install_build_dependencies configure_dkms_certs local signing_options signing_options="" if [[ -n "${PSN}" ]]; then signing_options="--module-signing-hash sha256 \ --module-signing-x509-hash sha256 \ --module-signing-secret-key \"${mok_key}\" \ --module-signing-public-key \"${mok_der}\" \ --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \ " fi runfile_args="${signing_options}" if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then runfile_args="${runfile_args} --no-dkms" fi fi } elif version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then runfile_args="--no-kernel-modules" fi execute_with_retries bash "${local_fn}" -e -q \ ${runfile_args} \ --ui=none \ --install-libglvnd \ --tmpdir="${tmpdir}" # On rocky8, or when driver version is prior to open driver min, or when GPU architecture is prior to Turing if ( is_rocky8 \ || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" \ || [[ "$((16#${pci_device_id}))" < "$((16#1E00))" ]] ) ; then if [[ "${cache_hit}" == "1" ]] ; then ${gsutil_cmd} cat "${gcs_tarball}" | tar -C / -xzv depmod -a else clear_dkms_key tar czvf "${local_tarball}" \ /var/log/nvidia-installer.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi building_file="" fi fi rm -f "${local_fn}" mark_complete userspace sync } function install_cuda_runfile() { is_complete cuda && return local local_fn="${tmpdir}/cuda.run" cache_fetched_package "${NVIDIA_CUDA_URL}" \ "${pkg_bucket}/nvidia/${CUDA_RUNFILE}" \ "${local_fn}" execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" rm -f "${local_fn}" mark_complete cuda sync } function install_cuda_toolkit() { local cudatk_package=cuda-toolkit if ge_debian12 && is_src_os ; then cudatk_package="${cudatk_package}=${CUDA_FULL_VERSION}-1" elif [[ -n "${CUDA_VERSION}" ]]; then cudatk_package="${cudatk_package}-${CUDA_VERSION//./-}" fi cuda_package="cuda=${CUDA_FULL_VERSION}-1" readonly cudatk_package if is_debuntu ; then # if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package} elif is_rocky ; then # rocky9: cuda-11-[7,8], cuda-12-[1..6] execute_with_retries dnf -y -q install "${cudatk_package}" fi sync } function load_kernel_module() { # for some use cases, the kernel module needs to be removed before first use of nvidia-smi for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do ( set +e rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" ) done depmod -a modprobe nvidia for suffix in uvm modeset drm; do modprobe "nvidia-${suffix}" done # TODO: if peermem is available, also modprobe nvidia-peermem } function install_cuda(){ is_complete cuda-repo && return if [[ "${gpu_count}" == "0" ]] ; then return ; fi if ( ge_debian12 && is_src_os ) ; then echo "installed with the driver on ${_shortname}" return 0 fi # The OS package distributions are unreliable install_cuda_runfile # Includes CUDA packages add_repo_cuda mark_complete cuda-repo } function install_nvidia_container_toolkit() { is_complete install-nvctk && return local container_runtime_default if command -v docker ; then container_runtime_default='docker' elif command -v containerd ; then container_runtime_default='containerd' elif command -v crio ; then container_runtime_default='crio' else container_runtime_default='' ; fi CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}") if test -z "${CONTAINER_RUNTIME}" ; then return ; fi add_repo_nvidia_container_toolkit if is_debuntu ; then execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else execute_with_retries dnf install -y -q nvidia-container-toolkit ; fi nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}" systemctl restart "${CONTAINER_RUNTIME}" mark_complete install-nvctk } # Install NVIDIA GPU driver provided by NVIDIA function install_nvidia_gpu_driver() { is_complete gpu-driver && return if [[ "${gpu_count}" == "0" ]] ; then return ; fi if ( ge_debian12 && is_src_os ) ; then add_nonfree_components apt-get update -qq apt-get -yq install \ dkms \ nvidia-open-kernel-dkms \ nvidia-open-kernel-support \ nvidia-smi \ libglvnd0 \ libcuda1 echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully" return 0 fi # OS driver packages do not produce reliable driver ; use runfile install_nvidia_userspace_runfile build_driver_from_github echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" mark_complete gpu-driver } function install_ops_agent(){ is_complete ops-agent && return mkdir -p /opt/google cd /opt/google # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation curl ${curl_retry_args} -O https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh local expected="038d98644e4c4a7969d26da790946720d278c8d49bb82b677f550c2a2b858411 add-google-cloud-ops-agent-repo.sh" execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install mark_complete ops-agent } # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics function install_gpu_agent() { # Stackdriver GPU agent parameters # local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics' local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics' if ( ! command -v pip && is_debuntu ) ; then execute_with_retries "apt-get install -y -qq python3-pip" fi local install_dir=/opt/gpu-utilization-agent mkdir -p "${install_dir}" curl ${curl_retry_args} \ "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt" curl ${curl_retry_args} \ "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ | sed -e 's/-u --format=/--format=/' \ | dd status=none of="${install_dir}/report_gpu_metrics.py" local venv="${install_dir}/venv" python_interpreter="/opt/conda/miniconda3/bin/python3" [[ -f "${python_interpreter}" ]] || python_interpreter="$(command -v python3)" if version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" && is_debuntu ; then execute_with_retries "apt-get install -y -qq python3-venv" fi "${python_interpreter}" -m venv "${venv}" ( source "${venv}/bin/activate" python3 -m pip install --upgrade pip execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt" ) sync # Generate GPU service. cat </lib/systemd/system/gpu-utilization-agent.service [Unit] Description=GPU Utilization Metric Agent [Service] Type=simple PIDFile=/run/gpu_agent.pid ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"' User=root Group=root WorkingDirectory=/ Restart=always [Install] WantedBy=multi-user.target EOF # Reload systemd manager configuration systemctl daemon-reload # Enable gpu-utilization-agent service systemctl --no-reload --now enable gpu-utilization-agent.service } function set_hadoop_property() { local -r config_file=$1 local -r property=$2 local -r value=$3 "${bdcfg}" set_property \ --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \ --name "${property}" --value "${value}" \ --clobber } function configure_yarn_resources() { if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then # TODO: when running this script to customize an image, this file # needs to be written *after* bdutil completes return 0 fi # pre-init scripts if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" fi set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' set_hadoop_property 'capacity-scheduler.xml' \ 'yarn.scheduler.capacity.resource-calculator' \ 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' } # This configuration should be applied only if GPU is attached to the node function configure_yarn_nodemanager() { set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' "${NVIDIA_SMI_PATH}" set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.container-executor.class' 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.linux-container-executor.group' 'yarn' # Fix local dirs access permissions local yarn_local_dirs=() readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" fi } function configure_gpu_exclusive_mode() { # only run this function when spark < 3.0 if version_ge "${SPARK_VERSION}" "3.0" ; then return 0 ; fi # include exclusive mode on GPU nvsmi -c EXCLUSIVE_PROCESS } function fetch_mig_scripts() { mkdir -p /usr/local/yarn-mig-scripts sudo chmod 755 /usr/local/yarn-mig-scripts execute_with_retries wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi execute_with_retries wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh sudo chmod 755 /usr/local/yarn-mig-scripts/* } function configure_gpu_script() { # Download GPU discovery script local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu' mkdir -p ${spark_gpu_script_dir} # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of: # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh local -r gpus_resources_script="${spark_gpu_script_dir}/getGpusResources.sh" cat > "${gpus_resources_script}" <<'EOF' #!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Example output: {"name": "gpu", "addresses":["0","1","2","3","4","5","6","7"]} set -e resources_json="/dev/shm/nvidia/gpusResources.json" if test -f "${resources_json}" ; then cat "${resources_json}" ; exit 0 ; fi mkdir -p "$(dirname ${resources_json})" ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} | tee "${resources_json}" EOF chmod a+rx "${gpus_resources_script}" if version_lt "${SPARK_VERSION}" "3.0" ; then return ; fi local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" local spark_defaults_dir="$(dirname "${spark_defaults_conf}")" if ! grep spark.executor.resource.gpu.discoveryScript "${spark_defaults_conf}" ; then echo "spark.executor.resource.gpu.discoveryScript=${gpus_resources_script}" >> "${spark_defaults_conf}" fi local executor_cores executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')" [[ "${executor_cores}" == "0" ]] && executor_cores=1 local executor_memory executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')" local task_cpus=2 [[ "${task_cpus}" -gt "${executor_cores}" ]] && task_cpus="${executor_cores}" local gpu_amount # gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" gpu_amount="$(perl -e "print 1 / ${executor_cores}")" # the gpu.amount properties are not appropriate for the version of # spark shipped with 1.5 images using the capacity scheduler. TODO: # In order to get spark rapids GPU accelerated SQL working on 1.5 # images, we must configure the Fair scheduler version_ge "${DATAPROC_IMAGE_VERSION}" "2.0" || return if ! grep -q "BEGIN : RAPIDS properties" "${spark_defaults_conf}"; then cat >>"${spark_defaults_conf}" <> "${HADOOP_CONF_DIR}/container-executor.cfg" printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" else printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg" fi # Configure a systemd unit to ensure that permissions are set on restart cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service<&2 ; return 0 elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0 else nvsmi_works="1" ; fi if test -v 1 && [[ "$1" == "-L" ]] ; then local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt" if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}" else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi return 0 fi "${nvsmi}" $* } function install_build_dependencies() { is_complete build-dependencies && return if is_debuntu ; then if is_ubuntu22 && is_cuda12 ; then # On ubuntu22, the default compiler does not build some kernel module versions # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11 execute_with_retries apt-get install -y -qq gcc-12 update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 update-alternatives --set gcc /usr/bin/gcc-12 elif is_ubuntu22 && version_lt "${CUDA_VERSION}" "11.7" ; then # On cuda less than 11.7, the kernel driver does not build on ubuntu22 # https://forums.developer.nvidia.com/t/latest-nvidia-driver-470-63-01-installation-fails-with-latest-linux-kernel-5-16-5-100/202972 echo "N.B.: Older CUDA 11 known bad on ${_shortname}" fi elif is_rocky ; then execute_with_retries dnf -y -q install gcc local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" set +e eval "${dnf_cmd}" > "${install_log}" 2>&1 local retval="$?" set -e if [[ "${retval}" == "0" ]] ; then return ; fi local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')" local vault="https://download.rockylinux.org/vault/rocky/${os_ver}" if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then # this kernel-devel may have been migrated to the vault dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \ "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \ "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \ "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \ "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \ "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm" )" fi set +e eval "${dnf_cmd}" > "${install_log}" 2>&1 local retval="$?" set -e if [[ "${retval}" == "0" ]] ; then return ; fi if grep -q 'Status code: 404 for https' "${install_log}" ; then local stg_url="https://download.rockylinux.org/stg/rocky/${os_ver}/devel/x86_64/os/Packages/k/" dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \ "${stg_url}/kernel-${uname_r}.rpm" \ "${stg_url}/kernel-core-${uname_r}.rpm" \ "${stg_url}/kernel-modules-${uname_r}.rpm" \ "${stg_url}/kernel-modules-core-${uname_r}.rpm" \ "${stg_url}/kernel-devel-${uname_r}.rpm" )" fi execute_with_retries "${dnf_cmd}" fi mark_complete build-dependencies } function is_complete() { phase="$1" test -f "${workdir}/complete/${phase}" } function mark_complete() { phase="$1" touch "${workdir}/complete/${phase}" } function mark_incomplete() { phase="$1" rm -f "${workdir}/complete/${phase}" } function install_dependencies() { is_complete install-dependencies && return 0 pkg_list="pciutils screen" if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi mark_complete install-dependencies } function prepare_gpu_env(){ #set_support_matrix # if set, this variable includes a gcs path to a build-in-progress indicator building_file="" set_cuda_version set_driver_version set +e # NV vendor ID is 10DE pci_vendor_id="10DE" gpu_count="$(grep -i PCI_ID=${pci_vendor_id} /sys/bus/pci/devices/*/uevent | wc -l)" set -e if [[ "${gpu_count}" > "0" ]] ; then # N.B.: https://pci-ids.ucw.cz/v2.2/pci.ids.xz pci_device_id="$(grep -h -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | head -1 | awk -F: '{print $2}')" pci_device_id_int="$((16#${pci_device_id}))" case "${pci_device_id}" in "15F8" ) gpu_type="nvidia-tesla-p100" ;; "1BB3" ) gpu_type="nvidia-tesla-p4" ;; "1DB1" ) gpu_type="nvidia-tesla-v100" ;; "1EB8" ) gpu_type="nvidia-tesla-t4" ;; "20B2" ) gpu_type="nvidia-tesla-a100-80gb" ;; "20B5" ) gpu_type="nvidia-tesla-a100-80gb" ;; "20F3" ) gpu_type="nvidia-tesla-a100-80gb" ;; "20F5" ) gpu_type="nvidia-tesla-a100-80gb" ;; "20"* ) gpu_type="nvidia-tesla-a100" ;; "23"* ) gpu_type="nvidia-h100" ;; # NB: install does not begin with legacy image 2.0.68-debian10/cuda11.1 "27B8" ) gpu_type="nvidia-l4" ;; # NB: install does not complete with legacy image 2.0.68-debian10/cuda11.1 * ) gpu_type="unrecognized" esac ACCELERATOR="type=${gpu_type},count=${gpu_count}" fi nvsmi_works="0" if is_cuda11 ; then gcc_ver="11" elif is_cuda12 ; then gcc_ver="12" ; fi if ! test -v DEFAULT_RAPIDS_RUNTIME ; then readonly DEFAULT_RAPIDS_RUNTIME='SPARK' fi # Set variables from metadata RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') INCLUDE_GPUS="$(get_metadata_attribute include-gpus "")" INCLUDE_PYTORCH="$(get_metadata_attribute 'include-pytorch' 'no')" readonly RAPIDS_RUNTIME INCLUDE_GPUS INCLUDE_PYTORCH # determine whether we have nvidia-smi installed and working nvsmi set_nv_urls set_cuda_runfile_url set_cudnn_version set_cudnn_tarball_url } # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades # Users should run apt-mark unhold before they wish to upgrade these packages function hold_nvidia_packages() { if ! is_debuntu ; then return ; fi apt-mark hold nvidia-* > /dev/null 2>&1 apt-mark hold libnvidia-* > /dev/null 2>&1 if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then apt-mark hold xserver-xorg-video-nvidia* fi } function check_secure_boot() { local SECURE_BOOT="disabled" if command -v mokutil ; then SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') fi PSN="$(get_metadata_attribute private_secret_name)" readonly PSN if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster." exit 1 elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then echo "Error: Secure boot is enabled, but no signing material provided." echo "Please either disable secure boot or provide signing material as per" echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot" return 1 fi CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" readonly CA_TMPDIR if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv mok_der=/var/lib/shim-signed/mok/MOK.der else mok_key=/var/lib/dkms/mok.key mok_der=/var/lib/dkms/mok.pub ; fi return 0 } # Function to group Hadoop/Spark config steps (called in init-action mode or deferred) function run_hadoop_spark_config() { # Ensure necessary variables are available or re-evaluated # prepare_gpu_env needs CUDA/Driver versions, call it first if needed if [[ ! -v CUDA_VERSION || ! -v DRIVER_VERSION ]]; then prepare_gpu_env; fi # Re-read ROLE ROLE="$(get_metadata_attribute dataproc-role)"; # Re-read SPARK_VERSION if not set or default if [[ ! -v SPARK_VERSION || "${SPARK_VERSION}" == "0.0" ]]; then SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1 || echo "0.0")" fi # Re-check GPU count set +e gpu_count="$(grep -i PCI_ID=10DE /sys/bus/pci/devices/*/uevent | wc -l)" set -e # Re-check MIG status IS_MIG_ENABLED=0 NVIDIA_SMI_PATH='/usr/bin' # Reset default path MIG_MAJOR_CAPS=0 if [[ "${gpu_count}" -gt "0" ]] && nvsmi >/dev/null 2>&1; then # Check if nvsmi works before querying migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader || echo '[N/A]')" if [[ "${migquery_result}" != "[N/A]" && "${migquery_result}" != "" ]]; then NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" if [[ "${NUM_MIG_GPUS}" -eq "1" ]] && (echo "${migquery_result}" | grep -q Enabled); then IS_MIG_ENABLED=1 NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' # Set MIG path MIG_MAJOR_CAPS=$(grep nvidia-caps /proc/devices | cut -d ' ' -f 1 || echo 0) if [[ ! -d "/usr/local/yarn-mig-scripts" ]]; then fetch_mig_scripts || echo "WARN: Failed to fetch MIG scripts." >&2; fi fi fi fi # Ensure config directories exist if [[ ! -d "${HADOOP_CONF_DIR}" || ! -d "${SPARK_CONF_DIR}" ]]; then echo "ERROR: Config directories (${HADOOP_CONF_DIR}, ${SPARK_CONF_DIR}) not found. Cannot apply configuration." return 1 # Use return instead of exit in a function fi # Run config applicable to all nodes configure_yarn_resources # Run node-specific config if [[ "${gpu_count}" -gt 0 ]]; then configure_yarn_nodemanager install_spark_rapids # Installs JARs configure_gpu_script configure_gpu_isolation configure_gpu_exclusive_mode # Call this here, it checks Spark version internally elif [[ "${ROLE}" == "Master" ]]; then # Master node without GPU still needs some config configure_yarn_nodemanager install_spark_rapids # Still need JARs on Master configure_gpu_script else # Worker node without GPU, skip node-specific YARN/Spark config. : fi # Restart services after config for svc in resourcemanager nodemanager; do if (systemctl is-active --quiet hadoop-yarn-${svc}.service); then systemctl stop hadoop-yarn-${svc}.service || echo "WARN: Failed to stop ${svc}" systemctl start hadoop-yarn-${svc}.service || echo "WARN: Failed to start ${svc}" fi done return 0 # Explicitly return success } # This function now ONLY generates the script and service file. # It does NOT enable the service here. function create_deferred_config_files() { local -r service_name="dataproc-gpu-config" local -r service_file="/etc/systemd/system/${service_name}.service" # This is the script that will contain the config logic local -r config_script_path="/usr/local/sbin/apply-dataproc-gpu-config.sh" # Use 'declare -f' to extract function definitions needed by the config logic # and write them, along with the config logic itself, into the new script. cat < "${config_script_path}" #!/bin/bash # Deferred configuration script generated by install_gpu_driver.sh set -xeuo pipefail # --- Minimal necessary functions and variables --- # Define constants readonly HADOOP_CONF_DIR='/etc/hadoop/conf' readonly SPARK_CONF_DIR='/etc/spark/conf' readonly bdcfg="/usr/local/bin/bdconfig" readonly workdir=/opt/install-dpgce # Needed for cache_fetched_package # --- Define Necessary Global Arrays --- # These need to be explicitly defined here as they are not functions. $(declare -p DRIVER_FOR_CUDA) $(declare -p DRIVER_SUBVER) $(declare -p CUDNN_FOR_CUDA) $(declare -p NCCL_FOR_CUDA) $(declare -p CUDA_SUBVER) # drv_for_cuda is defined within set_cuda_runfile_url, which is included below # Define minimal metadata functions $(declare -f print_metadata_value) $(declare -f print_metadata_value_if_exists) $(declare -f get_metadata_value) $(declare -f get_metadata_attribute) # Define nvsmi wrapper $(declare -f nvsmi) nvsmi_works="0" # Initialize variable used by nvsmi # Define version comparison $(declare -f version_ge) $(declare -f version_gt) $(declare -f version_le) $(declare -f version_lt) # Define OS check functions $(declare -f os_id) $(declare -f os_version) $(declare -f os_codename) # Added os_codename as it's used by clean_up_sources_lists indirectly via os_add_repo $(declare -f is_debian) $(declare -f is_ubuntu) $(declare -f is_rocky) $(declare -f is_debuntu) $(declare -f is_debian10) $(declare -f is_debian11) $(declare -f is_debian12) $(declare -f is_rocky8) $(declare -f is_rocky9) $(declare -f is_ubuntu18) $(declare -f is_ubuntu20) $(declare -f is_ubuntu22) $(declare -f ge_debian12) $(declare -f le_debian10) $(declare -f le_debian11) $(declare -f ge_ubuntu20) $(declare -f le_ubuntu18) $(declare -f ge_rocky9) $(declare -f os_vercat) # Added os_vercat as it's used by set_nv_urls/set_cuda_runfile_url # Define _shortname (needed by install_spark_rapids -> cache_fetched_package and others) readonly _shortname="\$(os_id)\$(os_version|perl -pe 's/(\\d+).*/\$1/')" # Define shortname and nccl_shortname (needed by set_nv_urls) if is_ubuntu22 ; then nccl_shortname="ubuntu2004" shortname="\$(os_id)\$(os_vercat)" elif ge_rocky9 ; then nccl_shortname="rhel8" shortname="rhel9" elif is_rocky ; then shortname="\$(os_id | sed -e 's/rocky/rhel/')\$(os_vercat)" nccl_shortname="\${shortname}" else shortname="\$(os_id)\$(os_vercat)" nccl_shortname="\${shortname}" fi readonly shortname nccl_shortname # Define prepare_gpu_env and its dependencies $(declare -f prepare_gpu_env) $(declare -f set_cuda_version) $(declare -f set_driver_version) $(declare -f set_nv_urls) $(declare -f set_cuda_runfile_url) $(declare -f set_cudnn_version) $(declare -f set_cudnn_tarball_url) $(declare -f is_cuda11) $(declare -f is_cuda12) $(declare -f le_cuda11) $(declare -f le_cuda12) $(declare -f ge_cuda11) $(declare -f ge_cuda12) $(declare -f is_cudnn8) $(declare -f is_cudnn9) # Define DATAPROC_IMAGE_VERSION (re-evaluate) SPARK_VERSION="\$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1 || echo "0.0")" if version_lt "\${SPARK_VERSION}" "2.5" ; then DATAPROC_IMAGE_VERSION="1.5" elif version_lt "\${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" elif version_lt "\${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" elif version_lt "\${SPARK_VERSION}" "3.6" ; then if [[ -f /etc/environment ]] ; then eval "\$(grep '^DATAPROC_IMAGE_VERSION' /etc/environment)" || DATAPROC_IMAGE_VERSION="2.2" else DATAPROC_IMAGE_VERSION="2.2" fi else DATAPROC_IMAGE_VERSION="2.3" ; fi # Default to latest known version readonly DATAPROC_IMAGE_VERSION # Define set_hadoop_property $(declare -f set_hadoop_property) # --- Include definitions of functions called by the config logic --- $(declare -f configure_yarn_resources) $(declare -f configure_yarn_nodemanager) $(declare -f install_spark_rapids) $(declare -f configure_gpu_script) $(declare -f configure_gpu_isolation) $(declare -f configure_gpu_exclusive_mode) $(declare -f fetch_mig_scripts) $(declare -f cache_fetched_package) $(declare -f execute_with_retries) # --- Define gsutil/gcloud commands and curl args --- gsutil_cmd="gcloud storage" gsutil_stat_cmd="gcloud storage objects describe" gcloud_sdk_version="\$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print \$2}' || echo '0.0.0')" if version_lt "\${gcloud_sdk_version}" "402.0.0" ; then gsutil_cmd="gsutil -o GSUtil:check_hashes=never" gsutil_stat_cmd="gsutil stat" fi curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30" # Define pkg_bucket (needed by cache_fetched_package) temp_bucket="\$(get_metadata_attribute dataproc-temp-bucket)" readonly temp_bucket readonly pkg_bucket="gs://\${temp_bucket}/dpgce-packages" readonly install_log="/tmp/deferred-config-install.log" # Log file for execute_with_retries # --- Include the main config function --- $(declare -f run_hadoop_spark_config) # --- Execute the config logic --- if run_hadoop_spark_config; then # Configuration successful, disable the service systemctl disable ${service_name}.service rm -f "${config_script_path}" "${service_file}" systemctl daemon-reload else echo "ERROR: Deferred configuration script (${config_script_path}) failed." >&2 # Keep the service enabled to allow for manual inspection/retry exit 1 fi exit 0 EOF chmod +x "${config_script_path}" cat < "${service_file}" [Unit] Description=Apply Dataproc GPU configuration on first boot # Ensure it runs after Dataproc agent and YARN services are likely up After=google-dataproc-agent.service network-online.target hadoop-yarn-resourcemanager.service hadoop-yarn-nodemanager.service Wants=network-online.target google-dataproc-agent.service [Service] Type=oneshot ExecStart=${config_script_path} # Execute the generated config script RemainAfterExit=no # Service is done after exec StandardOutput=journal+console StandardError=journal+console [Install] WantedBy=multi-user.target EOF chmod 644 "${service_file}" # Service is enabled later only if IS_CUSTOM_IMAGE_BUILD is true } function main() { # Perform installations (these are generally safe during image build) if (lspci | grep -q NVIDIA); then # Check MIG status early, primarily for driver installation logic migquery_result="$(nvsmi --query-gpu=mig.mode.current --format=csv,noheader || echo '[N/A]')" # Use || for safety if [[ "${migquery_result}" == "[N/A]" ]] ; then migquery_result="" ; fi NUM_MIG_GPUS="$(echo ${migquery_result} | uniq | wc -l)" if [[ "${NUM_MIG_GPUS}" -gt 0 ]] ; then if [[ "${NUM_MIG_GPUS}" -eq "1" ]]; then if (echo "${migquery_result}" | grep Enabled); then IS_MIG_ENABLED=1 # Fetch MIG scripts early if needed by driver install/check if [[ ! -d "/usr/local/yarn-mig-scripts" ]]; then fetch_mig_scripts || echo "WARN: Failed to fetch MIG scripts." >&2; fi fi fi fi # Install core components if MIG is not already enabled (MIG setup implies drivers exist) if [[ $IS_MIG_ENABLED -eq 0 ]]; then install_nvidia_gpu_driver install_nvidia_container_toolkit install_cuda load_kernel_module # Load modules after driver install if [[ -n ${CUDNN_VERSION} ]]; then install_nvidia_nccl install_nvidia_cudnn fi case "${INCLUDE_PYTORCH^^}" in "1" | "YES" | "TRUE" ) install_pytorch ;; esac #Install GPU metrics collection in Stackdriver if needed if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then #install_ops_agent install_gpu_agent echo 'GPU metrics agent successfully deployed.' else echo 'GPU metrics agent will not be installed.' fi # for some use cases, the kernel module needs to be removed before first use of nvidia-smi for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" done if test -n "$(nvsmi -L)" ; then # cache the result of the gpu query ADDRS=$(nvsmi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') echo "{\"name\": \"gpu\", \"addresses\":[$ADDRS]}" | tee "/var/run/nvidia-gpu-index.txt" chmod a+r "/var/run/nvidia-gpu-index.txt" fi MIG_GPU_LIST="$(nvsmi -L | grep -E '(MIG|[PVAH]100)' || echo -n "")" NUM_MIG_GPUS="$(test -n "${MIG_GPU_LIST}" && echo "${MIG_GPU_LIST}" | wc -l || echo "0")" if [[ "${NUM_MIG_GPUS}" -gt "0" ]] ; then # enable MIG on every GPU for GPU_ID in $(echo ${MIG_GPU_LIST} | awk -F'[: ]' '{print $2}') ; do if version_le "${CUDA_VERSION}" "11.6" ; then nvsmi -i "${GPU_ID}" --multi-instance-gpu=1 else nvsmi -i "${GPU_ID}" --multi-instance-gpu 1 fi done NVIDIA_SMI_PATH='/usr/local/yarn-mig-scripts/' MIG_MAJOR_CAPS="$(grep nvidia-caps /proc/devices | cut -d ' ' -f 1)" fetch_mig_scripts else configure_gpu_exclusive_mode fi fi configure_yarn_nodemanager install_spark_rapids configure_gpu_script configure_gpu_isolation elif [[ "${ROLE}" == "Master" ]]; then # Master node without GPU detected. : else # Worker node without GPU detected. : fi # End GPU detection # --- Generate Config Script and Service File --- # This happens in both modes now create_deferred_config_files # --- Apply or Defer Configuration --- if [[ "${IS_CUSTOM_IMAGE_BUILD}" == "true" ]]; then # Enable the systemd service for first boot systemctl enable "dataproc-gpu-config.service" else # Running as a standard init action: execute the generated script immediately local -r config_script_path="/usr/local/sbin/apply-dataproc-gpu-config.sh" if [[ -x "${config_script_path}" ]]; then bash -x "${config_script_path}" else echo "ERROR: Generated config script ${config_script_path} not found or not executable." exit 1 fi # The config script handles its own cleanup and service disabling on success fi # --- End Apply or Defer --- } function cache_fetched_package() { local src_url="$1" local gcs_fn="$2" local local_fn="$3" if ${gsutil_stat_cmd} "${gcs_fn}" 2>&1 ; then execute_with_retries ${gsutil_cmd} cp "${gcs_fn}" "${local_fn}" else time ( curl ${curl_retry_args} "${src_url}" -o "${local_fn}" && \ execute_with_retries ${gsutil_cmd} cp "${local_fn}" "${gcs_fn}" ; ) fi } function clean_up_sources_lists() { if ! is_debuntu; then return; fi # # bigtop (primary) # local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')" local regional_bigtop_repo_uri regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} | sed -E "s#/dataproc-bigtop-repo(-dev)?/#/goog-dataproc-bigtop-repo\\1-${region}/#" | grep -E "deb .*goog-dataproc-bigtop-repo(-dev)?-${region}.* dataproc contrib" | cut -d ' ' -f 2 | head -1) if [[ "${regional_bigtop_repo_uri}" == */ ]]; then local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key" else local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key" fi local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg" rm -f "${bigtop_kr_path}" curl ${curl_retry_args} \ "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}" sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" fi # # adoptium # # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public" local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg" rm -f "${adoptium_kr_path}" local -r old_adoptium_list="/etc/apt/sources.list.d/adoptopenjdk.list" if test -f "${old_adoptium_list}" ; then rm -f "${old_adoptium_list}" fi for keyid in "0x3b04d753c9050d9a5d343f39843c48a565f8f04b" "0x35baa0b33e9eb396f59ca838c0ba5ce6dc6315a3" ; do curl ${curl_retry_args} "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" \ | gpg --import --no-default-keyring --keyring "${adoptium_kr_path}" done echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \ > /etc/apt/sources.list.d/adoptium.list # # docker # local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg" local docker_repo_file="/etc/apt/sources.list.d/docker.list" local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg" rm -f "${docker_kr_path}" curl ${curl_retry_args} "${docker_key_url}" \ | gpg --import --no-default-keyring --keyring "${docker_kr_path}" echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \ > ${docker_repo_file} # # google cloud + logging/monitoring # local gcloud_kr_path="/usr/share/keyrings/cloud.google.gpg" if ls /etc/apt/sources.list.d/google-clou*.list ; then rm -f "${gcloud_kr_path}" curl ${curl_retry_args} https://packages.cloud.google.com/apt/doc/apt-key.gpg \ | gpg --import --no-default-keyring --keyring "${gcloud_kr_path}" for list in google-cloud google-cloud-logging google-cloud-monitoring ; do list_file="/etc/apt/sources.list.d/${list}.list" if [[ -f "${list_file}" ]]; then sed -i -e "s:deb https:deb [signed-by=${gcloud_kr_path}] https:g" "${list_file}" fi done fi # # cran-r # if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then local cranr_kr_path="/usr/share/keyrings/cran-r.gpg" rm -f "${cranr_kr_path}" for keyid in "0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" "0xe298a3a825c0d65dfd57cbb651716619e084dab9" ; do curl ${curl_retry_args} "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" \ | gpg --import --no-default-keyring --keyring "${cranr_kr_path}" done sed -i -e "s:deb http:deb [signed-by=${cranr_kr_path}] http:g" /etc/apt/sources.list.d/cran-r.list fi # # mysql # if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then rm -f /usr/share/keyrings/mysql.gpg curl ${curl_retry_args} 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ gpg --dearmor -o /usr/share/keyrings/mysql.gpg sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list fi if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi } function exit_handler() { # Purge private key material until next grant clear_dkms_key # clean up incomplete build indicators if test -n "${building_file}" ; then if ${gsutil_stat_cmd} "${building_file}" ; then ${gsutil_cmd} rm "${building_file}" || true ; fi fi set +e # Allow cleanup commands to fail without exiting script echo "Exit handler invoked" # Clear pip cache # TODO: make this conditional on which OSs have pip without cache purge pip cache purge || echo "unable to purge pip cache" # If system memory was sufficient to mount memory-backed filesystems if [[ "${tmpdir}" == "/mnt/shm" ]] ; then # remove the tmpfs pip cache-dir pip config unset global.cache-dir || echo "unable to unset global pip cache" # Clean up shared memory mounts for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then umount -f ${shmdir} fi done # restart services stopped during preparation stage # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' fi if is_debuntu ; then # Clean up OS package cache apt-get -y -qq clean apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove # re-hold systemd package if ge_debian12 ; then apt-mark hold systemd libsystemd0 ; fi hold_nvidia_packages else dnf clean all fi # print disk usage statistics for large components if is_ubuntu ; then du -hs \ /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ /usr/lib \ /opt/nvidia/* \ /opt/conda/miniconda3 2>/dev/null | sort -h elif is_debian ; then du -x -hs \ /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu,} \ /var/lib/{docker,mysql,} \ /opt/nvidia/* \ /opt/{conda,google-cloud-ops-agent,install-nvidia,} \ /usr/bin \ /usr \ /var \ / 2>/dev/null | sort -h else # Rocky du -hs \ /var/lib/docker \ /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas,} \ /usr/lib64/google-cloud-sdk \ /opt/nvidia/* \ /opt/conda/miniconda3 2>/dev/null | sort -h fi # Process disk usage logs from installation period rm -f /run/keep-running-df sync sleep 5.01s # compute maximum size of disk during installation # Log file contains logs like the following (minus the preceeding #): #Filesystem 1K-blocks Used Available Use% Mounted on #/dev/vda2 7096908 2611344 4182932 39% / df / | tee -a "/run/disk-usage.log" perl -e '($first, @samples) = grep { m:^/: } ; unshift(@samples,$first); $final=$samples[-1]; ($starting)=(split(/\s+/,$first))[2] =~ /^(\d+)/; ($ending)=(split(/\s+/,$final))[2] =~ /^(\d+)/; @siz=( sort { $a <= $b } map { (split)[2] =~ /^(\d+)/ } @samples ); $max=$siz[0]; $min=$siz[-1]; $inc=$max-$starting; print( " samples-taken: ", scalar @siz, $/, "starting-disk-used: $starting", $/, " ending-disk-used: $ending", $/, " maximum-disk-used: $max", $/, " minimum-disk-used: $min", $/, " increased-by: $inc", $/ )' < "/run/disk-usage.log" echo "exit_handler has completed" # zero free disk space (only if creating image) if [[ "${IS_CUSTOM_IMAGE_BUILD}" == "true" ]]; then dd if=/dev/zero of=/zero status=progress || true sync sleep 3s rm -f /zero || true fi return 0 } function set_proxy(){ METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')" if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi export http_proxy="${METADATA_HTTP_PROXY}" export https_proxy="${METADATA_HTTP_PROXY}" export HTTP_PROXY="${METADATA_HTTP_PROXY}" export HTTPS_PROXY="${METADATA_HTTP_PROXY}" no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254" local no_proxy_svc for no_proxy_svc in compute secretmanager dns servicedirectory logging \ bigquery composer pubsub bigquerydatatransfer dataflow \ storage datafusion ; do no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com" done export NO_PROXY="${no_proxy}" } function mount_ramdisk(){ local free_mem free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" if [[ ${free_mem} -lt 20500000 ]]; then return 0 ; fi # Write to a ramdisk instead of churning the persistent disk tmpdir="/mnt/shm" mkdir -p "${tmpdir}/pkgs_dirs" mount -t tmpfs tmpfs "${tmpdir}" # Download conda packages to tmpfs if [[ -f /opt/conda/miniconda3/bin/conda ]] ; then /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}" fi # Clear pip cache # TODO: make this conditional on which OSs have pip without cache purge pip cache purge || echo "unable to purge pip cache" # Download pip packages to tmpfs pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir" # Download OS packages to tmpfs if is_debuntu ; then mount -t tmpfs tmpfs /var/cache/apt/archives else mount -t tmpfs tmpfs /var/cache/dnf fi } function harden_sshd_config() { # disable sha1 and md5 use in kex and kex-gss features declare -A feature_map=(["kex"]="kexalgorithms") if ( is_rocky || version_ge "${DATAPROC_IMAGE_VERSION}" "2.1" ) ; then feature_map["kex-gss"]="gssapikexalgorithms" fi for ftr in "${!feature_map[@]}" ; do local feature=${feature_map[$ftr]} local sshd_config_line sshd_config_line="${feature} $( (sshd -T | awk "/^${feature} / {print \$2}" | sed -e 's/,/\n/g'; ssh -Q "${ftr}" ) \ | sort -u | grep -v -ie sha1 -e md5 | paste -sd "," -)" grep -iv "^${feature} " /etc/ssh/sshd_config > /tmp/sshd_config_new echo "$sshd_config_line" >> /tmp/sshd_config_new # TODO: test whether sshd will reload with this change before mv mv -f /tmp/sshd_config_new /etc/ssh/sshd_config done local svc=ssh if is_rocky ; then svc="sshd" ; fi systemctl reload "${svc}" } function prepare_to_install(){ readonly uname_r=$(uname -r) # Verify OS compatability and Secure boot state check_os check_secure_boot # --- Detect Image Build Context --- # Use 'initialization-actions' as the default name for clarity INVOCATION_TYPE="$(get_metadata_attribute invocation-type "initialization-actions")" if [[ "${INVOCATION_TYPE}" == "custom-images" ]]; then IS_CUSTOM_IMAGE_BUILD="true" # echo "Detected custom image build context (invocation-type=custom-images). Configuration will be deferred." # Keep silent else IS_CUSTOM_IMAGE_BUILD="false" # Ensure it's explicitly false otherwise # echo "Running in initialization action mode (invocation-type=${INVOCATION_TYPE})." # Keep silent fi # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be # used as a more performant replacement for `gsutil` gsutil_cmd="gcloud storage" gsutil_stat_cmd="gcloud storage objects describe" gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')" if version_lt "${gcloud_sdk_version}" "402.0.0" ; then gsutil_cmd="gsutil -o GSUtil:check_hashes=never" gsutil_stat_cmd="gsutil stat" fi curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30" # Setup temporary directories (potentially on RAM disk) tmpdir=/tmp/ # Default mount_ramdisk # Updates tmpdir if successful install_log="${tmpdir}/install.log" # Set install log path based on final tmpdir # Prepare GPU environment variables (versions, URLs, counts) prepare_gpu_env workdir=/opt/install-dpgce # Set GCS bucket for caching temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" readonly temp_bucket readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" readonly bdcfg="/usr/local/bin/bdconfig" export DEBIAN_FRONTEND=noninteractive mkdir -p "${workdir}/complete" trap exit_handler EXIT set_proxy is_complete prepare.common && return harden_sshd_config if is_debuntu ; then repair_old_backports clean_up_sources_lists apt-get update -qq --allow-releaseinfo-change apt-get -y clean apt-get -o DPkg::Lock::Timeout=60 -y autoremove if ge_debian12 ; then apt-mark unhold systemd libsystemd0 ; fi if is_ubuntu ; then # Wait for gcloud to be available on Ubuntu while ! command -v gcloud ; do sleep 5s ; done fi else # Rocky dnf clean all fi # zero free disk space (only if creating image) if [[ "${IS_CUSTOM_IMAGE_BUILD}" == "true" ]]; then ( set +e time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero ) fi install_dependencies # Monitor disk usage in a screen session df / > "/run/disk-usage.log" touch "/run/keep-running-df" screen -d -m -LUS keep-running-df \ bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" mark_complete prepare.common } function check_os() { if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version." exit 1 elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 ) ; then echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version." exit 1 elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version." exit 1 fi SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" readonly SPARK_VERSION if version_lt "${SPARK_VERSION}" "2.4" || \ version_ge "${SPARK_VERSION}" "4.0" ; then echo "Error: Your Spark version (${SPARK_VERSION}) is not supported. Please use a supported version." exit 1 fi # Detect dataproc image version if (! test -v DATAPROC_IMAGE_VERSION || [[ -z "${DATAPROC_IMAGE_VERSION}" ]]) ; then if test -v DATAPROC_VERSION ; then DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" else # When building custom-images, neither of the above variables # are defined and we need to make a reasonable guess if version_lt "${SPARK_VERSION}" "2.5" ; then DATAPROC_IMAGE_VERSION="1.5" elif version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" elif version_lt "${SPARK_VERSION}" "3.6" ; then if [[ -f /etc/environment ]] ; then eval "$(grep '^DATAPROC_IMAGE_VERSION' /etc/environment)" || DATAPROC_IMAGE_VERSION="2.2" else DATAPROC_IMAGE_VERSION="2.2" fi else DATAPROC_IMAGE_VERSION="2.3" ; fi # Default to latest known version fi fi } # # Generate repo file under /etc/apt/sources.list.d/ # function apt_add_repo() { local -r repo_name="$1" local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" local -r include_src="${4:-yes}" local -r kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" local -r repo_path="${6:-/etc/apt/sources.list.d/${repo_name}.list}" echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" if [[ "${include_src}" == "yes" ]] ; then echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" fi apt-get update -qq } # # Generate repo file under /etc/yum.repos.d/ # function dnf_add_repo() { local -r repo_name="$1" local -r repo_url="$3" # "http(s)://host/path/filename.repo" local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" curl ${curl_retry_args} "${repo_url}" \ | dd of="${repo_path}" status=progress } # # Keyrings default to # /usr/share/keyrings/${repo_name}.gpg (debian/ubuntu) or # /etc/pki/rpm-gpg/${repo_name}.gpg (rocky/RHEL) # function os_add_repo() { local -r repo_name="$1" local -r signing_key_url="$2" local -r repo_data="$3" # "http(s)://host/path/uri argument0 .. argumentN" local kr_path if is_debuntu ; then kr_path="${5:-/usr/share/keyrings/${repo_name}.gpg}" else kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" ; fi mkdir -p "$(dirname "${kr_path}")" curl ${curl_retry_args} "${signing_key_url}" \ | gpg --import --no-default-keyring --keyring "${kr_path}" if is_debuntu ; then apt_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" else dnf_add_repo "${repo_name}" "${signing_key_url}" "${repo_data}" "${4:-yes}" "${kr_path}" "${6:-}" ; fi } readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" function install_spark_rapids() { if [[ "${RAPIDS_RUNTIME}" != "SPARK" ]]; then return ; fi # Update SPARK RAPIDS config local DEFAULT_SPARK_RAPIDS_VERSION DEFAULT_SPARK_RAPIDS_VERSION="24.08.1" if version_ge "${DATAPROC_IMAGE_VERSION}" "2.2" ; then DEFAULT_SPARK_RAPIDS_VERSION="25.02.1" fi local DEFAULT_XGBOOST_VERSION="1.7.6" # 2.1.3 # https://mvnrepository.com/artifact/ml.dmlc/xgboost4j-spark-gpu local -r scala_ver="2.12" if [[ "${DATAPROC_IMAGE_VERSION}" == "2.0" ]] ; then DEFAULT_SPARK_RAPIDS_VERSION="23.08.2" # Final release to support spark 3.1.3 fi readonly SPARK_RAPIDS_VERSION=$(get_metadata_attribute 'spark-rapids-version' ${DEFAULT_SPARK_RAPIDS_VERSION}) readonly XGBOOST_VERSION=$(get_metadata_attribute 'xgboost-version' ${DEFAULT_XGBOOST_VERSION}) local -r rapids_repo_url='https://repo1.maven.org/maven2/ai/rapids' local -r nvidia_repo_url='https://repo1.maven.org/maven2/com/nvidia' local -r dmlc_repo_url='https://repo.maven.apache.org/maven2/ml/dmlc' local jar_basename local spark_jars_dir="/usr/lib/spark/jars" mkdir -p "${spark_jars_dir}" jar_basename="xgboost4j-spark-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" cache_fetched_package "${dmlc_repo_url}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ "${pkg_bucket}/xgboost4j-spark-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ "${spark_jars_dir}/${jar_basename}" jar_basename="xgboost4j-gpu_${scala_ver}-${XGBOOST_VERSION}.jar" cache_fetched_package "${dmlc_repo_url}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ "${pkg_bucket}/xgboost4j-gpu_${scala_ver}/${XGBOOST_VERSION}/${jar_basename}" \ "${spark_jars_dir}/${jar_basename}" jar_basename="rapids-4-spark_${scala_ver}-${SPARK_RAPIDS_VERSION}.jar" cache_fetched_package "${nvidia_repo_url}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ "${pkg_bucket}/rapids-4-spark_${scala_ver}/${SPARK_RAPIDS_VERSION}/${jar_basename}" \ "${spark_jars_dir}/${jar_basename}" } # --- Script Entry Point --- prepare_to_install # Run preparation steps first main # Call main logic ====== Filename: ./examples/secure-boot/cjac.sh ====== #!/bin/bash set -e set -x PROJECT_ID=cjac-2021-00 CLUSTER_NAME="cluster-1718310842" my_bucket="kerberos-bucket-000" custom_image_zone="us-west4-a" disk_size_gb="50" # greater than or equal to 30 SA_NAME="sa-${CLUSTER_NAME}" GSA="${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com" gcloud config set project "${PROJECT_ID}" gcloud auth login if [[ -d tls ]] ; then mv tls "tls-$(date +%s)" ; fi eval "$(bash examples/secure-boot/create-key-pair.sh)" metadata="public_secret_name=${public_secret_name}" metadata="${metadata},private_secret_name=${private_secret_name}" metadata="${metadata},secret_project=${secret_project}" metadata="${metadata},secret_version=${secret_version}" # Grant the service account access to list secrets for the project gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role="roles/secretmanager.viewer" # grant service account permission to access the private secret gcloud secrets add-iam-policy-binding "${private_secret_name}" \ --member="serviceAccount:${GSA}" \ --role="roles/secretmanager.secretAccessor" # grant service account permission to access the public secret gcloud secrets add-iam-policy-binding "${public_secret_name}" \ --member="serviceAccount:${GSA}" \ --role="roles/secretmanager.secretAccessor" dataproc_version=2.2-debian12 echo "#!/bin/bash\necho no op" | dd of=empty.sh #customization_script=empty.sh customization_script="examples/secure-boot/install_gpu_driver.sh" image_name="cuda-12-4-${dataproc_version/\./-}-$(date +%F-%H-%M)" python generate_custom_image.py \ --accelerator "type=nvidia-tesla-t4" \ --image-name "${image_name}" \ --dataproc-version "${dataproc_version}" \ --trusted-cert "tls/db.der" \ --customization-script "${customization_script}" \ --service-account "${GSA}" \ --metadata "${metadata}" \ --zone "${custom_image_zone}" \ --disk-size "${disk_size_gb}" \ --no-smoke-test \ --gcs-bucket "${my_bucket}" \ --shutdown-instance-timer-sec=30 set +x ====== Filename: ./examples/secure-boot/install-nvidia-driver-debian12.sh ====== #!/bin/bash set -xeu mkdir -p /opt/install-nvidia-driver cd $_ nv_driver_ver="550.54.14" nv_cuda_ver="12.4.0" # read secret name, project, version sig_pub_secret_name="$(/usr/share/google/get_metadata_value attributes/public_secret_name)" sig_priv_secret_name="$(/usr/share/google/get_metadata_value attributes/private_secret_name)" sig_secret_project="$(/usr/share/google/get_metadata_value attributes/secret_project)" sig_secret_version="$(/usr/share/google/get_metadata_value attributes/secret_version)" readonly expected_modulus_md5sum="bd40cf5905c7bba4225d330136fdbfd3" ca_tmpdir="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" mkdir -p "${ca_tmpdir}" # The Microsoft Corporation UEFI CA 2011 ms_uefi_ca="${ca_tmpdir}/MicCorUEFCA2011_2011-06-27.crt" if [[ ! -f "${ms_uefi_ca}" ]]; then curl -L -o "${ms_uefi_ca}" "https://go.microsoft.com/fwlink/p/?linkid=321194" fi # Write private material to volatile storage gcloud secrets versions access "${sig_secret_version}" \ --project="${sig_secret_project}" \ --secret="${sig_priv_secret_name}" \ | dd of="${ca_tmpdir}/db.rsa" readonly cacert_der="${ca_tmpdir}/db.der" gcloud secrets versions access "${sig_secret_version}" \ --project="${sig_secret_project}" \ --secret="${sig_pub_secret_name}" \ | base64 --decode \ | dd of="${cacert_der}" mokutil --sb-state # configure the nvidia-container-toolkit package source # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ | tee /etc/apt/sources.list.d/nvidia-container-toolkit.list # enable non-free and non-free-firmware components, update cache DEBIAN_SOURCES="/etc/apt/sources.list.d/debian.sources" COMPONENTS="main contrib non-free non-free-firmware" sed -i -e "s/Components: .*$/Components: ${COMPONENTS}/" ${DEBIAN_SOURCES} apt-get -qq update # install DKMS apt-get --no-install-recommends -qq -y install dkms # Prepare DKMS to use the certificates retrieved from cloud secrets ln -sf "${ca_tmpdir}/db.rsa" /var/lib/dkms/mok.key cp "${ca_tmpdir}/db.der" /var/lib/dkms/mok.pub # install dkms and nvidia support packages apt-get --no-install-recommends -qq -y install \ dkms \ "linux-headers-$(uname -r)" \ nvidia-container-toolkit \ nvidia-open-kernel-support \ nvidia-smi \ libglvnd0 \ libcuda1 # install the driver itself apt-get --no-install-recommends -qq -y install \ nvidia-open-kernel-dkms apt-get clean apt-get autoremove -y # Install CUDA cuda_runfile="cuda_${nv_cuda_ver}_${nv_driver_ver}_linux.run" curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "https://developer.download.nvidia.com/compute/cuda/${nv_cuda_ver}/local_installers/${cuda_runfile}" \ -o cuda.run bash ./cuda.run --silent --toolkit --no-opengl-libs rm cuda.run ====== Filename: ./examples/secure-boot/dask.screenrc ====== # # For debugging, uncomment the following line # # screen -L -t monitor 0 /bin/bash screen -L -t 2.2-debian12 1 /bin/bash -x examples/secure-boot/pre-init.sh 2.2-debian12 examples/secure-boot/dask.sh screen -L -t 2.1-debian11 2 /bin/bash -x examples/secure-boot/pre-init.sh 2.1-debian11 examples/secure-boot/dask.sh screen -L -t 2.0-debian10 3 /bin/bash -x examples/secure-boot/pre-init.sh 2.0-debian10 examples/secure-boot/dask.sh screen -L -t 2.2-ubuntu22 4 /bin/bash -x examples/secure-boot/pre-init.sh 2.2-ubuntu22 examples/secure-boot/dask.sh screen -L -t 2.1-ubuntu20 5 /bin/bash -x examples/secure-boot/pre-init.sh 2.1-ubuntu20 examples/secure-boot/dask.sh screen -L -t 2.0-ubuntu18 6 /bin/bash -x examples/secure-boot/pre-init.sh 2.0-ubuntu18 examples/secure-boot/dask.sh screen -L -t 2.2-rocky9 7 /bin/bash -x examples/secure-boot/pre-init.sh 2.2-rocky9 examples/secure-boot/dask.sh screen -L -t 2.1-rocky8 8 /bin/bash -x examples/secure-boot/pre-init.sh 2.1-rocky8 examples/secure-boot/dask.sh screen -L -t 2.0-rocky8 9 /bin/bash -x examples/secure-boot/pre-init.sh 2.0-rocky8 examples/secure-boot/dask.sh ====== Filename: ./examples/secure-boot/create-key-pair.sh ====== #!/bin/bash # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS-IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # This script creates a key pair and publishes to cloud secrets or # fetches an already published key pair from cloud secrets set -e # https://github.com/glevand/secure-boot-utils # https://cloud.google.com/compute/shielded-vm/docs/creating-shielded-images#adding-shielded-image # https://cloud.google.com/compute/shielded-vm/docs/creating-shielded-images#generating-security-keys-certificates # https://wiki.archlinux.org/title/Unified_Extensible_Firmware_Interface/Secure_Boot#Creating_keys ITERATION=042 CURRENT_PROJECT_ID="$(gcloud config get project)" if [[ -z "${CURRENT_PROJECT_ID}" ]]; then echo 'project is not set. please set with `gcloud config set project ${PROJECT_ID}`' >&2 exit -1 fi PROJECT_ID="${CURRENT_PROJECT_ID}" function create_key () { local EFI_VAR_NAME="$1" local CN_VAL="$2" local PRIVATE_KEY="tls/${EFI_VAR_NAME}.rsa" local CACERT="tls/${EFI_VAR_NAME}.pem" local CACERT_DER="tls/${EFI_VAR_NAME}.der" CA_KEY_SECRET_NAME="efi-${EFI_VAR_NAME}-priv-key-${ITERATION}" CA_CERT_SECRET_NAME="efi-${EFI_VAR_NAME}-pub-key-${ITERATION}" # If the secrets exist in secret manager, populate the tls/ directory if [[ ! -f "${PRIVATE_KEY}" ]] && gcloud secrets describe "${CA_CERT_SECRET_NAME}" > /dev/null ; then mkdir -p tls gcloud secrets versions access "1" \ --project="${PROJECT_ID}" \ --secret="${CA_KEY_SECRET_NAME}" \ | dd of="${PRIVATE_KEY}" status=none gcloud secrets versions access "1" \ --project="${PROJECT_ID}" \ --secret="${CA_CERT_SECRET_NAME}" \ | base64 --decode \ | dd of="${CACERT_DER}" status=none # Create a PEM-format version of the cert openssl x509 \ -inform DER \ -in "${CACERT_DER}" \ -outform PEM \ -out "${CACERT}" MS_UEFI_CA="tls/MicCorUEFCA2011_2011-06-27.crt" curl -s -L -o "${MS_UEFI_CA}" 'https://go.microsoft.com/fwlink/p/?linkid=321194' echo "${CA_KEY_SECRET_NAME}" > tls/private-key-secret-name.txt echo "${CA_CERT_SECRET_NAME}" > tls/public-key-secret-name.txt modulus_md5sum="$(openssl rsa -noout -modulus -in ${PRIVATE_KEY} | openssl md5 | awk '{print $2}' | tee tls/modulus-md5sum.txt)" return fi if [[ -f "${PRIVATE_KEY}" ]]; then echo "key already exists. Skipping generation." >&2 modulus_md5sum="$(cat tls/modulus-md5sum.txt)" return fi mkdir -p tls echo "generating '${CN_VAL}' '${CACERT}', '${CACERT_DER}' and '${PRIVATE_KEY}'" >&2 # Generate new x.509 key and cert openssl req \ -newkey rsa:3072 \ -nodes \ -keyout "${PRIVATE_KEY}" \ -new \ -x509 \ -sha256 \ -days 3650 \ -subj "/CN=${CN_VAL}/" \ -out "${CACERT}" # Create a DER-format version of the cert openssl x509 \ -outform DER \ -in "${CACERT}" \ -outform DER \ -in "${CACERT}" \ -out "${CACERT_DER}" # Create a new secret containing private key gcloud secrets create "${CA_KEY_SECRET_NAME}" \ --project="${PROJECT_ID}" \ --replication-policy="automatic" \ --data-file="${PRIVATE_KEY}" echo "Private key secret name: '${CA_KEY_SECRET_NAME}'" >&2 echo "${CA_KEY_SECRET_NAME}" > tls/private-key-secret-name.txt # Create a new secret containing public key cat "${CACERT_DER}" | base64 > "${CACERT_DER}.base64" gcloud secrets create "${CA_CERT_SECRET_NAME}" \ --project="${PROJECT_ID}" \ --replication-policy="automatic" \ --data-file="${CACERT_DER}.base64" modulus_md5sum="$(openssl x509 -noout -modulus -in ${CACERT} | openssl md5 | awk '{print $2}')" echo "modulus-md5sum: ${modulus_md5sum}" >&2 echo "${modulus_md5sum}" > tls/modulus-md5sum.txt echo "Public key secret name: '${CA_CERT_SECRET_NAME}'" >&2 echo "${CA_CERT_SECRET_NAME}" > tls/public-key-secret-name.txt } EFI_VAR_NAME=db create_key "${EFI_VAR_NAME}" "Cloud Dataproc Custom Image CA ${ITERATION}" echo "modulus_md5sum=${modulus_md5sum}" echo "private_secret_name=${CA_KEY_SECRET_NAME}" echo "public_secret_name=${CA_CERT_SECRET_NAME}" echo "secret_project=${PROJECT_ID}" echo "secret_version=1" ====== Filename: ./examples/secure-boot/pre-init.screenrc ====== # # For debugging, uncomment the following line # # screen -L -t monitor 0 /bin/bash #screen -L -t 1.5-debian10 1 /bin/bash -x examples/secure-boot/pre-init.sh 1.5-debian10 screen -L -t 2.0-debian10 2 /bin/bash -x examples/secure-boot/pre-init.sh 2.0-debian10 screen -L -t 2.0-rocky8 3 /bin/bash -x examples/secure-boot/pre-init.sh 2.0-rocky8 screen -L -t 2.0-ubuntu18 4 /bin/bash -x examples/secure-boot/pre-init.sh 2.0-ubuntu18 screen -L -t 2.1-debian11 5 /bin/bash -x examples/secure-boot/pre-init.sh 2.1-debian11 screen -L -t 2.1-rocky8 6 /bin/bash -x examples/secure-boot/pre-init.sh 2.1-rocky8 screen -L -t 2.1-ubuntu20 7 /bin/bash -x examples/secure-boot/pre-init.sh 2.1-ubuntu20 #screen -L -t 2.1-ubuntu20-arm 11 /bin/bash -x examples/secure-boot/pre-init.sh 2.1-ubuntu20-arm screen -L -t 2.2-debian12 8 /bin/bash -x examples/secure-boot/pre-init.sh 2.2-debian12 screen -L -t 2.2-rocky9 9 /bin/bash -x examples/secure-boot/pre-init.sh 2.2-rocky9 screen -L -t 2.2-ubuntu22 10 /bin/bash -x examples/secure-boot/pre-init.sh 2.2-ubuntu22 screen -L -t 2.3-debian12 12 /bin/bash -x examples/secure-boot/pre-init.sh 2.3-debian12 screen -L -t 2.3-rocky9 13 /bin/bash -x examples/secure-boot/pre-init.sh 2.3-rocky9 screen -L -t 2.3-ubuntu22 14 /bin/bash -x examples/secure-boot/pre-init.sh 2.3-ubuntu22 #screen -L -t 2.3-ml-ubuntu22 15 /bin/bash -x examples/secure-boot/pre-init.sh 2.3-ml-ubuntu22 ====== Filename: ./examples/secure-boot/env.json.sample ====== { "PROJECT_ID":"example-yyyy-nn", "PURPOSE":"cuda-pre-init", "BUCKET":"my-bucket-name", "IMAGE_VERSION":"2.2-debian12", "ZONE":"us-west4-a" } ====== Filename: ./examples/secure-boot/cloud-sql-proxy.sh ====== #!/bin/bash # Copyright 2016 Google LLC and contributors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This init script installs a cloud-sql-proxy on each node in the cluster, and # uses that proxy to expose TCP proxies of one or more CloudSQL instances. # One of these instances is used for the clusters Hive Metastore. # Do not use "set -x" to avoid printing passwords in clear in the logs set -euo pipefail function os_id() ( set +x ; grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; ) function os_version() ( set +x ; grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; ) function os_codename() ( set +x ; grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; ) function version_ge(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|tail -n1)" ]]; } function version_gt(){ [[ "$1" = "$2" ]]&& return 1 || version_ge "$1" "$2";} function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; } function version_lt(){ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";} readonly -A supported_os=( ['debian']="10 11 12" ['rocky']="8 9" ['ubuntu']="18.04 20.04 22.04" ) # dynamically define OS version test utility functions if [[ "$(os_id)" == "rocky" ]]; then _os_version=$(os_version | sed -e 's/[^0-9].*$//g') else _os_version="$(os_version)"; fi for os_id_val in 'rocky' 'ubuntu' 'debian' ; do eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )" for osver in $(echo "${supported_os["${os_id_val}"]}") ; do eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )" eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )" eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )" done done function is_debuntu() ( set +x ; is_debian || is_ubuntu ; ) function print_metadata_value() { local readonly tmpfile=$(mktemp) http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \ -s -o ${tmpfile} 2>/dev/null) local readonly return_code=$? # If the command completed successfully, print the metadata value to stdout. if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then cat ${tmpfile} fi rm -f ${tmpfile} return ${return_code} } function print_metadata_value_if_exists() { local return_code=1 local readonly url=$1 print_metadata_value ${url} return_code=$? return ${return_code} } function get_metadata_value() ( set +x local readonly varname=$1 local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 # Print the instance metadata value. print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} return_code=$? # If the instance doesn't have the value, try the project. if [[ ${return_code} != 0 ]]; then print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} return_code=$? fi return ${return_code} ) function get_metadata_attribute() ( set +x local -r attribute_name="$1" local -r default_value="${2:-}" get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" ) # Detect dataproc image version from its various names if (! test -v DATAPROC_IMAGE_VERSION) && test -v DATAPROC_VERSION; then DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" fi readonly OS_NAME=$(lsb_release -is | tr '[:upper:]' '[:lower:]') distribution=$(. /etc/os-release;echo $ID$VERSION_ID) declare -A DEFAULT_DB_PORT=(['MYSQL']='3306' ['POSTGRES']='5432' ['SQLSERVER']='1433') declare -A DEFAULT_DB_ADMIN_USER=(['MYSQL']='root' ['POSTGRES']='postgres' ['SQLSERVER']='sqlserver') declare -A DEFAULT_DB_PROTO=(['MYSQL']='mysql' ['POSTGRES']='postgresql' ['SQLSERVER']='sqlserver') declare -A DEFAULT_DB_DRIVER=(['MYSQL']='com.mysql.jdbc.Driver' ['POSTGRES']='org.postgresql.Driver' ['SQLSERVER']='com.microsoft.sqlserver.jdbc.SQLServerDriver') function err() { echo "[$(date +'%Y-%m-%dT%H:%M:%S%z')] [$(hostname)]: ERROR: $*" >&2 return 1 } function log() { echo "[$(date +'%Y-%m-%dT%H:%M:%S%z')] [$(hostname)]: INFO: $*" >&2 } readonly ADDITIONAL_INSTANCES_KEY='attributes/additional-cloud-sql-instances' readonly PROXY_DIR='/var/run/cloud_sql_proxy' readonly PROXY_BIN='/usr/local/bin/cloud_sql_proxy' readonly INIT_SCRIPT='/usr/lib/systemd/system/cloud-sql-proxy.service' readonly PROXY_LOG_DIR='/var/log/cloud-sql-proxy' # Whether to configure the Hive metastore to point to a Cloud SQL database. # This is not required for Hive & Spark I/O. ENABLE_CLOUD_SQL_METASTORE="$(/usr/share/google/get_metadata_value attributes/enable-cloud-sql-hive-metastore || echo 'true')" readonly ENABLE_CLOUD_SQL_METASTORE # Whether to enable the proxy on workers. This is not necessary for the # Metastore, but is required for Hive & Spark I/O. ENABLE_PROXY_ON_WORKERS="$(/usr/share/google/get_metadata_value attributes/enable-cloud-sql-proxy-on-workers || echo 'true')" readonly ENABLE_PROXY_ON_WORKERS # Whether to use the private IP address of the cloud sql instance. USE_CLOUD_SQL_PRIVATE_IP="$(/usr/share/google/get_metadata_value attributes/use-cloud-sql-private-ip || echo 'false')" readonly USE_CLOUD_SQL_PRIVATE_IP METASTORE_INSTANCE="$(/usr/share/google/get_metadata_value attributes/hive-metastore-instance || echo '')" readonly METASTORE_INSTANCE ADDITIONAL_INSTANCES="$(/usr/share/google/get_metadata_value ${ADDITIONAL_INSTANCES_KEY} || echo '')" readonly ADDITIONAL_INSTANCES function repair_old_backports { if ! is_debuntu ; then return ; fi # This script uses 'apt-get update' and is therefore potentially dependent on # backports repositories which have been archived. In order to mitigate this # problem, we will use archive.debian.org for the oldoldstable repo # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157 debdists="https://deb.debian.org/debian/dists" oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); oldstable=$( curl -s "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); stable=$( curl -s "${debdists}/stable/Release" | awk '/^Codename/ {print $2}'); matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) ) for filename in "${matched_files[@]}"; do # Fetch from archive.debian.org for ${oldoldstable}-backports perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports } {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}" done } # Get metastore DB instance type, result be one of MYSQL, POSTGRES, SQLSERVER function get_cloudsql_instance_type() { local instance=$(echo "$1" | cut -d "," -f 1) local database='' if [[ -z "${instance}" ]]; then log 'cloudsql instance VM metadata not specified' elif ! [[ "${instance}" =~ .+:.+:.+ ]]; then log 'cloudsql instance not of form project:region:instance' else local project=${instance%*:*:*} instance=${instance##*:} database=$(gcloud sql instances describe --project=${project} ${instance} | grep 'databaseVersion') if [[ -z "${database}" ]]; then log 'Unable to describe metastore_instance' else # Trim off version and whitespaces and use upper case # databaseVersion: MYSQL_8_0 # databaseVersion: POSTGRES_12 # databaseVersion: SQLSERVER_2019_STANDARD database=${database##*:} database=${database%%_*} database="${database#"${database%%[![:space:]]*}"}" fi fi echo "${database^^}" } # CLOUD SQL instance type is one of MYSQL, POSTGRES, SQLSERVER. If not specified # try to infer it from METASTORE_INSTANCE, ADDITIONAL_INSTANCES, default to MYSQL CLOUDSQL_INSTANCE_TYPE="$(/usr/share/google/get_metadata_value attributes/cloud-sql-instance-type || echo '')" CLOUDSQL_INSTANCE_TYPE=${CLOUDSQL_INSTANCE_TYPE^^} if [[ -z "${CLOUDSQL_INSTANCE_TYPE}" ]]; then if [[ -n "${METASTORE_INSTANCE}" ]]; then CLOUDSQL_INSTANCE_TYPE=$(get_cloudsql_instance_type "${METASTORE_INSTANCE}") elif [[ -n "${ADDITIONAL_INSTANCES}" ]]; then CLOUDSQL_INSTANCE_TYPE=$(get_cloudsql_instance_type "${ADDITIONAL_INSTANCES}") fi fi if [[ -z "${CLOUDSQL_INSTANCE_TYPE}" ]]; then CLOUDSQL_INSTANCE_TYPE='MYSQL' fi readonly CLOUDSQL_INSTANCE_TYPE METASTORE_PROXY_PORT="$(/usr/share/google/get_metadata_value attributes/metastore-proxy-port || echo '')" if [[ "${METASTORE_INSTANCE}" =~ =tcp:[0-9]+$ ]]; then METASTORE_PROXY_PORT="${METASTORE_INSTANCE##*:}" else METASTORE_PROXY_PORT=${DEFAULT_DB_PORT["${CLOUDSQL_INSTANCE_TYPE}"]} fi readonly METASTORE_PROXY_PORT # Database user to use to access metastore. DB_HIVE_USER="$(/usr/share/google/get_metadata_value attributes/db-hive-user || echo 'hive')" readonly DB_HIVE_USER DB_ADMIN_USER="$(/usr/share/google/get_metadata_value attributes/db-admin-user || echo '')" if [[ -z ${DB_ADMIN_USER} ]]; then DB_ADMIN_USER=${DEFAULT_DB_ADMIN_USER["${CLOUDSQL_INSTANCE_TYPE}"]} fi readonly DB_ADMIN_USER KMS_KEY_URI="$(/usr/share/google/get_metadata_value attributes/kms-key-uri || echo '')" readonly KMS_KEY_URI # Database admin user password used to create the metastore database and user. DB_ADMIN_PASSWORD_URI="$(/usr/share/google/get_metadata_value attributes/db-admin-password-uri || echo '')" readonly DB_ADMIN_PASSWORD_URI DB_ADMIN_PASSWORD='' if [[ -n "${DB_ADMIN_PASSWORD_URI}" ]]; then # Decrypt password DB_ADMIN_PASSWORD="$(gsutil cat "${DB_ADMIN_PASSWORD_URI}" | gcloud kms decrypt \ --ciphertext-file - \ --plaintext-file - \ --key "${KMS_KEY_URI}")" fi if [[ "${CLOUDSQL_INSTANCE_TYPE}" == "POSTGRES" && -z "${DB_ADMIN_PASSWORD}" ]]; then log 'POSTGRES DB admin password is not set' fi readonly DB_ADMIN_PASSWORD # Database password used to access metastore. DB_HIVE_PASSWORD_URI="$(/usr/share/google/get_metadata_value attributes/db-hive-password-uri || echo '')" readonly DB_HIVE_PASSWORD_URI if [[ -n "${DB_HIVE_PASSWORD_URI}" ]]; then # Decrypt password DB_HIVE_PASSWORD="$(gsutil cat "${DB_HIVE_PASSWORD_URI}" | gcloud kms decrypt \ --ciphertext-file - \ --plaintext-file - \ --key "${KMS_KEY_URI}")" readonly DB_HIVE_PASSWORD else db_hive_pwd=$(bdconfig get_property_value \ --configuration_file "/etc/hive/conf/hive-site.xml" \ --name "javax.jdo.option.ConnectionPassword" 2>/dev/null) if [[ "${db_hive_pwd}" == "None" ]]; then db_hive_pwd="hive-password" fi readonly DB_HIVE_PASSWORD=${db_hive_pwd} fi # Name of MySQL database to use for the metastore. # Will be created if it doesn't exist. METASTORE_DB="$(/usr/share/google/get_metadata_value attributes/hive-metastore-db || echo 'hive_metastore')" readonly METASTORE_DB # Dataproc master nodes information readonly DATAPROC_MASTER=$(/usr/share/google/get_metadata_value attributes/dataproc-master) function get_java_property() { local property_file=$1 local property_name=$2 local property_value property_value=$(grep "^${property_name}=" "${property_file}" | tail -n 1 | cut -d '=' -f 2- | sed -r 's/\\([#!=:])/\1/g') echo "${property_value}" } function get_dataproc_property() { local property_name=$1 local property_value [[ -f /etc/google-dataproc/dataproc.properties ]] || return property_value=$(get_java_property \ /etc/google-dataproc/dataproc.properties "${property_name}") echo "${property_value}" } function is_component_selected() { local component=$1 local activated_components activated_components=$(get_dataproc_property dataproc.components.activate) if [[ ${activated_components} == *${component}* ]]; then return 0 fi return 1 } KERBEROS_ENABLED=$(is_component_selected 'kerberos' && echo 'true' || echo 'false') readonly KERBEROS_ENABLED function get_hive_principal() { # Hostname is fully qualified local host host=$(hostname -f) local domain domain=$(dnsdomainname) # Realm is uppercase domain name echo "hive/${host}@${domain^^}" } function get_hiveserver_uri() { local base_connect_string="jdbc:hive2://localhost:10000" if [[ "${KERBEROS_ENABLED}" == 'true' ]]; then local hive_principal hive_principal=$(get_hive_principal) echo "${base_connect_string}/;principal=${hive_principal}" else echo "${base_connect_string}" fi } # Helper to run any command with Fibonacci backoff. # If all retries fail, returns last attempt's exit code. # Args: "$@" is the command to run. function run_with_retries() { local retry_backoff=(1 1 2 3 5 8 13 21 34 55 89 144) local -a cmd=("$@") log "About to run '${cmd[*]}' with retries..." for ((i = 0; i < ${#retry_backoff[@]}; i++)); do if "${cmd[@]}"; then return 0 fi local sleep_time=${retry_backoff[$i]} log "'${cmd[*]}' attempt $((i + 1)) failed! Sleeping ${sleep_time}." sleep "${sleep_time}" done log "Final attempt of '${cmd[*]}'..." # Let any final error propagate all the way out to any error traps. "${cmd[@]}" } function get_metastore_instance() { local metastore_instance="${METASTORE_INSTANCE}" if ! [[ "${metastore_instance}" =~ =tcp:[0-9]+$ ]]; then metastore_instance+="=tcp:${METASTORE_PROXY_PORT}" fi echo "${metastore_instance}" } function get_proxy_flags() { local proxy_instances_flags='' # If a Cloud SQL instance has both public and private IP, use private IP. if [[ ${USE_CLOUD_SQL_PRIVATE_IP} == "true" ]]; then proxy_instances_flags+=" --ip_address_types=PRIVATE" fi if [[ ${ENABLE_CLOUD_SQL_METASTORE} == "true" ]]; then local metastore_instance metastore_instance=$(get_metastore_instance) proxy_instances_flags+=" -instances=${metastore_instance}" fi if [[ -n "${ADDITIONAL_INSTANCES}" ]]; then # Pass additional instances straight to the proxy. proxy_instances_flags+=" -instances_metadata=instance/${ADDITIONAL_INSTANCES_KEY}" fi echo "${proxy_instances_flags}" } function install_cloud_sql_proxy() { echo 'Installing Cloud SQL Proxy ...' >&2 # Install proxy. wget -nv --timeout=30 --tries=5 --retry-connrefused \ https://dl.google.com/cloudsql/cloud_sql_proxy.linux.amd64 mv cloud_sql_proxy.linux.amd64 ${PROXY_BIN} chmod +x ${PROXY_BIN} mkdir -p ${PROXY_DIR} mkdir -p ${PROXY_LOG_DIR} local proxy_flags proxy_flags="$(get_proxy_flags)" # Validate db_hive_password and escape invalid xml characters if found. local db_hive_password_xml_escaped db_hive_password_xml_escaped=${DB_HIVE_PASSWORD//&/&} db_hive_password_xml_escaped=${db_hive_password_xml_escaped///>} db_hive_password_xml_escaped=${db_hive_password_xml_escaped//'"'/"} # Install proxy as systemd service for reboot tolerance. cat <${INIT_SCRIPT} [Unit] Description=Google Cloud SQL Proxy After=local-fs.target network-online.target After=google.service Before=shutdown.target [Service] Type=simple ExecStart=/bin/sh -c '${PROXY_BIN} \ -dir=${PROXY_DIR} \ ${proxy_flags} >> /var/log/cloud-sql-proxy/cloud-sql-proxy.log 2>&1' [Install] WantedBy=multi-user.target EOF chmod a+rw ${INIT_SCRIPT} if [[ $ENABLE_CLOUD_SQL_METASTORE == "true" ]]; then local db_url=jdbc:${DEFAULT_DB_PROTO["${CLOUDSQL_INSTANCE_TYPE}"]}://localhost:${METASTORE_PROXY_PORT}/${METASTORE_DB} local db_driver=${DEFAULT_DB_DRIVER["${CLOUDSQL_INSTANCE_TYPE}"]} # Update hive-site.xml cat <hive-template.xml javax.jdo.option.ConnectionURL ${db_url} the URL of the MySQL database javax.jdo.option.ConnectionDriverName ${db_driver} javax.jdo.option.ConnectionUserName ${DB_HIVE_USER} javax.jdo.option.ConnectionPassword ${db_hive_password_xml_escaped} EOF bdconfig merge_configurations \ --configuration_file /etc/hive/conf/hive-site.xml \ --source_configuration_file hive-template.xml \ --clobber fi log 'Cloud SQL Proxy installation succeeded' } function initialize_mysql_metastore_db() { log 'Initialzing MYSQL DB for Hive metastore ...' local db_password_param='--password=' if [[ -n ${DB_ADMIN_PASSWORD} ]]; then db_password_param+=${DB_ADMIN_PASSWORD} fi local db_hive_password_param='' if [[ -n ${DB_HIVE_PASSWORD} ]]; then db_hive_password_param+="-p${DB_HIVE_PASSWORD}" fi # Check if metastore is initialized. if ! mysql -h 127.0.0.1 -P "${METASTORE_PROXY_PORT}" -u "${DB_HIVE_USER}" "${db_hive_password_param}" -e ''; then mysql -h 127.0.0.1 -P "${METASTORE_PROXY_PORT}" -u "${DB_ADMIN_USER}" "${db_password_param}" -e \ "CREATE USER '${DB_HIVE_USER}' IDENTIFIED BY '${DB_HIVE_PASSWORD}';" fi if ! mysql -h 127.0.0.1 -P "${METASTORE_PROXY_PORT}" -u "${DB_HIVE_USER}" "${db_hive_password_param}" -e "use ${METASTORE_DB}"; then # Initialize a Hive metastore DB mysql -h 127.0.0.1 -P "${METASTORE_PROXY_PORT}" -u "${DB_ADMIN_USER}" "${db_password_param}" -e \ "CREATE DATABASE ${METASTORE_DB}; GRANT ALL PRIVILEGES ON ${METASTORE_DB}.* TO '${DB_HIVE_USER}';" /usr/lib/hive/bin/schematool -dbType mysql -initSchema || err 'Failed to set mysql schema.' fi log 'MYSQL DB initialized for Hive metastore' } function initialize_postgres_metastore_db() { log 'Initialzing POSTGRES DB for Hive metastore ...' local admin_connection=postgresql://"${DB_ADMIN_USER}":"${DB_ADMIN_PASSWORD}"@127.0.0.1:"${METASTORE_PROXY_PORT}"/ local hive_connection=postgresql://"${DB_HIVE_USER}":"${DB_HIVE_PASSWORD}"@127.0.0.1:"${METASTORE_PROXY_PORT}"/postgres # Check if metastore is initialized. if ! psql "${hive_connection}" -c ''; then log 'Create DB Hive user...' psql "${admin_connection}" -c "CREATE USER ${DB_HIVE_USER} WITH PASSWORD '${DB_HIVE_PASSWORD}';" fi if ! psql "${hive_connection}" -c '\c "${METASTORE_DB}" ' ; then log 'Create Hive Metastore database...' psql "${admin_connection}" -c "CREATE DATABASE ${METASTORE_DB};" psql "${hive_connection}" -c '\c "${METASTORE_DB}" ' psql "${admin_connection}" -c "GRANT ALL PRIVILEGES ON DATABASE ${METASTORE_DB} TO ${DB_HIVE_USER} ;" log 'Create Hive Metastore schema...' /usr/lib/hive/bin/schematool -dbType postgres -initSchema || err 'Failed to set postgres schema.' fi log 'POSTGRES DB initialized for Hive metastore' } function initialize_metastore_db() { case ${CLOUDSQL_INSTANCE_TYPE} in MYSQL) initialize_mysql_metastore_db ;; POSTGRES) initialize_postgres_metastore_db ;; SQLSERVER) # TODO: add SQLSERVER support ;; *) # NO-OP ;; esac } function run_validation() { log 'Validating Hive is running...' # Check that metastore schema is compatible. /usr/lib/hive/bin/schematool -dbType ${CLOUDSQL_INSTANCE_TYPE,,} -info || err 'Run /usr/lib/hive/bin/schematool -dbType ${CLOUDSQL_INSTANCE_TYPE,,} -upgradeSchemaFrom to upgrade the schema. Note that this may break Hive metastores that depend on the old schema' # Validate it's functioning. # On newer Dataproc images, we start hive-server2 after init actions are run, # so skip this step if hive-server2 isn't already running. if (systemctl show -p SubState --value hive-server2 | grep -q running); then local hiveserver_uri hiveserver_uri=$(get_hiveserver_uri) if ! timeout 60s beeline -u "${hiveserver_uri}" -e 'SHOW TABLES;' >&/dev/null; then err 'Failed to bring up Cloud SQL Metastore' else log 'Cloud SQL Hive Metastore initialization succeeded' fi # Execute the Hive "reload function" DDL to reflect permanent functions # that have already been created in the HiveServer. beeline -u "${hiveserver_uri}" -e "reload function;" log 'Reloaded permanent functions' fi log 'Validated Hive functioning' } function install_mysql_cli() { if command -v mysql >/dev/null; then log "MySQL CLI is already installed" return fi log "Installing MySQL CLI ..." if command -v apt >/dev/null; then apt update && apt install mysql-client -y elif command -v yum >/dev/null; then yum -y update && yum -y install mysql fi log "MySQL CLI installed" } function install_postgres_cli() { if command -v psql >/dev/null; then log "POSTGRES CLI is already installed" return fi log "Installing POSTGRES CLI ..." if command -v apt >/dev/null; then apt update && apt install postgresql-client -y elif command -v yum >/dev/null; then yum -y update && yum -y install postgresql fi log "POSTGRES CLI installed" } function install_db_cli() { case ${CLOUDSQL_INSTANCE_TYPE} in MYSQL) install_mysql_cli ;; POSTGRES) install_postgres_cli ;; SQLSERVER) # TODO: add SQL support err 'Fail fast here if SQLSERVER support is not enabled.' ;; *) # NO-OP ;; esac } function stop_mysql_service() { # Debian/Ubuntu if (systemctl is-enabled --quiet mysql); then log 'Stopping and disabling mysql.service ...' systemctl stop mysql systemctl disable mysql log 'mysql.service stopped and disabled' # CentOS/Rocky elif systemctl is-enabled --quiet mysqld; then log 'Stopping and disabling mysqld.service ...' systemctl stop mysqld systemctl disable mysqld log 'mysqld.service stopped and disabled' else log 'Service mysql is not enabled' fi } function stop_hive_services() { if (systemctl is-enabled --quiet hive-server2); then log 'Stopping Hive server2 ...' systemctl stop hive-server2 log 'Hive server2 stopped' else echo "Service Hive server2 is not enabled" fi if (systemctl is-enabled --quiet hive-metastore); then log 'Stopping Hive metastore ...' systemctl stop hive-metastore log 'Hive metastore stopped' else echo "Service Hive metastore is not enabled" fi } function start_hive_services() { if (systemctl is-enabled --quiet hive-metastore); then log 'Restarting Hive metastore ...' # Re-start metastore to pickup config changes. systemctl restart hive-metastore || err 'Unable to start hive-metastore service' log 'Hive metastore restarted' else echo "Service Hive metastore is not enabled" fi if (systemctl is-enabled --quiet hive-server2); then log 'Restarting Hive server2 ...' # Re-start Hive server2 to re-establish Metastore connection. systemctl restart hive-server2 || err 'Unable to start hive-server2 service' log 'Hive server2 restarted' else echo "Service Hive server2 is not enabled" fi } function start_cloud_sql_proxy() { log 'Starting Cloud SQL proxy ...' systemctl enable cloud-sql-proxy systemctl start cloud-sql-proxy || err 'Unable to start cloud-sql-proxy service' if [[ $ENABLE_CLOUD_SQL_METASTORE == "true" ]]; then run_with_retries nc -zv localhost "${METASTORE_PROXY_PORT}" fi log 'Cloud SQL Proxy started' log 'Logs can be found in /var/log/cloud-sql-proxy/cloud-sql-proxy.log' } function validate() { if [[ $ENABLE_CLOUD_SQL_METASTORE != "true" ]] && [[ -z "${ADDITIONAL_INSTANCES}" ]]; then err 'No Cloud SQL instances to proxy' fi } function update_master() { if [[ $ENABLE_CLOUD_SQL_METASTORE == "true" ]]; then stop_hive_services stop_mysql_service fi install_cloud_sql_proxy start_cloud_sql_proxy if [[ $ENABLE_CLOUD_SQL_METASTORE == "true" ]]; then install_db_cli # Retry as there may be failures due to race condition run_with_retries initialize_metastore_db start_hive_services # Make sure that Hive metastore properly configured. run_with_retries run_validation fi } function update_worker() { # This part runs on workers. There is no in-cluster MySQL on workers. if [[ $ENABLE_PROXY_ON_WORKERS == "true" ]]; then install_cloud_sql_proxy start_cloud_sql_proxy fi } function clean_up_sources_lists() { if ! is_debuntu ; then return ; fi # # bigtop (primary) # local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')" local regional_bigtop_repo_uri regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} | sed -E "s#/dataproc-bigtop-repo(-dev)?/#/goog-dataproc-bigtop-repo\\1-${region}/#" | grep -E "deb .*goog-dataproc-bigtop-repo(-dev)?-${region}.* dataproc contrib" | cut -d ' ' -f 2 | head -1) if [[ "${regional_bigtop_repo_uri}" == */ ]]; then local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key" else local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key" fi local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg" rm -f "${bigtop_kr_path}" curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \ "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}" sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" fi # # adoptium # # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public" local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg" rm -f "${adoptium_kr_path}" curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \ | gpg --dearmor -o "${adoptium_kr_path}" echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \ > /etc/apt/sources.list.d/adoptium.list # # docker # local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg" local docker_repo_file="/etc/apt/sources.list.d/docker.list" local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg" rm -f "${docker_kr_path}" curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \ | gpg --dearmor -o "${docker_kr_path}" echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \ > ${docker_repo_file} # # google cloud + logging/monitoring # if ls /etc/apt/sources.list.d/google-cloud*.list ; then rm -f /usr/share/keyrings/cloud.google.gpg curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg for list in google-cloud google-cloud-logging google-cloud-monitoring ; do list_file="/etc/apt/sources.list.d/${list}.list" if [[ -f "${list_file}" ]]; then sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}" fi done fi # # cran-r # if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi rm -f /usr/share/keyrings/cran-r.gpg curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ gpg --dearmor -o /usr/share/keyrings/cran-r.gpg sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list fi # # mysql # if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then rm -f /usr/share/keyrings/mysql.gpg curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ gpg --dearmor -o /usr/share/keyrings/mysql.gpg sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list fi if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi } function main() { local role role="$(/usr/share/google/get_metadata_value attributes/dataproc-role)" validate repair_old_backports clean_up_sources_lists if [[ "${role}" == 'Master' ]]; then update_master else update_worker fi log 'All done' } main ====== Filename: ./examples/secure-boot/rapids.sh ====== #!/bin/bash # Copyright 2019,2020,2021,2022,2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This initialization action script will install rapids on a Dataproc # cluster. set -euxo pipefail function os_id() { grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; } function is_ubuntu() { [[ "$(os_id)" == 'ubuntu' ]] ; } function is_ubuntu18() { is_ubuntu && [[ "$(os_version)" == '18.04'* ]] ; } function is_debian() { [[ "$(os_id)" == 'debian' ]] ; } function is_debuntu() { is_debian || is_ubuntu ; } function print_metadata_value() { local readonly tmpfile=$(mktemp) http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \ -s -o ${tmpfile} 2>/dev/null) local readonly return_code=$? # If the command completed successfully, print the metadata value to stdout. if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then cat ${tmpfile} fi rm -f ${tmpfile} return ${return_code} } function print_metadata_value_if_exists() { local return_code=1 local readonly url=$1 print_metadata_value ${url} return_code=$? return ${return_code} } function get_metadata_value() { set +x local readonly varname=$1 local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 # Print the instance metadata value. print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} return_code=$? # If the instance doesn't have the value, try the project. if [[ ${return_code} != 0 ]]; then print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} return_code=$? fi set -x return ${return_code} } function get_metadata_attribute() ( set +x local -r attribute_name="$1" local -r default_value="${2:-}" get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" ) function is_cuda12() { [[ "${CUDA_VERSION%%.*}" == "12" ]] ; } function is_cuda11() { [[ "${CUDA_VERSION%%.*}" == "11" ]] ; } function execute_with_retries() { local -r cmd="$*" for i in {0..9} ; do if eval "$cmd"; then return 0 ; fi sleep 5 done echo "Cmd '${cmd}' failed." return 1 } function configure_dask_yarn() { readonly DASK_YARN_CONFIG_DIR=/etc/dask/ readonly DASK_YARN_CONFIG_FILE=${DASK_YARN_CONFIG_DIR}/config.yaml # Minimal custom configuration is required for this # setup. Please see https://yarn.dask.org/en/latest/quickstart.html#usage # for information on tuning Dask-Yarn environments. mkdir -p "${DASK_YARN_CONFIG_DIR}" cat <"${DASK_YARN_CONFIG_FILE}" # Config file for Dask Yarn. # # These values are joined on top of the default config, found at # https://yarn.dask.org/en/latest/configuration.html#default-configuration yarn: environment: python://${DASK_CONDA_ENV}/bin/python worker: count: 2 gpus: 1 class: "dask_cuda.CUDAWorker" EOF } function install_systemd_dask_worker() { echo "Installing systemd Dask Worker service..." local -r dask_worker_local_dir="/tmp/${DASK_WORKER_SERVICE}" mkdir -p "${dask_worker_local_dir}" local DASK_WORKER_LAUNCHER="/usr/local/bin/${DASK_WORKER_SERVICE}-launcher.sh" cat <"${DASK_WORKER_LAUNCHER}" #!/bin/bash LOGFILE="/var/log/${DASK_WORKER_SERVICE}.log" nvidia-smi -c DEFAULT echo "dask-cuda-worker starting, logging to \${LOGFILE}" ${DASK_CONDA_ENV}/bin/dask-cuda-worker "${MASTER}:8786" --local-directory="${dask_worker_local_dir}" --memory-limit=auto >> "\${LOGFILE}" 2>&1 EOF chmod 750 "${DASK_WORKER_LAUNCHER}" local -r dask_service_file="/usr/lib/systemd/system/${DASK_WORKER_SERVICE}.service" cat <"${dask_service_file}" [Unit] Description=Dask Worker Service [Service] Type=simple Restart=on-failure ExecStart=/bin/bash -c 'exec ${DASK_WORKER_LAUNCHER}' [Install] WantedBy=multi-user.target EOF chmod a+r "${dask_service_file}" systemctl daemon-reload # Enable the service if [[ "${ROLE}" != "Master" ]]; then enable_worker_service="1" else local RUN_WORKER_ON_MASTER=$(get_metadata_attribute dask-cuda-worker-on-master 'true') # Enable service on single-node cluster (no workers) local worker_count="$(get_metadata_attribute dataproc-worker-count)" if [[ "${worker_count}" == "0" || "${RUN_WORKER_ON_MASTER}" == "true" ]]; then enable_worker_service="1" fi fi if [[ "${enable_worker_service}" == "1" ]]; then systemctl enable "${DASK_WORKER_SERVICE}" systemctl restart "${DASK_WORKER_SERVICE}" fi } function install_systemd_dask_scheduler() { # only run scheduler on primary master if [[ "$(hostname -s)" != "${MASTER}" ]]; then return ; fi echo "Installing systemd Dask Scheduler service..." local -r dask_scheduler_local_dir="/tmp/${DASK_SCHEDULER_SERVICE}" mkdir -p "${dask_scheduler_local_dir}" local DASK_SCHEDULER_LAUNCHER="/usr/local/bin/${DASK_SCHEDULER_SERVICE}-launcher.sh" cat <"${DASK_SCHEDULER_LAUNCHER}" #!/bin/bash LOGFILE="/var/log/${DASK_SCHEDULER_SERVICE}.log" echo "dask scheduler starting, logging to \${LOGFILE}" ${DASK_CONDA_ENV}/bin/dask scheduler >> "\${LOGFILE}" 2>&1 EOF chmod 750 "${DASK_SCHEDULER_LAUNCHER}" local -r dask_service_file="/usr/lib/systemd/system/${DASK_SCHEDULER_SERVICE}.service" cat <"${dask_service_file}" [Unit] Description=Dask Scheduler Service [Service] Type=simple Restart=on-failure ExecStart=/bin/bash -c 'exec ${DASK_SCHEDULER_LAUNCHER}' [Install] WantedBy=multi-user.target EOF chmod a+r "${dask_service_file}" systemctl daemon-reload # Enable the service systemctl enable "${DASK_SCHEDULER_SERVICE}" } function install_systemd_dask_service() { install_systemd_dask_scheduler install_systemd_dask_worker } function restart_knox() { systemctl stop knox rm -rf "${KNOX_HOME}/data/deployments/*" systemctl start knox } function configure_knox_for_dask() { if [[ ! -d "${KNOX_HOME}" ]]; then echo "Skip configuring Knox rules for Dask" return 0 fi local DASK_UI_PORT=8787 if [[ -f /etc/knox/conf/topologies/default.xml ]]; then sed -i \ "/<\/topology>/i DASK<\/role>http://localhost:${DASK_UI_PORT}<\/url><\/service> DASKWS<\/role>ws:\/\/${MASTER}:${DASK_UI_PORT}<\/url><\/service>" \ /etc/knox/conf/topologies/default.xml fi mkdir -p "${KNOX_DASK_DIR}" cat >"${KNOX_DASK_DIR}/service.xml" <<'EOF' EOF cat >"${KNOX_DASK_DIR}/rewrite.xml" <<'EOF' EOF mkdir -p "${KNOX_DASKWS_DIR}" cat >"${KNOX_DASKWS_DIR}/service.xml" <<'EOF' EOF cat >"${KNOX_DASKWS_DIR}/rewrite.xml" <<'EOF' EOF chown -R knox:knox "${KNOX_DASK_DIR}" "${KNOX_DASKWS_DIR}" # Do not restart knox during pre-init script run if [[ -n "${ROLE}" ]]; then restart_knox fi } function configure_fluentd_for_dask() { if [[ "$(hostname -s)" == "${MASTER}" ]]; then cat >/etc/google-fluentd/config.d/dataproc-dask.conf < @type tail path /var/log/dask-scheduler.log pos_file /var/tmp/fluentd.dataproc.dask.scheduler.pos read_from_head true tag google.dataproc.dask-scheduler @type none @type record_transformer filename dask-scheduler.log EOF fi if [[ "${enable_worker_service}" == "1" ]]; then cat >>/etc/google-fluentd/config.d/dataproc-dask.conf < @type tail path /var/log/dask-worker.log pos_file /var/tmp/fluentd.dataproc.dask.worker.pos read_from_head true tag google.dataproc.dask-worker @type none @type record_transformer filename dask-worker.log EOF fi systemctl restart google-fluentd } function install_dask_rapids() { if is_cuda12 ; then local python_spec="python>=3.11" local cuda_spec="cuda-version>=12,<13" local dask_spec="dask>=2024.7" local numba_spec="numba" elif is_cuda11 ; then local python_spec="python>=3.9" local cuda_spec="cuda-version>=11,<12.0a0" local dask_spec="dask" local numba_spec="numba" fi rapids_spec="rapids>=${RAPIDS_VERSION}" CONDA_PACKAGES=() if [[ "${DASK_RUNTIME}" == 'yarn' ]]; then # Pin `distributed` and `dask` package versions to old release # because `dask-yarn` 0.9 uses skein in a way which # is not compatible with `distributed` package 2022.2 and newer: # https://github.com/dask/dask-yarn/issues/155 dask_spec="dask<2022.2" python_spec="python>=3.7,<3.8.0a0" rapids_spec="rapids<=24.05" if is_ubuntu18 ; then # the libuuid.so.1 distributed with fiona 1.8.22 dumps core when calling uuid_generate_time_generic CONDA_PACKAGES+=("fiona<1.8.22") fi CONDA_PACKAGES+=('dask-yarn=0.9' "distributed<2022.2") fi CONDA_PACKAGES+=( "${cuda_spec}" "${rapids_spec}" "${dask_spec}" "dask-bigquery" "dask-ml" "dask-sql" "cudf" "${numba_spec}" ) # Install cuda, rapids, dask mamba="/opt/conda/miniconda3/bin/mamba" conda="/opt/conda/miniconda3/bin/conda" "${conda}" remove -n dask --all || echo "unable to remove conda environment [dask]" ( set +e local is_installed="0" for installer in "${mamba}" "${conda}" ; do test -d "${DASK_CONDA_ENV}" || \ time "${installer}" "create" -m -n 'dask-rapids' -y --no-channel-priority \ -c 'conda-forge' -c 'nvidia' -c 'rapidsai' \ ${CONDA_PACKAGES[*]} \ "${python_spec}" \ > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } sync if [[ "$retval" == "0" ]] ; then is_installed="1" break fi "${conda}" config --set channel_priority flexible done if [[ "${is_installed}" == "0" ]]; then echo "failed to install dask" return 1 fi ) } function main() { # Install Dask with RAPIDS install_dask_rapids # In "standalone" mode, Dask relies on a systemd unit to launch. # In "yarn" mode, it relies a config.yaml file. if [[ "${DASK_RUNTIME}" == "yarn" ]]; then # Create Dask YARN config file configure_dask_yarn else # Create Dask service install_systemd_dask_service if [[ "$(hostname -s)" == "${MASTER}" ]]; then systemctl start "${DASK_SCHEDULER_SERVICE}" systemctl status "${DASK_SCHEDULER_SERVICE}" fi echo "Starting Dask 'standalone' cluster..." if [[ "${enable_worker_service}" == "1" ]]; then systemctl start "${DASK_WORKER_SERVICE}" systemctl status "${DASK_WORKER_SERVICE}" fi configure_knox_for_dask local DASK_CLOUD_LOGGING="$(get_metadata_attribute dask-cloud-logging || echo 'false')" if [[ "${DASK_CLOUD_LOGGING}" == "true" ]]; then configure_fluentd_for_dask fi fi echo "Dask RAPIDS for ${DASK_RUNTIME} successfully initialized." if [[ "${ROLE}" == "Master" ]]; then systemctl restart hadoop-yarn-resourcemanager.service # Restart NodeManager on Master as well if this is a single-node-cluster. if systemctl list-units | grep hadoop-yarn-nodemanager; then systemctl restart hadoop-yarn-nodemanager.service fi else systemctl restart hadoop-yarn-nodemanager.service fi } function exit_handler() ( set +e echo "Exit handler invoked" # Free conda cache /opt/conda/miniconda3/bin/conda clean -a > /dev/null 2>&1 # Clear pip cache pip cache purge || echo "unable to purge pip cache" # remove the tmpfs conda pkgs_dirs if [[ -d /mnt/shm ]] ; then /opt/conda/miniconda3/bin/conda config --remove pkgs_dirs /mnt/shm ; fi # Clean up shared memory mounts for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm ; do if grep -q "^tmpfs ${shmdir}" /proc/mounts ; then rm -rf ${shmdir}/* umount -f ${shmdir} fi done # Clean up OS package cache ; re-hold systemd package if is_debuntu ; then apt-get -y -qq clean apt-get -y -qq autoremove else dnf clean all fi # print disk usage statistics if is_debuntu ; then # Rocky doesn't have sort -h and fails when the argument is passed du --max-depth 3 -hx / | sort -h | tail -10 fi # Process disk usage logs from installation period rm -f "${tmpdir}/keep-running-df" sleep 6s # compute maximum size of disk during installation # Log file contains logs like the following (minus the preceeding #): #Filesystem Size Used Avail Use% Mounted on #/dev/vda2 6.8G 2.5G 4.0G 39% / df -h / | tee -a "${tmpdir}/disk-usage.log" perl -e '$max=( sort map { (split)[2] =~ /^(\d+)/ } grep { m:^/: } )[-1]; print( "maximum-disk-used: $max", $/ );' < "${tmpdir}/disk-usage.log" echo "exit_handler has completed" # zero free disk space if [[ -n "$(get_metadata_attribute creating-image)" ]]; then dd if=/dev/zero of=/zero ; sync ; rm -f /zero fi return 0 ) function prepare_to_install(){ readonly DEFAULT_CUDA_VERSION="12.4" CUDA_VERSION=$(get_metadata_attribute 'cuda-version' ${DEFAULT_CUDA_VERSION}) readonly CUDA_VERSION readonly ROLE=$(get_metadata_attribute dataproc-role) readonly MASTER=$(get_metadata_attribute dataproc-master) # RAPIDS config RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'DASK') readonly RAPIDS_RUNTIME readonly DEFAULT_DASK_RAPIDS_VERSION="24.08" readonly RAPIDS_VERSION=$(get_metadata_attribute 'rapids-version' ${DEFAULT_DASK_RAPIDS_VERSION}) # Dask config DASK_RUNTIME="$(get_metadata_attribute dask-runtime || echo 'standalone')" readonly DASK_RUNTIME readonly DASK_SERVICE=dask-cluster readonly DASK_WORKER_SERVICE=dask-worker readonly DASK_SCHEDULER_SERVICE=dask-scheduler readonly DASK_CONDA_ENV="/opt/conda/miniconda3/envs/dask-rapids" # Knox config readonly KNOX_HOME=/usr/lib/knox readonly KNOX_DASK_DIR="${KNOX_HOME}/data/services/dask/0.1.0" readonly KNOX_DASKWS_DIR="${KNOX_HOME}/data/services/daskws/0.1.0" enable_worker_service="0" free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" # Write to a ramdisk instead of churning the persistent disk if [[ ${free_mem} -ge 5250000 ]]; then tmpdir=/mnt/shm mkdir -p /mnt/shm mount -t tmpfs tmpfs /mnt/shm # Download conda packages to tmpfs /opt/conda/miniconda3/bin/conda config --add pkgs_dirs /mnt/shm mount -t tmpfs tmpfs /mnt/shm # Download pip packages to tmpfs pip config set global.cache-dir /mnt/shm || echo "unable to set global.cache-dir" # Download OS packages to tmpfs if is_debuntu ; then mount -t tmpfs tmpfs /var/cache/apt/archives else mount -t tmpfs tmpfs /var/cache/dnf fi else tmpdir=/tmp fi install_log="${tmpdir}/install.log" trap exit_handler EXIT # Monitor disk usage in a screen session if is_debuntu ; then apt-get install -y -qq screen else dnf -y -q install screen fi df -h / | tee "${tmpdir}/disk-usage.log" touch "${tmpdir}/keep-running-df" screen -d -m -US keep-running-df \ bash -c "while [[ -f ${tmpdir}/keep-running-df ]] ; do df -h / | tee -a ${tmpdir}/disk-usage.log ; sleep 5s ; done" } prepare_to_install main ====== Filename: ./examples/secure-boot/ai-notebooks.sh ====== #!/bin/bash # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS-IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # This script creates a custom image pre-loaded with cuda set -ex export PROJECT_ID="$(jq -r .PROJECT_ID env.json)" export PURPOSE="$(jq -r .PURPOSE env.json)" export BUCKET="$(jq -r .BUCKET env.json)" export IMAGE_VERSION="$(jq -r .IMAGE_VERSION env.json)" export ZONE="$(jq -r .ZONE env.json)" custom_image_zone="${ZONE}" disk_size_gb="50" # greater than or equal to 30 SA_NAME="sa-${PURPOSE}" GSA="${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com" gcloud config set project ${PROJECT_ID} gcloud auth login if [[ -d tls ]] ; then mv tls "tls-$(date +%s)" ; fi eval "$(bash examples/secure-boot/create-key-pair.sh)" metadata="public_secret_name=${public_secret_name}" metadata="${metadata},private_secret_name=${private_secret_name}" metadata="${metadata},secret_project=${secret_project}" metadata="${metadata},secret_version=${secret_version}" if gcloud iam service-accounts list --filter email="${GSA}" 2>&1 | grep 'Listed 0 items.' ; then # Create service account for this purpose echo "creating pre-init customization service account ${GSA}" gcloud iam service-accounts create "${SA_NAME}" \ --description="Service account for pre-init customization" \ --display-name="${SA_NAME}" fi # Grant service account access to bucket gcloud storage buckets add-iam-policy-binding "gs://${BUCKET}" \ --member="serviceAccount:${GSA}" \ --role="roles/storage.objectViewer" # Grant the service account access to list secrets for the project gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role="roles/secretmanager.viewer" # Grant service account permission to access the private secret gcloud secrets add-iam-policy-binding "${private_secret_name}" \ --member="serviceAccount:${GSA}" \ --role="roles/secretmanager.secretAccessor" # Grant service account permission to access the public secret gcloud secrets add-iam-policy-binding "${public_secret_name}" \ --member="serviceAccount:${GSA}" \ --role="roles/secretmanager.secretAccessor" # If no OS family specified, default to debian if [[ "${IMAGE_VERSION}" != *-* ]] ; then case "${IMAGE_VERSION}" in "2.2" ) dataproc_version="${IMAGE_VERSION}-debian12" ;; "2.1" ) dataproc_version="${IMAGE_VERSION}-debian11" ;; "2.0" ) dataproc_version="${IMAGE_VERSION}-debian10" ;; esac else dataproc_version="${IMAGE_VERSION}" fi #dataproc_version="${IMAGE_VERSION}-ubuntu22" #dataproc_version="${IMAGE_VERSION}-rocky9" #customization_script="examples/secure-boot/install-nvidia-driver-debian11.sh" #customization_script="examples/secure-boot/install-nvidia-driver-debian12.sh" customization_script="examples/secure-boot/install_gpu_driver.sh" #echo "#!/bin/bash\necho no op" | dd of=empty.sh #customization_script=empty.sh #image_name="nvidia-open-kernel-2.2-ubuntu22-$(date +%F)" #image_name="nvidia-open-kernel-2.2-rocky9-$(date +%F)" #image_name="nvidia-open-kernel-2.2-debian12-$(date +%F)" #image_name="nvidia-open-kernel-${dataproc_version}-$(date +%F)" image_name="cuda-${dataproc_version/\./-}-$(date +%F-%H-%M)" python generate_custom_image.py \ --accelerator "type=nvidia-tesla-t4" \ --image-name "${image_name}" \ --dataproc-version "${dataproc_version}" \ --trusted-cert "tls/db.der" \ --customization-script "${customization_script}" \ --service-account "${GSA}" \ --metadata "${metadata}" \ --zone "${custom_image_zone}" \ --disk-size "${disk_size_gb}" \ --no-smoke-test \ --gcs-bucket "${BUCKET}" \ --shutdown-instance-timer-sec=30 set +x # Revoke permission to access the private secret gcloud secrets remove-iam-policy-binding "${private_secret_name}" \ --member="serviceAccount:${GSA}" \ --role="roles/secretmanager.secretAccessor" > /dev/null 2>&1 # Revoke access to bucket gcloud storage buckets remove-iam-policy-binding "gs://${BUCKET}" \ --member="serviceAccount:${GSA}" \ --role="roles/storage.objectViewer" > /dev/null 2>&1 # Revoke access to list secrets for the project gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role="roles/secretmanager.viewer" > /dev/null 2>&1 ====== Filename: ./examples/secure-boot/mig.sh ====== #!/bin/bash # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS-IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # This script installs NVIDIA GPU drivers and enables MIG on Amphere GPU architectures. # # This script should be specified in --metadata=startup-script-url= option and # --metadata=ENABLE_MIG can be used to enable or disable MIG. The default is to enable it. # The script does a reboot to fully enable MIG and then configures the MIG device based on the # user specified MIG_CGI profiles specified via: --metadata=^:^MIG_CGI='9,9'. If MIG_CGI # is not specified it assumes it's using an A100 and configures 2 instances with profile id 9. # It is assumed this script is used in conjuntion with install_gpu_driver.sh, which does the # YARN setup to fully utilize the MIG instances on YARN. # # This initialization action is generated from # initialization-actions/templates/spark-rapids/mig.sh.in # # Modifications made directly to the generated file will be lost when # the template is re-evaluated set -euxo pipefail function os_id() ( set +x ; grep '^ID=' /etc/os-release | cut -d= -f2 | xargs ; ) function os_version() ( set +x ; grep '^VERSION_ID=' /etc/os-release | cut -d= -f2 | xargs ; ) function os_codename() ( set +x ; grep '^VERSION_CODENAME=' /etc/os-release | cut -d= -f2 | xargs ; ) function version_ge(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|tail -n1)" ]]; } function version_gt(){ [[ "$1" = "$2" ]]&& return 1 || version_ge "$1" "$2";} function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; } function version_lt(){ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";} readonly -A supported_os=( ['debian']="10 11 12" ['rocky']="8 9" ['ubuntu']="18.04 20.04 22.04" ) # dynamically define OS version test utility functions if [[ "$(os_id)" == "rocky" ]]; then _os_version=$(os_version | sed -e 's/[^0-9].*$//g') else _os_version="$(os_version)"; fi for os_id_val in 'rocky' 'ubuntu' 'debian' ; do eval "function is_${os_id_val}() ( set +x ; [[ \"$(os_id)\" == '${os_id_val}' ]] ; )" for osver in $(echo "${supported_os["${os_id_val}"]}") ; do eval "function is_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; )" eval "function ge_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_ge \"${_os_version}\" \"${osver}\" ; )" eval "function le_${os_id_val}${osver%%.*}() ( set +x ; is_${os_id_val} && version_le \"${_os_version}\" \"${osver}\" ; )" done done function is_debuntu() ( set +x ; is_debian || is_ubuntu ; ) function os_vercat() ( set +x if is_ubuntu ; then os_version | sed -e 's/[^0-9]//g' elif is_rocky ; then os_version | sed -e 's/[^0-9].*$//g' else os_version ; fi ; ) function repair_old_backports { if ! is_debuntu ; then return ; fi # This script uses 'apt-get update' and is therefore potentially dependent on # backports repositories which have been archived. In order to mitigate this # problem, we will use archive.debian.org for the oldoldstable repo # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157 debdists="https://deb.debian.org/debian/dists" oldoldstable=$(curl -s "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); oldstable=$( curl -s "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); stable=$( curl -s "${debdists}/stable/Release" | awk '/^Codename/ {print $2}'); matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) ) for filename in "${matched_files[@]}"; do # Fetch from archive.debian.org for ${oldoldstable}-backports perl -pi -e "s{^(deb[^\s]*) https?://[^/]+/debian ${oldoldstable}-backports } {\$1 https://archive.debian.org/debian ${oldoldstable}-backports }g" "${filename}" done } function print_metadata_value() { local readonly tmpfile=$(mktemp) http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \ -s -o ${tmpfile} 2>/dev/null) local readonly return_code=$? # If the command completed successfully, print the metadata value to stdout. if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then cat ${tmpfile} fi rm -f ${tmpfile} return ${return_code} } function print_metadata_value_if_exists() { local return_code=1 local readonly url=$1 print_metadata_value ${url} return_code=$? return ${return_code} } # replicates /usr/share/google/get_metadata_value function get_metadata_value() ( set +x local readonly varname=$1 local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 # Print the instance metadata value. print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} return_code=$? # If the instance doesn't have the value, try the project. if [[ ${return_code} != 0 ]]; then print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} return_code=$? fi return ${return_code} ) function get_metadata_attribute() ( set +x local -r attribute_name="$1" local -r default_value="${2:-}" get_metadata_value "attributes/${attribute_name}" || echo -n "${default_value}" ) function execute_with_retries() ( set +x local -r cmd="$*" if [[ "$cmd" =~ "^apt-get install" ]] ; then apt-get -y clean apt-get -o DPkg::Lock::Timeout=60 -y autoremove fi for ((i = 0; i < 3; i++)); do set -x time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } set +x if [[ $retval == 0 ]] ; then return 0 ; fi sleep 5 done return 1 ) function cache_fetched_package() { local src_url="$1" local gcs_fn="$2" local local_fn="$3" if gsutil ls "${gcs_fn}" 2>&1 | grep -q "${gcs_fn}" ; then time gcloud storage cp "${gcs_fn}" "${local_fn}" else time ( curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 "${src_url}" -o "${local_fn}" && \ gcloud storage cp "${local_fn}" "${gcs_fn}" ; ) fi } function add_contrib_component() { if ge_debian12 ; then # Include in sources file components on which nvidia-kernel-open-dkms depends local -r debian_sources="/etc/apt/sources.list.d/debian.sources" local components="main contrib" sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" elif is_debian ; then sed -i -e 's/ main$/ main contrib/' /etc/apt/sources.list fi } function set_hadoop_property() { local -r config_file=$1 local -r property=$2 local -r value=$3 "${bdcfg}" set_property \ --configuration_file "${HADOOP_CONF_DIR}/${config_file}" \ --name "${property}" --value "${value}" \ --clobber } function configure_yarn_resources() { if [[ ! -d "${HADOOP_CONF_DIR}" ]] ; then return 0 ; fi # pre-init scripts if [[ ! -f "${HADOOP_CONF_DIR}/resource-types.xml" ]]; then printf '\n' >"${HADOOP_CONF_DIR}/resource-types.xml" fi set_hadoop_property 'resource-types.xml' 'yarn.resource-types' 'yarn.io/gpu' set_hadoop_property 'capacity-scheduler.xml' \ 'yarn.scheduler.capacity.resource-calculator' \ 'org.apache.hadoop.yarn.util.resource.DominantResourceCalculator' set_hadoop_property 'yarn-site.xml' 'yarn.resource-types' 'yarn.io/gpu' } # This configuration should be applied only if GPU is attached to the node function configure_yarn_nodemanager() { set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.resource-plugins' 'yarn.io/gpu' set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices' 'auto' set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables' $NVIDIA_SMI_PATH set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.linux-container-executor.cgroups.mount' 'true' set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.linux-container-executor.cgroups.mount-path' '/sys/fs/cgroup' set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.linux-container-executor.cgroups.hierarchy' 'yarn' set_hadoop_property 'yarn-site.xml' \ 'yarn.nodemanager.container-executor.class' \ 'org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor' set_hadoop_property 'yarn-site.xml' 'yarn.nodemanager.linux-container-executor.group' 'yarn' # Fix local dirs access permissions local yarn_local_dirs=() readarray -d ',' yarn_local_dirs < <("${bdcfg}" get_property_value \ --configuration_file "${HADOOP_CONF_DIR}/yarn-site.xml" \ --name "yarn.nodemanager.local-dirs" 2>/dev/null | tr -d '\n') if [[ "${#yarn_local_dirs[@]}" -ne "0" && "${yarn_local_dirs[@]}" != "None" ]]; then chown yarn:yarn -R "${yarn_local_dirs[@]/,/}" fi } function clean_up_sources_lists() { # # bigtop (primary) # local -r dataproc_repo_file="/etc/apt/sources.list.d/dataproc.list" if [[ -f "${dataproc_repo_file}" ]] && ! grep -q signed-by "${dataproc_repo_file}" ; then region="$(get_metadata_value zone | perl -p -e 's:.*/:: ; s:-[a-z]+$::')" local regional_bigtop_repo_uri regional_bigtop_repo_uri=$(cat ${dataproc_repo_file} | sed "s#/dataproc-bigtop-repo/#/goog-dataproc-bigtop-repo-${region}/#" | grep "deb .*goog-dataproc-bigtop-repo-${region}.* dataproc contrib" | cut -d ' ' -f 2 | head -1) if [[ "${regional_bigtop_repo_uri}" == */ ]]; then local -r bigtop_key_uri="${regional_bigtop_repo_uri}archive.key" else local -r bigtop_key_uri="${regional_bigtop_repo_uri}/archive.key" fi local -r bigtop_kr_path="/usr/share/keyrings/bigtop-keyring.gpg" rm -f "${bigtop_kr_path}" curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 \ "${bigtop_key_uri}" | gpg --dearmor -o "${bigtop_kr_path}" sed -i -e "s:deb https:deb [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" sed -i -e "s:deb-src https:deb-src [signed-by=${bigtop_kr_path}] https:g" "${dataproc_repo_file}" fi # # adoptium # # https://adoptium.net/installation/linux/#_deb_installation_on_debian_or_ubuntu local -r key_url="https://packages.adoptium.net/artifactory/api/gpg/key/public" local -r adoptium_kr_path="/usr/share/keyrings/adoptium.gpg" rm -f "${adoptium_kr_path}" curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${key_url}" \ | gpg --dearmor -o "${adoptium_kr_path}" echo "deb [signed-by=${adoptium_kr_path}] https://packages.adoptium.net/artifactory/deb/ $(os_codename) main" \ > /etc/apt/sources.list.d/adoptium.list # # docker # local docker_kr_path="/usr/share/keyrings/docker-keyring.gpg" local docker_repo_file="/etc/apt/sources.list.d/docker.list" local -r docker_key_url="https://download.docker.com/linux/$(os_id)/gpg" rm -f "${docker_kr_path}" curl -fsS --retry-connrefused --retry 10 --retry-max-time 30 "${docker_key_url}" \ | gpg --dearmor -o "${docker_kr_path}" echo "deb [signed-by=${docker_kr_path}] https://download.docker.com/linux/$(os_id) $(os_codename) stable" \ > ${docker_repo_file} # # google cloud + logging/monitoring # if ls /etc/apt/sources.list.d/google-cloud*.list ; then rm -f /usr/share/keyrings/cloud.google.gpg curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg for list in google-cloud google-cloud-logging google-cloud-monitoring ; do list_file="/etc/apt/sources.list.d/${list}.list" if [[ -f "${list_file}" ]]; then sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https:g' "${list_file}" fi done fi # # cran-r # if [[ -f /etc/apt/sources.list.d/cran-r.list ]]; then keyid="0x95c0faf38db3ccad0c080a7bdc78b2ddeabc47b7" if is_ubuntu18 ; then keyid="0x51716619E084DAB9"; fi rm -f /usr/share/keyrings/cran-r.gpg curl "https://keyserver.ubuntu.com/pks/lookup?op=get&search=${keyid}" | \ gpg --dearmor -o /usr/share/keyrings/cran-r.gpg sed -i -e 's:deb http:deb [signed-by=/usr/share/keyrings/cran-r.gpg] http:g' /etc/apt/sources.list.d/cran-r.list fi # # mysql # if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then rm -f /usr/share/keyrings/mysql.gpg curl 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ gpg --dearmor -o /usr/share/keyrings/mysql.gpg sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list fi if [[ -f /etc/apt/trusted.gpg ]] ; then mv /etc/apt/trusted.gpg /etc/apt/old-trusted.gpg ; fi } function set_proxy(){ METADATA_HTTP_PROXY="$(get_metadata_attribute http-proxy '')" if [[ -z "${METADATA_HTTP_PROXY}" ]] ; then return ; fi export METADATA_HTTP_PROXY export http_proxy="${METADATA_HTTP_PROXY}" export https_proxy="${METADATA_HTTP_PROXY}" export HTTP_PROXY="${METADATA_HTTP_PROXY}" export HTTPS_PROXY="${METADATA_HTTP_PROXY}" no_proxy="localhost,127.0.0.0/8,::1,metadata.google.internal,169.254.169.254" local no_proxy_svc for no_proxy_svc in compute secretmanager dns servicedirectory logging \ bigquery composer pubsub bigquerydatatransfer dataflow \ storage datafusion ; do no_proxy="${no_proxy},${no_proxy_svc}.googleapis.com" done export NO_PROXY="${no_proxy}" } function mount_ramdisk(){ local free_mem free_mem="$(awk '/^MemFree/ {print $2}' /proc/meminfo)" if [[ ${free_mem} -lt 10500000 ]]; then return 0 ; fi # Write to a ramdisk instead of churning the persistent disk tmpdir="/mnt/shm" mkdir -p "${tmpdir}" mount -t tmpfs tmpfs "${tmpdir}" # Download conda packages to tmpfs /opt/conda/miniconda3/bin/conda config --add pkgs_dirs "${tmpdir}" # Clear pip cache # TODO: make this conditional on which OSs have pip without cache purge pip cache purge || echo "unable to purge pip cache" # Download pip packages to tmpfs pip config set global.cache-dir "${tmpdir}" || echo "unable to set global.cache-dir" # Download OS packages to tmpfs if is_debuntu ; then mount -t tmpfs tmpfs /var/cache/apt/archives else mount -t tmpfs tmpfs /var/cache/dnf fi } function check_os() { if is_debian && ( ! is_debian10 && ! is_debian11 && ! is_debian12 ) ; then echo "Error: The Debian version ($(os_version)) is not supported. Please use a compatible Debian version." exit 1 elif is_ubuntu && ( ! is_ubuntu18 && ! is_ubuntu20 && ! is_ubuntu22 ) ; then echo "Error: The Ubuntu version ($(os_version)) is not supported. Please use a compatible Ubuntu version." exit 1 elif is_rocky && ( ! is_rocky8 && ! is_rocky9 ) ; then echo "Error: The Rocky Linux version ($(os_version)) is not supported. Please use a compatible Rocky Linux version." exit 1 fi SPARK_VERSION="$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1)" readonly SPARK_VERSION if version_lt "${SPARK_VERSION}" "3.1" || \ version_ge "${SPARK_VERSION}" "4.0" ; then echo "Error: Your Spark version is not supported. Please upgrade Spark to one of the supported versions." exit 1 fi # Detect dataproc image version if (! test -v DATAPROC_IMAGE_VERSION) ; then if test -v DATAPROC_VERSION ; then DATAPROC_IMAGE_VERSION="${DATAPROC_VERSION}" else if version_lt "${SPARK_VERSION}" "3.2" ; then DATAPROC_IMAGE_VERSION="2.0" elif version_lt "${SPARK_VERSION}" "3.4" ; then DATAPROC_IMAGE_VERSION="2.1" elif version_lt "${SPARK_VERSION}" "3.6" ; then DATAPROC_IMAGE_VERSION="2.2" else echo "Unknown dataproc image version" ; exit 1 ; fi fi fi } readonly _shortname="$(os_id)$(os_version|perl -pe 's/(\d+).*/$1/')" # Dataproc configurations readonly HADOOP_CONF_DIR='/etc/hadoop/conf' readonly HIVE_CONF_DIR='/etc/hive/conf' readonly SPARK_CONF_DIR='/etc/spark/conf' function set_support_matrix() { # CUDA version and Driver version # https://docs.nvidia.com/deploy/cuda-compatibility/ # https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html # https://developer.nvidia.com/cuda-downloads # Minimum supported version for open kernel driver is 515.43.04 # https://github.com/NVIDIA/open-gpu-kernel-modules/tags # Rocky8: 12.0: 525.147.05 local latest latest="$(curl -s https://download.nvidia.com/XFree86/Linux-x86_64/latest.txt | awk '{print $1}')" readonly -A DRIVER_FOR_CUDA=( ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05" ["12.1"]="530.30.02" ["12.4"]="550.135" ["12.5"]="555.42.02" ["12.6"]="560.35.03" ) readonly -A DRIVER_SUBVER=( ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["530"]="530.41.03" ["535"]="535.216.01" ["545"]="545.29.06" ["550"]="550.135" ["555"]="555.58.02" ["560"]="560.35.03" ["565"]="565.57.01" ) # https://developer.nvidia.com/cudnn-downloads if is_debuntu ; then readonly -A CUDNN_FOR_CUDA=( ["11.7"]="9.5.1.17" ["11.8"]="9.5.1.17" ["12.0"]="9.5.1.17" ["12.1"]="9.5.1.17" ["12.4"]="9.5.1.17" ["12.5"]="9.5.1.17" ["12.6"]="9.5.1.17" ) elif is_rocky ; then # rocky: # 12.0: 8.8.1.3 # 12.1: 8.9.3.28 # 12.2: 8.9.7.29 # 12.3: 9.0.0.312 # 12.4: 9.1.1.17 # 12.5: 9.2.1.18 # 12.6: 9.5.1.17 readonly -A CUDNN_FOR_CUDA=( ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.4"]="9.1.1.17" ["12.5"]="9.2.1.18" ["12.6"]="9.5.1.17" ) fi # https://developer.nvidia.com/nccl/nccl-download # 12.2: 2.19.3, 12.5: 2.21.5 readonly -A NCCL_FOR_CUDA=( ["11.7"]="2.21.5" ["11.8"]="2.21.5" ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.4"]="2.23.4" ["12.5"]="2.21.5" ["12.6"]="2.23.4" ) readonly -A CUDA_SUBVER=( ["11.7"]="11.7.1" ["11.8"]="11.8.0" ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" ["12.6"]="12.6.2" ) } set_support_matrix function set_cuda_version() { local cuda_url cuda_url=$(get_metadata_attribute 'cuda-url' '') if [[ -n "${cuda_url}" ]] ; then # if cuda-url metadata variable has been passed, extract default version from url local CUDA_URL_VERSION CUDA_URL_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_(\d+\.\d+\.\d+)_\d+\.\d+\.\d+_linux.run$}{$1}')" if [[ "${CUDA_URL_VERSION}" =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]] ; then DEFAULT_CUDA_VERSION="${CUDA_URL_VERSION%.*}" CUDA_FULL_VERSION="${CUDA_URL_VERSION}" fi fi if ( ! test -v DEFAULT_CUDA_VERSION ) ; then DEFAULT_CUDA_VERSION='12.4' fi readonly DEFAULT_CUDA_VERSION CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") readonly CUDA_VERSION if ( ! test -v CUDA_FULL_VERSION ) ; then CUDA_FULL_VERSION=${CUDA_SUBVER["${CUDA_VERSION}"]} fi readonly CUDA_FULL_VERSION } set_cuda_version function is_cuda12() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "12" ]] ; ) function le_cuda12() ( set +x ; version_le "${CUDA_VERSION%%.*}" "12" ; ) function ge_cuda12() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "12" ; ) function is_cuda11() ( set +x ; [[ "${CUDA_VERSION%%.*}" == "11" ]] ; ) function le_cuda11() ( set +x ; version_le "${CUDA_VERSION%%.*}" "11" ; ) function ge_cuda11() ( set +x ; version_ge "${CUDA_VERSION%%.*}" "11" ; ) function set_driver_version() { local gpu_driver_url gpu_driver_url=$(get_metadata_attribute 'gpu-driver-url' '') local cuda_url cuda_url=$(get_metadata_attribute 'cuda-url' '') local DEFAULT_DRIVER # Take default from gpu-driver-url metadata value if [[ -n "${gpu_driver_url}" ]] ; then DRIVER_URL_DRIVER_VERSION="$(echo "${gpu_driver_url}" | perl -pe 's{^.*/NVIDIA-Linux-x86_64-(\d+\.\d+\.\d+).run$}{$1}')" if [[ "${DRIVER_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then DEFAULT_DRIVER="${DRIVER_URL_DRIVER_VERSION}" ; fi # Take default from cuda-url metadata value as a backup elif [[ -n "${cuda_url}" ]] ; then local CUDA_URL_DRIVER_VERSION="$(echo "${cuda_url}" | perl -pe 's{^.*/cuda_\d+\.\d+\.\d+_(\d+\.\d+\.\d+)_linux.run$}{$1}')" if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}" driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]} if curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q '^HTTP.*200\s*$' ; then # use the version indicated by the cuda url as the default if it exists DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" elif curl -s --head "https://download.nvidia.com/XFree86/Linux-x86_64/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q '^HTTP.*200\s*$' ; then # use the maximum sub-version available for the major version indicated in cuda url as the default DEFAULT_DRIVER="${driver_max_maj_version}" fi fi fi if ( ! test -v DEFAULT_DRIVER ) ; then # If a default driver version has not been extracted, use the default for this version of CUDA DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]} fi DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") readonly DRIVER_VERSION readonly DRIVER="${DRIVER_VERSION%%.*}" export DRIVER_VERSION DRIVER gpu_driver_url="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" if ! curl -s --head "${gpu_driver_url}" | grep -E -q '^HTTP.*200\s*$' ; then echo "No NVIDIA driver exists for DRIVER_VERSION=${DRIVER_VERSION}" exit 1 fi } set_driver_version readonly DEFAULT_CUDNN8_VERSION="8.0.5.39" readonly DEFAULT_CUDNN9_VERSION="9.1.0.70" # Parameters for NVIDIA-provided cuDNN library readonly DEFAULT_CUDNN_VERSION=${CUDNN_FOR_CUDA["${CUDA_VERSION}"]} CUDNN_VERSION=$(get_metadata_attribute 'cudnn-version' "${DEFAULT_CUDNN_VERSION}") function is_cudnn8() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "8" ]] ; ) function is_cudnn9() ( set +x ; [[ "${CUDNN_VERSION%%.*}" == "9" ]] ; ) # The minimum cuDNN version supported by rocky is ${DEFAULT_CUDNN8_VERSION} if is_rocky && (version_le "${CUDNN_VERSION}" "${DEFAULT_CUDNN8_VERSION}") ; then CUDNN_VERSION="${DEFAULT_CUDNN8_VERSION}" elif (ge_ubuntu20 || ge_debian12) && is_cudnn8 ; then # cuDNN v8 is not distribution for ubuntu20+, debian12 CUDNN_VERSION="${DEFAULT_CUDNN9_VERSION}" elif (le_ubuntu18 || le_debian11) && is_cudnn9 ; then # cuDNN v9 is not distributed for ubuntu18, debian10, debian11 ; fall back to 8 CUDNN_VERSION="8.8.0.121" fi readonly CUDNN_VERSION readonly DEFAULT_NCCL_VERSION=${NCCL_FOR_CUDA["${CUDA_VERSION}"]} readonly NCCL_VERSION=$(get_metadata_attribute 'nccl-version' ${DEFAULT_NCCL_VERSION}) # Parameters for NVIDIA-provided Debian GPU driver readonly DEFAULT_USERSPACE_URL="https://download.nvidia.com/XFree86/Linux-x86_64/${DRIVER_VERSION}/NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run" readonly USERSPACE_URL=$(get_metadata_attribute 'gpu-driver-url' "${DEFAULT_USERSPACE_URL}") USERSPACE_FILENAME="$(echo ${USERSPACE_URL} | perl -pe 's{^.+/}{}')" readonly USERSPACE_FILENAME # Short name for urls if is_ubuntu22 ; then # at the time of writing 20241125 there is no ubuntu2204 in the index of repos at # https://developer.download.nvidia.com/compute/machine-learning/repos/ # use packages from previous release until such time as nvidia # release ubuntu2204 builds shortname="$(os_id)$(os_vercat)" nccl_shortname="ubuntu2004" elif ge_rocky9 ; then # use packages from previous release until such time as nvidia # release rhel9 builds shortname="rhel9" nccl_shortname="rhel8" elif is_rocky ; then shortname="$(os_id | sed -e 's/rocky/rhel/')$(os_vercat)" nccl_shortname="${shortname}" else shortname="$(os_id)$(os_vercat)" nccl_shortname="${shortname}" fi # Parameters for NVIDIA-provided package repositories readonly NVIDIA_BASE_DL_URL='https://developer.download.nvidia.com/compute' readonly NVIDIA_REPO_URL="${NVIDIA_BASE_DL_URL}/cuda/repos/${shortname}/x86_64" # Parameters for NVIDIA-provided NCCL library readonly DEFAULT_NCCL_REPO_URL="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/nvidia-machine-learning-repo-${nccl_shortname}_1.0.0-1_amd64.deb" NCCL_REPO_URL=$(get_metadata_attribute 'nccl-repo-url' "${DEFAULT_NCCL_REPO_URL}") readonly NCCL_REPO_URL readonly NCCL_REPO_KEY="${NVIDIA_BASE_DL_URL}/machine-learning/repos/${nccl_shortname}/x86_64/7fa2af80.pub" # 3bf863cc.pub function set_cuda_runfile_url() { local MAX_DRIVER_VERSION local MAX_CUDA_VERSION local MIN_OPEN_DRIVER_VER="515.48.07" local MIN_DRIVER_VERSION="${MIN_OPEN_DRIVER_VER}" local MIN_CUDA_VERSION="11.7.1" # matches MIN_OPEN_DRIVER_VER if is_cuda12 ; then if is_debian12 ; then MIN_DRIVER_VERSION="545.23.06" MIN_CUDA_VERSION="12.3.0" elif is_debian10 ; then MAX_DRIVER_VERSION="555.42.02" MAX_CUDA_VERSION="12.5.0" elif is_ubuntu18 ; then MAX_DRIVER_VERSION="530.30.02" MAX_CUDA_VERSION="12.1.1" fi elif version_ge "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then if le_debian10 ; then # cuda 11 is not supported for <= debian10 MAX_CUDA_VERSION="0" MAX_DRIVER_VERSION="0" fi else echo "Minimum CUDA version supported is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" fi if version_lt "${CUDA_VERSION}" "${MIN_CUDA_VERSION}" ; then echo "Minimum CUDA version for ${shortname} is ${MIN_CUDA_VERSION}. Specified: ${CUDA_VERSION}" elif ( test -v MAX_CUDA_VERSION && version_gt "${CUDA_VERSION}" "${MAX_CUDA_VERSION}" ) ; then echo "Maximum CUDA version for ${shortname} is ${MAX_CUDA_VERSION}. Specified: ${CUDA_VERSION}" fi if version_lt "${DRIVER_VERSION}" "${MIN_DRIVER_VERSION}" ; then echo "Minimum kernel driver version for ${shortname} is ${MIN_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" elif ( test -v MAX_DRIVER_VERSION && version_gt "${DRIVER_VERSION}" "${MAX_DRIVER_VERSION}" ) ; then echo "Maximum kernel driver version for ${shortname} is ${MAX_DRIVER_VERSION}. Specified: ${DRIVER_VERSION}" fi # driver version named in cuda runfile filename # (these may not be actual driver versions - see https://download.nvidia.com/XFree86/Linux-x86_64/) readonly -A drv_for_cuda=( ["11.7.0"]="515.43.04" ["11.7.1"]="515.65.01" ["11.8.0"]="520.61.05" ["12.0.0"]="525.60.13" ["12.0.1"]="525.85.12" ["12.1.0"]="530.30.02" ["12.1.1"]="530.30.02" ["12.2.0"]="535.54.03" ["12.2.1"]="535.86.10" ["12.2.2"]="535.104.05" ["12.3.0"]="545.23.06" ["12.3.1"]="545.23.08" ["12.3.2"]="545.23.08" ["12.4.0"]="550.54.15" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://download.nvidia.com/XFree86/Linux-x86_64/ ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.41.06 is not ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ) # Verify that the file with the indicated combination exists local drv_ver=${drv_for_cuda["${CUDA_FULL_VERSION}"]} CUDA_RUNFILE="cuda_${CUDA_FULL_VERSION}_${drv_ver}_linux.run" local CUDA_RELEASE_BASE_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}" local DEFAULT_NVIDIA_CUDA_URL="${CUDA_RELEASE_BASE_URL}/local_installers/${CUDA_RUNFILE}" NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") readonly NVIDIA_CUDA_URL CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" readonly CUDA_RUNFILE if ! curl -s --head "${NVIDIA_CUDA_URL}" | grep -E -q '^HTTP.*200\s*$' ; then echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}" exit 1 fi if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12" elif ( version_gt "${CUDA_VERSION}" "12.1.1" && is_ubuntu18 ) ; then echo "CUDA 12.1.1 is the maximum CUDA version supported on ubuntu18. Requested version: ${CUDA_VERSION}" elif ( version_lt "${CUDA_VERSION%%.*}" "12" && ge_debian12 ) ; then echo "CUDA 11 not supported on Debian 12. Requested version: ${CUDA_VERSION}" elif ( version_lt "${CUDA_VERSION}" "11.8" && is_rocky9 ) ; then echo "CUDA 11.8.0 is the minimum version for Rocky 9. Requested version: ${CUDA_VERSION}" fi } set_cuda_runfile_url # Parameter for NVIDIA-provided Rocky Linux GPU driver readonly NVIDIA_ROCKY_REPO_URL="${NVIDIA_REPO_URL}/cuda-${shortname}.repo" CUDNN_TARBALL="cudnn-${CUDA_VERSION}-linux-x64-v${CUDNN_VERSION}.tgz" CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/${CUDNN_TARBALL}" if ( version_ge "${CUDNN_VERSION}" "8.3.1.22" ); then # When version is greater than or equal to 8.3.1.22 but less than 8.4.1.50 use this format CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%.*}-archive.tar.xz" if ( version_le "${CUDNN_VERSION}" "8.4.1.50" ); then # When cuDNN version is greater than or equal to 8.4.1.50 use this format CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION}-archive.tar.xz" fi # Use legacy url format with one of the tarball name formats depending on version as above CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDA_VERSION}/${CUDNN_TARBALL}" fi if ( version_ge "${CUDA_VERSION}" "12.0" ); then # Use modern url format When cuda version is greater than or equal to 12.0 CUDNN_TARBALL="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz" CUDNN_TARBALL_URL="${NVIDIA_BASE_DL_URL}/cudnn/redist/cudnn/linux-x86_64/${CUDNN_TARBALL}" fi readonly CUDNN_TARBALL readonly CUDNN_TARBALL_URL # Whether to install NVIDIA-provided or OS-provided GPU driver GPU_DRIVER_PROVIDER=$(get_metadata_attribute 'gpu-driver-provider' 'NVIDIA') readonly GPU_DRIVER_PROVIDER # Whether to install GPU monitoring agent that sends GPU metrics to Stackdriver INSTALL_GPU_AGENT=$(get_metadata_attribute 'install-gpu-agent' 'false') readonly INSTALL_GPU_AGENT NVIDIA_SMI_PATH='/usr/bin' MIG_MAJOR_CAPS=0 IS_MIG_ENABLED=0 CUDA_KEYRING_PKG_INSTALLED="0" function install_cuda_keyring_pkg() { if [[ "${CUDA_KEYRING_PKG_INSTALLED}" == "1" ]]; then return ; fi local kr_ver=1.1 curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ -o "${tmpdir}/cuda-keyring.deb" dpkg -i "${tmpdir}/cuda-keyring.deb" rm -f "${tmpdir}/cuda-keyring.deb" CUDA_KEYRING_PKG_INSTALLED="1" } function uninstall_cuda_keyring_pkg() { apt-get purge -yq cuda-keyring CUDA_KEYRING_PKG_INSTALLED="0" } function install_local_cuda_repo() { if test -f "${workdir}/install-local-cuda-repo-complete" ; then return ; fi if [[ "${CUDA_LOCAL_REPO_INSTALLED}" == "1" ]]; then return ; fi CUDA_LOCAL_REPO_INSTALLED="1" pkgname="cuda-repo-${shortname}-${CUDA_VERSION//./-}-local" CUDA_LOCAL_REPO_PKG_NAME="${pkgname}" readonly LOCAL_INSTALLER_DEB="${pkgname}_${CUDA_FULL_VERSION}-${DRIVER_VERSION}-1_amd64.deb" readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" readonly DIST_KEYRING_DIR="/var/${pkgname}" curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}" dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}" rm "${tmpdir}/${LOCAL_INSTALLER_DEB}" cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ if is_ubuntu ; then curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \ -o /etc/apt/preferences.d/cuda-repository-pin-600 fi touch "${workdir}/install-local-cuda-repo-complete" } function uninstall_local_cuda_repo(){ apt-get purge -yq "${CUDA_LOCAL_REPO_PKG_NAME}" rm -f "${workdir}/install-local-cuda-repo-complete" } CUDNN_PKG_NAME="" function install_local_cudnn_repo() { if test -f "${workdir}/install-local-cudnn-repo-complete" ; then return ; fi pkgname="cudnn-local-repo-${shortname}-${CUDNN_VERSION%.*}" CUDNN_PKG_NAME="${pkgname}" local_deb_fn="${pkgname}_1.0-1_amd64.deb" local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}" # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz curl -fsSL --retry-connrefused --retry 3 --retry-max-time 5 \ "${local_deb_url}" -o "${tmpdir}/local-installer.deb" dpkg -i "${tmpdir}/local-installer.deb" rm -f "${tmpdir}/local-installer.deb" cp /var/cudnn-local-repo-*-${CUDNN_VERSION%.*}*/cudnn-local-*-keyring.gpg /usr/share/keyrings touch "${workdir}/install-local-cudnn-repo-complete" } function uninstall_local_cudnn_repo() { apt-get purge -yq "${CUDNN_PKG_NAME}" rm -f "${workdir}/install-local-cudnn-repo-complete" } CUDNN8_LOCAL_REPO_INSTALLED="0" CUDNN8_PKG_NAME="" function install_local_cudnn8_repo() { if test -f "${workdir}/install-local-cudnn8-repo-complete" ; then return ; fi if is_ubuntu ; then cudnn8_shortname="ubuntu2004" elif is_debian ; then cudnn8_shortname="debian11" else return 0 ; fi if is_cuda12 ; then CUDNN8_CUDA_VER=12.0 elif is_cuda11 ; then CUDNN8_CUDA_VER=11.8 else CUDNN8_CUDA_VER="${CUDA_VERSION}" ; fi cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDNN8_CUDA_VER}" pkgname="cudnn-local-repo-${cudnn8_shortname}-${CUDNN_VERSION}" CUDNN8_PKG_NAME="${pkgname}" deb_fn="${pkgname}_1.0-1_amd64.deb" local_deb_fn="${tmpdir}/${deb_fn}" local_deb_url="${NVIDIA_BASE_DL_URL}/redist/cudnn/v${CUDNN_VERSION%.*}/local_installers/${CUDNN8_CUDA_VER}/${deb_fn}" # cache the cudnn package cache_fetched_package "${local_deb_url}" \ "${pkg_bucket}/${CUDNN8_CUDA_VER}/${deb_fn}" \ "${local_deb_fn}" local cudnn_path="$(dpkg -c ${local_deb_fn} | perl -ne 'if(m{(/var/cudnn-local-repo-.*)/\s*$}){print $1}')" # If we are using a ram disk, mount another where we will unpack the cudnn local installer if [[ "${tmpdir}" == "/mnt/shm" ]] && ! grep -q '/var/cudnn-local-repo' /proc/mounts ; then mkdir -p "${cudnn_path}" mount -t tmpfs tmpfs "${cudnn_path}" fi dpkg -i "${local_deb_fn}" rm -f "${local_deb_fn}" cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings touch "${workdir}/install-local-cudnn8-repo-complete" } function uninstall_local_cudnn8_repo() { apt-get purge -yq "${CUDNN8_PKG_NAME}" rm -f "${workdir}/install-local-cudnn8-repo-complete" } function install_nvidia_nccl() { if test -f "${workdir}/nccl-complete" ; then return ; fi if is_cuda11 && is_debian12 ; then echo "NCCL cannot be compiled for CUDA 11 on ${_shortname}" return fi local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" # https://github.com/NVIDIA/nccl/blob/master/README.md # https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ # Fermi: SM_20, compute_30 # Kepler: SM_30,SM_35,SM_37, compute_30,compute_35,compute_37 # Maxwell: SM_50,SM_52,SM_53, compute_50,compute_52,compute_53 # Pascal: SM_60,SM_61,SM_62, compute_60,compute_61,compute_62 # The following architectures are suppored by open kernel driver # Volta: SM_70,SM_72, compute_70,compute_72 # Ampere: SM_80,SM_86,SM_87, compute_80,compute_86,compute_87 # The following architectures are supported by CUDA v11.8+ # Ada: SM_89, compute_89 # Hopper: SM_90,SM_90a compute_90,compute_90a # Blackwell: SM_100, compute_100 NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72" NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_87,code=sm_87" if version_ge "${CUDA_VERSION}" "11.8" ; then NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_89,code=sm_89" fi if version_ge "${CUDA_VERSION}" "12.0" ; then NVCC_GENCODE="${NVCC_GENCODE} -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90a,code=compute_90a" fi mkdir -p "${workdir}" pushd "${workdir}" test -d "${workdir}/nccl" || { local tarball_fn="v${NCCL_VERSION}-1.tar.gz" curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \ | tar xz mv "nccl-${NCCL_VERSION}-1" nccl } local build_path if is_debuntu ; then build_path="nccl/build/pkg/deb" ; else build_path="nccl/build/pkg/rpm/x86_64" ; fi test -d "${workdir}/nccl/build" || { local build_tarball="nccl-build_${_shortname}_${nccl_version}.tar.gz" local local_tarball="${workdir}/${build_tarball}" local gcs_tarball="${pkg_bucket}/${_shortname}/${build_tarball}" output=$(gsutil ls "${gcs_tarball}" 2>&1 || echo '') if echo "${output}" | grep -q "${gcs_tarball}" ; then # cache hit - unpack from cache echo "cache hit" else # build and cache pushd nccl # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install install_build_dependencies if is_debuntu ; then # These packages are required to build .deb packages from source execute_with_retries \ apt-get install -y -qq build-essential devscripts debhelper fakeroot export NVCC_GENCODE execute_with_retries make -j$(nproc) pkg.debian.build elif is_rocky ; then # These packages are required to build .rpm packages from source execute_with_retries \ dnf -y -q install rpm-build rpmdevtools export NVCC_GENCODE execute_with_retries make -j$(nproc) pkg.redhat.build fi tar czvf "/${local_tarball}" "../${build_path}" gcloud storage cp "${local_tarball}" "${gcs_tarball}" rm "${local_tarball}" make clean popd fi gcloud storage cat "${gcs_tarball}" | tar xz } if is_debuntu ; then dpkg -i "${build_path}/libnccl${NCCL_VERSION%%.*}_${nccl_version}_amd64.deb" "${build_path}/libnccl-dev_${nccl_version}_amd64.deb" elif is_rocky ; then rpm -ivh "${build_path}/libnccl-${nccl_version}.x86_64.rpm" "${build_path}/libnccl-devel-${nccl_version}.x86_64.rpm" fi popd touch "${workdir}/nccl-complete" } function is_src_nvidia() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "NVIDIA" ]] ; ) function is_src_os() ( set +x ; [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; ) function install_nvidia_cudnn() { if test -f "${workdir}/cudnn-complete" ; then return ; fi local major_version major_version="${CUDNN_VERSION%%.*}" local cudnn_pkg_version cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}" if is_rocky ; then if is_cudnn8 ; then execute_with_retries dnf -y -q install \ "libcudnn${major_version}" \ "libcudnn${major_version}-devel" sync elif is_cudnn9 ; then execute_with_retries dnf -y -q install \ "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \ "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}" sync else echo "Unsupported cudnn version: '${major_version}'" fi elif is_debuntu; then if ge_debian12 && is_src_os ; then apt-get -y install nvidia-cudnn else if is_cudnn8 ; then install_local_cudnn8_repo apt-get update -qq execute_with_retries \ apt-get -y install --no-install-recommends \ "libcudnn8=${cudnn_pkg_version}" \ "libcudnn8-dev=${cudnn_pkg_version}" uninstall_local_cudnn8_repo sync elif is_cudnn9 ; then install_cuda_keyring_pkg apt-get update -qq execute_with_retries \ apt-get -y install --no-install-recommends \ "libcudnn9-cuda-${CUDA_VERSION%%.*}" \ "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \ "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" sync else echo "Unsupported cudnn version: [${CUDNN_VERSION}]" fi fi else echo "Unsupported OS: '${_shortname}'" exit 1 fi ldconfig echo "NVIDIA cuDNN successfully installed for ${_shortname}." touch "${workdir}/cudnn-complete" } function add_nonfree_components() { if is_src_nvidia ; then return; fi if ge_debian12 ; then # Include in sources file components on which nvidia-open-kernel-dkms depends local -r debian_sources="/etc/apt/sources.list.d/debian.sources" local components="main contrib non-free non-free-firmware" sed -i -e "s/Components: .*$/Components: ${components}/" "${debian_sources}" elif is_debian ; then sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list fi } function add_repo_nvidia_container_toolkit() { if is_debuntu ; then local kr_path=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg local sources_list_path=/etc/apt/sources.list.d/nvidia-container-toolkit.list # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html test -f "${kr_path}" || curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ | gpg --dearmor -o "${kr_path}" test -f "${sources_list_path}" || curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ | perl -pe "s#deb https://#deb [signed-by=${kr_path}] https://#g" \ | tee "${sources_list_path}" apt-get update else curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \ tee /etc/yum.repos.d/nvidia-container-toolkit.repo fi } function add_repo_cuda() { if is_debuntu ; then install_cuda_keyring_pkg # 11.7+, 12.0+ elif is_rocky ; then execute_with_retries "dnf config-manager --add-repo ${NVIDIA_ROCKY_REPO_URL}" fi } function build_driver_from_github() { # non-GPL driver will have been built on rocky8 if is_rocky8 ; then return 0 ; fi pushd "${workdir}" test -d "${workdir}/open-gpu-kernel-modules" || { local tarball_fn="${DRIVER_VERSION}.tar.gz" curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \ | tar xz mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules } local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" local local_tarball="${workdir}/${build_tarball}" local build_dir if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] then build_dir="${modulus_md5sum}" else build_dir="unsigned" ; fi local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then echo "cache hit" else # build the kernel modules pushd open-gpu-kernel-modules install_build_dependencies if is_cuda11 && is_ubuntu22 ; then echo "Kernel modules cannot be compiled for CUDA 11 on ${_shortname}" exit 1 fi execute_with_retries make -j$(nproc) modules \ > kernel-open/build.log \ 2> kernel-open/build_error.log # Sign kernel modules if [[ -n "${PSN}" ]]; then for module in $(find open-gpu-kernel-modules/kernel-open -name '*.ko'); do "/lib/modules/${uname_r}/build/scripts/sign-file" sha256 \ "${mok_key}" \ "${mok_der}" \ "${module}" done fi make modules_install \ >> kernel-open/build.log \ 2>> kernel-open/build_error.log # Collect build logs and installed binaries tar czvf "${local_tarball}" \ "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') gcloud storage cp "${local_tarball}" "${gcs_tarball}" rm "${local_tarball}" make clean popd fi gcloud storage cat "${gcs_tarball}" | tar -C / -xzv depmod -a } popd } function build_driver_from_packages() { if is_debuntu ; then if [[ -n "$(apt-cache search -n "nvidia-driver-${DRIVER}-server-open")" ]] ; then local pkglist=("nvidia-driver-${DRIVER}-server-open") ; else local pkglist=("nvidia-driver-${DRIVER}-open") ; fi if is_debian ; then pkglist=( "firmware-nvidia-gsp=${DRIVER_VERSION}-1" "nvidia-smi=${DRIVER_VERSION}-1" "nvidia-alternative=${DRIVER_VERSION}-1" "nvidia-kernel-open-dkms=${DRIVER_VERSION}-1" "nvidia-kernel-support=${DRIVER_VERSION}-1" "nvidia-modprobe=${DRIVER_VERSION}-1" "libnvidia-ml1=${DRIVER_VERSION}-1" ) fi add_contrib_component apt-get update -qq execute_with_retries apt-get install -y -qq --no-install-recommends dkms #configure_dkms_certs execute_with_retries apt-get install -y -qq --no-install-recommends "${pkglist[@]}" sync elif is_rocky ; then #configure_dkms_certs if execute_with_retries dnf -y -q module install "nvidia-driver:${DRIVER}-dkms" ; then echo "nvidia-driver:${DRIVER}-dkms installed successfully" else execute_with_retries dnf -y -q module install 'nvidia-driver:latest' fi sync fi #clear_dkms_key } function install_nvidia_userspace_runfile() { # This .run file contains NV's OpenGL implementation as well as # nvidia optimized implementations of the gtk+ 2,3 stack(s) not # including glib (https://docs.gtk.org/glib/), and what appears to # be a copy of the source from the kernel-open directory of for # example DRIVER_VERSION=560.35.03 # # https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/560.35.03.tar.gz # # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it. if test -f "${workdir}/userspace-complete" ; then return ; fi local local_fn="${tmpdir}/userspace.run" cache_fetched_package "${USERSPACE_URL}" \ "${pkg_bucket}/${USERSPACE_FILENAME}" \ "${local_fn}" local runfile_args runfile_args="" local cache_hit="0" local local_tarball if is_rocky8 ; then local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}.tar.gz" local_tarball="${workdir}/${build_tarball}" local build_dir if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] then build_dir="${modulus_md5sum}" else build_dir="unsigned" ; fi local gcs_tarball="${pkg_bucket}/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" if gsutil ls "${gcs_tarball}" 2>&1 | grep -q "${gcs_tarball}" ; then cache_hit="1" runfile_args="--no-kernel-modules" echo "cache hit" else install_build_dependencies local signing_options signing_options="" if [[ -n "${PSN}" ]]; then signing_options="--module-signing-hash sha256 \ --module-signing-x509-hash sha256 \ --module-signing-secret-key \"${mok_key}\" \ --module-signing-public-key \"${mok_der}\" \ --module-signing-script \"/lib/modules/${uname_r}/build/scripts/sign-file\" \ " fi runfile_args="--no-dkms ${signing_options}" fi } else runfile_args="--no-kernel-modules" fi execute_with_retries bash "${local_fn}" -e -q \ ${runfile_args} \ --ui=none \ --install-libglvnd \ --tmpdir="${tmpdir}" if is_rocky8 ; then if [[ "${cache_hit}" == "1" ]] ; then gcloud storage cat "${gcs_tarball}" | tar -C / -xzv depmod -a else tar czvf "${local_tarball}" \ /var/log/nvidia-installer.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') gcloud storage cp "${local_tarball}" "${gcs_tarball}" fi fi rm -f "${local_fn}" touch "${workdir}/userspace-complete" sync } function install_cuda_runfile() { if test -f "${workdir}/cuda-complete" ; then return ; fi local local_fn="${tmpdir}/cuda.run" cache_fetched_package "${NVIDIA_CUDA_URL}" \ "${pkg_bucket}/${CUDA_RUNFILE}" \ "${local_fn}" execute_with_retries bash "${local_fn}" --toolkit --no-opengl-libs --silent --tmpdir="${tmpdir}" rm -f "${local_fn}" touch "${workdir}/cuda-complete" sync } function install_cuda_toolkit() { local cudatk_package=cuda-toolkit if ge_debian12 && is_src_os ; then cudatk_package="${cudatk_package}=${CUDA_FULL_VERSION}-1" elif [[ -n "${CUDA_VERSION}" ]]; then cudatk_package="${cudatk_package}-${CUDA_VERSION//./-}" fi cuda_package="cuda=${CUDA_FULL_VERSION}-1" readonly cudatk_package if is_debuntu ; then # if is_ubuntu ; then execute_with_retries "apt-get install -y -qq --no-install-recommends cuda-drivers-${DRIVER}=${DRIVER_VERSION}-1" ; fi execute_with_retries apt-get install -y -qq --no-install-recommends ${cuda_package} ${cudatk_package} elif is_rocky ; then # rocky9: cuda-11-[7,8], cuda-12-[1..6] execute_with_retries dnf -y -q install "${cudatk_package}" fi sync } function load_kernel_module() { # for some use cases, the kernel module needs to be removed before first use of nvidia-smi for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" done depmod -a modprobe nvidia for suffix in uvm modeset drm; do modprobe "nvidia-${suffix}" done # TODO: if peermem is available, also modprobe nvidia-peermem } function install_cuda(){ if test -f "${workdir}/cuda-repo-complete" ; then return ; fi if ( ge_debian12 && is_src_os ) ; then echo "installed with the driver on ${_shortname}" return 0 fi # The OS package distributions are unreliable install_cuda_runfile # Includes CUDA packages add_repo_cuda touch "${workdir}/cuda-repo-complete" } function install_nvidia_container_toolkit() { local container_runtime_default if command -v docker ; then container_runtime_default='docker' elif command -v containerd ; then container_runtime_default='containerd' elif command -v crio ; then container_runtime_default='crio' else container_runtime_default='' ; fi CONTAINER_RUNTIME=$(get_metadata_attribute 'container-runtime' "${container_runtime_default}") if test -z "${CONTAINER_RUNTIME}" ; then return ; fi add_repo_nvidia_container_toolkit if is_debuntu ; then execute_with_retries apt-get install -y -q nvidia-container-toolkit ; else execute_with_retries dnf install -y -q nvidia-container-toolkit ; fi nvidia-ctk runtime configure --runtime="${CONTAINER_RUNTIME}" systemctl restart "${CONTAINER_RUNTIME}" } # Install NVIDIA GPU driver provided by NVIDIA function install_nvidia_gpu_driver() { if test -f "${workdir}/gpu-driver-complete" ; then return ; fi if ( ge_debian12 && is_src_os ) ; then add_nonfree_components apt-get update -qq apt-get -yq install \ dkms \ nvidia-open-kernel-dkms \ nvidia-open-kernel-support \ nvidia-smi \ libglvnd0 \ libcuda1 echo "NVIDIA GPU driver provided by ${_shortname} was installed successfully" return 0 fi # OS driver packages do not produce reliable driver ; use runfile install_nvidia_userspace_runfile build_driver_from_github echo "NVIDIA GPU driver provided by NVIDIA was installed successfully" touch "${workdir}/gpu-driver-complete" } function install_ops_agent(){ if test -f "${workdir}/ops-agent-complete" ; then return ; fi mkdir -p /opt/google cd /opt/google # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation curl -sSO https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install touch "${workdir}/ops-agent-complete" } # Collects 'gpu_utilization' and 'gpu_memory_utilization' metrics function install_gpu_agent() { # Stackdriver GPU agent parameters # local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/master/dlvm/gcp-gpu-utilization-metrics' local -r GPU_AGENT_REPO_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/ml-on-gcp/refs/heads/master/dlvm/gcp-gpu-utilization-metrics' if ( ! command -v pip && is_debuntu ) ; then execute_with_retries "apt-get install -y -qq python3-pip" fi local install_dir=/opt/gpu-utilization-agent mkdir -p "${install_dir}" curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt" curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ | sed -e 's/-u --format=/--format=/' \ | dd status=none of="${install_dir}/report_gpu_metrics.py" local venv="${install_dir}/venv" python3 -m venv "${venv}" ( source "${venv}/bin/activate" python3 -m pip install --upgrade pip execute_with_retries python3 -m pip install -r "${install_dir}/requirements.txt" ) sync # Generate GPU service. cat </lib/systemd/system/gpu-utilization-agent.service [Unit] Description=GPU Utilization Metric Agent [Service] Type=simple PIDFile=/run/gpu_agent.pid ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"' User=root Group=root WorkingDirectory=/ Restart=always [Install] WantedBy=multi-user.target EOF # Reload systemd manager configuration systemctl daemon-reload # Enable gpu-utilization-agent service systemctl --no-reload --now enable gpu-utilization-agent.service } function configure_gpu_exclusive_mode() { # check if running spark 3, if not, enable GPU exclusive mode local spark_version spark_version=$(spark-submit --version 2>&1 | sed -n 's/.*version[[:blank:]]\+\([0-9]\+\.[0-9]\).*/\1/p' | head -n1) if [[ ${spark_version} != 3.* ]]; then # include exclusive mode on GPU nvidia-smi -c EXCLUSIVE_PROCESS fi } function fetch_mig_scripts() { mkdir -p /usr/local/yarn-mig-scripts sudo chmod 755 /usr/local/yarn-mig-scripts wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/nvidia-smi wget -P /usr/local/yarn-mig-scripts/ https://raw.githubusercontent.com/NVIDIA/spark-rapids-examples/branch-22.10/examples/MIG-Support/yarn-unpatched/scripts/mig2gpu.sh sudo chmod 755 /usr/local/yarn-mig-scripts/* } function configure_gpu_script() { # Download GPU discovery script local -r spark_gpu_script_dir='/usr/lib/spark/scripts/gpu' mkdir -p ${spark_gpu_script_dir} # need to update the getGpusResources.sh script to look for MIG devices since if multiple GPUs nvidia-smi still # lists those because we only disable the specific GIs via CGROUPs. Here we just create it based off of: # https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh local -r gpus_resources_script="${spark_gpu_script_dir}/getGpusResources.sh" cat > "${gpus_resources_script}" <<'EOF' #!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ADDRS=$(nvidia-smi --query-gpu=index --format=csv,noheader | perl -e 'print(join(q{,},map{chomp; qq{"$_"}}))') echo {\"name\": \"gpu\", \"addresses\":[${ADDRS}]} EOF chmod a+rx "${gpus_resources_script}" local spark_defaults_conf="/etc/spark/conf.dist/spark-defaults.conf" if version_ge "${SPARK_VERSION}" "3.0" ; then local gpu_count gpu_count="$(lspci | grep NVIDIA | wc -l)" local executor_cores executor_cores="$(nproc | perl -MPOSIX -pe '$_ = POSIX::floor( $_ * 0.75 ); $_-- if $_ % 2')" local executor_memory executor_memory_gb="$(awk '/^MemFree/ {print $2}' /proc/meminfo | perl -MPOSIX -pe '$_ *= 0.75; $_ = POSIX::floor( $_ / (1024*1024) )')" local task_cpus=2 local gpu_amount gpu_amount="$(echo $executor_cores | perl -pe "\$_ = ( ${gpu_count} / (\$_ / ${task_cpus}) )")" cat >>"${spark_defaults_conf}" <> "${HADOOP_CONF_DIR}/container-executor.cfg" printf 'export MIG_AS_GPU_ENABLED=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" printf 'export ENABLE_MIG_GPUS_FOR_CGROUPS=1\n' >> "${HADOOP_CONF_DIR}/yarn-env.sh" else printf '\n[gpu]\nmodule.enabled=true\n[cgroups]\nroot=/sys/fs/cgroup\nyarn-hierarchy=yarn\n' >> "${HADOOP_CONF_DIR}/container-executor.cfg" fi # Configure a systemd unit to ensure that permissions are set on restart cat >/etc/systemd/system/dataproc-cgroup-device-permissions.service<&2 ; return 0 elif ! eval "${nvsmi} > /dev/null" ; then echo "nvidia-smi fails" >&2 ; return 0 else nvsmi_works="1" ; fi if [[ "$1" == "-L" ]] ; then local NV_SMI_L_CACHE_FILE="/var/run/nvidia-smi_-L.txt" if [[ -f "${NV_SMI_L_CACHE_FILE}" ]]; then cat "${NV_SMI_L_CACHE_FILE}" else "${nvsmi}" $* | tee "${NV_SMI_L_CACHE_FILE}" ; fi return 0 fi "${nvsmi}" $* } function install_build_dependencies() { if test -f "${workdir}/build-dependencies-complete" ; then return ; fi if is_debuntu ; then if is_ubuntu22 && is_cuda12 ; then # On ubuntu22, the default compiler does not build some kernel module versions # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11 execute_with_retries apt-get install -y -qq gcc-12 update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 12 update-alternatives --set gcc /usr/bin/gcc-12 fi elif is_rocky ; then execute_with_retries dnf -y -q install gcc local dnf_cmd="dnf -y -q install kernel-devel-${uname_r}" set +e eval "${dnf_cmd}" > "${install_log}" 2>&1 local retval="$?" set -e if [[ "${retval}" == "0" ]] ; then return ; fi if grep -q 'Unable to find a match: kernel-devel-' "${install_log}" ; then # this kernel-devel may have been migrated to the vault local os_ver="$(echo $uname_r | perl -pe 's/.*el(\d+_\d+)\..*/$1/; s/_/./')" local vault="https://download.rockylinux.org/vault/rocky/${os_ver}" dnf_cmd="$(echo dnf -y -q --setopt=localpkg_gpgcheck=1 install \ "${vault}/BaseOS/x86_64/os/Packages/k/kernel-${uname_r}.rpm" \ "${vault}/BaseOS/x86_64/os/Packages/k/kernel-core-${uname_r}.rpm" \ "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-${uname_r}.rpm" \ "${vault}/BaseOS/x86_64/os/Packages/k/kernel-modules-core-${uname_r}.rpm" \ "${vault}/AppStream/x86_64/os/Packages/k/kernel-devel-${uname_r}.rpm" )" fi execute_with_retries "${dnf_cmd}" fi touch "${workdir}/build-dependencies-complete" } function install_dependencies() { pkg_list="pciutils screen" if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi } function prepare_gpu_env(){ # Verify SPARK compatability RAPIDS_RUNTIME=$(get_metadata_attribute 'rapids-runtime' 'SPARK') readonly DEFAULT_XGBOOST_VERSION="1.7.6" # try 2.1.1 nvsmi_works="0" if is_cuda11 ; then gcc_ver="11" elif is_cuda12 ; then gcc_ver="12" ; fi } # Hold all NVIDIA-related packages from upgrading unintenionally or services like unattended-upgrades # Users should run apt-mark unhold before they wish to upgrade these packages function hold_nvidia_packages() { apt-mark hold nvidia-* apt-mark hold libnvidia-* if dpkg -l | grep -q "xserver-xorg-video-nvidia"; then apt-mark hold xserver-xorg-video-nvidia* fi } function configure_mig_cgi() { if (/usr/share/google/get_metadata_value attributes/MIG_CGI); then META_MIG_CGI_VALUE=$(/usr/share/google/get_metadata_value attributes/MIG_CGI) nvidia-smi mig -cgi $META_MIG_CGI_VALUE -C else # Dataproc only supports A100's right now split in 2 if not specified nvidia-smi mig -cgi 9,9 -C fi } function enable_mig() { nvidia-smi -mig 1 } function configure_dkms_certs() { if test -v PSN && [[ -z "${PSN}" ]]; then echo "No signing secret provided. skipping"; return 0 fi mkdir -p "${CA_TMPDIR}" # If the private key exists, verify it if [[ -f "${CA_TMPDIR}/db.rsa" ]]; then echo "Private key material exists" local expected_modulus_md5sum expected_modulus_md5sum=$(get_metadata_attribute modulus_md5sum) if [[ -n "${expected_modulus_md5sum}" ]]; then modulus_md5sum="${expected_modulus_md5sum}" # Verify that cert md5sum matches expected md5sum if [[ "${modulus_md5sum}" != "$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" ]]; then echo "unmatched rsa key" fi # Verify that key md5sum matches expected md5sum if [[ "${modulus_md5sum}" != "$(openssl x509 -noout -modulus -in ${mok_der} | openssl md5 | awk '{print $2}')" ]]; then echo "unmatched x509 cert" fi else modulus_md5sum="$(openssl rsa -noout -modulus -in "${CA_TMPDIR}/db.rsa" | openssl md5 | awk '{print $2}')" fi ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" return fi # Retrieve cloud secrets keys local sig_priv_secret_name sig_priv_secret_name="${PSN}" local sig_pub_secret_name sig_pub_secret_name="$(get_metadata_attribute public_secret_name)" local sig_secret_project sig_secret_project="$(get_metadata_attribute secret_project)" local sig_secret_version sig_secret_version="$(get_metadata_attribute secret_version)" # If metadata values are not set, do not write mok keys if [[ -z "${sig_priv_secret_name}" ]]; then return 0 ; fi # Write private material to volatile storage gcloud secrets versions access "${sig_secret_version}" \ --project="${sig_secret_project}" \ --secret="${sig_priv_secret_name}" \ | dd status=none of="${CA_TMPDIR}/db.rsa" # Write public material to volatile storage gcloud secrets versions access "${sig_secret_version}" \ --project="${sig_secret_project}" \ --secret="${sig_pub_secret_name}" \ | base64 --decode \ | dd status=none of="${CA_TMPDIR}/db.der" local mok_directory="$(dirname "${mok_key}")" mkdir -p "${mok_directory}" # symlink private key and copy public cert from volatile storage to DKMS directory ln -sf "${CA_TMPDIR}/db.rsa" "${mok_key}" cp -f "${CA_TMPDIR}/db.der" "${mok_der}" modulus_md5sum="$(openssl rsa -noout -modulus -in "${mok_key}" | openssl md5 | awk '{print $2}')" } function clear_dkms_key { if [[ -z "${PSN}" ]]; then echo "No signing secret provided. skipping" >&2 return 0 fi rm -rf "${CA_TMPDIR}" "${mok_key}" } function check_secure_boot() { local SECURE_BOOT="disabled" SECURE_BOOT=$(mokutil --sb-state|awk '{print $2}') PSN="$(get_metadata_attribute private_secret_name)" readonly PSN if [[ "${SECURE_BOOT}" == "enabled" ]] && le_debian11 ; then echo "Error: Secure Boot is not supported on Debian before image 2.2. Please disable Secure Boot while creating the cluster." exit 1 elif [[ "${SECURE_BOOT}" == "enabled" ]] && [[ -z "${PSN}" ]]; then echo "Secure boot is enabled, but no signing material provided." echo "Please either disable secure boot or provide signing material as per" echo "https://github.com/GoogleCloudDataproc/custom-images/tree/master/examples/secure-boot" return 1 fi CA_TMPDIR="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" readonly CA_TMPDIR if is_ubuntu ; then mok_key=/var/lib/shim-signed/mok/MOK.priv mok_der=/var/lib/shim-signed/mok/MOK.der else mok_key=/var/lib/dkms/mok.key mok_der=/var/lib/dkms/mok.pub ; fi configure_dkms_certs } function exit_handler() { # Purge private key material until next grant clear_dkms_key set +ex echo "Exit handler invoked" # Clear pip cache pip cache purge || echo "unable to purge pip cache" # If system memory was sufficient to mount memory-backed filesystems if [[ "${tmpdir}" == "/mnt/shm" ]] ; then # remove the tmpfs pip cache-dir pip config unset global.cache-dir || echo "unable to unset global pip cache" # Clean up shared memory mounts for shmdir in /var/cache/apt/archives /var/cache/dnf /mnt/shm /tmp /var/cudnn-local ; do if ( grep -q "^tmpfs ${shmdir}" /proc/mounts && ! grep -q "^tmpfs ${shmdir}" /etc/fstab ) ; then umount -f ${shmdir} fi done # restart services stopped during preparation stage # systemctl list-units | perl -n -e 'qx(systemctl start $1) if /^.*? ((hadoop|knox|hive|mapred|yarn|hdfs)\S*).service/' fi if is_debuntu ; then # Clean up OS package cache apt-get -y -qq clean apt-get -y -qq -o DPkg::Lock::Timeout=60 autoremove # re-hold systemd package if ge_debian12 ; then apt-mark hold systemd libsystemd0 ; fi hold_nvidia_packages else dnf clean all fi # print disk usage statistics for large components if is_ubuntu ; then du -hs \ /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ /usr/lib \ /opt/nvidia/* \ /usr/local/cuda-1?.? \ /opt/conda/miniconda3 | sort -h elif is_debian ; then du -x -hs \ /usr/lib/{pig,hive,hadoop,jvm,spark,google-cloud-sdk,x86_64-linux-gnu} \ /var/lib/{docker,mysql,} \ /usr/lib \ /opt/nvidia/* \ /usr/local/cuda-1?.? \ /opt/{conda,google-cloud-ops-agent,install-nvidia,} \ /usr/bin \ /usr \ /var \ / 2>/dev/null | sort -h else du -hs \ /var/lib/docker \ /usr/lib/{pig,hive,hadoop,firmware,jvm,spark,atlas} \ /usr/lib64/google-cloud-sdk \ /usr/lib \ /opt/nvidia/* \ /usr/local/cuda-1?.? \ /opt/conda/miniconda3 fi # Process disk usage logs from installation period rm -f /run/keep-running-df sync sleep 5.01s # compute maximum size of disk during installation # Log file contains logs like the following (minus the preceeding #): #Filesystem 1K-blocks Used Available Use% Mounted on #/dev/vda2 7096908 2611344 4182932 39% / df / | tee -a "/run/disk-usage.log" perl -e '@siz=( sort { $a => $b } map { (split)[2] =~ /^(\d+)/ } grep { m:^/: } ); $max=$siz[0]; $min=$siz[-1]; $inc=$max-$min; print( " samples-taken: ", scalar @siz, $/, "maximum-disk-used: $max", $/, "minimum-disk-used: $min", $/, " increased-by: $inc", $/ )' < "/run/disk-usage.log" echo "exit_handler has completed" # zero free disk space if [[ -n "$(get_metadata_attribute creating-image)" ]]; then dd if=/dev/zero of=/zero sync sleep 3s rm -f /zero fi return 0 } function prepare_to_install(){ # Verify OS compatability and Secure boot state check_os check_secure_boot prepare_gpu_env OS_NAME="$(lsb_release -is | tr '[:upper:]' '[:lower:]')" readonly OS_NAME # node role ROLE="$(get_metadata_attribute dataproc-role)" readonly ROLE workdir=/opt/install-dpgce tmpdir=/tmp/ temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" readonly temp_bucket readonly pkg_bucket="gs://${temp_bucket}/dpgce-packages" uname_r=$(uname -r) readonly uname_r readonly bdcfg="/usr/local/bin/bdconfig" export DEBIAN_FRONTEND=noninteractive mkdir -p "${workdir}" trap exit_handler EXIT set_proxy mount_ramdisk readonly install_log="${tmpdir}/install.log" if test -f "${workdir}/prepare-complete" ; then return ; fi repair_old_backports if is_debuntu ; then clean_up_sources_lists apt-get update -qq apt-get -y clean apt-get -o DPkg::Lock::Timeout=60 -y autoremove if ge_debian12 ; then apt-mark unhold systemd libsystemd0 ; fi hold_nvidia_packages else dnf clean all fi # zero free disk space if [[ -n "$(get_metadata_attribute creating-image)" ]]; then ( set +e time dd if=/dev/zero of=/zero status=none ; sync ; sleep 3s ; rm -f /zero ) fi install_dependencies # Monitor disk usage in a screen session df / > "/run/disk-usage.log" touch "/run/keep-running-df" screen -d -m -LUS keep-running-df \ bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" touch "${workdir}/prepare-complete" } function main() { # default MIG to on when this script is used META_MIG_VALUE=$(get_metadata_attribute 'ENABLE_MIG' "1") if (lspci | grep -q NVIDIA); then if [[ $META_MIG_VALUE -ne 0 ]]; then # if the first invocation, the NVIDIA drivers and tools are not installed if [[ -f "/usr/bin/nvidia-smi" ]]; then # check to see if we already enabled mig mode and rebooted so we don't end # up in infinite reboot loop NUM_GPUS_WITH_DIFF_MIG_MODES=`/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l` if [[ $NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then echo "MIG is enabled on all GPUs, configuring instances" configure_mig_cgi exit 0 else echo "GPUs present but MIG is not enabled" fi else echo "More than 1 GPU with MIG configured differently between them" fi fi fi install_nvidia_gpu_driver if [[ ${META_MIG_VALUE} -ne 0 ]]; then enable_mig NUM_GPUS_WITH_DIFF_MIG_MODES="$(/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | wc -l)" if [[ NUM_GPUS_WITH_DIFF_MIG_MODES -eq 1 ]]; then if (/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | grep Enabled); then echo "MIG is fully enabled, we don't need to reboot" configure_mig_cgi else echo "MIG is configured on but NOT enabled. Failing" exit 1 fi else echo "MIG is NOT enabled all on GPUs. Failing" exit 1 fi else echo "Not enabling MIG" fi fi } prepare_to_install main ====== Filename: ./examples/secure-boot/build-current-images.sh ====== #!/bin/bash # Copyright 2024 Google LLC and contributors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS-IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # This script creates a custom image pre-loaded with # # GPU drivers + cuda + rapids + cuDNN + nccl + tensorflow + pytorch + ipykernel + numba # To run the script, the following will bootstrap # # git clone git@github.com:GoogleCloudDataproc/custom-images # cd custom-images # git checkout 2025.02 # cp examples/secure-boot/env.json.sample env.json # vi env.json # docker build -f Dockerfile -t custom-images-builder:latest . # time docker run -it custom-images-builder:latest bash examples/secure-boot/build-current-images.sh set -ex function execute_with_retries() ( set +x local -r cmd="$*" local install_log="${tmpdir}/install.log" for ((i = 0; i < 3; i++)); do set -x eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } set +x if [[ $retval == 0 ]] ; then return 0 ; fi sleep 5 done return 1 ) function configure_service_account() { # Create service account if gcloud iam service-accounts list --filter email="${GSA}" 2>&1 | grep -q 'Listed 0 items.' ; then # Create service account for this purpose echo "creating pre-init customization service account ${GSA}" gcloud iam service-accounts create "${SA_NAME}" \ --description="Service account for pre-init customization" \ --display-name="${SA_NAME}" fi if [[ -d tls ]] ; then mv tls "tls-$(date +%s)" ; fi eval "$(bash examples/secure-boot/create-key-pair.sh)" execute_with_retries gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role="roles/dataproc.worker" \ --condition=None # Grant the service account access to buckets in this project # TODO: this is over-broad and should be limited only to the buckets # used by these clusters for storage_object_role in 'User' 'Creator' 'Viewer' ; do execute_with_retries gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role="roles/storage.object${storage_object_role}" \ --condition=None done for secret in "${public_secret_name}" "${private_secret_name}" ; do for sm_role in 'viewer' 'secretAccessor' ; do # Grant the service account permission to list the secret execute_with_retries gcloud secrets -q add-iam-policy-binding "${secret}" \ --member="serviceAccount:${GSA}" \ --role="roles/secretmanager.${sm_role}" \ --condition=None done done execute_with_retries gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role=roles/compute.instanceAdmin.v1 \ --condition=None execute_with_retries gcloud iam service-accounts add-iam-policy-binding "${GSA}" \ --member="serviceAccount:${GSA}" \ --role=roles/iam.serviceAccountUser \ --condition=None } function revoke_bindings() { execute_with_retries gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role="roles/dataproc.worker" \ --condition=None # Revoke the service account's access to buckets in this project for storage_object_role in 'User' 'Creator' 'Viewer' ; do execute_with_retries gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role="roles/storage.object${storage_object_role}" \ --condition=None done for secret in "${public_secret_name}" "${private_secret_name}" ; do # Revoke the service account's permission to list and access the secret for sm_role in 'viewer' 'secretAccessor' ; do execute_with_retries gcloud secrets -q remove-iam-policy-binding "${secret}" \ --member="serviceAccount:${GSA}" \ --role="roles/secretmanager.${sm_role}" \ --condition=None done done execute_with_retries gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role=roles/compute.instanceAdmin.v1 \ --condition=None execute_with_retries gcloud iam service-accounts remove-iam-policy-binding "${GSA}" \ --member="serviceAccount:${GSA}" \ --role=roles/iam.serviceAccountUser \ --condition=None } export DOMAIN="$(jq -r .DOMAIN env.json)" export PROJECT_ID="$(jq -r .PROJECT_ID env.json)" export PURPOSE="$(jq -r .PURPOSE env.json)" export BUCKET="$(jq -r .BUCKET env.json)" export SECRET_NAME="$(jq -r .SECRET_NAME env.json)" export REGION="$(jq -r .REGION env.json)" export ZONE="$(jq -r .ZONE env.json)" export PRINCIPAL_USER="$(jq -r .PRINCIPAL env.json)" export PRINCIPAL_DOMAIN="$(jq -r .DOMAIN env.json)" export PRINCIPAL="${PRINCIPAL_USER}@${PRINCIPAL_DOMAIN}" echo -n "setting gcloud config..." gcloud config set project "${PROJECT_ID}" gcloud config set account "${PRINCIPAL}" gcloud auth login CURRENT_COMPUTE_REGION="$(gcloud config get compute/region)" if [[ "${CURRENT_COMPUTE_REGION}" != "${REGION}" ]]; then echo "setting compute region" gcloud config set compute/region "${REGION}" fi CURRENT_DATAPROC_REGION="$(gcloud config get dataproc/region)" if [[ "${CURRENT_DATAPROC_REGION}" != "${REGION}" ]]; then echo "setting dataproc region" gcloud config set dataproc/region "${REGION}" fi CURRENT_COMPUTE_ZONE="$(gcloud config get compute/zone)" if [[ "${CURRENT_COMPUTE_ZONE}" != "${ZONE}" ]]; then echo "setting compute zone" gcloud config set compute/zone "${ZONE}" fi SA_NAME="sa-${PURPOSE}" if [[ "${PROJECT_ID}" =~ ":" ]] ; then GSA="${SA_NAME}@${PROJECT_ID#*:}.${PROJECT_ID%:*}.iam.gserviceaccount.com" else GSA="${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com" fi readonly timestamp="$(date "+%Y%m%d-%H%M%S")" export timestamp export tmpdir=/tmp/${timestamp}; mkdir -p ${tmpdir} configure_service_account # screen session name session_name="build-current-images" export ZONE="$(jq -r .ZONE env.json)" gcloud compute instances list --zones "${ZONE}" --format json > ${tmpdir}/instances.json gcloud compute images list --format json > ${tmpdir}/images.json # Run generation scripts simultaneously for each dataproc image version screen -L -US "${session_name}" -c examples/secure-boot/pre-init.screenrc function find_disk_usage() { # grep maximum-disk-used /tmp/custom-image-*/logs/startup-script.log grep -H 'Customization script' /tmp/custom-image-*/logs/workflow.log echo '# DP_IMG_VER RECOMMENDED_DISK_SIZE DSK_SZ D_USED D_FREE D%F PURPOSE' # workflow_log=/tmp/custom-image-dataproc-2-0-deb10-20250424-232955-tf-20250425-230559/logs/workflow.log for workflow_log in $(grep -Hl "Customization script" /tmp/custom-image-*/logs/workflow.log) ; do startup_log="${workflow_log/workflow/startup-script}" grep -v '^\[' "${startup_log}" \ | grep -A20 'Filesystem.*Avail' | tail -20 \ | perl examples/secure-boot/genline.pl "${startup_log}" done } revoke_bindings ====== Filename: ./examples/secure-boot/no-customization.sh ====== #!/bin/bash function exit_handler() { set +ex echo "Exit handler invoked" # Process disk usage logs from installation period rm -f /run/keep-running-df sync sleep 5.01s # compute maximum size of disk during installation # Log file contains logs like the following (minus the preceeding #): #Filesystem 1K-blocks Used Available Use% Mounted on #/dev/vda2 7096908 2611344 4182932 39% / df / | tee -a "/run/disk-usage.log" perl -e '($first, @samples) = grep { m:^/: } ; unshift(@samples,$first); $final=$samples[-1]; ($starting)=(split(/\s+/,$first))[2] =~ /^(\d+)/; ($ending)=(split(/\s+/,$final))[2] =~ /^(\d+)/; @siz=( sort { $a => $b } map { (split)[2] =~ /^(\d+)/ } @samples ); $max=$siz[0]; $min=$siz[-1]; $inc=$max-$starting; print( " samples-taken: ", scalar @siz, $/, "starting-disk-used: $starting", $/, " ending-disk-used: $ending", $/, " maximum-disk-used: $max", $/, " minimum-disk-used: $min", $/, " increased-by: $inc", $/ )' < "/run/disk-usage.log" # zero free disk space if [[ -n "$(get_metadata_attribute creating-image)" ]]; then dd if=/dev/zero of=/zero sync sleep 3s rm -f /zero fi echo "exit_handler has completed" return 0 } # Monitor disk usage in a screen session df / | tee "/run/disk-usage.log" touch "/run/keep-running-df" screen -d -m -LUS keep-running-df \ bash -c "while [[ -f /run/keep-running-df ]] ; do df / | tee -a /run/disk-usage.log ; sleep 5s ; done" trap exit_handler EXIT sleep 30s echo "exit handler will be triggered after this operation." ====== Filename: ./examples/secure-boot/pre-init.sh ====== #!/bin/bash # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS-IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # This script creates a custom image with the script specified loaded # # pre-init.sh function version_ge(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|tail -n1)" ]]; } function version_gt(){ [[ "$1" = "$2" ]]&& return 1 || version_ge "$1" "$2";} function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; } function version_lt(){ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";} set -e IMAGE_VERSION="$1" if [[ -z "${IMAGE_VERSION}" ]] ; then IMAGE_VERSION="$(jq -r .IMAGE_VERSION env.json)" ; fi PROJECT_ID="$(jq -r .PROJECT_ID env.json)" PURPOSE="$(jq -r .PURPOSE env.json)" BUCKET="$(jq -r .BUCKET env.json)" TEMP_BUCKET="$(jq -r .TEMP_BUCKET env.json)" ZONE="$(jq -r .ZONE env.json)" SUBNET="$(jq -r .SUBNET env.json)" HIVE_NAME="$(jq -r .HIVE_INSTANCE_NAME env.json)" HIVEDB_PW_URI="$(jq -r .DB_HIVE_PASSWORD_URI env.json)" SECRET_NAME="$(jq -r .SECRET_NAME env.json)" KMS_KEY_URI="$(jq -r .KMS_KEY_URI env.json)" PRINCIPAL_USER="$(jq -r .PRINCIPAL env.json)" PRINCIPAL_DOMAIN="$(jq -r .DOMAIN env.json)" PRINCIPAL="${PRINCIPAL_USER}@${PRINCIPAL_DOMAIN}" gcloud config set project "${PROJECT_ID}" gcloud config set account "${PRINCIPAL}" region="$(echo "${ZONE}" | perl -pe 's/-[a-z]+$//')" custom_image_zone="${ZONE}" disk_size_gb="30" # greater than or equal to 30 (32 for rocky8) SA_NAME="sa-${PURPOSE}" GSA="${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com" # If no OS family specified, default to debian if [[ "${IMAGE_VERSION}" != *-* ]] ; then case "${IMAGE_VERSION}" in "2.3" ) dataproc_version="${IMAGE_VERSION}-debian12" ;; "2.2" ) dataproc_version="${IMAGE_VERSION}-debian12" ;; "2.1" ) dataproc_version="${IMAGE_VERSION}-debian11" ;; "2.0" ) dataproc_version="${IMAGE_VERSION}-debian10" ;; "1.5" ) dataproc_version="${IMAGE_VERSION}-debian10" ;; esac else dataproc_version="${IMAGE_VERSION}" fi CUDA_VERSION="12.4.1" case "${dataproc_version}" in "1.5-debian10" ) CUDA_VERSION="11.5.2" ; short_dp_ver=1.5-deb10 ; disk_size_gb="20";; "2.0-debian10" ) CUDA_VERSION="12.1.1" ; short_dp_ver=2.0-deb10 ;; "2.0-rocky8" ) CUDA_VERSION="12.1.1" ; short_dp_ver=2.0-roc8 ; disk_size_gb="32";; "2.0-ubuntu18" ) CUDA_VERSION="12.1.1" ; short_dp_ver=2.0-ubu18 ;; "2.1-debian11" ) CUDA_VERSION="12.4.1" ; short_dp_ver=2.1-deb11 ;; "2.1-rocky8" ) CUDA_VERSION="12.4.1" ; short_dp_ver=2.1-roc8 ;; "2.1-ubuntu20" ) CUDA_VERSION="12.4.1" ; short_dp_ver=2.1-ubu20 ;; "2.1-ubuntu20-arm" ) CUDA_VERSION="12.4.1" ; short_dp_ver=2.1-ubu20-arm ;; "2.2-debian12" ) CUDA_VERSION="12.6.3" ; short_dp_ver=2.2-deb12 ;; "2.2-rocky9" ) CUDA_VERSION="12.6.3" ; short_dp_ver=2.2-roc9 ;; "2.2-ubuntu22" ) CUDA_VERSION="12.6.3" ; short_dp_ver=2.2-ubu22 ;; "2.3-debian12" ) CUDA_VERSION="12.6.3" ; short_dp_ver=2.3-deb12 ;; "2.3-rocky9" ) CUDA_VERSION="12.6.3" ; short_dp_ver=2.3-roc9 ;; "2.3-ubuntu22" ) CUDA_VERSION="12.6.3" ; short_dp_ver=2.3-ubu22 ;; "2.3-ml-ubuntu22" ) CUDA_VERSION="12.6.3" ; short_dp_ver=2.3-ml-ubu22 ; disk_size_gb="50";; esac function create_h100_instance() { python generate_custom_image.py \ --machine-type "a3-highgpu-2g" \ --accelerator "type=nvidia-h100-80gb,count=2" \ $* } function create_t4_instance() { python generate_custom_image.py \ --machine-type "n1-standard-32" \ --accelerator "type=nvidia-tesla-t4,count=1" \ $* } function create_unaccelerated_instance() { python generate_custom_image.py \ --machine-type "n1-standard-2" \ $* } OPTIONAL_COMPONENTS_ARG="" function generate() { local extra_args="$*" # local image_name="${PURPOSE}-${timestamp}-${dataproc_version//\./-}" local image_name="dataproc-${short_dp_ver//\./-}-${timestamp}-${PURPOSE}" local image="$(jq -r ".[] | select(.name == \"${image_name}\").name" "${tmpdir}/images.json")" if [[ -n "${image}" ]] ; then echo "Image already exists" return fi local metadata="invocation-type=custom-images" metadata="${metadata},dataproc-temp-bucket=${TEMP_BUCKET}" local install_image="$(jq -r ".[] | select(.name == \"${image_name}-install\").name" "${tmpdir}/images.json")" if [[ -n "${install_image}" ]] ; then echo "Install image already exists. Cleaning up after aborted run." gcloud -q compute images delete "${image_name}-install" fi local instance="$(jq -r ".[] | select(.name == \"${image_name}-install\").name" "${tmpdir}/instances.json")" if [[ -n "${instance}" ]]; then # if previous run ended without cleanup... echo "cleaning up instance from previous run" gcloud -q compute instances delete "${image_name}-install" --zone "${ZONE}" fi create_function="create_unaccelerated_instance" if [[ "${customization_script}" =~ "cloud-sql-proxy.sh" ]] ; then metadata="${metadata},hive-metastore-instance=${PROJECT_ID}:${region}:${HIVE_NAME}" metadata="${metadata},db-hive-password-uri=${HIVEDB_PW_URI}" metadata="${metadata},kms-key-uri=${KMS_KEY_URI}" fi # For actions requiring access to the MOK during runtime, pass the requisite # metadata to extract the signing material if [[ "${customization_script}" =~ "install_gpu_driver.sh" ]] ; then eval "$(bash examples/secure-boot/create-key-pair.sh)" metadata="${metadata},public_secret_name=${public_secret_name}" metadata="${metadata},private_secret_name=${private_secret_name}" metadata="${metadata},secret_project=${secret_project}" metadata="${metadata},secret_version=${secret_version}" metadata="${metadata},modulus_md5sum=${modulus_md5sum}" fi if [[ "${customization_script}" =~ "install_gpu_driver.sh" ]] ; then metadata="${metadata},cuda-version=${CUDA_VERSION}" metadata="${metadata},include-pytorch=1" create_function="create_t4_instance" fi if [[ "${customization_script}" =~ "spark-rapids.sh" ]] ; then metadata="${metadata},rapids-runtime=SPARK" create_function="create_t4_instance" fi if [[ "${customization_script}" =~ "rapids.sh" ]] ; then metadata="${metadata},rapids-runtime=DASK" create_function="create_t4_instance" fi # check for known retry-able errors after failed completion local do_retry=1 set -x while [[ "${do_retry}" == "1" ]]; do do_retry=0 set +e "${create_function}" \ --image-name "${image_name}" \ --customization-script "${customization_script}" \ --service-account "${GSA}" \ --metadata "${metadata}" \ --zone "${custom_image_zone}" \ --disk-size "${disk_size_gb}" \ --gcs-bucket "${BUCKET}" \ --subnet "${SUBNET}" \ ${OPTIONAL_COMPONENTS_ARG} \ --shutdown-instance-timer-sec=30 \ --no-smoke-test \ ${extra_args} if [[ "$?" != "0" ]]; then local img_build_dir="$(ls -d /tmp/custom-image-${image_name}-*)" # retry if the startup-script.log file does not exist or is empty local startup_script_log="${img_build_dir}/logs/startup-script.log" if [[ ! -f "${startup_script_log}" ]] || [[ "$(wc -l < $startup-script.log)" == "0" ]]; then do_retry=1 mkdir -p /tmp/old mv "${img_build_dir}" /tmp/old else exit 1 fi fi done set +x } function generate_from_dataproc_version() { generate --dataproc-version "$1" ; } function generate_from_prerelease_version() { # base image -> tensorflow local img_pfx="https://www.googleapis.com/compute/v1/projects/cloud-dataproc/global/images" # local src_timestamp="20250410-165100" local src_timestamp="20250505-045100" case "${dataproc_version}" in # "1.5-debian10" ) image_uri="${img_pfx}/dataproc-1-5-deb10-${src_timestamp}-rc01" ;; # "1.5-debian10" ) image_uri="${img_pfx}/dataproc-1-5-deb10-20200820-160220-rc01" ;; # "1.5-debian10" ) image_uri="https://www.googleapis.com/compute/v1/projects/cloud-dataproc-ci/global/images/dataproc-1-5-deb10-20230909-165100-rc01" ;; "1.5-debian10" ) image_uri="https://www.googleapis.com/compute/v1/projects/cloud-dataproc/global/images/dataproc-1-5-deb10-20230909-165100-rc01" ;; "2.0-debian10" ) image_uri="${img_pfx}/dataproc-2-0-deb10-${src_timestamp}-rc01" ;; "2.0-rocky8" ) image_uri="${img_pfx}/dataproc-2-0-roc8-${src_timestamp}-rc01" ;; "2.0-ubuntu18" ) image_uri="${img_pfx}/dataproc-2-0-ubu18-${src_timestamp}-rc01" ;; "2.1-debian11" ) image_uri="${img_pfx}/dataproc-2-1-deb11-${src_timestamp}-rc01" ;; "2.1-rocky8" ) image_uri="${img_pfx}/dataproc-2-1-roc8-${src_timestamp}-rc01" ;; "2.1-ubuntu20" ) image_uri="${img_pfx}/dataproc-2-1-ubu20-${src_timestamp}-rc01" ;; "2.1-ubuntu20-arm" ) image_uri="${img_pfx}/dataproc-2-1-ubu20-arm-${src_timestamp}-rc01" ;; "2.2-debian12" ) image_uri="${img_pfx}/dataproc-2-2-deb12-${src_timestamp}-rc01" ;; "2.2-rocky9" ) image_uri="${img_pfx}/dataproc-2-2-roc9-${src_timestamp}-rc01" ;; "2.2-ubuntu22" ) image_uri="${img_pfx}/dataproc-2-2-ubu22-${src_timestamp}-rc01" ;; "2.3-debian12" ) image_uri="${img_pfx}/dataproc-2-3-deb12-${src_timestamp}-rc01" ;; "2.3-rocky9" ) image_uri="${img_pfx}/dataproc-2-3-roc9-${src_timestamp}-rc01" ;; "2.3-ubuntu22" ) image_uri="${img_pfx}/dataproc-2-3-ubu22-${src_timestamp}-rc01" ;; "2.3-ml-ubuntu22" ) image_uri="${img_pfx}/dataproc-2-3-ml-ubu22-${src_timestamp}-rc01" ;; esac generate --base-image-uri "${image_uri}" } function generate_from_base_purpose() { # local image_name="dataproc-${short_dp_ver//\./-}-${timestamp}-${PURPOSE}" # https://pantheon.corp.google.com/compute/imagesDetail/projects/cloud-dataproc-ci/global/images/dataproc-2-0-deb10-20250422-193049-secure-boot?project=cloud-dataproc-ci # https://www.googleapis.com/compute/v1/projects/dataproc-cloud-dataproc-ci/global/images/dataproc-2-0-deb10-20250422-193049-secure-boot # https://www.googleapis.com/compute/v1/projects/cloud-dataproc-ci/global/images/dataproc-2-0-deb10-20250422-193049-secure-bootprojects/dataproc-${PROJECT_ID}/global/images" # local img_pfx="https://www.googleapis.com/compute/v1/projects/dataproc-${PROJECT_ID}/global/images" local img_pfx="https://www.googleapis.com/compute/v1/projects/${PROJECT_ID}/global/images" generate --base-image-uri "${img_pfx}/dataproc-${short_dp_ver/\./-}-${timestamp}-${1}" # generate --base-image-uri "${img_pfx}/${1}-${dataproc_version/\./-}-${timestamp}" } # base image -> secure-boot # Install secure-boot certs without customization PURPOSE="secure-boot" customization_script="examples/secure-boot/no-customization.sh" time generate_from_dataproc_version "${dataproc_version}" #time generate_from_prerelease_version "${dataproc_version}" if version_ge "${IMAGE_VERSION}" "2.3" ; then ## run the installer for the DOCKER optional component PURPOSE="docker" OPTIONAL_COMPONENTS_ARG='--optional-components=DOCKER' customization_script="examples/secure-boot/no-customization.sh" time generate_from_base_purpose "secure-boot" ## run the installer for the ZEPPELIN optional component PURPOSE="zeppelin" OPTIONAL_COMPONENTS_ARG='--optional-components=ZEPPELIN' customization_script="examples/secure-boot/no-customization.sh" time generate_from_base_purpose "secure-boot" ## run the installer for the DOCKER,PIG optional components PURPOSE="docker-pig" OPTIONAL_COMPONENTS_ARG='--optional-components=PIG' customization_script="examples/secure-boot/no-customization.sh" time generate_from_base_purpose "docker" fi OPTIONAL_COMPONENTS_ARG="" ## Execute spark-rapids/spark-rapids.sh init action on base image PURPOSE="cloud-sql-proxy" customization_script="examples/secure-boot/cloud-sql-proxy.sh" echo time generate_from_base_purpose "secure-boot" # secure-boot -> tensorflow case "${dataproc_version}" in # DP_IMG_VER RECOMMENDED_DISK_SIZE DSK_SZ D_USED D_FREE D%F DATE_SAMPLED "2.0-debian10" ) disk_size_gb="36" ;; # 35.20G 30.74G 2.91G 92% / # 20250507-083009-tf "2.0-rocky8" ) disk_size_gb="43" ;; # 48.79G 36.34G 12.45G 75% / # 20250507-083009-tf "2.0-ubuntu18" ) disk_size_gb="38" ;; # 36.65G 32.24G 4.39G 89% / # 20250507-083009-tf "2.1-debian11" ) disk_size_gb="42" ;; # 41.11G 35.82G 3.50G 92% / # 20250507-083009-tf "2.1-rocky8" ) disk_size_gb="45" ;; # 59.79G 38.41G 21.39G 65% / # 20250429-193537-tf "2.1-ubuntu20" ) disk_size_gb="42" ;; # 47.31G 36.02G 11.27G 77% / # 20250507-083009-tf "2.1-ubuntu20-arm" ) disk_size_gb="45" ;; # 39.55G 39.39G 0.15G 100% / # pre-init-2-1-ubuntu20 "2.2-debian12" ) disk_size_gb="51" ;; # 58.82G 43.88G 12.44G 78% / # 20250429-193537-tf "2.2-rocky9" ) disk_size_gb="51" ;; # 49.79G 43.51G 6.28G 88% / # 20250429-193537-tf "2.2-ubuntu22" ) disk_size_gb="50" ;; # 48.28G 43.32G 4.94G 90% / # 20250429-193537-tf "2.3-debian12" ) disk_size_gb="42" ;; # 41.11G 36.20G 3.12G 93% / # 20250507-083009-tf "2.3-rocky9" ) disk_size_gb="44" ;; # 49.79G 37.82G 11.98G 76% / # 20250507-083009-tf "2.3-ubuntu22" ) disk_size_gb="42" ;; # 40.52G 36.18G 4.33G 90% / # 20250507-083009-tf "2.3-ml-ubuntu22" ) disk_size_gb="70" ;; # 40.52G 36.18G 4.33G 90% / # 20250507-083009-tf esac # Install GPU drivers + cuda + rapids + cuDNN + nccl + tensorflow + pytorch on dataproc base image PURPOSE="tf" customization_script="examples/secure-boot/install_gpu_driver.sh" time generate_from_base_purpose "secure-boot" ## Execute spark-rapids/spark-rapids.sh init action on base image PURPOSE="spark" customization_script="examples/secure-boot/spark-rapids.sh" time generate_from_base_purpose "tf" ## Execute spark-rapids/mig.sh init action on base image PURPOSE="mig-pre-init" customization_script="examples/secure-boot/mig.sh" echo time generate_from_base_purpose "tf" # tf image -> rapids case "${dataproc_version}" in "2.0-debian10" ) disk_size_gb="41" ;; # 40.12G 37.51G 0.86G 98% / # rapids-pre-init-2-0-debian10 "2.0-rocky8" ) disk_size_gb="41" ;; # 38.79G 38.04G 0.76G 99% / # rapids-pre-init-2-0-rocky8 "2.0-ubuntu18" ) disk_size_gb="40" ;; # 37.62G 36.69G 0.91G 98% / # rapids-pre-init-2-0-ubuntu18 "2.1-debian11" ) disk_size_gb="44" ;; # 42.09G 39.77G 0.49G 99% / # rapids-pre-init-2-1-debian11 "2.1-rocky8" ) disk_size_gb="44" ;; # 43.79G 41.11G 2.68G 94% / # rapids-pre-init-2-1-rocky8 "2.1-ubuntu20" ) disk_size_gb="45" ;; # 39.55G 39.39G 0.15G 100% / # rapids-pre-init-2-1-ubuntu20 "2.1-ubuntu20-arm" ) disk_size_gb="45" ;; # 39.55G 39.39G 0.15G 100% / # pre-init-2-1-ubuntu20 "2.2-debian12" ) disk_size_gb="46" ;; # 44.06G 41.73G 0.41G 100% / # rapids-pre-init-2-2-debian12 "2.2-rocky9" ) disk_size_gb="45" ;; # 44.79G 42.29G 2.51G 95% / # rapids-pre-init-2-2-rocky9 "2.2-ubuntu22" ) disk_size_gb="46" ;; # 42.46G 41.97G 0.48G 99% / # rapids-pre-init-2-2-ubuntu22 esac #disk_size_gb="45" # Install dask with rapids on base image PURPOSE="rapids" customization_script="examples/secure-boot/rapids.sh" echo time generate_from_base_purpose "tf" #time generate_from_base_purpose "cuda-pre-init" ## Install dask without rapids on base image PURPOSE="dask" customization_script="examples/secure-boot/dask.sh" echo time generate_from_base_purpose "secure-boot" #time generate_from_base_purpose "cuda-pre-init" # cuda image -> pytorch case "${dataproc_version}" in "2.0-debian10" ) disk_size_gb="44" ;; # 40.12G 37.51G 0.86G 98% / # pre-init-2-0-debian10 "2.0-rocky8" ) disk_size_gb="41" ;; # 38.79G 38.04G 0.76G 99% / # pre-init-2-0-rocky8 "2.0-ubuntu18" ) disk_size_gb="44" ;; # 37.62G 36.69G 0.91G 98% / # pre-init-2-0-ubuntu18 "2.1-debian11" ) disk_size_gb="44" ;; # 42.09G 39.77G 0.49G 99% / # pre-init-2-1-debian11 "2.1-rocky8" ) disk_size_gb="44" ;; # 43.79G 41.11G 2.68G 94% / # pre-init-2-1-rocky8 "2.1-ubuntu20" ) disk_size_gb="45" ;; # 39.55G 39.39G 0.15G 100% / # pre-init-2-1-ubuntu20 "2.1-ubuntu20-arm" ) disk_size_gb="45" ;; # 39.55G 39.39G 0.15G 100% / # pre-init-2-1-ubuntu20 "2.2-debian12" ) disk_size_gb="48" ;; # 44.06G 41.73G 0.41G 100% / # pre-init-2-2-debian12 "2.2-rocky9" ) disk_size_gb="48" ;; # 44.79G 42.29G 2.51G 95% / # pre-init-2-2-rocky9 "2.2-ubuntu22" ) disk_size_gb="46" ;; # 42.46G 41.97G 0.48G 99% / # pre-init-2-2-ubuntu22 "2.3-debian12" ) disk_size_gb="42" ;; # 41.11G 36.20G 3.12G 93% / # 20250507-083009-tf "2.3-rocky9" ) disk_size_gb="44" ;; # 49.79G 37.82G 11.98G 76% / # 20250507-083009-tf "2.3-ubuntu22" ) disk_size_gb="42" ;; # 40.52G 36.18G 4.33G 90% / # 20250507-083009-tf "2.3-ml-ubuntu22" ) disk_size_gb="60" ;; # 40.52G 36.18G 4.33G 90% / # 20250507-083009-tf esac ## Install pytorch on base image PURPOSE="pytorch" customization_script="examples/secure-boot/pytorch.sh" echo time generate_from_base_purpose "tf" ====== Filename: ./examples/secure-boot/install-nvidia-driver-debian11.sh ====== #!/bin/bash set -xeu WORKDIR=/opt/install-nvidia-driver mkdir -p ${WORKDIR} cd $_ nv_driver_ver="550.54.14" nv_cuda_ver="12.4.0" # read secret name, project, version sig_pub_secret_name="$(/usr/share/google/get_metadata_value attributes/public_secret_name)" sig_priv_secret_name="$(/usr/share/google/get_metadata_value attributes/private_secret_name)" sig_secret_project="$(/usr/share/google/get_metadata_value attributes/secret_project)" sig_secret_version="$(/usr/share/google/get_metadata_value attributes/secret_version)" expected_modulus_md5sum="$(/usr/share/google/get_metadata_value attributes/modulus_md5sum)" readonly expected_modulus_md5sum ca_tmpdir="$(mktemp -u -d -p /run/tmp -t ca_dir-XXXX)" mkdir -p "${ca_tmpdir}" # The Microsoft Corporation UEFI CA 2011 ms_uefi_ca="${ca_tmpdir}/MicCorUEFCA2011_2011-06-27.crt" if [[ ! -f "${ms_uefi_ca}" ]]; then curl -L -o "${ms_uefi_ca}" "https://go.microsoft.com/fwlink/p/?linkid=321194" fi # Write private material to volatile storage gcloud secrets versions access "${sig_secret_version}" \ --project="${sig_secret_project}" \ --secret="${sig_priv_secret_name}" \ | dd of="${ca_tmpdir}/db.rsa" readonly cacert_der="${ca_tmpdir}/db.der" gcloud secrets versions access "${sig_secret_version}" \ --project="${sig_secret_project}" \ --secret="${sig_pub_secret_name}" \ | base64 --decode \ | dd of="${cacert_der}" mokutil --sb-state # configure the nvidia-container-toolkit package source # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ | tee /etc/apt/sources.list.d/nvidia-container-toolkit.list # add non-free components sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list # update package cache apt-get update -qq # install nvidia-container-toolkit and kernel headers apt-get --no-install-recommends -qq -y install \ nvidia-container-toolkit \ "linux-headers-$(uname -r)" apt-get clean apt-get autoremove -y # fetch .run file curl -o driver.run \ "https://download.nvidia.com/XFree86/Linux-x86_64/${nv_driver_ver}/NVIDIA-Linux-x86_64-${nv_driver_ver}.run" # Install all but kernel driver bash driver.run --no-kernel-modules --silent --install-libglvnd rm driver.run # Fetch open souce kernel module with corresponding tag git clone https://github.com/NVIDIA/open-gpu-kernel-modules.git --branch "${nv_driver_ver}" --single-branch cd ${WORKDIR}/open-gpu-kernel-modules # # build kernel modules # make -j$(nproc) modules > /var/log/open-gpu-kernel-modules-build.log # sign for module in $(find kernel-open -name '*.ko'); do /lib/modules/$(uname -r)/build/scripts/sign-file sha256 \ "${ca_tmpdir}/db.rsa" \ "${ca_tmpdir}/db.der" \ "${module}" done # install make modules_install >> /var/log/open-gpu-kernel-modules-build.log # rebuilt module index depmod -a cd ${WORKDIR} # # Install CUDA # cuda_runfile="cuda_${nv_cuda_ver}_${nv_driver_ver}_linux.run" curl -fsSL --retry-connrefused --retry 10 --retry-max-time 30 \ "https://developer.download.nvidia.com/compute/cuda/${nv_cuda_ver}/local_installers/${cuda_runfile}" \ -o cuda.run bash cuda.run --silent --toolkit --no-opengl-libs rm cuda.run ====== Filename: ./examples/secure-boot/cuda.sh ====== #!/bin/bash # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS-IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # This script creates a custom image pre-loaded with cuda set -ex export PROJECT_ID="$(jq -r .PROJECT_ID env.json)" export PURPOSE="$(jq -r .PURPOSE env.json)" export BUCKET="$(jq -r .BUCKET env.json)" export IMAGE_VERSION="$(jq -r .IMAGE_VERSION env.json)" export ZONE="$(jq -r .ZONE env.json)" custom_image_zone="${ZONE}" disk_size_gb="50" # greater than or equal to 30 SA_NAME="sa-${PURPOSE}" GSA="${SA_NAME}@${PROJECT_ID}.iam.gserviceaccount.com" gcloud config set project ${PROJECT_ID} gcloud auth login if [[ -d tls ]] ; then mv tls "tls-$(date +%s)" ; fi eval "$(bash examples/secure-boot/create-key-pair.sh)" metadata="public_secret_name=${public_secret_name}" metadata="${metadata},private_secret_name=${private_secret_name}" metadata="${metadata},secret_project=${secret_project}" metadata="${metadata},secret_version=${secret_version}" if gcloud iam service-accounts list --filter email="${GSA}" 2>&1 | grep 'Listed 0 items.' ; then # Create service account for this purpose echo "creating pre-init customization service account ${GSA}" gcloud iam service-accounts create "${SA_NAME}" \ --description="Service account for pre-init customization" \ --display-name="${SA_NAME}" fi # Grant service account access to bucket gcloud storage buckets add-iam-policy-binding "gs://${BUCKET}" \ --member="serviceAccount:${GSA}" \ --role="roles/storage.objectViewer" # Grant the service account access to list secrets for the project gcloud projects add-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role="roles/secretmanager.viewer" # Grant service account permission to access the private secret gcloud secrets add-iam-policy-binding "${private_secret_name}" \ --member="serviceAccount:${GSA}" \ --role="roles/secretmanager.secretAccessor" # Grant service account permission to access the public secret gcloud secrets add-iam-policy-binding "${public_secret_name}" \ --member="serviceAccount:${GSA}" \ --role="roles/secretmanager.secretAccessor" # If no OS family specified, default to debian if [[ "${IMAGE_VERSION}" != *-* ]] ; then case "${IMAGE_VERSION}" in "2.2" ) dataproc_version="${IMAGE_VERSION}-debian12" ;; "2.1" ) dataproc_version="${IMAGE_VERSION}-debian11" ;; "2.0" ) dataproc_version="${IMAGE_VERSION}-debian10" ;; esac else dataproc_version="${IMAGE_VERSION}" fi #dataproc_version="${IMAGE_VERSION}-ubuntu22" #dataproc_version="${IMAGE_VERSION}-rocky9" #customization_script="examples/secure-boot/install-nvidia-driver-debian11.sh" #customization_script="examples/secure-boot/install-nvidia-driver-debian12.sh" customization_script="examples/secure-boot/install_gpu_driver.sh" #echo "#!/bin/bash\necho no op" | dd of=empty.sh #customization_script=empty.sh #image_name="nvidia-open-kernel-2.2-ubuntu22-$(date +%F)" #image_name="nvidia-open-kernel-2.2-rocky9-$(date +%F)" #image_name="nvidia-open-kernel-2.2-debian12-$(date +%F)" #image_name="nvidia-open-kernel-${dataproc_version}-$(date +%F)" image_name="cuda-${dataproc_version/\./-}-$(date +%F-%H-%M)" python generate_custom_image.py \ --accelerator "type=nvidia-tesla-t4" \ --image-name "${image_name}" \ --dataproc-version "${dataproc_version}" \ --trusted-cert "tls/db.der" \ --customization-script "${customization_script}" \ --service-account "${GSA}" \ --metadata "${metadata}" \ --zone "${custom_image_zone}" \ --disk-size "${disk_size_gb}" \ --no-smoke-test \ --gcs-bucket "${BUCKET}" \ --shutdown-instance-timer-sec=30 set +x # Revoke permission to access the private secret gcloud secrets remove-iam-policy-binding "${private_secret_name}" \ --member="serviceAccount:${GSA}" \ --role="roles/secretmanager.secretAccessor" > /dev/null 2>&1 # Revoke access to bucket gcloud storage buckets remove-iam-policy-binding "gs://${BUCKET}" \ --member="serviceAccount:${GSA}" \ --role="roles/storage.objectViewer" > /dev/null 2>&1 # Revoke access to list secrets for the project gcloud projects remove-iam-policy-binding "${PROJECT_ID}" \ --member="serviceAccount:${GSA}" \ --role="roles/secretmanager.viewer" > /dev/null 2>&1 ====== Filename: ./CONTRIBUTING.md ====== # How to become a contributor and submit your own code ## Contributor License Agreements We'd love to accept your patches! Before we can take them, we have to jump a couple of legal hurdles. Please fill out either the individual or corporate Contributor License Agreement (CLA). * If you are an individual writing original source code and you're sure you own the intellectual property, then you'll need to sign an [individual CLA] (https://developers.google.com/open-source/cla/individual). * If you work for a company that wants to allow you to contribute your work, then you'll need to sign a [corporate CLA] (https://developers.google.com/open-source/cla/corporate). Follow either of the two links above to access the appropriate CLA and instructions for how to sign and return it. Once we receive it, we'll be able to accept your pull requests. ## Coding Practices 1. [Write small PRs](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/getting-started/best-practices-for-pull-requests#write-small-prs), this helps reviewers to provide feedback and reason why something is changed. 1. [Provide context and guidance](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/getting-started/best-practices-for-pull-requests#provide-context-and-guidance) in the title/description. 1. Squash commit messages into final one while merging to remove intermediate changes, to keep commit history clean. 1. Ensure that your code adheres to the existing style in the sample to which you are contributing. Shell scripts should follow the [Google shell style guide](https://google.github.io/styleguide/shell.xml). 1. Ensure that your code has an appropriate set of unit/integration tests which all pass. ## Contributing A Patch 1. Submit an issue describing your proposed change to the repo in question. 1. The repo owner will respond to your issue promptly. 1. If your proposed change is accepted, and you haven't already done so, sign a Contributor License Agreement (see details above). 1. Fork the desired repo, develop and test your code changes. 1. Ensure that your code has an appropriate set of unit tests which all pass. 1. Submit a pull request. ====== Filename: ./.gitignore ====== .vscode __pycache__ *.pyc # Ignore IntelliJ files. .idea/ *.iml *.ipr *.iws # MacOS folder files .DS_Store # Emacs *~ \#*# .\#* # secure-boot certificate store tls/ tls-*/ # Screen session logs screenlog.* # environment configuration file env.json # failed patches *.orig *.rej ====== Filename: ./startup_script/run.more ====== #!/bin/bash # Copyright 2019 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # run.sh will be used by image build workflow to run custom initialization # script when creating a custom image. # # Immediately after image build workflow creates an GCE instance, it will # execute run.sh on the GCE instance that it just created: # 1. Download user's custom init action script from cloud Storage bucket. # 2. Run the custom init action script. # 3. Check for init action script output, and print success or failure # message. # 4. Shutdown GCE instance. set -x # get custom-sources-path CUSTOM_SOURCES_PATH=$(/usr/share/google/get_metadata_value attributes/custom-sources-path) # get time to wait for stdout to flush SHUTDOWN_TIMER_IN_SEC=$(/usr/share/google/get_metadata_value attributes/shutdown-timer-in-sec) USER_DATAPROC_COMPONENTS=$( /usr/share/google/get_metadata_value attributes/optional-components | tr '[:upper:]' '[:lower:]' | tr '.' ',' || echo "") BDUTIL_DIR="/usr/local/share/google/dataproc/bdutil" DATAPROC_VERSION=$(/usr/share/google/get_metadata_value attributes/dataproc-version | cut -c1-3 | tr '-' '.' || echo "") ready="" function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; } function version_lt(){ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";} # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be # used as a more performant replacement for `gsutil` gsutil_cmd="gcloud storage" gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')" gsutil_cp_cmd="${gsutil_cmd} cp" if version_lt "${gcloud_sdk_version}" "402.0.0" ; then gsutil_cmd="gsutil" gsutil_cp_cmd="${gsutil_cmd} -m cp" fi function wait_until_ready() { # For Ubuntu, wait until /snap is mounted, so that gsutil is unavailable. if [[ $(. /etc/os-release && echo "${ID}") == ubuntu ]]; then for i in {0..10}; do sleep 5 if command -v "${gsutil_cmd/ *}" >/dev/null; then ready="true" break fi if ((i == 10)); then echo "BuildFailed: timed out waiting for gsutil to be available on Ubuntu." fi done else ready="true" fi } function download_scripts() { ${gsutil_cp_cmd} -r "${CUSTOM_SOURCES_PATH}/*" ./ } function run_custom_script() { if ! download_scripts; then echo "BuildFailed: failed to download scripts from ${CUSTOM_SOURCES_PATH}." return 1 fi set -x # Start-up script wrapper that installs screen and writes screenrc # get return code RET_CODE=$? # print failure message if install fails if [[ $RET_CODE -ne 0 ]]; then echo "BuildFailed: Dataproc Initialization Actions Failed. Please check your initialization script." else echo "BuildSucceeded: Dataproc Initialization Actions Succeeded." fi } function cleanup() { # .config and .gsutil dirs are created by the gsutil command. It contains # transient authentication keys to access gcs bucket. The init_actions.sh and # run.sh are your customization and bootstrap scripts (this) which must be # removed after creating the image rm -rf ~/.config/ ~/.gsutil/ rm ./init_actions.sh ./run.sh } function is_version_at_least() { local -r VERSION=$1 if [[ $(echo "$DATAPROC_VERSION >= $VERSION" | bc -l) -eq 1 ]]; then return 0 else return 1 fi } function run_install_optional_components_script() { if ! is_version_at_least "2.3" || [[ -z "$USER_DATAPROC_COMPONENTS" ]]; then return fi source "${BDUTIL_DIR}/install_optional_components.sh" } function main() { wait_until_ready if [[ "${ready}" == "true" ]]; then run_install_optional_components_script run_custom_script cleanup fi echo "Sleep ${SHUTDOWN_TIMER_IN_SEC}s before shutting down..." echo "You can change the timeout value with --shutdown-instance-timer-sec" sleep "${SHUTDOWN_TIMER_IN_SEC}" # wait for stdout to flush shutdown -h now } main "$@" ====== Filename: ./startup_script/README.md ====== [GCE VM startup script](https://cloud.google.com/compute/docs/startupscript) which downloads and runs the user-provided customization script. ====== Filename: ./startup_script/run.ng.sh ====== #!/bin/bash # Copyright 2019 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # run.sh will be used by image build workflow to run custom initialization # script when creating a custom image. # # Immediately after image build workflow creates an GCE instance, it will # execute run.sh on the GCE instance that it just created: # 1. Download user's custom init action script from cloud Storage bucket. # 2. Run the custom init action script. # 3. Check for init action script output, and print success or failure # message. # 4. Shutdown GCE instance. set -x # get custom-sources-path CUSTOM_SOURCES_PATH=$(/usr/share/google/get_metadata_value attributes/custom-sources-path) # get time to wait for stdout to flush SHUTDOWN_TIMER_IN_SEC=$(/usr/share/google/get_metadata_value attributes/shutdown-timer-in-sec) USER_DATAPROC_COMPONENTS=$( /usr/share/google/get_metadata_value attributes/optional-components | tr '[:upper:]' '[:lower:]' | tr '.' ',' || echo "") BDUTIL_DIR="/usr/local/share/google/dataproc/bdutil" DATAPROC_VERSION=$(/usr/share/google/get_metadata_value attributes/dataproc-version | cut -c1-3 | tr '-' '.' || echo "") ready="" function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; } function version_lt(){ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";} # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be # used as a more performant replacement for `gsutil` gsutil_cmd="gcloud storage" gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')" gsutil_cp_cmd="${gsutil_cmd} cp" if version_lt "${gcloud_sdk_version}" "402.0.0" ; then gsutil_cmd="gsutil" gsutil_cp_cmd="${gsutil_cmd} -m cp" fi function wait_until_ready() { # For Ubuntu, wait until /snap is mounted, so that gsutil is unavailable. if [[ $(. /etc/os-release && echo "${ID}") == ubuntu ]]; then for i in {0..10}; do sleep 5 if command -v "${gsutil_cmd/ *}" >/dev/null; then ready="true" break fi if ((i == 10)); then echo "BuildFailed: timed out waiting for gsutil to be available on Ubuntu." fi done else ready="true" fi } function download_scripts() { ${gsutil_cp_cmd} -r "${CUSTOM_SOURCES_PATH}/*" ./ } function run_custom_script() { if ! download_scripts; then echo "BuildFailed: failed to download scripts from ${CUSTOM_SOURCES_PATH}." return 1 fi set -x # Start-up script wrapper that installs screen and writes screenrc set -euo pipefail echo "preparing screenrc" # configure screen in which build-image.sh will run BUILD_TMP_DIR="/tmp/dataproc/custom-images" mkdir -p "${BUILD_TMP_DIR}" grep -q "tmpfs /tmp tmpfs" /proc/mounts || \ mount -t tmpfs tmpfs /tmp screen_dir="${BUILD_TMP_DIR}/screen" log_dir="${BUILD_TMP_DIR}/log" mkdir -p "${BUILD_TMP_DIR}/"{screen,log} screen_rcfile="${screen_dir}/build-image.screenrc" screen_success="${screen_dir}/build-image.success" screen_logfile="${log_dir}/build-image.log" cat > "${screen_rcfile}" < /dev/null 2>&1 done elif command -v apt-get ; then apt-get install -y -qq screen > /dev/null 2>&1 else echo "unable to install screen" exit 1 fi echo "done" fi screen -US bash -x ./init_actions.sh # get return code RET_CODE=$? # print failure message if install fails if [[ $RET_CODE -ne 0 ]]; then echo "BuildFailed: Dataproc Initialization Actions Failed. Please check your initialization script." else echo "BuildSucceeded: Dataproc Initialization Actions Succeeded." fi } function cleanup() { # .config and .gsutil dirs are created by the gsutil command. It contains # transient authentication keys to access gcs bucket. The init_actions.sh and # run.sh are your customization and bootstrap scripts (this) which must be # removed after creating the image rm -rf ~/.config/ ~/.gsutil/ rm ./init_actions.sh ./run.sh } function is_version_at_least() { local -r VERSION=$1 if [[ $(echo "$DATAPROC_VERSION >= $VERSION" | bc -l) -eq 1 ]]; then return 0 else return 1 fi } function run_install_optional_components_script() { if ! is_version_at_least "2.3" || [[ -z "$USER_DATAPROC_COMPONENTS" ]]; then return fi source "${BDUTIL_DIR}/install_optional_components.sh" } function main() { wait_until_ready if [[ "${ready}" == "true" ]]; then run_install_optional_components_script run_custom_script cleanup fi echo "Sleep ${SHUTDOWN_TIMER_IN_SEC}s before shutting down..." echo "You can change the timeout value with --shutdown-instance-timer-sec" sleep "${SHUTDOWN_TIMER_IN_SEC}" # wait for stdout to flush shutdown -h now } main "$@" ====== Filename: ./startup_script/run.sh ====== #!/bin/bash # Copyright 2019 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # run.sh will be used by image build workflow to run custom initialization # script when creating a custom image. # # Immediately after image build workflow creates an GCE instance, it will # execute run.sh on the GCE instance that it just created: # 1. Download user's custom init action script from cloud Storage bucket. # 2. Run the custom init action script. # 3. Check for init action script output, and print success or failure # message. # 4. Shutdown GCE instance. set -x # get custom-sources-path CUSTOM_SOURCES_PATH=$(/usr/share/google/get_metadata_value attributes/custom-sources-path) # get time to wait for stdout to flush SHUTDOWN_TIMER_IN_SEC=$(/usr/share/google/get_metadata_value attributes/shutdown-timer-in-sec) USER_DATAPROC_COMPONENTS=$( /usr/share/google/get_metadata_value attributes/optional-components | tr '[:upper:]' '[:lower:]' | tr '.' ' ' || echo "") DATAPROC_IMAGE_VERSION=$(/usr/share/google/get_metadata_value attributes/dataproc_dataproc_version | cut -c1-3 | tr '-' '.' || echo "") DATAPROC_IMAGE_TYPE=$(/usr/share/google/get_metadata_value attributes/dataproc_image_type || echo "standard") export REGION=$(/usr/share/google/get_metadata_value attributes/dataproc-region) [[ -n "${DATAPROC_IMAGE_TYPE}" ]] # Sanity validation export DATAPROC_IMAGE_TYPE [[ "${DATAPROC_IMAGE_VERSION}" =~ ^[0-9]+\.[0-9]+$ ]] # Sanity validation export DATAPROC_IMAGE_VERSION # Startup script that performs first boot configuration for Dataproc cluster. ready="" function version_le(){ [[ "$1" = "$(echo -e "$1\n$2"|sort -V|head -n1)" ]]; } function version_lt(){ [[ "$1" = "$2" ]]&& return 1 || version_le "$1" "$2";} # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be # used as a more performant replacement for `gsutil` gsutil_cmd="gcloud storage" gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')" gsutil_cp_cmd="${gsutil_cmd} cp" if version_lt "${gcloud_sdk_version}" "402.0.0" ; then gsutil_cmd="gsutil" gsutil_cp_cmd="${gsutil_cmd} -m cp" fi function wait_until_ready() { # For Ubuntu, wait until /snap is mounted, so that gsutil is unavailable. if [[ $(. /etc/os-release && echo "${ID}") == ubuntu ]]; then for i in {0..10}; do if command -v "${gsutil_cmd/ *}" >/dev/null; then ready="true" break fi sleep 5 if ((i == 10)); then echo "BuildFailed: timed out waiting for gsutil to be available on Ubuntu." fi done else ready="true" fi } function download_scripts() { ${gsutil_cp_cmd} -r "${CUSTOM_SOURCES_PATH}/*" ./ } function run_custom_script() { if ! download_scripts; then echo "BuildFailed: failed to download scripts from ${CUSTOM_SOURCES_PATH}." return 1 fi # run init actions bash -x ./init_actions.sh # get return code RET_CODE=$? # print failure message if install fails if [[ $RET_CODE -ne 0 ]]; then echo "BuildFailed: Dataproc Initialization Actions Failed. Please check your initialization script." else echo "BuildSucceeded: Dataproc Initialization Actions Succeeded." fi } function cleanup() { # .config and .gsutil dirs are created by the gsutil command. It contains # transient authentication keys to access gcs bucket. The init_actions.sh and # run.sh are your customization and bootstrap scripts (this) which must be # removed after creating the image rm -rf ~/.config/ ~/.gsutil/ rm ./init_actions.sh ./run.sh } function is_version_at_least() { local -r VERSION=$1 if [[ $(echo "$DATAPROC_IMAGE_VERSION >= $VERSION" | bc -l) -eq 1 ]]; then return 0 else return 1 fi } function run_install_optional_components_script() { if ! is_version_at_least "2.3" || [[ -z "$USER_DATAPROC_COMPONENTS" ]]; then return fi ( export BDUTIL_DIR="/usr/local/share/google/dataproc/bdutil" # Install Optional components set -Ee source /etc/environment source "${BDUTIL_DIR}/bdutil_env.sh" source "${BDUTIL_DIR}/bdutil_helpers.sh" source "${BDUTIL_DIR}/bdutil_metadata.sh" source "${BDUTIL_DIR}/bdutil_misc.sh" source "${BDUTIL_DIR}/components/components-helpers.sh" set -x export USER_DATAPROC_COMPONENTS=(${USER_DATAPROC_COMPONENTS}) source "${BDUTIL_DIR}/install_optional_components.sh" ) # get return code local RET_CODE=$? # print failure message if install fails if [[ $RET_CODE -ne 0 ]]; then echo "BuildFailed: Dataproc optional component installation Failed. Please check logs." else echo "BuildSucceeded: Dataproc optional component installation Succeeded." fi } function main() { wait_until_ready if [[ "${ready}" == "true" ]]; then run_install_optional_components_script run_custom_script cleanup fi echo "Sleep ${SHUTDOWN_TIMER_IN_SEC}s before shutting down..." echo "You can change the timeout value with --shutdown-instance-timer-sec" sleep "${SHUTDOWN_TIMER_IN_SEC}" # wait for stdout to flush shutdown -h now } main "$@" ====== Filename: ./Dockerfile ====== FROM python:slim # To build: docker build -t dataproc-custom-images:latest . # To run: docker run -it dataproc-custom-images:latest /bin/bash # Then from the docker bash shell, run examples/secure-boot/cuda.sh WORKDIR /custom-images RUN apt-get -qq update \ && apt-get -y -qq install \ apt-transport-https ca-certificates gnupg curl jq less screen RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg \ | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" \ | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list RUN apt-get -y -qq update && apt-get -y -qq install google-cloud-cli && apt-get clean RUN apt-get -y -qq install emacs-nox vim libmime-base64-perl && apt-get clean COPY . ${WORKDIR} CMD ["/bin/bash"]