# Copyright 2013 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS-IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Environment variables to be used in the local ghadoop as well as in setup
# scripts running on remote VMs; this file will be used as a preamble to each
# partial setup script being run on each VM.
#
# Edit values here before running ghadoop.
# CONFIGBUCKET and PROJECT are required.

############### REQUIRED ENVIRONMENT VARIABLES (no defaults) ##################

# A GCS bucket used for sharing generated SSH keys and GHFS configuration.
CONFIGBUCKET=""

# The Google Cloud Platform project name which owns the GCE resources.
PROJECT=""

###############################################################################

# GCE settings.
GCE_IMAGE='debian-7'
GCE_MACHINE_TYPE='n1-standard-4'
GCE_ZONE='us-central1-a'

# Comma-separated list of service-account scopes to include in the created VMs.
# List of available scopes can be obtained with 'gcutil help addinstance' and
# looking under the description for "--service_account_scopes".
# Must at least include 'storage-full' for gsutil and the GCS connector to work.
GCE_SERVICE_ACCOUNT_SCOPES='storage-full'

# Number of seconds for gcutil to wait for commands to finish before
# declaring the attempt a failure.
GCUTIL_TIMEOUT_SECONDS=600

# Number of seconds between polling operations from gcutil waiting for
# addinstance to finish. Should be increased for larger clusters to avoid
# hitting rate quota limits.
GCUTIL_POLL_INTERVAL_SECONDS=10

# If true, strips out external apt-get mirrors from /etc/apt/sources.list
# before apt-get installing the JRE. Should only be used for
# non-critical/non-sensitive deployments due to possibly omitting security
# patches from, e.g. security.debian.org.
STRIP_EXTERNAL_MIRRORS=false

# Prefix to be shared by all VM instance names in the cluster, as well as for
# SSH configuration between the JobTracker node and the TaskTracker nodes.
PREFIX='hs-ghfs'

# The number of worker nodes in the cluster.
NUM_WORKERS=2

# List of expanded worker-node names; generally should just be derived from
# $PREFIX and $NUM_WORKERS inside 'evaluate_late_variable_bindings'.
declare -a WORKERS

# Options to be passed to TaskTracker child JVMs.
JAVAOPTS='-Xms1024m -Xmx2048m'

# Complete URL for downloading the GHFS jarfile.
GHFSJAR='https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-1.2.1.jar'

# Complete URL for downloading the GHFS configuration script.
GHCONFIG='https://storage.googleapis.com/hadoop-tools/ghconfig/ghconfig-0.27.1.tar.gz'

# URI of Hadoop tarball to be deployed. Must begin with gs:// or http(s)://
# Use 'gsutil ls gs://hadoop-dist/hadoop-*.tar.gz' to list Google supplied options
HADOOP_TARBALL_URI='gs://hadoop-dist/hadoop-1.2.1-bin.tar.gz'

# Directory where Hadoop is to be installed
HADOOP_INSTALL_DIR='/home/hadoop/hadoop-install'

# Directory holding config files and scripts for Hadoop
HADOOP_CONF_DIR="${HADOOP_INSTALL_DIR}/conf"

# Whether or not to configure and start HDFS
# Must be true if DEFAULT_FS is hdfs
ENABLE_HDFS=true

# One of [gs|hdfs].
DEFAULT_FS='gs'

# If true, tries to attach the PDs listed in WORKER_ATTACHED_PDS and
# NAMENODE_ATTACHED_PD to their respective VMs as a non-boot volume. By default,
# the PDS will be named after the instance names with a "-pd" suffix.
USE_ATTACHED_PDS=false

# Only applicable if USE_ATTACHED_PDS is true; if so, this variable controls
# whether the PDs should be created explicitly during deployment. The PDs
# must not already exist.
CREATE_ATTACHED_PDS_ON_DEPLOY=true

# Only applicable if USE_ATTACHED_PDS is true; if so, this variable controls
# whether the PDs should be deleted explicitly when deleting the cluster.
DELETE_ATTACHED_PDS_ON_DELETE=true

# Only applicable during deployment if USE_ATTACHED_PDS is true and
# CREATE_ATTACHED_PDS_ON_DEPLOY is true. Specifies the size, in GB, of
# each non-boot PD to create for the worker nodes.
WORKER_ATTACHED_PDS_SIZE_GB=500

# Only applicable during deployment if USE_ATTACHED_PDS is true and
# CREATE_ATTACHED_PDS_ON_DEPLOY is true. Specifies the size, in GB, of
# the non-boot PD to create for the master node.
NAMENODE_ATTACHED_PD_SIZE_GB=500

# List of expanded per-worker-node PD names. Only applicable if USE_ATTACHED_PDS
# is true. By default will be generated inside
# 'evaluate_late_variable_bindings'.
declare -a WORKER_ATTACHED_PDS

# Helper function for normalizing boolean variables to 1/0 instead of
# true/false, respectively. We prefer to use arithmetic [1|0] instead of bash
# "true|false" and use (()) for conditions to avoid inadvertent eval of
# arbitrary strings.
function normalize_boolean() {
  local var_name=$1
  if [[ "${!var_name}" == 'true' ]]; then
    eval "${var_name}=1"
  elif [[ "${var_name}" == 'false' ]]; then
    eval "${var_name}=0"
  fi
}

# Overridable function which will be called after sourcing all provided env
# files in sequence; allows environment variables which are derived from other
# variables to reflect overrides introduced in other files. For example, by
# computing WORKERS and NAMENODE_HOSTNAME as a late binding, an override file
# needs only to redefine PREFIX in order to adopt the new WORKERS and
# NAMENODE_HOSTNAME values as well.
function evaluate_late_variable_bindings() {
  normalize_boolean 'STRIP_EXTERNAL_MIRRORS'
  normalize_boolean 'ENABLE_HDFS'
  normalize_boolean 'USE_ATTACHED_PDS'
  normalize_boolean 'CREATE_ATTACHED_PDS_ON_DEPLOY'
  normalize_boolean 'DELETE_ATTACHED_PDS_ON_DELETE'

  # Generate WORKERS array based on PREFIX and NUM_WORKERS.
  for ((i = 0; i < NUM_WORKERS; i++)); do
    WORKERS[${i}]="${PREFIX}-dn-${i}"
  done

  # The instance name of the VM which serves as both the namenode and
  # jobtracker.
  NAMENODE_HOSTNAME="${PREFIX}-nn"

  # Generate worker PD names based on the worker instance names.
  for ((i = 0; i < NUM_WORKERS; i++)); do
    WORKER_ATTACHED_PDS[${i}]="${WORKERS[${i}]}-pd"
  done

  # List of expanded master-node PD name. Only applicable if USE_ATTACHED_PDS
  # is true.
  NAMENODE_ATTACHED_PD="${NAMENODE_HOSTNAME}-pd"

  # Fully qualified HDFS URI of namenode
  NAMENODE_URI="hdfs://${NAMENODE_HOSTNAME}:8020/"

  # Host and port of jobtracker
  JOB_TRACKER_URI="${NAMENODE_HOSTNAME}:9101"
}

# Array of strings representing mapping from command step names to the scripts
# to be executed in those steps. The first line of each group must be the name
# and end with a colon. Following the colon must be a whitespace-separated list
# of files located in the same directory as ghadoop to be executed in that
# step. Names must be suitable for use as a substring inside a filename.

COMMAND_GROUPS=(
  "deploy-ssh-setup:
    setup_namenode_ssh.sh
  "

  "deploy-core-setup:
   install_java.sh
   mount_disks.sh
   setup_hadoop_user.sh
   install_hadoop.sh
   configure_hadoop.sh
   install_and_configure_ghfs.sh
   set_default_fs.sh
  "

  "deploy-ssh-data-setup:
  setup_datanode_ssh.sh
  "

  "deploy-start:
  start_hadoop.sh
  "
)

# Array of comma-separated pairs referring to the COMMAND_GROUPS previously
# defined, of the form <invoke-on-master>,<invoke-on-all-workers>. Within
# an element, the commands will be concurrently invoked on all VMs using
# ssh sessions running in the background. All such async invocations will
# be awaited for completion before continuing to the next step.
#
# Use '*' to specify a no-op, for example if a command must be completed on
# only the master node before running the next step on all workers.
COMMAND_STEPS=(
  "deploy-ssh-setup,*"
  'deploy-core-setup,deploy-core-setup'
  "*,deploy-ssh-data-setup"
  "deploy-start,*"
)
