diff --git a/bin/hadoop b/bin/hadoop index 4b4e6e32..651ec0ce 100755 --- a/bin/hadoop +++ b/bin/hadoop @@ -68,9 +68,11 @@ if [ $# = 0 ]; then echo " raidshell [options] run RAID-shell utility" echo " fs run a generic filesystem user client" echo " balancer run a cluster balancing utility" + echo " avatarbalancer run a avatar cluster balancing utility" echo " jmxget get JMX exported values from NameNode or DataNode." echo " oiv apply the offline fsimage viewer to an fsimage" echo " oev apply the offline edits viewer to an edits file" + echo " oid apply the offline fsimage decompressor to an fsimage" echo " Use -help to see options" echo " jobtracker run the MapReduce job Tracker node" echo " pipes run a Pipes job" @@ -122,6 +124,7 @@ fi # CLASSPATH initially contains $HADOOP_CONF_DIR JMX_OPTS="" CLASSPATH="${HADOOP_CONF_DIR}" +CLASSPATH=${CLASSPATH}:$HADOOP_CLASSPATH CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar # for developers, add Hadoop classes to CLASSPATH @@ -174,12 +177,6 @@ for f in $HADOOP_HOME/build/hadoop-*-tools.jar; do TOOL_PATH=${TOOL_PATH}:$f; done -# add user-specified CLASSPATH before corona so that a newer -# corona jar can be specified to override the deployed one -if [ "$HADOOP_CLASSPATH" != "" ]; then - CLASSPATH=${CLASSPATH}:${HADOOP_CLASSPATH} -fi - # CORONA_PATH for corona daemons if [ -d "$HADOOP_HOME/build/contrib/corona/classes" ]; then CORONA_PATH=${CORONA_PATH}:$HADOOP_HOME/build/contrib/corona/classes @@ -197,6 +194,15 @@ for f in $HADOOP_HOME/contrib/corona/lib/*.jar; do CORONA_LIB_PATH=${CORONA_LIB_PATH}:$f; done +# NOTIFIER_PATH for the namespace notifier server daemon +if [ -d "$HADOOP_HOME/build/contrib/namespace-notifier/classes" ]; then + NOTIFIER_PATH=${NOTIFIER_PATH}:$HADOOP_HOME/build/contrib/namespace-notifier/classes +fi + +for f in $HADOOP_HOME/contrib/namespace-notifier/*.jar; do + NOTIFIER_PATH=${NOTIFIER_PATH}:$f; +done + # default log directory & file if [ "$HADOOP_LOG_DIR" = "" ]; then HADOOP_LOG_DIR="$HADOOP_HOME/logs" @@ -227,9 +233,13 @@ if [ "$COMMAND" = "namenode" ] ; then HADOOP_OPTS="$HADOOP_OPTS $HADOOP_GC_LOG_OPTS $HADOOP_NAMENODE_OPTS" elif [ "$COMMAND" = "avatarshell" ] ; then CLASS='org.apache.hadoop.hdfs.AvatarShell' + HADOOP_LOGFILE='avatarshell.log' + HADOOP_ROOT_LOGGER=INFO,DRFA HADOOP_OPTS="$HADOOP_OPTS $HADOOP_GC_LOG_OPTS $HADOOP_CLIENT_OPTS" elif [ "$COMMAND" = "avatarzk" ] ; then CLASS='org.apache.hadoop.hdfs.AvatarZKShell' + HADOOP_LOGFILE='avatarzkshell.log' + HADOOP_ROOT_LOGGER=INFO,DRFA HADOOP_OPTS="$HADOOP_OPTS $HADOOP_GC_LOG_OPTS $HADOOP_CLIENT_OPTS" elif [ "$COMMAND" = "avatarnode" ] ; then CLASS='org.apache.hadoop.hdfs.server.namenode.AvatarNode' @@ -238,6 +248,31 @@ elif [ "$COMMAND" = "avatarnode" ] ; then elif [ "$COMMAND" = "secondarynamenode" ] ; then CLASS='org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode' HADOOP_OPTS="$HADOOP_OPTS $HADOOP_GC_LOG_OPTS $HADOOP_SECONDARYNAMENODE_OPTS" +elif [ "$COMMAND" = "raidnode" ] ; then + CLASS='org.apache.hadoop.raid.RaidNode' + JMX_OPTS=$HADOOP_JMX_RAIDNODE_OPTS + HADOOP_OPTS="$HADOOP_OPTS $HADOOP_GC_LOG_OPTS" + CLASSPATH=${CORONA_LIB_PATH}:${CLASSPATH} +elif [ "$COMMAND" = "notifier" ] ; then + CLASS='org.apache.hadoop.hdfs.notifier.server.ServerCore' + if [ "$NOTIFIER_PATH" != "" ]; then + CLASSPATH=${CLASSPATH}:${NOTIFIER_PATH} + fi + JMX_OPTS="$JMX_OPTS $NOTIFIER_JMX_OPTS" +elif [ "$COMMAND" = "fsshellservice" ] ; then + CLASS='org.apache.hadoop.hdfs.fsshellservice.FsShellServiceImpl' + if [ -d "$HADOOP_HOME/build/contrib/corona/lib" ]; then + for f in $HADOOP_HOME/build/contrib/corona/lib/*.jar; do + CLASSPATH=${CLASSPATH}:$f; + done + fi + if [ -d "$HADOOP_HOME/build/contrib/fsshellservice/" ]; then + CLASSPATH=${CLASSPATH}:$HADOOP_HOME/build/contrib/fsshellservice/classes + fi + for f in $HADOOP_HOME/contrib/fsshellservice/*.jar; do + CLASSPATH=${CLASSPATH}:$f; + done + CLASSPATH=${CORONA_LIB_PATH}:${CLASSPATH} elif [ "$COMMAND" = "avatardatanode" ] ; then CLASS='org.apache.hadoop.hdfs.server.datanode.AvatarDataNode' JMX_OPTS=$HADOOP_JMX_DATANODE_OPTS @@ -280,12 +315,19 @@ elif [ "$COMMAND" = "balancer" ] ; then CLASS=org.apache.hadoop.hdfs.server.balancer.Balancer HADOOP_OPTS="$HADOOP_OPTS $HADOOP_BALANCER_OPTS" CMDLINE_OPTS="$CMDLINE_OPTS $BALANCER_CMDLINE_OPTS" +elif [ "$COMMAND" = "avatarbalancer" ] ; then + CLASS=org.apache.hadoop.hdfs.server.balancer.AvatarBalancer + HADOOP_OPTS="$HADOOP_OPTS $HADOOP_BALANCER_OPTS" + CMDLINE_OPTS="$CMDLINE_OPTS $BALANCER_CMDLINE_OPTS" elif [ "$COMMAND" = "oiv" ] ; then CLASS=org.apache.hadoop.hdfs.tools.offlineImageViewer.OfflineImageViewer HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" elif [ "$COMMAND" = "oev" ] ; then CLASS=org.apache.hadoop.hdfs.tools.offlineEditsViewer.OfflineEditsViewer HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" +elif [ "$COMMAND" = "oid" ] ; then + CLASS=org.apache.hadoop.hdfs.tools.offlineImageViewer.OfflineImageDecompressor + HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" elif [ "$COMMAND" = "jmxget" ] ; then CLASS=org.apache.hadoop.hdfs.tools.JMXGet HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS" @@ -301,11 +343,14 @@ elif [ "$COMMAND" = "coronaclustermanager" ] ; then JMX_OPTS=$HADOOP_JMX_CORONACLUSTERMANAGER_OPTS HADOOP_OPTS="$HADOOP_OPTS $HADOOP_GC_LOG_OPTS $HADOOP_CORONACLUSTERMANAGER_OPTS" # Corona lib path should be first to ensure that it uses the right thrift JAR - CLASSPATH=${CORONA_LIB_PATH}:${CLASSPATH} + CLASSPATH=${CORONA_LIB_PATH}:${CLUSTER_MANAGER_LIB_PATH}:${CLASSPATH} elif [ "$COMMAND" = "coronatasktracker" ] ; then CLASS=org.apache.hadoop.mapred.CoronaTaskTracker JMX_OPTS=$HADOOP_JMX_TASKTRACKER_OPTS HADOOP_OPTS="$HADOOP_OPTS $HADOOP_GC_LOG_OPTS $HADOOP_TASKTRACKER_OPTS" + # For corona task trackers, the tasks should not get the thrift library. + MAPREDUCE_TASK_SYSTEM_CLASSPATH=${CLASSPATH} + export MAPREDUCE_TASK_SYSTEM_CLASSPATH # See coronaclustermanager comment CLASSPATH=${CORONA_LIB_PATH}:${CLASSPATH} elif [ "$COMMAND" = "coronaproxyjobtracker" ] ; then diff --git a/bin/start-corona.sh b/bin/start-corona.sh index b7098690..c4132a83 100755 --- a/bin/start-corona.sh +++ b/bin/start-corona.sh @@ -24,7 +24,8 @@ bin=`cd "$bin"; pwd` . "$bin"/hadoop-config.sh # start corona daemons -# start clustermanager first to minimize connection errors at startup -"$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR start coronaclustermanager +# start start-proxyjt.sh first so that clustermanager can be started correctly "$bin"/start-proxyjt.sh --config $HADOOP_CONF_DIR +sleep 1 +"$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR start coronaclustermanager "$bin"/hadoop-daemons.sh --config $HADOOP_CONF_DIR start coronatasktracker diff --git a/bin/start-fsshellservice.sh b/bin/start-fsshellservice.sh new file mode 100644 index 00000000..1c1d1ce7 --- /dev/null +++ b/bin/start-fsshellservice.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +usage="Usage: start-fsshellservice.sh" + +params=$# +bin=`dirname "$0"` +bin=`cd "$bin"; pwd` + +. "$bin"/hadoop-config.sh + +# get arguments +if [ $# -ge 1 ]; then + echo $usage +fi + +"$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR start fsshellservice diff --git a/bin/start-namespace-notifier.sh b/bin/start-namespace-notifier.sh new file mode 100644 index 00000000..946bf4d2 --- /dev/null +++ b/bin/start-namespace-notifier.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +usage="Usage: start-namespace-notifier.sh" + +params=$# +bin=`dirname "$0"` +bin=`cd "$bin"; pwd` + +. "$bin"/hadoop-config.sh +. "$bin"/../conf/hadoop-env.sh + +# get arguments +if [ $# -ge 1 ]; then + echo $usage +fi + +export NOTIFIER_JMX_OPTS=" -Dcom.sun.management.jmxremote.port=$NOTIFIER_JMX_PORT -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" + +"$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR start notifier diff --git a/bin/start-raidnode.sh b/bin/start-raidnode.sh index 72f5cc16..b67fc44d 100755 --- a/bin/start-raidnode.sh +++ b/bin/start-raidnode.sh @@ -37,4 +37,4 @@ if [ -f "${HADOOP_CONF_DIR}/hadoop-env.sh" ]; then fi export HADOOP_DAEMON_OPTS=$HADOOP_RAIDNODE_OPTS -"$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR start org.apache.hadoop.raid.RaidNode +"$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR start raidnode diff --git a/bin/stop-fsshellservice.sh b/bin/stop-fsshellservice.sh new file mode 100644 index 00000000..271639f3 --- /dev/null +++ b/bin/stop-fsshellservice.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +usage="Usage: stop-fsshellservice.sh" + +params=$# +bin=`dirname "$0"` +bin=`cd "$bin"; pwd` + +. "$bin"/hadoop-config.sh + +# get arguments +if [ $# -ge 1 ]; then + echo $usage +fi + +"$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR stop fsshellservice diff --git a/bin/stop-namespace-notifier.sh b/bin/stop-namespace-notifier.sh new file mode 100644 index 00000000..f29734f1 --- /dev/null +++ b/bin/stop-namespace-notifier.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +usage="Usage: stop-namespace-notifier.sh" + +params=$# +bin=`dirname "$0"` +bin=`cd "$bin"; pwd` + +. "$bin"/hadoop-config.sh + +# get arguments +if [ $# -ge 1 ]; then + echo $usage +fi + +"$bin"/hadoop-daemon.sh --config $HADOOP_CONF_DIR stop notifier diff --git a/build.xml b/build.xml index 20457e73..6f84bd0b 100644 --- a/build.xml +++ b/build.xml @@ -27,7 +27,7 @@ - + @@ -107,7 +107,7 @@ - + - + @@ -229,6 +229,14 @@ + + + + + + + + @@ -499,6 +507,7 @@ + @@ -559,8 +568,20 @@ - - + + + + + + + @@ -569,8 +590,16 @@ - - + + + + + + + + + + @@ -763,6 +792,7 @@ + @@ -842,7 +872,7 @@ + value="${build.native}/lib:${lib.dir}/native/${build.platform}:${snappy.lib}"/> @@ -905,7 +935,7 @@ - + @@ -1214,6 +1244,8 @@ + + @@ -1306,6 +1338,17 @@ + + + + + + + + + + + @@ -1327,6 +1370,8 @@ + + @@ -1378,6 +1423,11 @@ + + + + + @@ -1388,6 +1438,7 @@ + @@ -1911,7 +1962,7 @@ @@ -2006,5 +2057,9 @@ + + + + diff --git a/conf/hadoop-env.sh b/conf/hadoop-env.sh index ecf3eea8..2d8bb4fe 100644 --- a/conf/hadoop-env.sh +++ b/conf/hadoop-env.sh @@ -1,68 +1,74 @@ -# Set Hadoop-specific environment variables here. +# This if statement insures that this file will be sources only once +if [[ "$IS_HADOOP_ENV_ALREADY_SOURCED" != "true" ]]; then + export IS_HADOOP_ENV_ALREADY_SOURCED="true"; -# The only required environment variable is JAVA_HOME. All others are -# optional. When running a distributed configuration it is best to -# set JAVA_HOME in this file, so that it is correctly defined on -# remote nodes. + # Set Hadoop-specific environment variables here. -# The java implementation to use. Required. -# export JAVA_HOME=/usr/lib/j2sdk1.5-sun + # The only required environment variable is JAVA_HOME. All others are + # optional. When running a distributed configuration it is best to + # set JAVA_HOME in this file, so that it is correctly defined on + # remote nodes. -# Extra Java CLASSPATH elements. Optional. -# export HADOOP_CLASSPATH= + # The java implementation to use. Required. + # export JAVA_HOME=/usr/lib/j2sdk1.5-sun -# The maximum amount of heap to use, in MB. Default is 1000. -# export HADOOP_HEAPSIZE=2000 + # Extra Java CLASSPATH elements. Optional. + # export HADOOP_CLASSPATH= -# Extra Java runtime options. Empty by default. -# export HADOOP_OPTS=-server + # The maximum amount of heap to use, in MB. Default is 1000. + # export HADOOP_HEAPSIZE=2000 -# Command specific options appended to HADOOP_OPTS when specified -export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_NAMENODE_OPTS" -export HADOOP_SECONDARYNAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_SECONDARYNAMENODE_OPTS" -export HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_DATANODE_OPTS" -export HADOOP_BALANCER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_BALANCER_OPTS" -export HADOOP_JOBTRACKER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_JOBTRACKER_OPTS" -export HADOOP_RAIDNODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_RAIDNODE_OPTS" -export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote.port=8998 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" + # Extra Java runtime options. Empty by default. + # export HADOOP_OPTS=-server -# The only user who can start hadoop daemons. -# If this is not set, any user can start hadoop daemons. -export HADOOP_USERNAME="hadoop" + # Command specific options appended to HADOOP_OPTS when specified + export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_NAMENODE_OPTS" + export HADOOP_SECONDARYNAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_SECONDARYNAMENODE_OPTS" + export HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_DATANODE_OPTS" + export HADOOP_BALANCER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_BALANCER_OPTS" + export HADOOP_JOBTRACKER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_JOBTRACKER_OPTS" + export HADOOP_RAIDNODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_RAIDNODE_OPTS" + export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote.port=8998 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" -# Java Runtime garbage collection options to pass to all Hadoop -# servers (Namenode, Jobtracker, Datanode, Tasktracker). This must end -# with a colon ; to which the dynamically generated gc log filename will -# be appended to. The below defaults work for the Sun JVM, for example -# in IBM GC, use '-Xverbosegclog:'. -#export HADOOP_GC_LOG_OPTS="-XX:+PrintGCDateStamps -XX:+PrintGCDetails -Xloggc:" + # The only user who can start hadoop daemons. + # If this is not set, any user can start hadoop daemons. + export HADOOP_USERNAME="hadoop" -# export HADOOP_TASKTRACKER_OPTS= -# The following applies to multiple commands (fs, dfs, fsck, distcp etc) -# export HADOOP_CLIENT_OPTS + # Java Runtime garbage collection options to pass to all Hadoop + # servers (Namenode, Jobtracker, Datanode, Tasktracker). This must end + # with a colon ; to which the dynamically generated gc log filename will + # be appended to. The below defaults work for the Sun JVM, for example + # in IBM GC, use '-Xverbosegclog:'. + #export HADOOP_GC_LOG_OPTS="-XX:+PrintGCDateStamps -XX:+PrintGCDetails -Xloggc:" -# Extra ssh options. Empty by default. -# export HADOOP_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR" + # export HADOOP_TASKTRACKER_OPTS= + # The following applies to multiple commands (fs, dfs, fsck, distcp etc) + # export HADOOP_CLIENT_OPTS -# Where log files are stored. $HADOOP_HOME/logs by default. -# export HADOOP_LOG_DIR=${HADOOP_HOME}/logs + # Extra ssh options. Empty by default. + # export HADOOP_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR" -# File naming remote slave hosts. $HADOOP_HOME/conf/slaves by default. -# export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves + # Where log files are stored. $HADOOP_HOME/logs by default. + # export HADOOP_LOG_DIR=${HADOOP_HOME}/logs -# host:path where hadoop code should be rsync'd from. Unset by default. -# export HADOOP_MASTER=master:/home/$USER/src/hadoop + # File naming remote slave hosts. $HADOOP_HOME/conf/slaves by default. + # export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves -# Seconds to sleep between slave commands. Unset by default. This -# can be useful in large clusters, where, e.g., slave rsyncs can -# otherwise arrive faster than the master can service them. -# export HADOOP_SLAVE_SLEEP=0.1 + # host:path where hadoop code should be rsync'd from. Unset by default. + # export HADOOP_MASTER=master:/home/$USER/src/hadoop -# The directory where pid files are stored. /tmp by default. -# export HADOOP_PID_DIR=/var/hadoop/pids + # Seconds to sleep between slave commands. Unset by default. This + # can be useful in large clusters, where, e.g., slave rsyncs can + # otherwise arrive faster than the master can service them. + # export HADOOP_SLAVE_SLEEP=0.1 -# A string representing this instance of hadoop. $USER by default. -# export HADOOP_IDENT_STRING=$USER + # The directory where pid files are stored. /tmp by default. + # export HADOOP_PID_DIR=/var/hadoop/pids -# The scheduling priority for daemon processes. See 'man nice'. -# export HADOOP_NICENESS=10 + # A string representing this instance of hadoop. $USER by default. + # export HADOOP_IDENT_STRING=$USER + + # The scheduling priority for daemon processes. See 'man nice'. + # export HADOOP_NICENESS=10 + +fi diff --git a/conf/hadoop-env.sh.template b/conf/hadoop-env.sh.template index ecf3eea8..2d8bb4fe 100644 --- a/conf/hadoop-env.sh.template +++ b/conf/hadoop-env.sh.template @@ -1,68 +1,74 @@ -# Set Hadoop-specific environment variables here. +# This if statement insures that this file will be sources only once +if [[ "$IS_HADOOP_ENV_ALREADY_SOURCED" != "true" ]]; then + export IS_HADOOP_ENV_ALREADY_SOURCED="true"; -# The only required environment variable is JAVA_HOME. All others are -# optional. When running a distributed configuration it is best to -# set JAVA_HOME in this file, so that it is correctly defined on -# remote nodes. + # Set Hadoop-specific environment variables here. -# The java implementation to use. Required. -# export JAVA_HOME=/usr/lib/j2sdk1.5-sun + # The only required environment variable is JAVA_HOME. All others are + # optional. When running a distributed configuration it is best to + # set JAVA_HOME in this file, so that it is correctly defined on + # remote nodes. -# Extra Java CLASSPATH elements. Optional. -# export HADOOP_CLASSPATH= + # The java implementation to use. Required. + # export JAVA_HOME=/usr/lib/j2sdk1.5-sun -# The maximum amount of heap to use, in MB. Default is 1000. -# export HADOOP_HEAPSIZE=2000 + # Extra Java CLASSPATH elements. Optional. + # export HADOOP_CLASSPATH= -# Extra Java runtime options. Empty by default. -# export HADOOP_OPTS=-server + # The maximum amount of heap to use, in MB. Default is 1000. + # export HADOOP_HEAPSIZE=2000 -# Command specific options appended to HADOOP_OPTS when specified -export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_NAMENODE_OPTS" -export HADOOP_SECONDARYNAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_SECONDARYNAMENODE_OPTS" -export HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_DATANODE_OPTS" -export HADOOP_BALANCER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_BALANCER_OPTS" -export HADOOP_JOBTRACKER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_JOBTRACKER_OPTS" -export HADOOP_RAIDNODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_RAIDNODE_OPTS" -export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote.port=8998 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" + # Extra Java runtime options. Empty by default. + # export HADOOP_OPTS=-server -# The only user who can start hadoop daemons. -# If this is not set, any user can start hadoop daemons. -export HADOOP_USERNAME="hadoop" + # Command specific options appended to HADOOP_OPTS when specified + export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_NAMENODE_OPTS" + export HADOOP_SECONDARYNAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_SECONDARYNAMENODE_OPTS" + export HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_DATANODE_OPTS" + export HADOOP_BALANCER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_BALANCER_OPTS" + export HADOOP_JOBTRACKER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_JOBTRACKER_OPTS" + export HADOOP_RAIDNODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_RAIDNODE_OPTS" + export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote.port=8998 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" -# Java Runtime garbage collection options to pass to all Hadoop -# servers (Namenode, Jobtracker, Datanode, Tasktracker). This must end -# with a colon ; to which the dynamically generated gc log filename will -# be appended to. The below defaults work for the Sun JVM, for example -# in IBM GC, use '-Xverbosegclog:'. -#export HADOOP_GC_LOG_OPTS="-XX:+PrintGCDateStamps -XX:+PrintGCDetails -Xloggc:" + # The only user who can start hadoop daemons. + # If this is not set, any user can start hadoop daemons. + export HADOOP_USERNAME="hadoop" -# export HADOOP_TASKTRACKER_OPTS= -# The following applies to multiple commands (fs, dfs, fsck, distcp etc) -# export HADOOP_CLIENT_OPTS + # Java Runtime garbage collection options to pass to all Hadoop + # servers (Namenode, Jobtracker, Datanode, Tasktracker). This must end + # with a colon ; to which the dynamically generated gc log filename will + # be appended to. The below defaults work for the Sun JVM, for example + # in IBM GC, use '-Xverbosegclog:'. + #export HADOOP_GC_LOG_OPTS="-XX:+PrintGCDateStamps -XX:+PrintGCDetails -Xloggc:" -# Extra ssh options. Empty by default. -# export HADOOP_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR" + # export HADOOP_TASKTRACKER_OPTS= + # The following applies to multiple commands (fs, dfs, fsck, distcp etc) + # export HADOOP_CLIENT_OPTS -# Where log files are stored. $HADOOP_HOME/logs by default. -# export HADOOP_LOG_DIR=${HADOOP_HOME}/logs + # Extra ssh options. Empty by default. + # export HADOOP_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR" -# File naming remote slave hosts. $HADOOP_HOME/conf/slaves by default. -# export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves + # Where log files are stored. $HADOOP_HOME/logs by default. + # export HADOOP_LOG_DIR=${HADOOP_HOME}/logs -# host:path where hadoop code should be rsync'd from. Unset by default. -# export HADOOP_MASTER=master:/home/$USER/src/hadoop + # File naming remote slave hosts. $HADOOP_HOME/conf/slaves by default. + # export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves -# Seconds to sleep between slave commands. Unset by default. This -# can be useful in large clusters, where, e.g., slave rsyncs can -# otherwise arrive faster than the master can service them. -# export HADOOP_SLAVE_SLEEP=0.1 + # host:path where hadoop code should be rsync'd from. Unset by default. + # export HADOOP_MASTER=master:/home/$USER/src/hadoop -# The directory where pid files are stored. /tmp by default. -# export HADOOP_PID_DIR=/var/hadoop/pids + # Seconds to sleep between slave commands. Unset by default. This + # can be useful in large clusters, where, e.g., slave rsyncs can + # otherwise arrive faster than the master can service them. + # export HADOOP_SLAVE_SLEEP=0.1 -# A string representing this instance of hadoop. $USER by default. -# export HADOOP_IDENT_STRING=$USER + # The directory where pid files are stored. /tmp by default. + # export HADOOP_PID_DIR=/var/hadoop/pids -# The scheduling priority for daemon processes. See 'man nice'. -# export HADOOP_NICENESS=10 + # A string representing this instance of hadoop. $USER by default. + # export HADOOP_IDENT_STRING=$USER + + # The scheduling priority for daemon processes. See 'man nice'. + # export HADOOP_NICENESS=10 + +fi diff --git a/conf/log4j.properties b/conf/log4j.properties index 402f3a71..7e04cbc1 100644 --- a/conf/log4j.properties +++ b/conf/log4j.properties @@ -34,7 +34,7 @@ log4j.appender.DRFA.DatePattern=.yyyy-MM-dd log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout # Pattern format: Date LogLevel LoggerName LogMessage -log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n +log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c{1}: %m%n # Debugging Pattern format #log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n diff --git a/conf/log4j.properties.scribeappender b/conf/log4j.properties.scribeappender new file mode 100644 index 00000000..958b0f22 --- /dev/null +++ b/conf/log4j.properties.scribeappender @@ -0,0 +1,135 @@ +# Define some default values that can be overridden by system properties +hadoop.root.logger=INFO,console +hadoop.log.dir=. +hadoop.log.file=hadoop.log + +# Define the root logger to the system property "hadoop.root.logger". +log4j.rootLogger=${hadoop.root.logger}, EventCounter + +# Logging Threshold +log4j.threshhold=ALL + +# +# Daily Rolling File Appender +# + +log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender +log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file} + +# Rollver at midnight +#log4j.appender.DRFA.DatePattern=.yyyy-MM-dd + +# Rollver at the top of every hour +log4j.appender.DRFA.DatePattern=.yyyy-MM-dd-HH + +# 30-day backup +#log4j.appender.DRFA.MaxBackupIndex=30 +log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout + +# Pattern format: Date LogLevel LoggerName LogMessage +log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c{1}: %m%n +# Debugging Pattern format +#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n + +# +# logmonitor understood +# This format is the one that logmonitor can understand. It is heavyweight so +# should only be used for WARN and above +# + +log4j.appender.LM=org.apache.log4j.DailyRollingFileAppender +log4j.appender.LM.threshold=WARN +log4j.appender.LM.File=${hadoop.log.dir}/logmonitor-${hadoop.log.file} +log4j.appender.LM.DatePattern=.yyyy-MM-dd-HH +log4j.appender.LM.layout=org.apache.log4j.PatternLayout +log4j.appender.LM.layout.ConversionPattern=[%c{3},%L] [%d{EEE MMM dd HH:mm:ss yyyy}] %p: %m%n + +# +# console +# Add "console" to rootlogger above if you want to use this +# + +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n + +# +# TaskLog Appender +# + +#Default values +hadoop.tasklog.taskid=null +hadoop.tasklog.noKeepSplits=4 +hadoop.tasklog.totalLogFileSize=100 +hadoop.tasklog.purgeLogSplits=true +hadoop.tasklog.logsRetainHours=12 + +log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender +log4j.appender.TLA.taskId=${hadoop.tasklog.taskid} +log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize} + +log4j.appender.TLA.layout=org.apache.log4j.PatternLayout +log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n + +# +# Rolling File Appender +# + +#log4j.appender.RFA=org.apache.log4j.RollingFileAppender +#log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file} + +# Logfile size and and 30-day backups +#log4j.appender.RFA.MaxFileSize=1MB +#log4j.appender.RFA.MaxBackupIndex=30 + +#log4j.appender.RFA.layout=org.apache.log4j.PatternLayout +#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} - %m%n +#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n + +# +# FSNamesystem Audit logging +# All audit events are logged at INFO level +# +log4j.logger.org.apache.hadoop.hdfs.server.FSNamesystem.audit=INFO + +# Custom Logging levels + +#log4j.logger.org.apache.hadoop.mapred.JobTracker=DEBUG +#log4j.logger.org.apache.hadoop.mapred.TaskTracker=DEBUG +#log4j.logger.org.apache.hadoop.fs.FSNamesystem=DEBUG + +# Jets3t library +log4j.logger.org.jets3t.service.impl.rest.httpclient.RestS3Service=ERROR + +# +# Event Counter Appender +# Sends counts of logging messages at different severity levels to Hadoop Metrics. +# +log4j.appender.EventCounter=org.apache.hadoop.metrics.jvm.EventCounter + +# Special appender for RAID metrics. +log4j.logger.RaidMetrics=INFO,SCRIBE_RAID_METRICS_APPENDER + +# RaidMetrics +log4j.appender.SCRIBE_RAID_METRICS_APPENDER=com.facebook.logging.ScribeAppender +log4j.appender.SCRIBE_RAID_METRICS_APPENDER.tag=${hadoop.tasklog.taskid} +log4j.appender.SCRIBE_RAID_METRICS_APPENDER.application=raid +log4j.appender.SCRIBE_RAID_METRICS_APPENDER.installation=${hadoop.installationid} +log4j.appender.SCRIBE_RAID_METRICS_APPENDER.layout=org.apache.log4j.PatternLayout +log4j.appender.SCRIBE_RAID_METRICS_APPENDER.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n +log4j.appender.SCRIBE_RAID_METRICS_APPENDER.Threshold=INFO + +# +# This is a scribe appender. The data will be sent directly to scribe +# +# +log4j.appender.scribe=com.facebook.logging.ScribeAppender +log4j.appender.scribe.tag=${hadoop.tasklog.taskid} +log4j.appender.scribe.application=${hadoop.application} +log4j.appender.scribe.installation=${hadoop.installationid} +log4j.appender.scribe.layout=org.apache.log4j.PatternLayout +log4j.appender.scribe.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n +log4j.appender.scribe.threshold=WARN + +log4j.logger.com.facebook.smc.SmcUtil=INFO,console diff --git a/copy-hdfs-jars-to-maven.sh b/copy-hdfs-jars-to-maven.sh index 7ffdf8f1..45033901 100755 --- a/copy-hdfs-jars-to-maven.sh +++ b/copy-hdfs-jars-to-maven.sh @@ -5,59 +5,68 @@ # and test) built in titan/VENDOR/hadoop-0.20/. # +set -e -u -o pipefail BASEDIR=`dirname $0` cd ${BASEDIR} -if [ ! -f build/hadoop-0.20.1-dev-core.jar ]; then - if [ ! -f build/hadoop-0.20-core.jar ]; then - echo "core jar not found. Running 'ant jar'..." - ant jar | grep BUILD; - fi +VERSION=$( ant -q print-version | head -1 | awk '{print $2}' ) +if [ -z "$VERSION" ]; then + echo "Unable to determine Hadoop version" >&2 + exit 1 fi -if [ ! -f build/hadoop-0.20.1-dev-test.jar ]; then - if [ ! -f build/hadoop-0.20-test.jar ]; then - echo "test jar not found. Running 'ant jar-test'..." - ant jar-test | grep BUILD; - fi +TARGETS="" + +CORE_JAR=build/hadoop-$VERSION-core.jar +if [ ! -f $CORE_JAR ]; then + TARGETS="$TARGETS jar" fi +CORE_POM=build/ivy/maven/generated.pom +if [ ! -f $CORE_POM ]; then + TARGETS="$TARGETS makepom" +fi -# -# The names of core/test jar name depend -# on whether they were generated using -# build_all.sh script or just the vanilla -# simple ant jar/jar-test -# -if [ -f build/hadoop-0.20.1-dev-core.jar ]; then - CORE_JAR=build/hadoop-0.20.1-dev-core.jar -else - CORE_JAR=build/hadoop-0.20-core.jar +TEST_JAR=build/hadoop-$VERSION-test.jar +if [ ! -f $TEST_JAR ]; then + TARGETS="$TARGETS jar-test" fi -if [ -f build/hadoop-0.20.1-dev-test.jar ]; then - TEST_JAR=build/hadoop-0.20.1-dev-test.jar -else - TEST_JAR=build/hadoop-0.20-test.jar +if [ -n "$TARGETS" ]; then + ant $TARGETS fi +# Clear the optional flag on Hadoop dependencies so these dependencies can be +# included transitively in other projects. +CORE_POM_MODIFIED=$CORE_POM.new +./edit_generated_pom.py >$CORE_POM_MODIFIED + echo "** Publishing hadoop* core & test jars " echo "** to " echo "** your local maven repo (~/.m2/repository). " echo "** HBase builds will pick up the HDFS* jars from the local maven repo." -mvn install:install-file \ - -DgeneratePom=true \ +# When running under Commander, use the setting.xml file that specifies +# the localRepository for a central mvn repo that can be shared between +# all of the build/test agents +OPTS="" +if [[ -n "${COMMANDER_WORKSPACE:-}" || "$USER" == "svcscm" ]]; then + OPTS="-s /scm/git/electric/hadoop_builds/settings.xml" +fi + +mvn $OPTS install:install-file \ + -DpomFile=$CORE_POM_MODIFIED \ -DgroupId=org.apache.hadoop \ -DartifactId=hadoop-core \ - -Dversion=0.20 \ + -Dversion=$VERSION \ -Dpackaging=jar \ -Dfile=${CORE_JAR} -mvn install:install-file \ +mvn $OPTS install:install-file \ -DgeneratePom=true \ -DgroupId=org.apache.hadoop \ -DartifactId=hadoop-test \ - -Dversion=0.20 \ + -Dversion=$VERSION \ -Dpackaging=jar \ -Dfile=${TEST_JAR} + diff --git a/edit_generated_pom.py b/edit_generated_pom.py new file mode 100644 index 00000000..c2f88d39 --- /dev/null +++ b/edit_generated_pom.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +''' +Reads the automatically generated Hadoop pom file, removes the "optional" +flag from dependencies so that they could be included transitively into other +projects such as HBase, and removes certain dependencies that are not required +and could even break the code (e.g. an old version of xerces). Writes the +modified project object model XML to standard output. +''' + +import os +import re +import sys + +from xml.dom.minidom import parse + +NON_TRANSITIVE_DEPS = [ + # Old version, breaks HBase + 'xerces', + + # Not used in production + 'checkstyle', + 'jdiff', + + # A release audit tool, probably not used in prod + 'rat-lib', +] + +POM_FILE = 'build/ivy/maven/generated.pom' +doc = parse(POM_FILE) +deps = doc.getElementsByTagName('dependencies')[0] + +for dep in deps.getElementsByTagName('dependency'): + for c in dep.childNodes: + if (c.nodeName == 'artifactId' and + c.firstChild and + c.firstChild.nodeValue and + c.firstChild.nodeValue.strip() in NON_TRANSITIVE_DEPS): + deps.removeChild(dep) + break + + for o in dep.getElementsByTagName('optional'): + dep.removeChild(o) + +out_lines = doc.toprettyxml(indent=' ' * 2) +lines = [] +for l in out_lines.split('\n'): + l = l.rstrip() + if l: + lines.append(l) +output = '\n'.join(lines) + +# Make sure values stay on the same line: value +output = re.sub( + r'(<([a-zA-Z]+)>)' + r'\s*([^<>]+?)\s*' + r'()', r'\1\3\4', output) + +print output + diff --git a/hdfs-autoconf/README.md b/hdfs-autoconf/README.md new file mode 100644 index 00000000..e47bbfa2 --- /dev/null +++ b/hdfs-autoconf/README.md @@ -0,0 +1,213 @@ +What is this? +============= + +This is autoconfigurator and autolauncher for a local HDFS cluster. +It is supposed to be mainly used for developer purposes, and it provides +you with bunch of scripts for setting everything up in a minute.. or maybe two. +Enjoy! + +DISCLAIMER: The scripts are written and tested on the GNU system and relies +on GNU tools. At least two of them (`sed` & `readlink`) are known +to be incompatible with their BSD implementations. + + + +STARTING CLUSTER +================ + +1. Make sure you have a zookeeper quorum started somewhere and that file + `config-meta/avatar-shared.sed` has a `zookeeper-quorum` entry that points + to the quorum. If not, you can start a local zookeeper via + `zookeeper.sh start` command +2. `./build.sh` - builds all sources needed to start HDFS cluster +3. `./avatar-format` - formats cluster directories +4. `./start-dev-cluster --count 3` - starts local cluster with 3 datanodes. + +[OPTIONAL] If you want to change any `core-site.xml` or `hdfs-site.xml` + properties, make the necessary changes in the `config-templates/core-site.xml.template` and + `config-meta/hdfs-site.xml.template` files. If you want to configure cluster + directories, please refer to FAQ questions "Where do namenodes store their data?" and + "Where do datanodes store their data?". + + + +F.A.Q +===== + +Where do I find cluster log files? +---------------------------------- + +Logs directory is specified by `$LOGS_DIRECTORY` variable, which defauls to +`$HADOOP_VERSION/logs`. + + +Where do namenodes store their data? +------------------------------------ + +1. The directory that is used as a local directory for the active namenode is + specified in the `./config-meta/avatar-zero.sed` file. +2. Similar to the active namenode, the local directory for the standby + specified in the `./config-meta/avatar-one.sed` file. +3. The shared namenodes directory is specified in the + `./config-meta/avatar-shared.sed` file + + +Where do datanodes store their data? +------------------------------------ + +Each datanode has a set of volumes, and autotool maps volumes +to distinct local directories. These directories are specified in +datanode configuration file which is only one line long and has the following +entry: + +``` +s:{{DataNode-volumes}}:[,...]:g +``` + +In case of starting cluster with `./start-dev-cluster --count 5` command, +every of 5 datanodes will be started with a configuration file produced with the +help of `./config-meta/avatar-datanode.template` template. Consider having the following +template: + +``` +s:{{DataNode-volumes}}:/tmp/hadoop-datanode-XXX-vol0/,/tmp/hadoop-datanode-XXX-vol1/:g +``` + +This would mean that the first datanode has two volumes mapped to +`/tmp/hadoop-datanode-1-vol0/` and `/tmp/hadoop-datanode-1-vol1/` directories, and the +forth one has `/tmp/hadoop-datanode-4-vol0/` and `/tmp/hadoop-datanode-4-vol1/`. +That is because the "XXX" sequence in the `avatar-datanode.template` file is +substituted with the sequential datanode number to provide it with unique +directories on the local machine. + + +What is the format of files in `config-meta` directory? +------------------------------------------------------- + +These files are SED (Stream Editor) scripts. Though the syntax of SED scripts +is not coincise, autoconf tool utilizes only `substitute` command. + +The substitution command basically looks like this: + +``` +s:cat:dog:g +``` + +This example will substitute every 'cat' for 'dog'. The 's' letter stands for +'substitute' command, and the trailing 'g' is a flag that enforces sed to substitute +every entry of 'cat'; otherwise it would be done only for first occurences of +'cat' per line. + +Any symbol could be used as a command delimeter. Thus said, the followings are fully +equal to the previous example +``` + s_cat_dog_g + s%cat%dog%g + s/cat/dog/g +``` + +This feature could be utilized to avoid escaping inside of sed scripts. Consider +looking at the following example +``` + s:some-folder:/tmp/foo:g + s_URL_localhost:7777_g +``` + + +How do I add new datanode configuration file? +--------------------------------------------- + +1. create a file with the name that matches format 'avatar-datanode-*.sed' +(the format of the datanode configuration files is specified by +`$DATANODE_CONFIG_FILES` variable in `config.sh` file) + +2. Fill in the file with the following content +``` +s:{{DataNode-volumes}}:[,...]:g +``` + + +What is an example of datanode config file with multiple volumes? +----------------------------------------------------------------- + +A datanode with two volumes, each resides in its own directory, will look the +following way + +``` +s:{{DataNode-volumes}}:/tmp/mydatanode-volume-1/,/tmp/mydatanode-volume-2/:g +``` + +So the directories should be listed one after another, separated with comma +delimeter. +NOTE: Make sure you do not put any spaces! + + +What exactly does autoconf tool do? +----------------------------------- + +Whenever autoconf tool starts some HDFS instance, it does the following +sequence of actions: + +1. Picks template files from `config-templates` direcotry +2. Runs `sed` scripts from `config-meta` directory over them +3. Puts results of sed execution to the `hadoop-0.20/bin` directory (the path + to `hadoop-0.20` directory is specified via `$HADOOP_VERSION`) +4. Launches the HDFS instance + + +PRO stuff: multiple hadoop checkouts +------------------------------------ + +To switch between multiple hadoop checkouts just edit `./config.sh` file, +setting a `$HADOOP_VERSION` variable to the path of checkout you would like. + + + +Files overview +============== + +Client scripts +-------------- + +This is the list of scripts that are designed to be used by user. For more +information, you can refer to the source code of every script or just +run it with `--help` argument. + +* `./build.sh` - builds everything +* `./avatar-format` - formats directories for avatar namenodes (both active and + standby) +* `./avatar-zero-start` - starts active avatar +* `./avatar-one-start` - starts standby avatar +* `./avatar-datanode-start` - allows you to choose a config and start a datanode + instance configured according to it. + instance. Zookeeper is absolutely necessary for the cluster functioning, and + it is started and stopped automatically with cluster +* `./start-dev-cluster.sh` - starts all the nodes as daemons for the local cluster +* `./stop-dev-cluster.sh` - stops instantiated developer cluster (simply killing + all the processes with `avatar` in the name) +* `./zookeeper.sh` - this script is used to start and stop local zookeeper + + +Other directory files +--------------------- + +* `./config-meta` - the directory that contains all the options for the local + cluster + - `./config-meta/avatar-shared.sed` - configuration of shared directories, used by + both Active and Stand-by avatar nodes + - `./config-meta/avatar-zero.sed` - configuration of local directories for node zero + - `./config-meta/avatar-one.sed` - configuration of local directories for node one + - `./config-meta/avatar-datanode*.sed` - configuration files for datanodes, one file per + node. + - `./config-meta/avatar-datanode.template` - configuration file that is used + to automatically generate datanode configuration files. Read more about this + file in the FIXME +* `./config-templates` - stores all the files that are been run substitutions over. +* `./launchpad` - that stores generated scripts, should not be used + unless you _really_ know what you do. +* `./scripts` - here you can find scripts that do the dirty job +* `./README.md` - markdown README in best github traditions. +* `./config.sh` - this file exports a `$HADOOP_VERSION` variable as well as + couple of other variables. You might refer to the file often if you have + multiple hadoop checkouts + diff --git a/hdfs-autoconf/avatar-datanode-start b/hdfs-autoconf/avatar-datanode-start new file mode 100644 index 00000000..f2fe90aa --- /dev/null +++ b/hdfs-autoconf/avatar-datanode-start @@ -0,0 +1,133 @@ +#!/bin/bash + +# Usage: bash avatar-datanode-start [--conf configFile] [--daemon] +set -e + +usage="USAGE + bash $(basename $0) [--help] [--format] [--conf configFile] [--daemon] + +DESCRIPTION + Starts locally an avatar datanode with one of the configurations. If + the --conf options is not specified, the script brings up a menu listing + all the found datanode configuration files and letting user to make his + choice. + +OPTIONS + --help - shows this help message + --format - forces datanode to format its directories before it starts. If this + option is not given, then datanode does not format directories unless + they do not exist + --conf - specifies which configuration to use for starting datanode. + --daemon - starts datanode as a daemon process. Logs will go to + the directory specified by \$LOGS_DIRECTORY variable +" + +if [[ ${PWD##*/} != "hdfs-autoconf" ]]; then + echo "The script should be launched from ./hdfs-autoconf directory. Exiting.." + exit 1 +fi + +if (( $# >= 1 )); then + if [[ "$1" == "--help" ]]; then + echo "$usage" + exit 0 + fi +fi + +format="false" +if (( $# >= 1 )); then + if [[ "$1" == "--format" ]]; then + format="true" + shift; + fi +fi + +if (( $# >= 2 )); then + if [[ "$1" == "--conf" ]]; then + shift; + datanodeConfig=$1; + shift; + fi +fi + +daemon=false; +if (( $# >= 1 )); then + if [[ "$1" == "--daemon" ]]; then + daemon=true; + shift; + fi +fi + +if (( $# > 0 )); then + echo "$usage" + exit 1; +fi + +source scripts/common.sh + + +function showUserMenu { + echo -e "Searching for configurations ${cWHITE}$DATANODE_CONFIG_FILES${cRESET}..." + echo -e "Select config for this instance of datanode:" + + counter=0; + for i in $(ls -1 $DATANODE_CONFIG_FILES); do + counter=$(expr $counter + 1); + echo -e " ${cWHITE}[$counter]${cRESET} $i" + done; + + amount=$counter + + if (( $amount == 0 )); then + fail "No configuration files found" + fi + + read -p " + Which one to start (1-$amount): " + if [[ $REPLY == "" ]]; then + echo "Exiting..."; + exit 0; + fi + + if ! [[ $REPLY =~ ^[0-9]+$ ]]; then + fail "Command must be a number (no whitespaces!)" + fi + if !(( $REPLY > 0 && $REPLY <= $amount )); then + fail "Wrong command!" + fi + + datanodeConfig=$(ls -1 $DATANODE_CONFIG_FILES | head -$REPLY | tail -1); +} + +if [[ "$daemon" == "true" ]]; then + # HACK: we're removing *.pid files from logs directory so that hadoop + # daemon will allow us to start multiple instances + rm -f ${LOGS_DIRECTORY}/*.pid +fi + +if [[ $datanodeConfig == "" ]]; then + showUserMenu +fi + + +# creating logs subdirectory from the name of config file +datanodeLogsDirectory=${datanodeConfig##*/} +datanodeLogsDirectory=${datanodeLogsDirectory%.*} +export HADOOP_LOG_DIR=${LOGS_DIRECTORY}/$datanodeLogsDirectory +./scripts/gen-datanode $datanodeConfig +if [[ $format == "true" ]]; then + ./$LAUNCHPAD_DIR/dn-format --hard +else + ./$LAUNCHPAD_DIR/dn-format --soft +fi + +runArgs=""; +if [[ "$daemon" == "true" ]]; then + runArgs="$runArgs --daemon"; +fi + +./$LAUNCHPAD_DIR/run $runArgs + +# wait some time to make sure the running instance actually +# read all the config files +sleep 3 diff --git a/hdfs-autoconf/avatar-format b/hdfs-autoconf/avatar-format new file mode 100644 index 00000000..6fd73a02 --- /dev/null +++ b/hdfs-autoconf/avatar-format @@ -0,0 +1,65 @@ +#!/bin/bash + +set -e + +usage="USAGE + bash $(basename $0) [--help] + +DESCRIPTION + Formats directories that are used for both Active and Standby namenodes. + +OPTIONS + --help - show this help message +" + +if [[ ${PWD##*/} != "hdfs-autoconf" ]]; then + echo "The script should be launched from ./hdfs-autoconf directory. Exiting.." + exit 1 +fi + +if (( $# >= 1 )); then + if [[ $1 == "--help" ]]; then + echo "$usage"; + exit 0; + fi +fi + +if (( $# > 0 )); then + echo "$usage"; + exit 1; +fi + +source scripts/common.sh + +# populate config +./scripts/gen-avatar zero + +# creating directory formatters +LOCAL_DIR_FORMATTER="$TEMPLATES_DIR/format-avatarnode-local-dir.sh.template" +SHARED_DIR_FORMATTER="$TEMPLATES_DIR/format-avatarnode-shared-dir.sh.template" + +AVATAR_LOCAL_ZERO="$LAUNCHPAD_DIR/avatar-zero-local-dir.sh" +cp $LOCAL_DIR_FORMATTER $AVATAR_LOCAL_ZERO +patch $AVATAR_LOCAL_ZERO $METACONF_DIR/avatar-zero.sed + +AVATAR_LOCAL_ONE="$LAUNCHPAD_DIR/avatar-one-local-dir.sh" +cp $LOCAL_DIR_FORMATTER $AVATAR_LOCAL_ONE +patch $AVATAR_LOCAL_ONE $METACONF_DIR/avatar-one.sed + +AVATAR_SHARED="$LAUNCHPAD_DIR/avatar-shared-dir.sh" +cp $SHARED_DIR_FORMATTER $AVATAR_SHARED +patch $AVATAR_SHARED $METACONF_DIR/avatar-shared.sed + +echo "Creating avatar directories" +bash $AVATAR_LOCAL_ZERO +bash $AVATAR_LOCAL_ONE +bash $AVATAR_SHARED + + +echo "Formatting avatar..." +source config.sh +cd ${HADOOP_VERSION}/bin +./hadoop avatarzk -updateZK -zero -force +./hadoop avatarnode -format +echo -e "${cGREEN}Done.${cRESET}" + diff --git a/hdfs-autoconf/avatar-one-start b/hdfs-autoconf/avatar-one-start new file mode 100644 index 00000000..d6353f4f --- /dev/null +++ b/hdfs-autoconf/avatar-one-start @@ -0,0 +1,61 @@ +#!/bin/bash + +# Usage: bash avatar-one-start [--daemon] + +set -e +usage="USAGE + bash $(basename $0) [--help] [--daemon] + +DESCRIPTION + Starts locally an avatar namenode which is stand-by default. + +OPTIONS + --help - shows this help message + --daemon - starts avatar as a daemon process. Logs will go to + the directory specified by \$LOGS_DIRECTORY variable +" + +if [[ ${PWD##*/} != "hdfs-autoconf" ]]; then + echo "The script should be launched from ./hdfs-autoconf directory. Exiting.." + exit 1 +fi + +if (( $# >= 1)); then + if [[ $1 == "--help" ]]; then + echo "$usage"; + exit 0; + fi +fi + +daemon="false"; +if (( $# >= 1 )); then + if [[ $1 == "--daemon" ]]; then + daemon="true" + shift; + fi; +fi + +if (( $# > 0 )); then + echo "$usage"; + exit 1; +fi + +source config.sh + +runArgs=""; +if [[ "$daemon" == "true" ]]; then + # HACK: after every launch we should remove `pid` file so that + # `hadoop-daemon.sh` that is actually called in the depth) + # won't complain about instances that are already started + rm -f ${LOGS_DIRECTORY}/*.pid + runArgs="--daemon"; +fi + +export HADOOP_LOG_DIR=${LOGS_DIRECTORY}/avatar-one-logs + +./scripts/gen-avatar one +$LAUNCHPAD_DIR/run $runArgs + +# wait some time to make sure the running instance actually +# read all the config files +sleep 3 diff --git a/hdfs-autoconf/avatar-zero-start b/hdfs-autoconf/avatar-zero-start new file mode 100644 index 00000000..9eaf1642 --- /dev/null +++ b/hdfs-autoconf/avatar-zero-start @@ -0,0 +1,67 @@ +#!/bin/bash + +# Usage: bash avatar-one-start [--daemon] +set -e + +usage="USAGE + bash $(basename $0) [--help] [--daemon] + +DESCRIPTION + Starts locally an avatar namenode which is active by default. + +OPTIONS + --help - shows this help message + --daemon - starts avatar as a daemon process. Logs will go to + the directory specified by \$LOGS_DIRECTORY variable +" + +if [[ ${PWD##*/} != "hdfs-autoconf" ]]; then + echo "The script should be launched from ./hdfs-autoconf directory. Exiting.." + exit 1 +fi + +if (( $# >= 1)); then + if [[ $1 == "--help" ]]; then + echo "$usage"; + exit 0; + fi +fi + +daemon="false"; +if (( $# >= 1 )); then + if [[ $1 == "--daemon" ]]; then + daemon="true"; + shift; + fi +fi + +if (( $# > 0 )); then + echo "$usage" + exit 1 +fi + +source config.sh + +runArgs="" +if [[ "$daemon" == "true" ]]; then + # HACK: after every launch we should remove `pid` file so that + # `hadoop-daemon.sh` that is actually called in the depth) + # won't complain about instances that are already started + rm -f ${LOGS_DIRECTORY}/*.pid + + runArgs="$runArgs --daemon" +fi + +./scripts/gen-avatar zero + +cd ${HADOOP_VERSION}/bin +./hadoop avatarzk -updateZK -zero -force +cd - + +export HADOOP_LOG_DIR=${LOGS_DIRECTORY}/avatar-zero-logs + +$LAUNCHPAD_DIR/run $runArgs + +# wait some time to make sure the running instance actually +# read all the config files +sleep 3 diff --git a/hdfs-autoconf/build.sh b/hdfs-autoconf/build.sh new file mode 100644 index 00000000..79a95444 --- /dev/null +++ b/hdfs-autoconf/build.sh @@ -0,0 +1,48 @@ +#!/bin/bash +set -e + +usage="USAGE + $(basename $0) [--help] [--fast] + +DESCRIPTION + Builds HDFS from sources. + +OPTIONS + --help - shows this help + --fast - EXPERIMENTAL option, does some build 3 times faster than default + build. +" + +if (( $# >= 1 )); then + if [[ $1 == "--help" ]]; then + echo "$usage"; + exit 0; + fi +fi + +compile="full" +if (( $# >= 1 )); then + if [[ $1 == "--fast" ]]; then + compile="fast" + shift + fi +fi + +if (( $# > 0 )); then + echo "$usage" + exit 1 +fi + +source config.sh + +cd ${HADOOP_VERSION}; + +if [[ $compile == "full" ]]; then + ant clean compile +elif [[ $compile == "fast" ]]; then + ant clean compile-core + cd src/contrib/highavailability + ant clean compile +fi + + diff --git a/hdfs-autoconf/config-meta/avatar-datanode-1.sed b/hdfs-autoconf/config-meta/avatar-datanode-1.sed new file mode 100644 index 00000000..72c70745 --- /dev/null +++ b/hdfs-autoconf/config-meta/avatar-datanode-1.sed @@ -0,0 +1,2 @@ +# datanode volumes: list without spaces, comma-delimeted +s:{{DataNode-volumes}}:/tmp/hadoop-datanode-0-vol0/,/tmp/hadoop-datanode-0-vol1/:g diff --git a/hdfs-autoconf/config-meta/avatar-datanode-2.sed b/hdfs-autoconf/config-meta/avatar-datanode-2.sed new file mode 100644 index 00000000..b0536216 --- /dev/null +++ b/hdfs-autoconf/config-meta/avatar-datanode-2.sed @@ -0,0 +1,2 @@ +# datanode config +s:{{DataNode-volumes}}:/tmp/hadoop-datanode-1-vol0/,/tmp/hadoop-datanode-1-vol1/:g diff --git a/hdfs-autoconf/config-meta/avatar-datanode.template b/hdfs-autoconf/config-meta/avatar-datanode.template new file mode 100644 index 00000000..79c1f451 --- /dev/null +++ b/hdfs-autoconf/config-meta/avatar-datanode.template @@ -0,0 +1,6 @@ +# This file is used as a template for generating datanode config files +# automatically. Instead of "XXX" it will subsitute sequential number +# of a datanode instance. This way you can specify the format and amount +# of volumes for the automatically generated datanode configuration files +# +s:{{DataNode-volumes}}:/tmp/hadoop-datanode-XXX-vol0/,/tmp/hadoop-datanode-XXX-vol1/:g diff --git a/hdfs-autoconf/config-meta/avatar-one.sed b/hdfs-autoconf/config-meta/avatar-one.sed new file mode 100644 index 00000000..b8898837 --- /dev/null +++ b/hdfs-autoconf/config-meta/avatar-one.sed @@ -0,0 +1,5 @@ +# local avatar 1 config +s:{{NameNode-local}}:/tmp/hadoop-avatar-1-local/:g +s:{{NameNode-local-fsimage}}:/tmp/hadoop-avatar-1-local/fsimage/:g +s:{{NameNode-local-fsedits}}:/tmp/hadoop-avatar-1-local/fsedits/:g + diff --git a/hdfs-autoconf/config-meta/avatar-shared.sed b/hdfs-autoconf/config-meta/avatar-shared.sed new file mode 100644 index 00000000..3bacef0f --- /dev/null +++ b/hdfs-autoconf/config-meta/avatar-shared.sed @@ -0,0 +1,10 @@ +# setting up shared avatar directories +# all these paths will be created relatively to /tmp directory +s:{{NameNode-shared}}:/tmp/hadoop-avatar-shared/:g +s:{{NameNode-shared-fsimage-0}}:/tmp/hadoop-avatar-shared/fsimage-zero/:g +s:{{NameNode-shared-fsedits-0}}:/tmp/hadoop-avatar-shared/fsedits-zero/:g +s:{{NameNode-shared-fsimage-1}}:/tmp/hadoop-avatar-shared/fsimage-one/:g +s:{{NameNode-shared-fsedits-1}}:/tmp/hadoop-avatar-shared/fsedits-one/:g + +# ground may be a separator as well +s_{{zookeeper-quorum}}_localhost_g diff --git a/hdfs-autoconf/config-meta/avatar-zero.sed b/hdfs-autoconf/config-meta/avatar-zero.sed new file mode 100644 index 00000000..e00c462c --- /dev/null +++ b/hdfs-autoconf/config-meta/avatar-zero.sed @@ -0,0 +1,5 @@ +# local avatar 0 config +s:{{NameNode-local}}:/tmp/hadoop-avatar-0-local/:g +s:{{NameNode-local-fsimage}}:/tmp/hadoop-avatar-0-local/fsimage/:g +s:{{NameNode-local-fsedits}}:/tmp/hadoop-avatar-0-local/fsedits/:g + diff --git a/hdfs-autoconf/config-templates/avatar-site.xml.template b/hdfs-autoconf/config-templates/avatar-site.xml.template new file mode 100644 index 00000000..531049b7 --- /dev/null +++ b/hdfs-autoconf/config-templates/avatar-site.xml.template @@ -0,0 +1,115 @@ + + + + + + + dfs.http.address0 + localhost:50070 + + The address and the base port where the dfs namenode web ui will listen on. + If the port is 0 then the server will start on a free port. + + + + + dfs.http.address1 + localhost:50080 + + The address and the base port where the dfs namenode web ui will listen on. + If the port is 0 then the server will start on a free port. + + + + + dfs.name.dir + {{NameNode-local-fsimage}} + Determines where on the local filesystem the DFS name node + should store the name table(fsimage). If this is a comma-delimited list + of directories then the name table is replicated in all of the + directories, for redundancy. + + + + dfs.name.edits.dir + {{NameNode-local-fsedits}} + Determines where on the local filesystem the DFS name node + should store the transaction (edits) file. If this is a comma-delimited list of directories then the transaction file is replicated in all of the + directories, for redundancy. Default value is same as dfs.name.dir + + + + + dfs.name.dir.shared0 + {{NameNode-shared-fsimage-0}} + Determines where on the filer the AvatarNode + should store the name table(fsimage). + + + + + dfs.name.dir.shared1 + {{NameNode-shared-fsimage-1}} + Determines where on the filer the other instance of the AvatarNode + should store the name table(fsimage). + + + + + dfs.name.edits.dir.shared0 + {{NameNode-shared-fsedits-0}} + Determines where on the filer the AvatarNode + should store the transaction (edits) file. If this is a comma-delimited list of directories then the transaction file is replicated in all of the + directories, for redundancy. Default value is same as dfs.name.dir + + + + + dfs.name.edits.dir.shared1 + {{NameNode-shared-fsedits-1}} + Determines where on the filer the other instance of the AvatarNode + should store the transaction (edits) file. + + + + + fs.checkpoint.enabled + true + + + + standby.image.copies.tokeep + 5 + The number of backup copies of the image + and fsedits to keep around. + + + + + standby.image.days.tokeep + 2 + How old should the backup image + be to get deleted. + + + + + dfs.namenode.dn-address0 + localhost:9005 + + The address and port to run the RPC server which will be processing + requests from datanodes in the cluster. + + + + + dfs.namenode.dn-address1 + localhost:9006 + + The address and port to run the RPC server which will be processing + requests from datanodes in the cluster. + + + + + diff --git a/hdfs-autoconf/config-templates/core-site.xml.template b/hdfs-autoconf/config-templates/core-site.xml.template new file mode 100644 index 00000000..9d74a635 --- /dev/null +++ b/hdfs-autoconf/config-templates/core-site.xml.template @@ -0,0 +1,95 @@ + + + + + + + + + fs.default.name + hdfs://localhost:9000 + The name of the default file system. A URI whose + scheme and authority determine the FileSystem implementation. The + uri's scheme determines the config property (fs.SCHEME.impl) naming + the FileSystem implementation class. The uri's authority is used to + determine the host, port, etc. for a filesystem. + + + + fs.default.name0 + hdfs://localhost:9000 + The name of the default file system. A URI whose + scheme and authority determine the FileSystem implementation. The + uri's scheme determines the config property (fs.SCHEME.impl) naming + the FileSystem implementation class. The uri's authority is used to + determine the host, port, etc. for a filesystem. + + + + fs.default.name1 + hdfs://localhost:9010 + The name of the default file system. A URI whose + scheme and authority determine the FileSystem implementation. The + uri's scheme determines the config property (fs.SCHEME.impl) naming + the FileSystem implementation class. The uri's authority is used to + determine the host, port, etc. for a filesystem. + + + + fs.checkpoint.period + 600 + + The number of seconds between two periodic checkpoints + + + + + fs.checkpoint.size + 10000000 + + Defines the size of the edits log file that forces an urgent checkpoint even + if the maximum checkpoint delay is not reached. + + + + + fs.ha.zookeeper.quorum + {{zookeeper-quorum}} + The list of ZK servers DAFS will be connecting to + + + + ipc.client.connect.max.retries + 10 + + + + ipc.client.connect.timeout + 5 + + + + + fs.hdfs.impl + org.apache.hadoop.hdfs.DistributedAvatarFileSystem + + + + fs.ha.zookeeper.cache + true + + + + fs.ha.zookeeper.timeout + 30000 + Indicates the session timeout for a zookeeper client connection + + + + fs.ha.retrywrites + true + retry writes or not + + + + diff --git a/hdfs-autoconf/config-templates/format-avatardatanode.sh.template b/hdfs-autoconf/config-templates/format-avatardatanode.sh.template new file mode 100644 index 00000000..11f90ab9 --- /dev/null +++ b/hdfs-autoconf/config-templates/format-avatardatanode.sh.template @@ -0,0 +1,53 @@ +#!/bin/bash +set -e + +usage="USAGE + bash $(basename $0) [--help] [--soft] + +DESCRIPTION + Formats all the directories needed for every datanodes' volume. + In case the directory for volume already exists, it recreates it + thus deleting all the underlying data (this is also called HARD mode), + unless --soft option is given + +OPTIONS + --help - shows this help message + --soft - does not recreate directory if it already exists. This + option is used to preserve the data of the datanode +" + +if (( $# >= 1 )); then + if [[ "$1" == "--help" ]]; then + echo "$usage" + exit 0 + fi +fi + +soft="false"; +if (( $# >= 1 )); then + if [[ "$1" == "--soft" ]]; then + soft="true"; + shift; + fi +fi + +volumeDirs=$(echo {{DataNode-volumes}} | tr ',' '\n'); +echo "Volume dirs: $volumeDirs" + +if [[ "$soft" == "true" ]]; then + echo "Datanode is formatted in a SOFT mode" + for i in $volumeDirs; do + if ! [[ -d $i ]]; then + mkdir $i; + fi + done; +elif [[ "$soft" == "false" ]]; then + echo "Datanode is formatted in a HARD mode" + for i in $volumeDirs; do + rm -rf $i; + mkdir $i; + done; +else + echo "This is a bug. Local variable \$soft has a bad value of $soft" + exit 1 +fi diff --git a/hdfs-autoconf/config-templates/format-avatarnode-local-dir.sh.template b/hdfs-autoconf/config-templates/format-avatarnode-local-dir.sh.template new file mode 100644 index 00000000..186c566f --- /dev/null +++ b/hdfs-autoconf/config-templates/format-avatarnode-local-dir.sh.template @@ -0,0 +1,6 @@ +#!/bin/bash + +rm -rf {{NameNode-local}}; +mkdir -p {{NameNode-local-fsimage}}; +mkdir -p {{NameNode-local-fsedits}}; + diff --git a/hdfs-autoconf/config-templates/format-avatarnode-shared-dir.sh.template b/hdfs-autoconf/config-templates/format-avatarnode-shared-dir.sh.template new file mode 100644 index 00000000..28263713 --- /dev/null +++ b/hdfs-autoconf/config-templates/format-avatarnode-shared-dir.sh.template @@ -0,0 +1,7 @@ +#!/bin/bash + +rm -rf {{NameNode-shared}}; +mkdir -p {{NameNode-shared-fsimage-0}}; +mkdir -p {{NameNode-shared-fsedits-0}}; +mkdir -p {{NameNode-shared-fsimage-1}}; +mkdir -p {{NameNode-shared-fsedits-1}}; diff --git a/hdfs-autoconf/config-templates/hadoop-env-avatar-one.sh b/hdfs-autoconf/config-templates/hadoop-env-avatar-one.sh new file mode 100644 index 00000000..a051a4d7 --- /dev/null +++ b/hdfs-autoconf/config-templates/hadoop-env-avatar-one.sh @@ -0,0 +1,71 @@ +if [[ "$IS_HADOOP_ENV_ALREADY_SOURCED" != "true" ]]; then + export IS_HADOOP_ENV_ALREADY_SOURCED="true" + # Set Hadoop-specific environment variables here. + + # The only required environment variable is JAVA_HOME. All others are + # optional. When running a distributed configuration it is best to + # set JAVA_HOME in this file, so that it is correctly defined on + # remote nodes. + + # The java implementation to use. Required. + # export JAVA_HOME=/usr/lib/j2sdk1.5-sun + + # Extra Java CLASSPATH elements. Optional. + export HADOOP_CLASSPATH=${HADOOP_TRUNK_MAIN}/VENDOR/hadoop-0.20/lib/ + + # The maximum amount of heap to use, in MB. Default is 1000. + # export HADOOP_HEAPSIZE=2000 + + # Extra Java runtime options. Empty by default. + # export HADOOP_OPTS=-server + + # Command specific options appended to HADOOP_OPTS when specified + export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_NAMENODE_OPTS" + export HADOOP_SECONDARYNAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_SECONDARYNAMENODE_OPTS" + export HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_DATANODE_OPTS" + export HADOOP_BALANCER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_BALANCER_OPTS" + export HADOOP_JOBTRACKER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_JOBTRACKER_OPTS" + export HADOOP_RAIDNODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_RAIDNODE_OPTS" + export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote.port=8998 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" + + # The only user who can start hadoop daemons. + # If this is not set, any user can start hadoop daemons. + # export HADOOP_USERNAME="hadoop" + + # Java Runtime garbage collection options to pass to all Hadoop + # servers (Namenode, Jobtracker, Datanode, Tasktracker). This must end + # with a colon ; to which the dynamically generated gc log filename will + # be appended to. The below defaults work for the Sun JVM, for example + # in IBM GC, use '-Xverbosegclog:'. + #export HADOOP_GC_LOG_OPTS="-XX:+PrintGCDateStamps -XX:+PrintGCDetails -Xloggc:" + + # export HADOOP_TASKTRACKER_OPTS= + # The following applies to multiple commands (fs, dfs, fsck, distcp etc) + # export HADOOP_CLIENT_OPTS + + # Extra ssh options. Empty by default. + # export HADOOP_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR" + + # Where log files are stored. $HADOOP_HOME/logs by default. + # export HADOOP_LOG_DIR=${HADOOP_HOME}/logs + + # File naming remote slave hosts. $HADOOP_HOME/conf/slaves by default. + # export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves + + # host:path where hadoop code should be rsync'd from. Unset by default. + # export HADOOP_MASTER=master:/home/$USER/src/hadoop + + # Seconds to sleep between slave commands. Unset by default. This + # can be useful in large clusters, where, e.g., slave rsyncs can + # otherwise arrive faster than the master can service them. + # export HADOOP_SLAVE_SLEEP=0.1 + + # The directory where pid files are stored. /tmp by default. + # export HADOOP_PID_DIR=/var/hadoop/pids + + # A string representing this instance of hadoop. $USER by default. + # export HADOOP_IDENT_STRING=$USER + + # The scheduling priority for daemon processes. See 'man nice'. + # export HADOOP_NICENESS=10 +fi diff --git a/hdfs-autoconf/config-templates/hadoop-env-avatar-zero.sh b/hdfs-autoconf/config-templates/hadoop-env-avatar-zero.sh new file mode 100644 index 00000000..e1a2d0af --- /dev/null +++ b/hdfs-autoconf/config-templates/hadoop-env-avatar-zero.sh @@ -0,0 +1,71 @@ +if [[ "$IS_HADOOP_ENV_ALREADY_SOURCED" != "true" ]]; then + export IS_HADOOP_ENV_ALREADY_SOURCED="true" + # Set Hadoop-specific environment variables here. + + # The only required environment variable is JAVA_HOME. All others are + # optional. When running a distributed configuration it is best to + # set JAVA_HOME in this file, so that it is correctly defined on + # remote nodes. + + # The java implementation to use. Required. + # export JAVA_HOME=/usr/lib/j2sdk1.5-sun + + # Extra Java CLASSPATH elements. Optional. + #export HADOOP_CLASSPATH=${HADOOP_TRUNK_MAIN}/VENDOR/hadoop-0.20/lib/ + + # The maximum amount of heap to use, in MB. Default is 1000. + export HADOOP_HEAPSIZE=2000 + + # Extra Java runtime options. Empty by default. + # export HADOOP_OPTS=-server + + # Command specific options appended to HADOOP_OPTS when specified + export HADOOP_SECONDARYNAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_SECONDARYNAMENODE_OPTS" + export HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_DATANODE_OPTS" + export HADOOP_BALANCER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_BALANCER_OPTS" + export HADOOP_JOBTRACKER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_JOBTRACKER_OPTS" + export HADOOP_RAIDNODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_RAIDNODE_OPTS" + export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote -Xmx3g -Xms3g $HADOOP_NAMENODE_OPTS -Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=9070" + #export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote.port=8998 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" + + # The only user who can start hadoop daemons. + # If this is not set, any user can start hadoop daemons. + #export HADOOP_USERNAME="hadoop" + + # Java Runtime garbage collection options to pass to all Hadoop + # servers (Namenode, Jobtracker, Datanode, Tasktracker). This must end + # with a colon ; to which the dynamically generated gc log filename will + # be appended to. The below defaults work for the Sun JVM, for example + # in IBM GC, use '-Xverbosegclog:'. + #export HADOOP_GC_LOG_OPTS="-XX:+PrintGCDateStamps -XX:+PrintGCDetails -Xloggc:" + + # export HADOOP_TASKTRACKER_OPTS= + # The following applies to multiple commands (fs, dfs, fsck, distcp etc) + # export HADOOP_CLIENT_OPTS + + # Extra ssh options. Empty by default. + # export HADOOP_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR" + + # Where log files are stored. $HADOOP_HOME/logs by default. + # export HADOOP_LOG_DIR=${HADOOP_HOME}/logs + + # File naming remote slave hosts. $HADOOP_HOME/conf/slaves by default. + # export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves + + # host:path where hadoop code should be rsync'd from. Unset by default. + # export HADOOP_MASTER=master:/home/$USER/src/hadoop + + # Seconds to sleep between slave commands. Unset by default. This + # can be useful in large clusters, where, e.g., slave rsyncs can + # otherwise arrive faster than the master can service them. + # export HADOOP_SLAVE_SLEEP=0.1 + + # The directory where pid files are stored. /tmp by default. + # export HADOOP_PID_DIR=/var/hadoop/pids + + # A string representing this instance of hadoop. $USER by default. + # export HADOOP_IDENT_STRING=$USER + + # The scheduling priority for daemon processes. See 'man nice'. + # export HADOOP_NICENESS=10 +fi diff --git a/hdfs-autoconf/config-templates/hadoop-env-datanode.sh b/hdfs-autoconf/config-templates/hadoop-env-datanode.sh new file mode 100644 index 00000000..81870a56 --- /dev/null +++ b/hdfs-autoconf/config-templates/hadoop-env-datanode.sh @@ -0,0 +1,71 @@ +if [[ "$IS_HADOOP_ENV_ALREADY_SOURCED" != "true" ]]; then + export IS_HADOOP_ENV_ALREADY_SOURCED="true" + # Set Hadoop-specific environment variables here. + + # The only required environment variable is JAVA_HOME. All others are + # optional. When running a distributed configuration it is best to + # set JAVA_HOME in this file, so that it is correctly defined on + # remote nodes. + + # The java implementation to use. Required. + # export JAVA_HOME=/usr/lib/j2sdk1.5-sun + + # Extra Java CLASSPATH elements. Optional. + # export HADOOP_CLASSPATH= + + # The maximum amount of heap to use, in MB. Default is 1000. + # export HADOOP_HEAPSIZE=2000 + + # Extra Java runtime options. Empty by default. + # export HADOOP_OPTS=-server + + # Command specific options appended to HADOOP_OPTS when specified + export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_NAMENODE_OPTS" + export HADOOP_SECONDARYNAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_SECONDARYNAMENODE_OPTS" + export HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_DATANODE_OPTS" + export HADOOP_BALANCER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_BALANCER_OPTS" + export HADOOP_JOBTRACKER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_JOBTRACKER_OPTS" + export HADOOP_RAIDNODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_RAIDNODE_OPTS" + #export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote.port=8998 -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" + + # The only user who can start hadoop daemons. + # If this is not set, any user can start hadoop daemons. + # export HADOOP_USERNAME="hadoop" + + # Java Runtime garbage collection options to pass to all Hadoop + # servers (Namenode, Jobtracker, Datanode, Tasktracker). This must end + # with a colon ; to which the dynamically generated gc log filename will + # be appended to. The below defaults work for the Sun JVM, for example + # in IBM GC, use '-Xverbosegclog:'. + #export HADOOP_GC_LOG_OPTS="-XX:+PrintGCDateStamps -XX:+PrintGCDetails -Xloggc:" + + # export HADOOP_TASKTRACKER_OPTS= + # The following applies to multiple commands (fs, dfs, fsck, distcp etc) + # export HADOOP_CLIENT_OPTS + + # Extra ssh options. Empty by default. + # export HADOOP_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR" + + # Where log files are stored. $HADOOP_HOME/logs by default. + # export HADOOP_LOG_DIR=${HADOOP_HOME}/logs + + # File naming remote slave hosts. $HADOOP_HOME/conf/slaves by default. + # export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves + + # host:path where hadoop code should be rsync'd from. Unset by default. + # export HADOOP_MASTER=master:/home/$USER/src/hadoop + + # Seconds to sleep between slave commands. Unset by default. This + # can be useful in large clusters, where, e.g., slave rsyncs can + # otherwise arrive faster than the master can service them. + # export HADOOP_SLAVE_SLEEP=0.1 + + # The directory where pid files are stored. /tmp by default. + # export HADOOP_PID_DIR=/var/hadoop/pids + + # A string representing this instance of hadoop. $USER by default. + # export HADOOP_IDENT_STRING=$USER + + # The scheduling priority for daemon processes. See 'man nice'. + # export HADOOP_NICENESS=10 +fi diff --git a/hdfs-autoconf/config-templates/hdfs-site.xml.template b/hdfs-autoconf/config-templates/hdfs-site.xml.template new file mode 100644 index 00000000..f7d3d73e --- /dev/null +++ b/hdfs-autoconf/config-templates/hdfs-site.xml.template @@ -0,0 +1,118 @@ + + + + + + + + +dfs.replication +1 + + + + dfs.http.address + 127.0.0.1:50070 + + The address and the base port where the dfs namenode web ui will listen on. + If the port is 0 then the server will start on a free port. + + + + + dfs.secondary.http.address + 0.0.0.0:0 + + The secondary namenode http server address and port. + If the port is 0 then the server will start on a free port. + + + + + dfs.blockreport.intervalMsec + 300000 + Determines block reporting interval in milliseconds. + + + + dfs.fullblockreport.magnifier + 2 + + Determines the full block reporting interval, which is magnifier + times the delete block report interval. + + + + + dfs.datanode.address + 0.0.0.0:0 + + The address where the datanode server will listen to. + If the port is 0 then the server will start on a free port. + + + + + dfs.datanode.http.address + 0.0.0.0:0 + + The datanode http server address and port. + If the port is 0 then the server will start on a free port. + + + + + dfs.datanode.ipc.address + 0.0.0.0:0 + + The datanode ipc server address and port. + If the port is 0 then the server will start on a free port. + + + + + dfs.datanode.handler.count + 3 + The number of server threads for the datanode. + + + + dfs.permissions + false + + + + dfs.data.dir + {{DataNode-volumes}} + Determines where on the local filesystem an DFS data node + should store its blocks. If this is a comma-delimited + list of directories, then data will be stored in all named + directories, typically on different devices. + Directories that do not exist are ignored. + + + + + dfs.block.invalidate.limit + 100 + + + + dfs.safemode.extension + 10000 + + Determines extension of safe mode in milliseconds + after the threshold level is reached. + + + + + dfs.namenode.dn-address + localhost:9015 + + The address and port to run the RPC server which will be processing + requests from datanodes in the cluster. + + + + diff --git a/hdfs-autoconf/config-templates/run-datanode.sh b/hdfs-autoconf/config-templates/run-datanode.sh new file mode 100644 index 00000000..95e116ca --- /dev/null +++ b/hdfs-autoconf/config-templates/run-datanode.sh @@ -0,0 +1,12 @@ +#!/bin/bash +#Usage: bash $LAUNCHPAD_DIR/run.sh [--daemon] + +source config.sh + +cd ${HADOOP_VERSION}/bin +if [[ $# > 0 && $1 == "--daemon" ]]; then + export HADOOP_PID_DIR="$LOGS_DIRECTORY" && ./hadoop-daemon.sh start avatardatanode +else + ./hadoop avatardatanode +fi + diff --git a/hdfs-autoconf/config-templates/run-one.template b/hdfs-autoconf/config-templates/run-one.template new file mode 100644 index 00000000..a24524c5 --- /dev/null +++ b/hdfs-autoconf/config-templates/run-one.template @@ -0,0 +1,12 @@ +#!/bin/bash +#Usage: bash $LAUNCHPAD_DIR/run.sh [--daemon] + +source config.sh + +cd $HADOOP_VERSION/bin +if [[ $# > 0 && $1 == "--daemon" ]]; then + echo "daemon mode" + export HADOOP_PID_DIR="$LOGS_DIRECTORY" && ./hadoop-daemon.sh start avatarnode -one -standby; +else + ./hadoop avatarnode -one -standby; +fi diff --git a/hdfs-autoconf/config-templates/run-zero.template b/hdfs-autoconf/config-templates/run-zero.template new file mode 100644 index 00000000..21491edc --- /dev/null +++ b/hdfs-autoconf/config-templates/run-zero.template @@ -0,0 +1,12 @@ +#!/bin/bash +#Usage: bash $LAUNCHPAD_DIR/run.sh [--daemon] + +source config.sh + +cd $HADOOP_VERSION/bin + +if [[ $# > 0 && $1 == "--daemon" ]]; then + export HADOOP_PID_DIR="$LOGS_DIRECTORY" && ./hadoop-daemon.sh start avatarnode -zero; +else + ./hadoop avatarnode -zero; +fi diff --git a/hdfs-autoconf/config-templates/zoo.cfg b/hdfs-autoconf/config-templates/zoo.cfg new file mode 100644 index 00000000..aafb3247 --- /dev/null +++ b/hdfs-autoconf/config-templates/zoo.cfg @@ -0,0 +1,25 @@ +# The number of milliseconds of each tick +tickTime=2000 +# The number of ticks that the initial +# synchronization phase can take +initLimit=10 +# The number of ticks that can pass between +# sending a request and getting an acknowledgement +syncLimit=5 +# the directory where the snapshot is stored. +# do not use /tmp for storage, /tmp here is just +# example sakes. +dataDir=/tmp/zookeeper +# the port at which the clients will connect +clientPort=2181 +# +# Be sure to read the maintenance section of the +# administrator guide before turning on autopurge. +# +# http://zookeeper.apache.org/doc/current/zookeeperAdmin.html#sc_maintenance +# +# The number of snapshots to retain in dataDir +#autopurge.snapRetainCount=3 +# Purge task interval in hours +# Set to "0" to disable auto purge feature +#autopurge.purgeInterval=1 diff --git a/hdfs-autoconf/config.sh b/hdfs-autoconf/config.sh new file mode 100644 index 00000000..133e29e0 --- /dev/null +++ b/hdfs-autoconf/config.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# This script is sources by every other script. + +# let's stop execution when some simlpe command fails +set -e + +# ================================================== +# CONFIGURE BEFORE USE +# ================================================== + +# This argument specifies the hadoop checkout. So the binaries will be run +# from ${HADOOP_VERSION}/bin directory, and configuration files assumed to be +# located in ${HADOOP_VERSION}/conf directory. +# HADOOP_VERSION= +if [[ -z $HADOOP_VERSION ]]; then + HADOOP_VERSION=$(readlink -f ../) +fi + +# This is the directory that will hold all the log files for different +# instances. +# DISCLAIMER: Full path must be specified here! +if [[ -z $LOGS_DIRECTORY ]]; then + LOGS_DIRECTORY=$HADOOP_VERSION/logs +fi + +# =================================================== +# =================================================== + + +METACONF_DIR="./config-meta" +TEMPLATES_DIR="./config-templates" +LAUNCHPAD_DIR="./launchpad" +# This is the pattern that will be searched for the datanode configuration files +DATANODE_CONFIG_FILES="$METACONF_DIR/avatar-datanode*.sed" +# This is the file that will exist as long as the cluster is running. +# Used by start-dev-cluster and stop-dev-cluster scripts +CLUSTER_IS_RUNNING=$LOGS_DIRECTORY/cluster-is-running-now + + +if ! [[ -d $METACONF_DIR ]]; then + echo "Cannot find $METACONF_DIR directory; check config.sh to correct the dir" + exit 1 +fi + +if ! [[ -d $TEMPLATES_DIR ]]; then + echo "Cannot find $TEMPLATES_DIR directory; check config.sh to correct the dir" + exit 1 +fi + +if ! [[ -d $LAUNCHPAD_DIR ]]; then + mkdir -p $LAUNCHPAD_DIR +fi + +if [[ -z $ZOOKEEPER_PATH ]]; then + ZOOKEEPER_PATH="`pwd`/../../../VENDOR.zookeeper/fb-trunk/" +fi diff --git a/hdfs-autoconf/scripts/common.sh b/hdfs-autoconf/scripts/common.sh new file mode 100644 index 00000000..9e330b9d --- /dev/null +++ b/hdfs-autoconf/scripts/common.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +source config.sh + +# Colors! +# How to use them? See example: +# echo -e "See the real ${cRED}RED${cRESET} color" + +cBLACK='\E[0;30m' +cRED='\E[0;31m' +cGREEN='\E[0;32m' +cYELLOW='\E[0;33m' +cBLUE='\E[0;34m' +cMAGENTA='\E[0;35m' +cCYAN='\E[0;36m' +cWHITE='\E[1;37m' +cRESET='\E[00m' + +# just print a message in red color +function fail { + echo -e "${cRED}$1${cRESET}" + exit 1 +} + +# The script patches a template file with sed scripts. All changes +# are made in-place +# +# Usage +# bash patcher.sh