diff --git a/Dockerfile b/Dockerfile
index b1a55d9..9ce5f42 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,7 +10,7 @@ USER root
# install dev tools
RUN yum clean all; \
rpm --rebuilddb; \
- yum install -y curl which tar sudo openssh-server openssh-clients rsync
+ yum install -y yum-plugin-ovl curl which tar sudo openssh-server openssh-clients rsync
# update libselinux. see https://github.com/sequenceiq/hadoop-docker/issues/14
RUN yum update -y libselinux
@@ -22,13 +22,8 @@ RUN cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys
# java
-RUN curl -LO 'http://download.oracle.com/otn-pub/java/jdk/7u71-b14/jdk-7u71-linux-x64.rpm' -H 'Cookie: oraclelicense=accept-securebackup-cookie'
-RUN rpm -i jdk-7u71-linux-x64.rpm
-RUN rm jdk-7u71-linux-x64.rpm
-
-ENV JAVA_HOME /usr/java/default
-ENV PATH $PATH:$JAVA_HOME/bin
-RUN rm /usr/bin/java && ln -s $JAVA_HOME/bin/java /usr/bin/java
+RUN yum -y install java-1.8.0-openjdk-devel.x86_64 && yum clean all
+COPY java_env.sh /etc/profile.d/java_env.sh
# download native support
RUN mkdir -p /tmp/native
@@ -46,7 +41,7 @@ ENV HADOOP_YARN_HOME /usr/local/hadoop
ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop
ENV YARN_CONF_DIR $HADOOP_PREFIX/etc/hadoop
-RUN sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/usr/java/default\nexport HADOOP_PREFIX=/usr/local/hadoop\nexport HADOOP_HOME=/usr/local/hadoop\n:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
+RUN sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/etc/alternatives/java_sdk\nexport HADOOP_PREFIX=/usr/local/hadoop\nexport HADOOP_HOME=/usr/local/hadoop\n:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop/:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
#RUN . $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
@@ -61,6 +56,11 @@ ADD hdfs-site.xml $HADOOP_PREFIX/etc/hadoop/hdfs-site.xml
ADD mapred-site.xml $HADOOP_PREFIX/etc/hadoop/mapred-site.xml
ADD yarn-site.xml $HADOOP_PREFIX/etc/hadoop/yarn-site.xml
+# prepare tez installation
+ADD tez-site.xml $HADOOP_PREFIX/etc/hadoop/tez-site.xml
+RUN mkdir -p /root/tez
+RUN curl -s http://www-eu.apache.org/dist/tez/0.8.5/apache-tez-0.8.5-bin.tar.gz | tar -xz -C /root/tez
+
RUN $HADOOP_PREFIX/bin/hdfs namenode -format
# fixing the libhadoop.so like a boss
diff --git a/README.md b/README.md
index 1d7cace..00bab86 100644
--- a/README.md
+++ b/README.md
@@ -1,43 +1,20 @@
-# Apache Hadoop 2.7.1 Docker image
+[](https://hub.docker.com/r/ouyi/hadoop-docker/)
+[](https://hub.docker.com/r/ouyi/hadoop-docker/)
-[](https://registry.hub.docker.com/u/sequenceiq/hadoop-docker/)
-[](https://registry.hub.docker.com/u/sequenceiq/hadoop-docker/)
+# Apache Hadoop 2.7.1 in a Docker container
+Hadoop in a pseudo distributed mode in a Docker container, forked from [sequenceiq](https://github.com/sequenceiq/hadoop-docker). The added values are:
+- Bug [fix](https://github.com/sequenceiq/hadoop-docker/pull/75)
+- Use openjdk
+- Tez installation
-_Note: this is the master branch - for a particular Hadoop version always check the related branch_
-
-A few weeks ago we released an Apache Hadoop 2.3 Docker image - this quickly become the most [popular](https://registry.hub.docker.com/search?q=hadoop&s=downloads) Hadoop image in the Docker [registry](https://registry.hub.docker.com/).
-
-
-Following the success of our previous Hadoop Docker [images](https://registry.hub.docker.com/u/sequenceiq/hadoop-docker/), the feedback and feature requests we received, we aligned with the Hadoop release cycle, so we have released an Apache Hadoop 2.7.1 Docker image - same as the previous version, it's available as a trusted and automated build on the official Docker [registry](https://registry.hub.docker.com/).
-
-
-_FYI: All the former Hadoop releases (2.3, 2.4.0, 2.4.1, 2.5.0, 2.5.1, 2.5.2, 2.6.0) are available in the GitHub branches or our [Docker Registry](https://registry.hub.docker.com/u/sequenceiq/hadoop-docker/) - check the tags._
-
-# Build the image
-
-If you'd like to try directly from the Dockerfile you can build the image as:
-
-```
-docker build -t sequenceiq/hadoop-docker:2.7.1 .
-```
-# Pull the image
-
-The image is also released as an official Docker image from Docker's automated build repository - you can always pull or refer the image when launching containers.
-
-```
-docker pull sequenceiq/hadoop-docker:2.7.1
-```
-
-# Start a container
-
-In order to use the Docker image you have just build or pulled use:
+## Start a container
**Make sure that SELinux is disabled on the host. If you are using boot2docker you don't need to do anything.**
```
-docker run -it sequenceiq/hadoop-docker:2.7.1 /etc/bootstrap.sh -bash
+docker run -P -it ouyi/hadoop-docker /etc/bootstrap.sh -bash
```
## Testing
@@ -50,13 +27,5 @@ cd $HADOOP_PREFIX
bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.1.jar grep input output 'dfs[a-z.]+'
# check the output
-bin/hdfs dfs -cat output/*
+bin/hadoop fs -cat output/{*}
```
-
-## Hadoop native libraries, build, Bintray, etc
-
-The Hadoop build process is no easy task - requires lots of libraries and their right version, protobuf, etc and takes some time - we have simplified all these, made the build and released a 64b version of Hadoop nativelibs on this [Bintray repo](https://bintray.com/sequenceiq/sequenceiq-bin/hadoop-native-64bit/2.7.0/view/files). Enjoy.
-
-## Automate everything
-
-As we have mentioned previousely, a Docker file was created and released in the official [Docker repository](https://registry.hub.docker.com/u/sequenceiq/hadoop-docker/)
diff --git a/bootstrap.sh b/bootstrap.sh
index 4cf0e55..02821bd 100755
--- a/bootstrap.sh
+++ b/bootstrap.sh
@@ -18,6 +18,15 @@ $HADOOP_PREFIX/sbin/start-dfs.sh
$HADOOP_PREFIX/sbin/start-yarn.sh
$HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver
+# install Tez
+export PATH=$HADOOP_PREFIX/sbin:$HADOOP_PREFIX/bin:$PATH
+hdfs dfsadmin -safemode wait
+hadoop fs -mkdir -p /apps/tez
+hadoop fs -copyFromLocal /root/tez/apache-tez-0.8.5-bin/share/tez.tar.gz /apps/apache-tez-0.8.5-bin.tar.gz
+export TEZ_CONF_DIR=/usr/local/hadoop/etc/hadoop/
+export TEZ_JARS=/root/tez/apache-tez-0.8.5-bin
+export HADOOP_CLASSPATH=${TEZ_CONF_DIR}:${TEZ_JARS}/*:${TEZ_JARS}/lib/*
+
if [[ $1 == "-d" ]]; then
while true; do sleep 1000; done
fi
diff --git a/java_env.sh b/java_env.sh
new file mode 100644
index 0000000..65410e1
--- /dev/null
+++ b/java_env.sh
@@ -0,0 +1,2 @@
+export JAVA_HOME=/etc/alternatives/java_sdk
+export PATH=${JAVA_HOME}/bin:${PATH}
diff --git a/mapred-site.xml b/mapred-site.xml
index dba582f..e3371b3 100644
--- a/mapred-site.xml
+++ b/mapred-site.xml
@@ -3,4 +3,20 @@
mapreduce.framework.name
yarn
+
+ mapreduce.map.memory.mb
+ 256
+
+
+ mapreduce.reduce.memory.mb
+ 512
+
+
+ mapreduce.map.java.opts
+ -Xmx150m
+
+
+ mapreduce.reduce.java.opts
+ -Xmx300m
+
diff --git a/tez-site.xml b/tez-site.xml
new file mode 100644
index 0000000..116d389
--- /dev/null
+++ b/tez-site.xml
@@ -0,0 +1,1151 @@
+
+
+
+
+
+
+
+
+
+ tez.dag.recovery.enabled
+ true
+ Boolean value. Enable recovery of DAGs. This allows a restarted app master to recover the
+ incomplete DAGs from the previous instance of the app master.
+ boolean
+
+
+
+ tez.dag.recovery.io.buffer.size
+ 8192
+ Int value. Size in bytes for the IO buffer size while processing the recovery file.
+ Expert level setting.
+ integer
+
+
+
+ tez.dag.recovery.flush.interval.secs
+ 30
+ Int value. Interval, in seconds, between flushing recovery data to the recovery log.
+ integer
+
+
+
+ tez.dag.recovery.max.unflushed.events
+ 100
+ Int value. Number of recovery events to buffer before flushing them to the recovery log.
+ integer
+
+
+
+ tez.task.heartbeat.timeout.check-ms
+ 30000
+ Int value. Time interval, in milliseconds, between checks for lost tasks.
+ Expert level setting.
+ integer
+
+
+
+ tez.task.timeout-ms
+ 300000
+ Int value. Time interval, in milliseconds, within which a task must heartbeat to the app master
+ before its considered lost.
+ Expert level setting.
+ integer
+
+
+
+ tez.am.acls.enabled
+ true
+ Boolean value. Configuration to enable/disable ACL checks.
+ boolean
+
+
+
+ tez.allow.disabled.timeline-domains
+ false
+ Boolean value.
+ Allow disabling of Timeline Domains even if Timeline is being used.
+ boolean
+ true
+
+
+
+ tez.am.client.am.port-range
+ String value. Range of ports that the AM can use when binding for client connections. Leave blank
+ to use all possible ports. Expert level setting. It's hadoop standard range configuration.
+ For example 50000-50050,50100-50200
+ string
+
+
+
+ tez.am.client.am.thread-count
+ 1
+ Int value. Number of threads to handle client RPC requests. Expert level setting.
+ integer
+
+
+
+ tez.am.commit-all-outputs-on-dag-success
+ true
+ Boolean value. Determines when the final outputs to data sinks are committed. Commit is an
+ output specific operation and typically involves making the output visible for consumption.
+ If the config is true, then the outputs are committed at the end of DAG completion after all
+ constituent vertices have completed. If false, outputs for each vertex are committed after that
+ vertex succeeds. Depending on the desired output visibility and downstream consumer dependencies
+ this value must be appropriately chosen. Defaults to the safe choice of true.
+ boolean
+
+
+
+ tez.am.containerlauncher.thread-count-limit
+ 500
+ Int value. Upper limit on the number of threads user to launch containers in the app
+ master. Expert level setting.
+ integer
+
+
+
+ tez.am.container.idle.release-timeout-max.millis
+ 10000
+ Int value. The maximum amount of time to hold on to a container if no task can be
+ assigned to it immediately. Only active when reuse is enabled. The value
+ must be +ve and >=
+ TezConfiguration#TEZ_AM_CONTAINER_IDLE_RELEASE_TIMEOUT_MIN_MILLIS.
+ Containers will have an expire time set to a random value between
+ TezConfiguration#TEZ_AM_CONTAINER_IDLE_RELEASE_TIMEOUT_MIN_MILLIS &&
+ TezConfiguration#TEZ_AM_CONTAINER_IDLE_RELEASE_TIMEOUT_MAX_MILLIS. This
+ creates a graceful reduction in the amount of idle resources held
+ long
+
+
+
+ tez.am.container.idle.release-timeout-min.millis
+ 5000
+ Int value. The minimum amount of time to hold on to a container that is idle. Only active when
+ reuse is enabled. Set to -1 to never release idle containers (not recommended).
+ integer
+
+
+
+ tez.am.container.reuse.enabled
+ true
+ Boolean value. Configuration to specify whether container should be reused across tasks.
+ This improves performance by not incurring recurring launch overheads.
+ boolean
+
+
+
+ tez.am.container.reuse.locality.delay-allocation-millis
+ 250
+ Int value. The amount of time to wait before assigning a container to the next level
+ of locality. NODE -> RACK -> NON_LOCAL. Delay scheduling parameter. Expert level setting.
+ long
+
+
+
+ tez.am.container.reuse.non-local-fallback.enabled
+ false
+ Boolean value. Whether to reuse containers for non-local tasks. Active only if reuse is
+ enabled. Turning this on can severely affect locality and can be bad for jobs with high data
+ volume being read from the primary data sources.
+ boolean
+
+
+
+ tez.am.container.reuse.rack-fallback.enabled
+ true
+ Boolean value. Whether to reuse containers for rack local tasks. Active only if reuse is
+ enabled.
+ boolean
+
+
+
+ tez.am.credentials-merge
+ Boolean value. If true then Tez will add the ApplicationMaster credentials
+ to all task credentials.
+ boolean
+
+
+
+ tez.am.dag.scheduler.class
+ org.apache.tez.dag.app.dag.impl.DAGSchedulerNaturalOrder
+ String value. The class to be used for DAG Scheduling. Expert level setting.
+ string
+
+
+
+ tez.am.disable.client-version-check
+ false
+ Boolean value.
+ Disable version check between client and AM/DAG. Default false.
+ boolean
+ true
+
+
+
+ tez.am.inline.task.execution.enabled
+ false
+ Tez AM Inline Mode flag. Not valid till Tez-684 get checked-in
+ boolean
+ true
+
+
+
+ tez.am.inline.task.execution.max-tasks
+ 1
+ Int value.
+ The maximium number of tasks running in parallel within the app master process.
+ integer
+
+
+
+ tez.am.launch.cluster-default.cmd-opts
+ -server -Djava.net.preferIPv4Stack=true -Dhadoop.metrics.log.level=WARN
+ String value. Command line options which will be prepended to {@link #TEZ_AM_LAUNCH_CMD_OPTS}
+ during the launch of the AppMaster process. This property will typically be configured to
+ include default options meant to be used by all jobs in a cluster. If required, the values can
+ be overridden per job.
+ string
+
+
+
+ tez.am.launch.cluster-default.env
+ String value. Env settings will be merged with {@link #TEZ_AM_LAUNCH_ENV}
+ during the launch of the AppMaster process. This property will typically be configured to
+ include default system env meant to be used by all jobs in a cluster. If required, the values can
+ be appended to per job.
+ string
+
+
+
+ tez.am.launch.cmd-opts
+ -XX:+PrintGCDetails -verbose:gc -XX:+PrintGCTimeStamps -XX:+UseNUMA -XX:+UseParallelGC
+ String value. Command line options provided during the launch of the Tez
+ AppMaster process. Its recommended to not set any Xmx or Xms in these launch opts so that
+ Tez can determine them automatically.
+ string
+
+
+
+ tez.am.launch.env
+ String value. Env settings for the Tez AppMaster process.
+ Should be specified as a comma-separated of key-value pairs where each pair
+ is defined as KEY=VAL
+ e.g. "LD_LIBRARY_PATH=.,USERNAME=foo"
+ These take least precedence compared to other methods of setting env.
+ These get added to the app master environment prior to launching it.
+ This setting will prepend existing settings in the cluster default
+ string
+
+
+
+ tez.am.legacy.speculative.slowtask.threshold
+ Float value. Specifies how many standard deviations away from the mean task execution time
+ should be considered as an outlier/slow task.
+ float
+ true
+
+
+
+ tez.am.log.level
+ INFO
+ Root Logging level passed to the Tez app master.
+
+ Simple configuration: Set the log level for all loggers.
+ e.g. INFO
+ This sets the log level to INFO for all loggers.
+
+ Advanced configuration: Set the log level for all classes, along with a different level for some.
+ e.g. DEBUG;org.apache.hadoop.ipc=INFO;org.apache.hadoop.security=INFO
+ This sets the log level for all loggers to DEBUG, expect for the
+ org.apache.hadoop.ipc and org.apache.hadoop.security, which are set to INFO
+
+ Note: The global log level must always be the first parameter.
+ DEBUG;org.apache.hadoop.ipc=INFO;org.apache.hadoop.security=INFO is valid
+ org.apache.hadoop.ipc=INFO;org.apache.hadoop.security=INFO is not valid
+ string
+
+
+
+ tez.am.max.allowed.time-sec.for-read-error
+ 300
+ int value. Represents the maximum time in seconds for which a consumer attempt can report
+ a read error against its producer attempt, after which the producer attempt will be re-run
+ to re-generate the output. There are other heuristics which determine the retry and mainly
+ try to guard against a flurry of re-runs due to intermittent read errors
+ (due to network issues). This configuration puts a time limit on those heuristics to ensure
+ jobs dont hang indefinitely due to lack of closure in those heuristics
+
+ Expert level setting.
+ integer
+
+
+
+ tez.am.max.app.attempts
+ 2
+ Int value. Specifies the number of times the app master can be launched in order to recover
+ from app master failure. Typically app master failures are non-recoverable. This parameter
+ is for cases where the app master is not at fault but is lost due to system errors.
+ Expert level setting.
+ integer
+
+
+
+ tez.am.maxtaskfailures.per.node
+ 10
+ Int value. Specifies the number of task failures on a node before the node is considered faulty.
+ integer
+
+
+
+ tez.am.modify-acls
+ String value.
+ AM modify ACLs. This allows the specified users/groups to run modify operations on the AM
+ such as submitting DAGs, pre-warming the session, killing DAGs or shutting down the session.
+ Comma separated list of users, followed by whitespace, followed by a comma separated list of
+ groups
+ string
+
+
+
+ tez.am.node-blacklisting.enabled
+ true
+ Boolean value. Enabled blacklisting of nodes of nodes that are considered faulty. These nodes
+ will not be used to execute tasks.
+ boolean
+
+
+
+ tez.am.node-blacklisting.ignore-threshold-node-percent
+ 33
+ Int value. Specifies the percentage of nodes in the cluster that may be considered faulty.
+ This limits the number of nodes that are blacklisted in an effort to minimize the effects of
+ temporary surges in failures (e.g. due to network outages).
+ integer
+
+
+
+ tez.am.node-unhealthy-reschedule-tasks
+ false
+ Boolean value. Enable task rescheduling for node updates.
+ When enabled the task scheduler will reschedule task attempts that
+ are associated with an unhealthy node to avoid potential data transfer
+ errors from downstream tasks.
+ boolean
+
+
+
+ tez.am.preemption.heartbeats-between-preemptions
+ 3
+ Int value. The number of RM heartbeats to wait after preempting running tasks before preempting
+ more running tasks. After preempting a task, we need to wait at least 1 heartbeat so that the
+ RM can act on the released resources and assign new ones to us. Expert level setting.
+ integer
+
+
+
+ tez.am.preemption.max.wait-time-ms
+ 60000
+ Int value. Time (in millisecs) that an unsatisfied request will wait before preempting other
+ resources. In rare cases, the cluster says there are enough free resources but does not end
+ up getting enough on a node to actually assign it to the job. This configuration tries to put
+ a deadline on such wait to prevent indefinite job hangs.
+ integer
+
+
+
+ tez.am.preemption.percentage
+ 10
+ Int value. Specifies the percentage of tasks eligible to be preempted that
+ will actually be preempted in a given round of Tez internal preemption.
+ This slows down preemption and gives more time for free resources to be
+ allocated by the cluster (if any) and gives more time for preemptable tasks
+ to finish. Valid values are 0-100. Higher values will preempt quickly at
+ the cost of losing work. Setting to 0 turns off preemption. Expert level
+ setting.
+ integer
+
+
+
+ tez.am.resource.cpu.vcores
+ 1
+ Int value. The number of virtual cores to be used by the app master
+ integer
+
+
+
+ tez.am.resource.memory.mb
+ 1024
+ Int value. The amount of memory in MB to be used by the AppMaster
+ integer
+
+
+
+ tez.am.am-rm.heartbeat.interval-ms.max
+ 1000
+ Int value. The maximum heartbeat interval between the AM and RM in milliseconds
+ Increasing this reduces the communication between the AM and the RM and can
+ help in scaling up. Expert level setting.
+ integer
+
+
+
+ tez.am.session.min.held-containers
+ 0
+ Int value. The minimum number of containers that will be held in session mode. Not active in
+ non-session mode. Enables an idle session (not running any DAG) to hold on to a minimum number
+ of containers to provide fast response times for the next DAG.
+ integer
+
+
+
+ tez.am.mode.session
+ false
+ Boolean value. Execution mode for the Tez application. True implies session mode. If the client
+ code is written according to best practices then the same code can execute in either mode based
+ on this configuration. Session mode is more aggressive in reserving execution resources and is
+ typically used for interactive applications where multiple DAGs are submitted in quick succession
+ by the same user. For long running applications, one-off executions, batch jobs etc non-session
+ mode is recommended. If session mode is enabled then container reuse is recommended.
+ boolean
+
+
+
+ tez.am.speculation.enabled
+ false
+ boolean
+ true
+
+
+
+ tez.staging-dir
+ String value. Specifies a directory where Tez can create temporary job artifacts.
+ string
+
+
+
+ tez.am.staging.scratch-data.auto-delete
+ true
+ Boolean value. If true then Tez will try to automatically delete temporary job
+ artifacts that it creates within the specified staging dir. Does not affect any user data.
+ boolean
+
+
+
+ tez.am.task.listener.thread-count
+ 30
+ Int value. The number of threads used to listen to task heartbeat requests.
+ Expert level setting.
+ integer
+
+
+
+ tez.am.task.max.failed.attempts
+ 4
+ Int value. The maximum number of attempts that can fail for a particular task before the task is failed.
+ This does not count killed attempts. Task failure results in DAG failure.
+ integer
+
+
+
+ tez.am.tez-ui.history-url.template
+ __HISTORY_URL_BASE__/#/tez-app/__APPLICATION_ID__
+ String value
+ Tez UI URL template for the application.
+ Expert level setting.
+
+ The AM will redirect the user to the Tez UI via this url. Template supports the following
+ parameters to be replaced with the actual runtime information:
+
+ __APPLICATION_ID__ : Replaces this with application ID
+ __HISTORY_URL_BASE__: replaces this with TEZ_HISTORY_URL_BASE
+
+ For example, "http://uihost:9001/#/tez-app/__APPLICATION_ID__/ will be replaced to
+ http://uihost:9001/#/tez-app/application_1421880306565_0001/
+ string
+
+
+
+ tez.am.vertex.max-task-concurrency
+ -1
+ Int value. The maximum number of attempts that can run concurrently for a given vertex.
+ Setting <=0 implies no limit
+ integer
+
+
+
+ tez.am.view-acls
+ String value.
+ AM view ACLs. This allows the specified users/groups to view the status of the AM and all DAGs
+ that run within this AM.
+ Comma separated list of users, followed by whitespace, followed by a comma separated list of
+ groups
+ string
+
+
+
+ tez.am.tez-ui.webservice.enable
+ true
+ String value
+ Allow disabling of the Tez AM webservice. If set to false the Tez-UI wont show progress
+ updates for running application.
+ boolean
+
+
+
+ tez.application.tags
+ String value. Tags for the job that will be passed to YARN at submission
+ time. Queries to YARN for applications can filter on these tags.
+ string
+
+
+
+ tez.aux.uris
+ Auxiliary resources to be localized for the Tez AM and all its containers.
+
+ Value is comma-separated list of fully-resolved directories or file paths. All resources
+ are made available into the working directory of the AM and/or containers i.e. $CWD.
+
+ If directories are specified, they are not traversed recursively. Only files directly under the
+ specified directory are localized.
+
+ All duplicate resources are ignored.
+ string
+ ${fs.defaultFS}/apps/tez
+
+
+
+ tez.cancel.delegation.tokens.on.completion
+ true
+ boolean
+ true
+
+
+
+ tez.client.asynchronous-stop
+ true
+ Boolean value. Backwards compatibility setting. Changes TezClient stop to be a
+ synchronous call waiting until AM is in a final state before returning to the user.
+ Expert level setting.
+ boolean
+
+
+
+ tez.client.diagnostics.wait.timeout-ms
+ 3000
+ Long value
+ Time to wait (in milliseconds) for yarn app's diagnotics is available
+ Workaround for YARN-2560
+ long
+ true
+
+
+
+ tez.client.timeout-ms
+ 30000
+ Long value. Time interval, in milliseconds, for client to wait during client-requested
+ AM shutdown before issuing a hard kill to the RM for this application.
+ Expert level setting.
+ long
+
+
+
+ tez.java.opts.checker.class
+ String value.
+ Ability to provide a different implementation to check/verify java opts defined
+ for vertices/tasks.
+ Class has to be an instance of JavaOptsChecker
+ string
+ true
+
+
+
+ tez.java.opts.checker.enabled
+ true
+ Boolean value. Default true.
+ Ability to disable the Java Opts Checker
+ boolean
+ true
+
+
+
+ tez.container.max.java.heap.fraction
+ 0.8
+ Double value. Tez automatically determines the Xmx for the JVMs used to run
+ Tez tasks and app masters. This feature is enabled if the user has not
+ specified Xmx or Xms values in the launch command opts. Doing automatic Xmx
+ calculation is preferred because Tez can determine the best value based on
+ actual allocation of memory to tasks the cluster. The value if used as a
+ fraction that is applied to the memory allocated Factor to size Xmx based
+ on container memory size. Value should be greater than 0 and less than 1.
+ float
+
+
+
+ tez.counters.counter-name.max-length
+ 64
+ Int value. Configuration to limit the length of counter names. This can be used to
+ limit the amount of memory being used in the app master to store the
+ counters. Expert level setting.
+ integer
+ true
+
+
+
+ tez.counters.group-name.max-length
+ 256
+ Int value. Configuration to limit the counter group names per app master. This can be used to
+ limit the amount of memory being used in the app master to store the
+ counters. Expert level setting.
+ integer
+ true
+
+
+
+ tez.counters.max
+ 1200
+ Int value. Configuration to limit the counters per dag (AppMaster and Task). This can be used
+ to
+ limit the amount of memory being used in the app master to store the
+ counters. Expert level setting.
+ integer
+ true
+
+
+
+ tez.counters.max.groups
+ 500
+ Int value. Configuration to limit the number of counter groups for a DAG. This can be used to
+ limit the amount of memory being used in the app master to store the
+ counters. Expert level setting.
+ integer
+ true
+
+
+
+ tez.credentials.path
+ String value that is a file path.
+ Path to a credentials file (with serialized credentials) located on the local file system.
+ string
+
+
+
+ tez.dag.status.pollinterval-ms
+ 500
+ Long value
+ Status Poll interval in Milliseconds used when getting DAG status with timeout.
+ long
+
+
+
+ tez.generate.debug.artifacts
+ false
+ boolean
+ true
+
+
+
+ tez.history.logging.log.level
+ Enum value. Config to limit the type of events published to the history logging service.
+ The valid log levels are defined in the enum {@link HistoryLogLevel}. The default value is
+ defined in {@link HistoryLogLevel#DEFAULT}.
+ string
+
+
+
+ tez.history.logging.service.class
+ org.apache.tez.dag.history.logging.impl.SimpleHistoryLoggingService
+ String value that is a class name.
+ Specify the class to use for logging history data.
+ To disable, set this to "org.apache.tez.dag.history.logging.impl.DevNullHistoryLoggingService"
+ string
+
+
+
+ tez.history.logging.taskattempt-filters
+ List of comma separated enum values. Specifies the list of task attempt termination causes,
+ which have to be suppressed from being logged to ATS. The valid filters are defined in the
+ enum TaskAttemptTerminationCause. The filters are applied only if tez.history.logging.log.level
+ is set to TASK_ATTEMPT.
+ string
+
+
+
+ tez.history.logging.timeline-cache-plugin.old-num-dags-per-group
+ Comma separated list of Integers. These are the values that were set for the config value
+ for {@value #TEZ_HISTORY_LOGGING_TIMELINE_NUM_DAGS_PER_GROUP}. The older values are required so
+ that the groupIds generated previously will continue to be generated by the plugin. If an older
+ value is not present then the UI may not show information for DAGs which were created
+ with a different grouping value.
+
+ Note: Do not add too many values here as it will affect the performance of Yarn Timeline
+ Server/Tez UI due to the need to scan for more log files.
+ string
+ true
+ true
+
+
+
+ tez.history.logging.timeline.num-dags-per-group
+ 1
+ Integer value. Number of DAGs to be grouped together. This is used by the history logging
+ service to generate groupIds such that numDagsPerGroup will have same groupId in a given
+ session. If the value is set to 1 then we disable grouping. This config is used to control the
+ number of DAGs written into one log file, and hence controls number of files created in
+ the Filesystem used by YARN Timeline.
+ integer
+ true
+ true
+
+
+
+ tez.tez-ui.history-url.base
+ String value
+ Tez-UI Url base. This gets replaced in the TEZ_AM_TEZ_UI_HISTORY_URL_TEMPLATE
+ ex http://ui-host:9001 or if its hosted with a prefix http://ui-host:9001/~user
+ if the ui is hosted on the default port (80 for http and 443 for https), the port should not
+ be specified.
+ string
+
+
+
+ tez.ignore.lib.uris
+ Boolean value. Allows to ignore 'tez.lib.uris'. Useful during development as well as
+ raw Tez application where classpath is propagated with application
+ via {@link LocalResource}s. This is mainly useful for developer/debugger scenarios.
+ boolean
+ true
+
+
+
+ tez.ipc.payload.reserved.bytes
+ 5242880
+ Int value. SubmitDAGPlanRequest cannot be larger than Max IPC message size minus this number; otherwise, it will
+ be serialized to HDFS and we transfer the path to server. Server will deserialize the request from HDFS.
+ int
+ true
+
+
+
+ tez.tez.jvm.system-properties-to-log
+ String value. Determines what JVM properties will be logged for debugging purposes
+ in the AM and Task runtime logs.
+ string
+
+
+
+ tez.lib.uris
+ String value to a file path.
+ The location of the Tez libraries which will be localized for DAGs.
+ This follows the following semantics
+ <ol>
+ <li> To use .tar.gz or .tgz files (generated by the tez or hadoop builds), the full path to this
+ file (including filename) should be specified. The internal structure of the uncompressed tgz
+ will be defined by 'tez.lib.uris.classpath'</li>
+
+ <li> If a single file is specified without the above mentioned extensions - it will be treated as
+ a regular file. This means it will not be uncompressed during runtime. </li>
+
+ <li> If multiple entries exist
+ <ul>
+ <li> Regular Files: will be treated as regular files (not uncompressed during runtime) </li>
+ <li> Archive Files: will be treated as archives and will be uncompressed during runtime </li>
+ <li> Directories: all files under the directory (non-recursive) will be made available (but not
+ uncompressed during runtime). </li>
+ </ul>
+ </ol>
+ string
+ ${fs.defaultFS}/apps/apache-tez-0.8.5-bin.tar.gz
+
+
+
+ tez.lib.uris.classpath
+ Specify additional user classpath information to be used for Tez AM and all containers.
+ This will be appended to the classpath after PWD
+
+ 'tez.lib.uris.classpath' defines the relative classpath into the archives
+ that are set in 'tez.lib.uris'
+ string
+
+
+
+ tez.local.mode
+ false
+ Boolean value. Enable local mode execution in Tez. Enables tasks to run in the same process as
+ the app master. Primarily used for debugging.
+ boolean
+
+
+
+ tez.queue.name
+ String value. The queue name for all jobs being submitted from a given client.
+ string
+
+
+
+ tez.session.am.dag.submit.timeout.secs
+ 300
+ Int value. Time (in seconds) for which the Tez AM should wait for a DAG to be submitted before
+ shutting down. Only relevant in session mode. Any negative value will disable this check and
+ allow the AM to hang around forever in idle mode.
+ integer
+
+
+
+ tez.session.client.timeout.secs
+ 120
+ Int value. Time (in seconds) to wait for AM to come up when trying to submit a DAG
+ from the client. Only relevant in session mode. If the cluster is busy and cannot launch the
+ AM then this timeout may be hit. In those case, using non-session mode is recommended if
+ applicable. Otherwise increase the timeout (set to -1 for infinity. Not recommended)
+ integer
+
+
+
+ tez.simple.history.logging.dir
+ String value. The directory into which history data will be written. This defaults to the
+ container logging directory. This is relevant only when SimpleHistoryLoggingService is being
+ used for {@link TezConfiguration#TEZ_HISTORY_LOGGING_SERVICE_CLASS}
+ string
+
+
+
+ tez.simple.history.max.errors
+ 10
+ Int value. Maximum errors allowed while logging history data. After crossing this limit history
+ logging gets disabled. The job continues to run after this.
+ integer
+
+
+
+ tez.task.am.heartbeat.counter.interval-ms.max
+ 4000
+ Int value. Interval, in milliseconds, after which counters are sent to AM in heartbeat from
+ tasks. This reduces the amount of network traffice between AM and tasks to send high-volume
+ counters. Improves AM scalability. Expert level setting.
+ integer
+
+
+
+ tez.task.am.heartbeat.interval-ms.max
+ 100
+ Int value. The maximum heartbeat interval, in milliseconds, between the app master and tasks.
+ Increasing this can help improve app master scalability for a large number of concurrent tasks.
+ Expert level setting.
+ integer
+
+
+
+ tez.task.generate.counters.per.io
+ false
+ Whether to generate counters per IO or not. Enabling this will rename
+ CounterGroups / CounterNames to making them unique per Vertex +
+ Src|Destination
+ boolean
+ true
+ true
+
+
+
+ tez.task.get-task.sleep.interval-ms.max
+ 200
+ Int value. The maximum amount of time, in milliseconds, to wait before a task asks an
+ AM for another task. Increasing this can help improve app master scalability for a large
+ number of concurrent tasks. Expert level setting.
+ integer
+
+
+
+ tez.task.initialize-processor-first
+ false
+ Boolean value. Backwards compatibility setting for initializing IO processor before
+ inputs and outputs.
+ Expert level setting.
+ boolean
+
+
+
+ tez.task.initialize-processor-io-serially
+ false
+ Boolean value. Backwards compatibility setting for initializing inputs and outputs
+ serially instead of the parallel default.
+ Expert level setting.
+ boolean
+
+
+
+ tez.task.launch.cluster-default.cmd-opts
+ -server -Djava.net.preferIPv4Stack=true -Dhadoop.metrics.log.level=WARN
+ String value. Command line options which will be prepended to {@link
+ #TEZ_TASK_LAUNCH_CMD_OPTS} during the launch of Tez tasks. This property will typically be configured to
+ include default options meant to be used by all jobs in a cluster. If required, the values can
+ be overridden per job.
+ string
+
+
+
+ tez.task.launch.cluster-default.env
+ String value. Env settings will be merged with {@link #TEZ_TASK_LAUNCH_ENV}
+ during the launch of the task process. This property will typically be configured to
+ include default system env meant to be used by all jobs in a cluster. If required, the values can
+ be appended to per job.
+ string
+
+
+
+ tez.task.launch.cmd-opts
+ -XX:+PrintGCDetails -verbose:gc -XX:+PrintGCTimeStamps -XX:+UseNUMA -XX:+UseParallelGC
+ String value. Command line options provided during the launch of Tez Task
+ processes. Its recommended to not set any Xmx or Xms in these launch opts
+ so that Tez can determine them automatically.
+ string
+
+
+
+ tez.task.launch.env
+ String value. Env settings for the Tez Task processes.
+ Should be specified as a comma-separated of key-value pairs where each pair
+ is defined as KEY=VAL
+ e.g. "LD_LIBRARY_PATH=.,USERNAME=foo"
+ These take least precedence compared to other methods of setting env
+ These get added to the task environment prior to launching it.
+ This setting will prepend existing settings in the cluster default
+ string
+
+
+
+ tez.task.log.level
+ INFO
+ Root Logging level passed to the Tez tasks.
+
+ Simple configuration: Set the log level for all loggers.
+ e.g. INFO
+ This sets the log level to INFO for all loggers.
+
+ Advanced configuration: Set the log level for all classes, along with a different level for some.
+ e.g. DEBUG;org.apache.hadoop.ipc=INFO;org.apache.hadoop.security=INFO
+ This sets the log level for all loggers to DEBUG, expect for the
+ org.apache.hadoop.ipc and org.apache.hadoop.security, which are set to INFO
+
+ Note: The global log level must always be the first parameter.
+ DEBUG;org.apache.hadoop.ipc=INFO;org.apache.hadoop.security=INFO is valid
+ org.apache.hadoop.ipc=INFO;org.apache.hadoop.security=INFO is not valid
+ string
+
+
+
+ tez.task.max-events-per-heartbeat
+ 500
+ Int value. Maximum number of of events to fetch from the AM by the tasks in a single heartbeat.
+ Expert level setting. Expert level setting.
+ integer
+
+
+
+ tez.task.max-event-backlog
+ 10000
+ Int value. Maximum number of pending task events before a task will stop
+ asking for more events in the task heartbeat.
+ Expert level setting.
+ integer
+
+
+
+ tez.task.progress.stuck.interval-ms
+ -1
+ Long value. Interval, in milliseconds, within which any of the tasks Input/Processor/Output
+ components need to make successive progress notifications. If the progress is not notified
+ for this interval then the task will be considered hung and terminated.
+ The value for this config should be larger than {@link TezConfiguration#TASK_HEARTBEAT_TIMEOUT_MS}
+ and larger than 2 times the value of {@link TezConfiguration#TEZ_TASK_AM_HEARTBEAT_INTERVAL_MS}.
+ A config value <=0 disables this.
+ string
+
+
+
+ tez.task.resource.calculator.process-tree.class
+ string
+ true
+ true
+
+
+
+ tez.task.resource.cpu.vcores
+ 1
+ Int value. The number of virtual cores to be used by tasks.
+ integer
+
+
+
+ tez.task.resource.memory.mb
+ 1024
+ Int value. The amount of memory in MB to be used by tasks. This applies to all tasks across
+ all vertices. Setting it to the same value for all tasks is helpful for container reuse and
+ thus good for performance typically.
+ integer
+
+
+
+ tez.task.scale.memory.additional-reservation.fraction.max
+ float
+ true
+ true
+
+
+
+ tez.task.scale.memory.additional-reservation.fraction.per-io
+ Fraction of available memory to reserve per input/output. This amount is
+ removed from the total available pool before allocation and is for factoring in overheads.
+ float
+ true
+ true
+
+
+
+ tez.task.scale.memory.allocator.class
+ org.apache.tez.runtime.library.resources.WeightedScalingMemoryDistributor
+ The allocator to use for initial memory allocation
+ string
+ true
+ true
+
+
+
+ tez.task.scale.memory.enabled
+ true
+ Whether to scale down memory requested by each component if the total
+ exceeds the available JVM memory
+ boolean
+ true
+ true
+
+
+
+ tez.task.scale.memory.reserve-fraction
+ 0.3
+ The fraction of the JVM memory which will not be considered for allocation.
+ No defaults, since there are pre-existing defaults based on different scenarios.
+ double
+ true
+ true
+
+
+
+ tez.task.scale.memory.ratios
+ string
+ true
+ true
+
+
+
+ tez.task-specific.launch.cmd-opts
+ Additional launch command options to be added for specific tasks.
+ __VERTEX_NAME__ and __TASK_INDEX__ can be specified, which would be replaced at
+ runtime by vertex name and task index.
+ e.g tez.task-specific.launch.cmd-opts=
+ "-agentpath:libpagent.so,dir=/tmp/__VERTEX_NAME__/__TASK_INDEX__"
+ string
+ true
+
+
+
+ tez.task-specific.launch.cmd-opts.list
+ Set of tasks for which specific launch command options need to be added.
+ Format: "vertexName[csv of task ids];vertexName[csv of task ids].."
+ Valid e.g:
+ v[0,1,2] - Additional launch-cmd options for tasks 0,1,2 of vertex v
+ v[1,2,3];v2[5,6,7] - Additional launch-cmd options specified for tasks of vertices v and v2.
+ v[1:5,20,30];v2[2:5,60,7] - Additional launch-cmd options for 1,2,3,4,5,20,30 of vertex v; 2,
+ 3,4,5,60,7 of vertex v2
+ Partial ranges like :5, 1: are not supported.
+ v[] - Additional launch-cmd options for all tasks in vertex v
+ string
+ true
+
+
+
+ tez.task-specific.log.level
+ Task specific log level.
+
+ Simple configuration: Set the log level for all loggers.
+ e.g. INFO
+ This sets the log level to INFO for all loggers.
+
+ Advanced configuration: Set the log level for all classes, along with a different level for some.
+ e.g. DEBUG;org.apache.hadoop.ipc=INFO;org.apache.hadoop.security=INFO
+ This sets the log level for all loggers to DEBUG, expect for the
+ org.apache.hadoop.ipc and org.apache.hadoop.security, which are set to INFO
+
+ Note: The global log level must always be the first parameter.
+ DEBUG;org.apache.hadoop.ipc=INFO;org.apache.hadoop.security=INFO is valid
+ org.apache.hadoop.ipc=INFO;org.apache.hadoop.security=INFO is not valid
+ string
+ true
+
+
+
+ tez.test.minicluster.app.wait.on.shutdown.secs
+ 30
+ Long value.
+ Time to wait (in seconds) for apps to complete on MiniTezCluster shutdown.
+ long
+ true
+
+
+
+ tez.use.cluster.hadoop-libs
+ false
+ Boolean value.
+ Specify whether hadoop libraries required to run Tez should be the ones deployed on the cluster.
+ This is disabled by default - with the expectation being that tez.lib.uris has a complete
+ tez-deployment which contains the hadoop libraries.
+ boolean
+
+
+
+ tez.yarn.ats.acl.domains.auto-create
+ true
+ boolean
+
+
+
+ tez.yarn.ats.event.flush.timeout.millis
+ -1
+ Int value. Time, in milliseconds, to wait while flushing YARN ATS data during shutdown.
+ Expert level setting.
+ long
+
+
+
+ tez.yarn.ats.max.events.per.batch
+ 5
+ Int value. Max no. of events to send in a single batch to ATS.
+ Expert level setting.
+ integer
+
+
+
+ tez.yarn.ats.max.polling.time.per.event.millis
+ 10
+ Int value. Time, in milliseconds, to wait for an event before sending a batch to ATS.
+ Expert level setting.
+ integer
+
+
+
diff --git a/yarn-site.xml b/yarn-site.xml
index 7593388..d9de36e 100644
--- a/yarn-site.xml
+++ b/yarn-site.xml
@@ -29,4 +29,25 @@
600
+
+yarn.nodemanager.resource.memory-mb
+2048
+
+
+
+ yarn.scheduler.minimum-allocation-mb
+ 128
+
+
+
+ yarn.nodemanager.vmem-pmem-ratio
+ 1.5
+
+
+
+ yarn.nodemanager.vmem-check-enabled
+ false
+ Whether virtual memory limits will be enforced for containers
+
+