From 1177bebdc59e8bcc43158a78a3eabbc51ada3e2a Mon Sep 17 00:00:00 2001 From: Raghav Aggarwal Date: Fri, 23 Jan 2026 02:14:34 +0530 Subject: [PATCH 01/10] TEZ-4682: [Cloud] Tez AM docker image --- .../org/apache/tez/dag/app/DAGAppMaster.java | 2 +- tez-dist/pom.xml | 32 ++++ tez-dist/src/docker/Dockerfile | 91 +++++++++++ tez-dist/src/docker/README.md | 65 ++++++++ tez-dist/src/docker/build-docker.sh | 128 +++++++++++++++ tez-dist/src/docker/conf/log4j2.properties | 25 +++ tez-dist/src/docker/conf/tez-site.xml | 61 +++++++ tez-dist/src/docker/entrypoint.sh | 153 ++++++++++++++++++ tez-dist/src/docker/tez.env | 31 ++++ 9 files changed, 587 insertions(+), 1 deletion(-) create mode 100644 tez-dist/src/docker/Dockerfile create mode 100644 tez-dist/src/docker/README.md create mode 100755 tez-dist/src/docker/build-docker.sh create mode 100644 tez-dist/src/docker/conf/log4j2.properties create mode 100644 tez-dist/src/docker/conf/tez-site.xml create mode 100644 tez-dist/src/docker/entrypoint.sh create mode 100644 tez-dist/src/docker/tez.env diff --git a/tez-dag/src/main/java/org/apache/tez/dag/app/DAGAppMaster.java b/tez-dag/src/main/java/org/apache/tez/dag/app/DAGAppMaster.java index c119d572ab..caab2f2a43 100644 --- a/tez-dag/src/main/java/org/apache/tez/dag/app/DAGAppMaster.java +++ b/tez-dag/src/main/java/org/apache/tez/dag/app/DAGAppMaster.java @@ -2425,7 +2425,7 @@ public static void main(String[] args) { Objects.requireNonNull(appSubmitTimeStr, ApplicationConstants.APP_SUBMIT_TIME_ENV + " is null"); - Configuration conf = new Configuration(); + Configuration conf = new TezConfiguration(); ServerFrameworkService frameworkService = getFrameworkService(conf); AMExtensions amExtensions = frameworkService.getAMExtensions(); diff --git a/tez-dist/pom.xml b/tez-dist/pom.xml index 9777d0c0b9..31dae3a28e 100644 --- a/tez-dist/pom.xml +++ b/tez-dist/pom.xml @@ -118,6 +118,38 @@ + + docker + + + + org.codehaus.mojo + exec-maven-plugin + + + build-docker-image + package + + exec + + + /bin/bash + + ${project.basedir}/src/docker/build-docker.sh + -hadoop + ${hadoop.version} + -tez + ${project.version} + -repo + apache + + + + + + + + diff --git a/tez-dist/src/docker/Dockerfile b/tez-dist/src/docker/Dockerfile new file mode 100644 index 0000000000..680da464ff --- /dev/null +++ b/tez-dist/src/docker/Dockerfile @@ -0,0 +1,91 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +ARG BUILD_ENV=unarchive + +# hadolint ignore=DL3006 +FROM ubuntu AS unarchive +# hadolint ignore=DL3010 +ONBUILD COPY hadoop-*.tar.gz /opt +# hadolint ignore=DL3010 +ONBUILD COPY tez-*.tar.gz /opt + +# hadolint ignore=DL3006 +FROM ${BUILD_ENV} AS env +ARG HADOOP_VERSION +ARG TEZ_VERSION + +RUN mkdir -p /opt/hadoop \ + && tar -xzv \ + --exclude="hadoop-$HADOOP_VERSION/share/doc" \ + --exclude="*/jdiff" \ + --exclude="*/sources" \ + --exclude="*tests.jar" \ + --exclude="*/webapps" \ + -f /opt/hadoop-$HADOOP_VERSION.tar.gz \ + -C /opt/hadoop --strip-components 1 \ + && mkdir -p /opt/tez \ + && tar -xzv \ + -f /opt/tez-$TEZ_VERSION.tar.gz \ + -C /opt/tez \ + && rm -rf /opt/hadoop-$HADOOP_VERSION.tar.gz /opt/tez-$TEZ_VERSION.tar.gz + +FROM eclipse-temurin:21.0.3_9-jre-ubi9-minimal AS run + +ARG UID=1000 +ARG HADOOP_VERSION +ARG TEZ_VERSION + +# Install dependencies +# hadolint ignore=DL3041 +RUN set -ex; \ + microdnf update -y; \ + microdnf -y install procps gettext findutils; \ + microdnf clean all; \ + useradd --no-create-home -s /sbin/nologin -c "" --uid $UID tez + +# Set necessary environment variables +ENV HADOOP_HOME=/opt/hadoop \ + TEZ_HOME=/opt/tez \ + TEZ_CONF_DIR=/opt/tez/conf \ + HADOOP_CONF_DIR=/opt/tez/conf + +ENV TEZ_CLIENT_VERSION=$TEZ_VERSION + +ENV PATH=$TEZ_HOME/bin:$HADOOP_HOME/bin:$PATH + +COPY --from=env --chown=tez /opt/hadoop $HADOOP_HOME +# UPDATED: Copy from the normalized directory name created in 'env' stage +COPY --from=env --chown=tez /opt/tez $TEZ_HOME + +RUN mkdir -p $TEZ_CONF_DIR && chown tez:tez $TEZ_CONF_DIR + +COPY --chown=tez entrypoint.sh / +COPY --chown=tez conf $TEZ_CONF_DIR + +# Create Extension Point Directory +RUN mkdir -p /opt/tez/plugins && chown tez:tez /opt/tez/plugins && chmod 755 /opt/tez/plugins + +RUN chmod +x /entrypoint.sh + +USER tez +WORKDIR $TEZ_HOME + +# Expose AM ports via -p flag in docker command +# EXPOSE 10001 10002 10003 8042 + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/tez-dist/src/docker/README.md b/tez-dist/src/docker/README.md new file mode 100644 index 0000000000..b055d8b629 --- /dev/null +++ b/tez-dist/src/docker/README.md @@ -0,0 +1,65 @@ + + +# Tez AM Docker + +1. Building the docker image: + + ```bash + mvn clean install -DskipTests -Pdocker,tools + ``` + +2. Install zookeeper in mac by: + + ```bash + brew install zookeeper + zkServer start + ``` + +3. Running the Tez AM container: + + ```bash + docker run \ + -p 10001:10001 -p 8042:8042 \ + --name tez-am \ + apache/tez-am:1.0.0-SNAPSHOT + ``` + +4. Debugging the Tez AM container: +Uncomment the JAVA_TOOL_OPTIONS in tez.env and expose 5005 port using -p flag + + ```bash + docker run --rm \ + -p 10001:10001 -p 8042:8042 -p 5005:5005 \ + -e TEZ_FRAMEWORK_MODE="STANDALONE_ZOOKEEPER" \ + --env-file tez.env \ + --name tez-am \ + apache/tez-am:1.0.0-SNAPSHOT + ``` + +5. To override the tez-site.xml in docker image use: + +```bash + docker run --rm \ + -p 10001:10001 -p 8042:8042 -p 5005:5005 \ + -e TEZ_FRAMEWORK_MODE="STANDALONE_ZOOKEEPER" \ + --env-file tez.env \ + -v "$(pwd)/conf/tez-site.xml:/opt/tez/custom-conf/tez-site.xml" \ + --name tez-am \ + apache/tez-am:1.0.0-SNAPSHOT + ``` diff --git a/tez-dist/src/docker/build-docker.sh b/tez-dist/src/docker/build-docker.sh new file mode 100755 index 0000000000..fabe94ed77 --- /dev/null +++ b/tez-dist/src/docker/build-docker.sh @@ -0,0 +1,128 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -xeou pipefail + +HADOOP_VERSION= +TEZ_VERSION= +REPO= + +usage() { + cat <&2 +Usage: $0 [-h] [-hadoop ] [-tez ] [-repo ] +Build the Apache Tez AM Docker image +-help Display help +-hadoop Build image with the specified Hadoop version +-tez Build image with the specified Tez version +-repo Docker repository +EOF +} + +while [ $# -gt 0 ]; do + case "$1" in + -h) + usage + exit 0 + ;; + -hadoop) + shift + HADOOP_VERSION=$1 + shift + ;; + -tez) + shift + TEZ_VERSION=$1 + shift + ;; + -repo) + shift + REPO=$1 + shift + ;; + *) + shift + ;; + esac +done + +SCRIPT_DIR=$( + cd "$(dirname "$0")" + pwd +) + +DIST_DIR=${DIST_DIR:-"$SCRIPT_DIR/../.."} +PROJECT_ROOT=${PROJECT_ROOT:-"$SCRIPT_DIR/../../.."} + +repo=${REPO:-apache} +WORK_DIR="$(mktemp -d)" +CACHE_DIR="$SCRIPT_DIR/cache" +mkdir -p "$CACHE_DIR" + +# Defaults Hadoop and Tez versions from pom.xml if not provided +HADOOP_VERSION=${HADOOP_VERSION:-$(mvn -f "$PROJECT_ROOT/pom.xml" -q help:evaluate -Dexpression=hadoop.version -DforceStdout)} +TEZ_VERSION=${TEZ_VERSION:-$(mvn -f "$PROJECT_ROOT/pom.xml" -q help:evaluate -Dexpression=project.version -DforceStdout)} + +###################### +# HADOOP FETCH LOGIC # +###################### +HADOOP_FILE_NAME="hadoop-$HADOOP_VERSION.tar.gz" +HADOOP_URL=${HADOOP_URL:-"https://archive.apache.org/dist/hadoop/core/hadoop-$HADOOP_VERSION/$HADOOP_FILE_NAME"} +if [ ! -f "$CACHE_DIR/$HADOOP_FILE_NAME" ]; then + echo "Downloading Hadoop from $HADOOP_URL..." + if ! curl --fail -L "$HADOOP_URL" -o "$CACHE_DIR/$HADOOP_FILE_NAME.tmp"; then + echo "Fail to download Hadoop, exiting...." + exit 1 + fi + mv "$CACHE_DIR/$HADOOP_FILE_NAME.tmp" "$CACHE_DIR/$HADOOP_FILE_NAME" +fi + +##################################### +# Pick tez tarball from local build # +##################################### +TEZ_FILE_NAME="tez-$TEZ_VERSION.tar.gz" +LOCAL_DIST_PATH="$DIST_DIR/target/$TEZ_FILE_NAME" + +if [ -f "$LOCAL_DIST_PATH" ]; then + echo "--> Found local Tez build artifact at: $LOCAL_DIST_PATH" + cp "$LOCAL_DIST_PATH" "$WORK_DIR/" +else + echo "--> Error: Local Tez artifact not found at $LOCAL_DIST_PATH" + echo "--> Please build the project first (e.g., mvn clean install -DskipTests)." + exit 1 +fi + +# ------------------------------------------------------------------------- +# BUILD CONTEXT PREPARATION +# ------------------------------------------------------------------------- +cp "$CACHE_DIR/$HADOOP_FILE_NAME" "$WORK_DIR/" +cp -R "$SCRIPT_DIR/conf" "$WORK_DIR/" 2>/dev/null || mkdir -p "$WORK_DIR/conf" +cp "$SCRIPT_DIR/entrypoint.sh" "$WORK_DIR/" +cp "$SCRIPT_DIR/Dockerfile" "$WORK_DIR/" + +echo "Building Docker image..." +docker build \ + "$WORK_DIR" \ + -f "$WORK_DIR/Dockerfile" \ + -t "$repo/tez-am:$TEZ_VERSION" \ + --build-arg "BUILD_ENV=unarchive" \ + --build-arg "HADOOP_VERSION=$HADOOP_VERSION" \ + --build-arg "TEZ_VERSION=$TEZ_VERSION" + +rm -r "${WORK_DIR}" +echo "Docker image $repo/tez-am:$TEZ_VERSION built successfully." diff --git a/tez-dist/src/docker/conf/log4j2.properties b/tez-dist/src/docker/conf/log4j2.properties new file mode 100644 index 0000000000..ddccb1b184 --- /dev/null +++ b/tez-dist/src/docker/conf/log4j2.properties @@ -0,0 +1,25 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +appender.console.type = Console +appender.console.name = console +appender.console.target = SYSTEM_ERR +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{ISO8601} %5p [%t] %c{2}: %m%n + +rootLogger.level = INFO +rootLogger.appenderRef.console.ref = console diff --git a/tez-dist/src/docker/conf/tez-site.xml b/tez-dist/src/docker/conf/tez-site.xml new file mode 100644 index 0000000000..a38a9ae6d0 --- /dev/null +++ b/tez-dist/src/docker/conf/tez-site.xml @@ -0,0 +1,61 @@ + + + + + + + tez.am.client.am.port-range + 10001-10003 + + + + tez.am.resource.memory.mb + 1024 + + + + tez.framework.mode + STANDALONE_ZOOKEEPER + + + + tez.am.tez-ui.webservice.enable + false + + + + tez.am.zookeeper.quorum + host.docker.internal:2181 + + + + tez.am.log.level + DEBUG + + + + tez.am.mode.session + true + + + + tez.local.mode + true + + + diff --git a/tez-dist/src/docker/entrypoint.sh b/tez-dist/src/docker/entrypoint.sh new file mode 100644 index 0000000000..543c580ff2 --- /dev/null +++ b/tez-dist/src/docker/entrypoint.sh @@ -0,0 +1,153 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -xeou pipefail + +################################################ +# 1. Mocking DAGAppMaster#main() env variables # +################################################ + +: "${CONTAINER_ID:="container_1700000000000_0001_01_000001"}" +: "${USER:="tez"}" +: "${HADOOP_USER_NAME:="tez"}" +: "${NM_HOST:="localhost"}" +: "${NM_PORT:="12345"}" +: "${NM_HTTP_PORT:="8042"}" +: "${LOCAL_DIRS:="/tmp"}" +: "${LOG_DIRS:="/opt/tez/logs"}" +: "${APP_SUBMIT_TIME_ENV:=$(($(date +%s) * 1000))}" +: "${TEZ_AM_EXTERNAL_ID:="tez-session-$(hostname)"}" + +export CONTAINER_ID USER HADOOP_USER_NAME NM_HOST NM_PORT NM_HTTP_PORT \ + LOCAL_DIRS LOG_DIRS APP_SUBMIT_TIME_ENV TEZ_AM_EXTERNAL_ID + +if [[ ! -f "tez-conf.pb" ]]; then + touch "tez-conf.pb" + echo "--> Created dummy tez-conf.pb" +fi + +if [[ ! -f "tez-dag.pb" ]]; then + touch "tez-dag.pb" + echo "--> Created dummy tez-dag.pb" +fi + +mkdir -p "$LOG_DIRS" + +########################## +# CONFIGURATION HANDLING # +########################## + +# Symlink hadoop conf in tez conf dir +if [[ -d "$HADOOP_HOME/etc/hadoop" ]]; then + echo "--> Linking missing Hadoop configs to $TEZ_CONF_DIR..." + for f in "$HADOOP_HOME/etc/hadoop"/*; do + basename=$(basename "$f") + # this check helps in case user wants to provide its custom hfds-site.xml + # or any other configuration file + if [[ ! -e "$TEZ_CONF_DIR/$basename" ]]; then + ln -s "$f" "$TEZ_CONF_DIR/$basename" + fi + done +fi + +########################### +# Custom Config directory # +########################### +if [[ -n "${TEZ_CUSTOM_CONF_DIR:-}" ]] && [[ -d "$TEZ_CUSTOM_CONF_DIR" ]]; then + echo "--> Using custom configuration directory: $TEZ_CUSTOM_CONF_DIR" + find "${TEZ_CUSTOM_CONF_DIR}" -type f -exec \ + ln -sf {} "${TEZ_CONF_DIR}"/ \; + + # Remove template keyword if it exist + if [[ -f "$TEZ_CONF_DIR/tez-site.xml.template" ]]; then + envsubst < "$TEZ_CONF_DIR/tez-site.xml.template" > "$TEZ_CONF_DIR/tez-site.xml" + fi +fi + +############# +# CLASSPATH # +############# + +export HADOOP_USER_CLASSPATH_FIRST=true +# Order is: conf -> plugins -> tez jars -> hadoop jars +CLASSPATH="${TEZ_CONF_DIR}" + +# Custom Plugins +# This allows mounting a volume at /opt/tez/plugins containing aux jars +PLUGIN_DIR="/opt/tez/plugins" +if [[ -d "$PLUGIN_DIR" ]]; then + count=$(find "$PLUGIN_DIR" -maxdepth 1 -name "*.jar" 2>/dev/null | wc -l) + if [ "$count" != "0" ]; then + echo "--> Found $count plugin jars. Prepending to classpath." + CLASSPATH="${CLASSPATH}:${PLUGIN_DIR}/*" + fi +fi + +# Tez Jars +CLASSPATH="${CLASSPATH}:${TEZ_HOME}/*:${TEZ_HOME}/lib/*" + +# Hadoop Jars +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/common/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/common/lib/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/hdfs/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/hdfs/lib/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/yarn/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/yarn/lib/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/mapreduce/*" +CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/mapreduce/lib/*" + +############# +# Execution # +############# +TEZ_DAG_JAR=$(find "$TEZ_HOME" -maxdepth 1 -name "tez-dag-*.jar" ! -name "*-tests.jar" | head -n 1) + +if [ -z "$TEZ_DAG_JAR" ]; then + echo "Error: Could not find tez-dag-*.jar in $TEZ_HOME" + exit 1 +fi + +echo "--> Starting DAGAppMaster..." +echo "--> HADOOP_CONF_DIR: $HADOOP_CONF_DIR" + +: "${TEZ_AM_HEAP_OPTS:="-Xmx2048m"}" + +# Check for Log4j2 Configuration +LOG4J2_FILE="$TEZ_CONF_DIR/log4j2.properties" +if [[ -f "$LOG4J2_FILE" ]]; then + echo "--> [TEZ-AM] Found Log4j2 configuration: $LOG4J2_FILE" + JAVA_OPTS="${JAVA_OPTS:+$JAVA_OPTS }-Dlog4j.configurationFile=file:$LOG4J2_FILE" +fi + +JAVA_ADD_OPENS=( + "--add-opens=java.base/java.lang=ALL-UNNAMED" + "--add-opens=java.base/java.util=ALL-UNNAMED" + "--add-opens=java.base/java.io=ALL-UNNAMED" +) + +read -r -a JAVA_OPTS_ARR <<< "${JAVA_OPTS:-}" +read -r -a HEAP_OPTS_ARR <<< "${TEZ_AM_HEAP_OPTS}" + +exec java "${HEAP_OPTS_ARR[@]}" "${JAVA_OPTS_ARR[@]}" "${JAVA_ADD_OPENS[@]}" \ + -Duser.name="$HADOOP_USER_NAME" \ + -Djava.library.path="$HADOOP_HOME/lib/native" \ + -Dhadoop.home.dir="$HADOOP_HOME" \ + -Dhadoop.log.dir="$LOG_DIRS" \ + -Dtez.conf.dir="$TEZ_CONF_DIR" \ + -cp "$CLASSPATH" \ + org.apache.tez.dag.app.DAGAppMaster --session \ + "$@" diff --git a/tez-dist/src/docker/tez.env b/tez-dist/src/docker/tez.env new file mode 100644 index 0000000000..ce7d4d278f --- /dev/null +++ b/tez-dist/src/docker/tez.env @@ -0,0 +1,31 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Tez AM Container Environment Configuration + +HADOOP_USER_NAME=tez +USER=tez +CONTAINER_ID=container_1700000000000_0001_01_000001 +NM_HOST=localhost +NM_PORT=12345 +NM_HTTP_PORT=8042 +TEZ_FRAMEWORK_MODE=STANDALONE_ZOOKEEPER +TEZ_AM_ZOOKEEPER_QUORUM=host.docker.internal:2181 +TEZ_AM_LOG_LEVEL=INFO +# TEZ_CUSTOM_CONF_DIR=/opt/tez/custom-conf +# Enable remote debugging on port 5005 +#JAVA_TOOL_OPTIONS='-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005' From 5fc4ec45dd134939f6ce6815d4d3e2242c563308 Mon Sep 17 00:00:00 2001 From: Raghav Aggarwal Date: Sat, 21 Feb 2026 22:05:49 +0530 Subject: [PATCH 02/10] move back to Configuration --- .../java/org/apache/tez/dag/app/DAGAppMaster.java | 2 +- tez-dist/src/docker/conf/tez-site.xml | 14 ++------------ tez-dist/src/docker/entrypoint.sh | 10 ---------- tez-dist/src/docker/tez.env | 2 -- 4 files changed, 3 insertions(+), 25 deletions(-) diff --git a/tez-dag/src/main/java/org/apache/tez/dag/app/DAGAppMaster.java b/tez-dag/src/main/java/org/apache/tez/dag/app/DAGAppMaster.java index caab2f2a43..c119d572ab 100644 --- a/tez-dag/src/main/java/org/apache/tez/dag/app/DAGAppMaster.java +++ b/tez-dag/src/main/java/org/apache/tez/dag/app/DAGAppMaster.java @@ -2425,7 +2425,7 @@ public static void main(String[] args) { Objects.requireNonNull(appSubmitTimeStr, ApplicationConstants.APP_SUBMIT_TIME_ENV + " is null"); - Configuration conf = new TezConfiguration(); + Configuration conf = new Configuration(); ServerFrameworkService frameworkService = getFrameworkService(conf); AMExtensions amExtensions = frameworkService.getAMExtensions(); diff --git a/tez-dist/src/docker/conf/tez-site.xml b/tez-dist/src/docker/conf/tez-site.xml index a38a9ae6d0..681ecc30b7 100644 --- a/tez-dist/src/docker/conf/tez-site.xml +++ b/tez-dist/src/docker/conf/tez-site.xml @@ -23,16 +23,6 @@ 10001-10003 - - tez.am.resource.memory.mb - 1024 - - - - tez.framework.mode - STANDALONE_ZOOKEEPER - - tez.am.tez-ui.webservice.enable false @@ -49,8 +39,8 @@ - tez.am.mode.session - true + fs.defaultFS + hdfs://host.docker.internal:9000 diff --git a/tez-dist/src/docker/entrypoint.sh b/tez-dist/src/docker/entrypoint.sh index 543c580ff2..06715aba16 100644 --- a/tez-dist/src/docker/entrypoint.sh +++ b/tez-dist/src/docker/entrypoint.sh @@ -36,16 +36,6 @@ set -xeou pipefail export CONTAINER_ID USER HADOOP_USER_NAME NM_HOST NM_PORT NM_HTTP_PORT \ LOCAL_DIRS LOG_DIRS APP_SUBMIT_TIME_ENV TEZ_AM_EXTERNAL_ID -if [[ ! -f "tez-conf.pb" ]]; then - touch "tez-conf.pb" - echo "--> Created dummy tez-conf.pb" -fi - -if [[ ! -f "tez-dag.pb" ]]; then - touch "tez-dag.pb" - echo "--> Created dummy tez-dag.pb" -fi - mkdir -p "$LOG_DIRS" ########################## diff --git a/tez-dist/src/docker/tez.env b/tez-dist/src/docker/tez.env index ce7d4d278f..ed2d208f61 100644 --- a/tez-dist/src/docker/tez.env +++ b/tez-dist/src/docker/tez.env @@ -24,8 +24,6 @@ NM_HOST=localhost NM_PORT=12345 NM_HTTP_PORT=8042 TEZ_FRAMEWORK_MODE=STANDALONE_ZOOKEEPER -TEZ_AM_ZOOKEEPER_QUORUM=host.docker.internal:2181 -TEZ_AM_LOG_LEVEL=INFO # TEZ_CUSTOM_CONF_DIR=/opt/tez/custom-conf # Enable remote debugging on port 5005 #JAVA_TOOL_OPTIONS='-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005' From 8510a5ad1a59a06256f9db526ed876d62d0b266d Mon Sep 17 00:00:00 2001 From: Raghav Aggarwal Date: Wed, 25 Feb 2026 21:22:04 +0530 Subject: [PATCH 03/10] Review Comments --- .gitignore | 1 + tez-dist/pom.xml | 9 +-- tez-dist/src/docker/Dockerfile | 10 ++-- tez-dist/src/docker/README.md | 60 ++++++++++++++----- tez-dist/src/docker/build-docker.sh | 10 ++-- tez-dist/src/docker/conf/tez-site.xml | 2 + .../{entrypoint.sh => tez-am-entrypoint.sh} | 2 +- tez-dist/src/docker/tez.env | 11 ++-- 8 files changed, 69 insertions(+), 36 deletions(-) rename tez-dist/src/docker/{entrypoint.sh => tez-am-entrypoint.sh} (99%) diff --git a/.gitignore b/.gitignore index 85d660672c..e0df115492 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ .settings pom.xml.versionsBackup target +tez-dist/src/docker/cache/ diff --git a/tez-dist/pom.xml b/tez-dist/pom.xml index 31dae3a28e..5271970279 100644 --- a/tez-dist/pom.xml +++ b/tez-dist/pom.xml @@ -136,12 +136,9 @@ /bin/bash ${project.basedir}/src/docker/build-docker.sh - -hadoop - ${hadoop.version} - -tez - ${project.version} - -repo - apache + -hadoop ${hadoop.version} + -tez ${project.version} + -repo apache diff --git a/tez-dist/src/docker/Dockerfile b/tez-dist/src/docker/Dockerfile index 680da464ff..0d40c33253 100644 --- a/tez-dist/src/docker/Dockerfile +++ b/tez-dist/src/docker/Dockerfile @@ -44,7 +44,7 @@ RUN mkdir -p /opt/hadoop \ -C /opt/tez \ && rm -rf /opt/hadoop-$HADOOP_VERSION.tar.gz /opt/tez-$TEZ_VERSION.tar.gz -FROM eclipse-temurin:21.0.3_9-jre-ubi9-minimal AS run +FROM eclipse-temurin:21-jdk-ubi9-minimal AS run ARG UID=1000 ARG HADOOP_VERSION @@ -54,7 +54,7 @@ ARG TEZ_VERSION # hadolint ignore=DL3041 RUN set -ex; \ microdnf update -y; \ - microdnf -y install procps gettext findutils; \ + microdnf -y install procps gettext findutils hostname; \ microdnf clean all; \ useradd --no-create-home -s /sbin/nologin -c "" --uid $UID tez @@ -74,13 +74,13 @@ COPY --from=env --chown=tez /opt/tez $TEZ_HOME RUN mkdir -p $TEZ_CONF_DIR && chown tez:tez $TEZ_CONF_DIR -COPY --chown=tez entrypoint.sh / +COPY --chown=tez tez-am-entrypoint.sh / COPY --chown=tez conf $TEZ_CONF_DIR # Create Extension Point Directory RUN mkdir -p /opt/tez/plugins && chown tez:tez /opt/tez/plugins && chmod 755 /opt/tez/plugins -RUN chmod +x /entrypoint.sh +RUN chmod +x /tez-am-entrypoint.sh USER tez WORKDIR $TEZ_HOME @@ -88,4 +88,4 @@ WORKDIR $TEZ_HOME # Expose AM ports via -p flag in docker command # EXPOSE 10001 10002 10003 8042 -ENTRYPOINT ["/entrypoint.sh"] +ENTRYPOINT ["/tez-am-entrypoint.sh"] diff --git a/tez-dist/src/docker/README.md b/tez-dist/src/docker/README.md index b055d8b629..0c15f613b9 100644 --- a/tez-dist/src/docker/README.md +++ b/tez-dist/src/docker/README.md @@ -21,7 +21,7 @@ 1. Building the docker image: ```bash - mvn clean install -DskipTests -Pdocker,tools + mvn clean install -DskipTests -Pdocker ``` 2. Install zookeeper in mac by: @@ -34,32 +34,62 @@ 3. Running the Tez AM container: ```bash - docker run \ - -p 10001:10001 -p 8042:8042 \ + export TEZ_VERSION=1.0.0-SNAPSHOT + + docker run --rm \ + -p 10001:10001 \ + --env-file tez-dist/src/docker/tez.env \ --name tez-am \ - apache/tez-am:1.0.0-SNAPSHOT + --hostname localhost \ + apache/tez-am:$TEZ_VERSION ``` + * `TEZ_VERSION` corresponds to the Maven `${project.version}`. + Set this environment variable in your shell before running the commands. + * Expose ports using the `-p` flag based on the + `tez.am.client.am.port-range` property in `tez-site.xml`. + * The `--hostname` flag configures the container's hostname, allowing + services on the host (e.g., macOS) to connect to it. + * Ensure the `--env-file` flag is included, or at a minimum, pass + `-e TEZ_FRAMEWORK_MODE=STANDALONE_ZOOKEEPER` to the `docker run` command. + 4. Debugging the Tez AM container: -Uncomment the JAVA_TOOL_OPTIONS in tez.env and expose 5005 port using -p flag +Uncomment the `JAVA_TOOL_OPTIONS` in `tez.env` and expose 5005 port using `-p` flag ```bash docker run --rm \ - -p 10001:10001 -p 8042:8042 -p 5005:5005 \ - -e TEZ_FRAMEWORK_MODE="STANDALONE_ZOOKEEPER" \ - --env-file tez.env \ + -p 10001:10001 -p 5005:5005 \ + --env-file tez-dist/src/docker/tez.env \ --name tez-am \ - apache/tez-am:1.0.0-SNAPSHOT + --hostname localhost \ + apache/tez-am:$TEZ_VERSION ``` 5. To override the tez-site.xml in docker image use: + * Set the `TEZ_CUSTOM_CONF_DIR` environment variable in `tez.env` + or via the `docker run` command (e.g., `/opt/tez/custom-conf`). + + ```bash + export TEZ_SITE_PATH=$(pwd)/tez-dist/src/docker/conf/tez-site.xml + + docker run --rm \ + -p 10001:10001 \ + --env-file tez-dist/src/docker/tez.env \ + -v "$TEZ_SITE_PATH:/opt/tez/custom-conf/tez-site.xml" \ + --name tez-am \ + --hostname localhost \ + apache/tez-am:$TEZ_VERSION + ``` -```bash +6. To add plugin jars in docker image use: + * The plugin directory path inside the Docker container is fixed at `/opt/tez/plugins`. + + ```bash docker run --rm \ - -p 10001:10001 -p 8042:8042 -p 5005:5005 \ - -e TEZ_FRAMEWORK_MODE="STANDALONE_ZOOKEEPER" \ - --env-file tez.env \ - -v "$(pwd)/conf/tez-site.xml:/opt/tez/custom-conf/tez-site.xml" \ + -p 10001:10001 \ + --env-file tez-dist/src/docker/tez.env \ + -v "/path/to/your/local/plugins:/opt/tez/plugins" \ --name tez-am \ - apache/tez-am:1.0.0-SNAPSHOT + --hostname localhost \ + apache/tez-am:$TEZ_VERSION ``` diff --git a/tez-dist/src/docker/build-docker.sh b/tez-dist/src/docker/build-docker.sh index fabe94ed77..3642aa7834 100755 --- a/tez-dist/src/docker/build-docker.sh +++ b/tez-dist/src/docker/build-docker.sh @@ -69,7 +69,7 @@ SCRIPT_DIR=$( DIST_DIR=${DIST_DIR:-"$SCRIPT_DIR/../.."} PROJECT_ROOT=${PROJECT_ROOT:-"$SCRIPT_DIR/../../.."} -repo=${REPO:-apache} +REPO=${REPO:-apache} WORK_DIR="$(mktemp -d)" CACHE_DIR="$SCRIPT_DIR/cache" mkdir -p "$CACHE_DIR" @@ -82,7 +82,7 @@ TEZ_VERSION=${TEZ_VERSION:-$(mvn -f "$PROJECT_ROOT/pom.xml" -q help:evaluate -De # HADOOP FETCH LOGIC # ###################### HADOOP_FILE_NAME="hadoop-$HADOOP_VERSION.tar.gz" -HADOOP_URL=${HADOOP_URL:-"https://archive.apache.org/dist/hadoop/core/hadoop-$HADOOP_VERSION/$HADOOP_FILE_NAME"} +HADOOP_URL=${HADOOP_URL:-"https://dlcdn.apache.org/hadoop/common/hadoop-$HADOOP_VERSION/$HADOOP_FILE_NAME"} if [ ! -f "$CACHE_DIR/$HADOOP_FILE_NAME" ]; then echo "Downloading Hadoop from $HADOOP_URL..." if ! curl --fail -L "$HADOOP_URL" -o "$CACHE_DIR/$HADOOP_FILE_NAME.tmp"; then @@ -112,17 +112,17 @@ fi # ------------------------------------------------------------------------- cp "$CACHE_DIR/$HADOOP_FILE_NAME" "$WORK_DIR/" cp -R "$SCRIPT_DIR/conf" "$WORK_DIR/" 2>/dev/null || mkdir -p "$WORK_DIR/conf" -cp "$SCRIPT_DIR/entrypoint.sh" "$WORK_DIR/" +cp "$SCRIPT_DIR/tez-am-entrypoint.sh" "$WORK_DIR/" cp "$SCRIPT_DIR/Dockerfile" "$WORK_DIR/" echo "Building Docker image..." docker build \ "$WORK_DIR" \ -f "$WORK_DIR/Dockerfile" \ - -t "$repo/tez-am:$TEZ_VERSION" \ + -t "$REPO/tez-am:$TEZ_VERSION" \ --build-arg "BUILD_ENV=unarchive" \ --build-arg "HADOOP_VERSION=$HADOOP_VERSION" \ --build-arg "TEZ_VERSION=$TEZ_VERSION" rm -r "${WORK_DIR}" -echo "Docker image $repo/tez-am:$TEZ_VERSION built successfully." +echo "Docker image $REPO/tez-am:$TEZ_VERSION built successfully." diff --git a/tez-dist/src/docker/conf/tez-site.xml b/tez-dist/src/docker/conf/tez-site.xml index 681ecc30b7..b1b2b55caa 100644 --- a/tez-dist/src/docker/conf/tez-site.xml +++ b/tez-dist/src/docker/conf/tez-site.xml @@ -38,10 +38,12 @@ DEBUG + tez.local.mode diff --git a/tez-dist/src/docker/entrypoint.sh b/tez-dist/src/docker/tez-am-entrypoint.sh similarity index 99% rename from tez-dist/src/docker/entrypoint.sh rename to tez-dist/src/docker/tez-am-entrypoint.sh index 06715aba16..e4d96394ad 100644 --- a/tez-dist/src/docker/entrypoint.sh +++ b/tez-dist/src/docker/tez-am-entrypoint.sh @@ -63,7 +63,7 @@ if [[ -n "${TEZ_CUSTOM_CONF_DIR:-}" ]] && [[ -d "$TEZ_CUSTOM_CONF_DIR" ]]; then find "${TEZ_CUSTOM_CONF_DIR}" -type f -exec \ ln -sf {} "${TEZ_CONF_DIR}"/ \; - # Remove template keyword if it exist + # Remove template keyword if it exists if [[ -f "$TEZ_CONF_DIR/tez-site.xml.template" ]]; then envsubst < "$TEZ_CONF_DIR/tez-site.xml.template" > "$TEZ_CONF_DIR/tez-site.xml" fi diff --git a/tez-dist/src/docker/tez.env b/tez-dist/src/docker/tez.env index ed2d208f61..832bb986da 100644 --- a/tez-dist/src/docker/tez.env +++ b/tez-dist/src/docker/tez.env @@ -17,13 +17,16 @@ # Tez AM Container Environment Configuration -HADOOP_USER_NAME=tez -USER=tez CONTAINER_ID=container_1700000000000_0001_01_000001 +USER=tez +HADOOP_USER_NAME=tez NM_HOST=localhost NM_PORT=12345 NM_HTTP_PORT=8042 +LOG_DIRS=/opt/tez/logs TEZ_FRAMEWORK_MODE=STANDALONE_ZOOKEEPER -# TEZ_CUSTOM_CONF_DIR=/opt/tez/custom-conf +TEZ_CUSTOM_CONF_DIR=/opt/tez/custom-conf +# TEZ_AM_HEAP_OPTS configures the maximum heap size (Xmx) for the Tez AM. +TEZ_AM_HEAP_OPTS=-Xmx2048m # Enable remote debugging on port 5005 -#JAVA_TOOL_OPTIONS='-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005' +# JAVA_TOOL_OPTIONS='-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005' From 9b311c45c182e36ae1d35e7600e86795d70bd3ef Mon Sep 17 00:00:00 2001 From: Raghav Aggarwal Date: Sat, 28 Feb 2026 00:36:27 +0530 Subject: [PATCH 04/10] Get rid of hadoop dependency from docker image --- tez-dist/pom.xml | 1 - tez-dist/src/docker/Dockerfile | 30 ++++----------------- tez-dist/src/docker/build-docker.sh | 28 ++----------------- tez-dist/src/docker/tez-am-entrypoint.sh | 34 +----------------------- 4 files changed, 8 insertions(+), 85 deletions(-) diff --git a/tez-dist/pom.xml b/tez-dist/pom.xml index 5271970279..fff9980c53 100644 --- a/tez-dist/pom.xml +++ b/tez-dist/pom.xml @@ -136,7 +136,6 @@ /bin/bash ${project.basedir}/src/docker/build-docker.sh - -hadoop ${hadoop.version} -tez ${project.version} -repo apache diff --git a/tez-dist/src/docker/Dockerfile b/tez-dist/src/docker/Dockerfile index 0d40c33253..883652246b 100644 --- a/tez-dist/src/docker/Dockerfile +++ b/tez-dist/src/docker/Dockerfile @@ -20,34 +20,21 @@ ARG BUILD_ENV=unarchive # hadolint ignore=DL3006 FROM ubuntu AS unarchive # hadolint ignore=DL3010 -ONBUILD COPY hadoop-*.tar.gz /opt -# hadolint ignore=DL3010 ONBUILD COPY tez-*.tar.gz /opt # hadolint ignore=DL3006 FROM ${BUILD_ENV} AS env -ARG HADOOP_VERSION ARG TEZ_VERSION -RUN mkdir -p /opt/hadoop \ - && tar -xzv \ - --exclude="hadoop-$HADOOP_VERSION/share/doc" \ - --exclude="*/jdiff" \ - --exclude="*/sources" \ - --exclude="*tests.jar" \ - --exclude="*/webapps" \ - -f /opt/hadoop-$HADOOP_VERSION.tar.gz \ - -C /opt/hadoop --strip-components 1 \ - && mkdir -p /opt/tez \ +RUN mkdir -p /opt/tez \ && tar -xzv \ -f /opt/tez-$TEZ_VERSION.tar.gz \ -C /opt/tez \ - && rm -rf /opt/hadoop-$HADOOP_VERSION.tar.gz /opt/tez-$TEZ_VERSION.tar.gz + && rm -rf /opt/tez-$TEZ_VERSION.tar.gz FROM eclipse-temurin:21-jdk-ubi9-minimal AS run ARG UID=1000 -ARG HADOOP_VERSION ARG TEZ_VERSION # Install dependencies @@ -59,17 +46,13 @@ RUN set -ex; \ useradd --no-create-home -s /sbin/nologin -c "" --uid $UID tez # Set necessary environment variables -ENV HADOOP_HOME=/opt/hadoop \ - TEZ_HOME=/opt/tez \ - TEZ_CONF_DIR=/opt/tez/conf \ - HADOOP_CONF_DIR=/opt/tez/conf +ENV TEZ_HOME=/opt/tez \ + TEZ_CONF_DIR=/opt/tez/conf ENV TEZ_CLIENT_VERSION=$TEZ_VERSION -ENV PATH=$TEZ_HOME/bin:$HADOOP_HOME/bin:$PATH +ENV PATH=$TEZ_HOME/bin:$PATH -COPY --from=env --chown=tez /opt/hadoop $HADOOP_HOME -# UPDATED: Copy from the normalized directory name created in 'env' stage COPY --from=env --chown=tez /opt/tez $TEZ_HOME RUN mkdir -p $TEZ_CONF_DIR && chown tez:tez $TEZ_CONF_DIR @@ -85,7 +68,4 @@ RUN chmod +x /tez-am-entrypoint.sh USER tez WORKDIR $TEZ_HOME -# Expose AM ports via -p flag in docker command -# EXPOSE 10001 10002 10003 8042 - ENTRYPOINT ["/tez-am-entrypoint.sh"] diff --git a/tez-dist/src/docker/build-docker.sh b/tez-dist/src/docker/build-docker.sh index 3642aa7834..91926cf33f 100755 --- a/tez-dist/src/docker/build-docker.sh +++ b/tez-dist/src/docker/build-docker.sh @@ -19,16 +19,14 @@ set -xeou pipefail -HADOOP_VERSION= TEZ_VERSION= REPO= usage() { cat <&2 -Usage: $0 [-h] [-hadoop ] [-tez ] [-repo ] +Usage: $0 [-h] [-tez ] [-repo ] Build the Apache Tez AM Docker image -help Display help --hadoop Build image with the specified Hadoop version -tez Build image with the specified Tez version -repo Docker repository EOF @@ -40,11 +38,6 @@ while [ $# -gt 0 ]; do usage exit 0 ;; - -hadoop) - shift - HADOOP_VERSION=$1 - shift - ;; -tez) shift TEZ_VERSION=$1 @@ -74,24 +67,9 @@ WORK_DIR="$(mktemp -d)" CACHE_DIR="$SCRIPT_DIR/cache" mkdir -p "$CACHE_DIR" -# Defaults Hadoop and Tez versions from pom.xml if not provided -HADOOP_VERSION=${HADOOP_VERSION:-$(mvn -f "$PROJECT_ROOT/pom.xml" -q help:evaluate -Dexpression=hadoop.version -DforceStdout)} +# Defaults Tez versions from pom.xml if not provided TEZ_VERSION=${TEZ_VERSION:-$(mvn -f "$PROJECT_ROOT/pom.xml" -q help:evaluate -Dexpression=project.version -DforceStdout)} -###################### -# HADOOP FETCH LOGIC # -###################### -HADOOP_FILE_NAME="hadoop-$HADOOP_VERSION.tar.gz" -HADOOP_URL=${HADOOP_URL:-"https://dlcdn.apache.org/hadoop/common/hadoop-$HADOOP_VERSION/$HADOOP_FILE_NAME"} -if [ ! -f "$CACHE_DIR/$HADOOP_FILE_NAME" ]; then - echo "Downloading Hadoop from $HADOOP_URL..." - if ! curl --fail -L "$HADOOP_URL" -o "$CACHE_DIR/$HADOOP_FILE_NAME.tmp"; then - echo "Fail to download Hadoop, exiting...." - exit 1 - fi - mv "$CACHE_DIR/$HADOOP_FILE_NAME.tmp" "$CACHE_DIR/$HADOOP_FILE_NAME" -fi - ##################################### # Pick tez tarball from local build # ##################################### @@ -110,7 +88,6 @@ fi # ------------------------------------------------------------------------- # BUILD CONTEXT PREPARATION # ------------------------------------------------------------------------- -cp "$CACHE_DIR/$HADOOP_FILE_NAME" "$WORK_DIR/" cp -R "$SCRIPT_DIR/conf" "$WORK_DIR/" 2>/dev/null || mkdir -p "$WORK_DIR/conf" cp "$SCRIPT_DIR/tez-am-entrypoint.sh" "$WORK_DIR/" cp "$SCRIPT_DIR/Dockerfile" "$WORK_DIR/" @@ -121,7 +98,6 @@ docker build \ -f "$WORK_DIR/Dockerfile" \ -t "$REPO/tez-am:$TEZ_VERSION" \ --build-arg "BUILD_ENV=unarchive" \ - --build-arg "HADOOP_VERSION=$HADOOP_VERSION" \ --build-arg "TEZ_VERSION=$TEZ_VERSION" rm -r "${WORK_DIR}" diff --git a/tez-dist/src/docker/tez-am-entrypoint.sh b/tez-dist/src/docker/tez-am-entrypoint.sh index e4d96394ad..be144cdaca 100644 --- a/tez-dist/src/docker/tez-am-entrypoint.sh +++ b/tez-dist/src/docker/tez-am-entrypoint.sh @@ -38,23 +38,6 @@ export CONTAINER_ID USER HADOOP_USER_NAME NM_HOST NM_PORT NM_HTTP_PORT \ mkdir -p "$LOG_DIRS" -########################## -# CONFIGURATION HANDLING # -########################## - -# Symlink hadoop conf in tez conf dir -if [[ -d "$HADOOP_HOME/etc/hadoop" ]]; then - echo "--> Linking missing Hadoop configs to $TEZ_CONF_DIR..." - for f in "$HADOOP_HOME/etc/hadoop"/*; do - basename=$(basename "$f") - # this check helps in case user wants to provide its custom hfds-site.xml - # or any other configuration file - if [[ ! -e "$TEZ_CONF_DIR/$basename" ]]; then - ln -s "$f" "$TEZ_CONF_DIR/$basename" - fi - done -fi - ########################### # Custom Config directory # ########################### @@ -73,8 +56,7 @@ fi # CLASSPATH # ############# -export HADOOP_USER_CLASSPATH_FIRST=true -# Order is: conf -> plugins -> tez jars -> hadoop jars +# Order is: conf -> plugins -> tez jars CLASSPATH="${TEZ_CONF_DIR}" # Custom Plugins @@ -91,16 +73,6 @@ fi # Tez Jars CLASSPATH="${CLASSPATH}:${TEZ_HOME}/*:${TEZ_HOME}/lib/*" -# Hadoop Jars -CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/common/*" -CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/common/lib/*" -CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/hdfs/*" -CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/hdfs/lib/*" -CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/yarn/*" -CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/yarn/lib/*" -CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/mapreduce/*" -CLASSPATH="${CLASSPATH}:${HADOOP_HOME}/share/hadoop/mapreduce/lib/*" - ############# # Execution # ############# @@ -112,7 +84,6 @@ if [ -z "$TEZ_DAG_JAR" ]; then fi echo "--> Starting DAGAppMaster..." -echo "--> HADOOP_CONF_DIR: $HADOOP_CONF_DIR" : "${TEZ_AM_HEAP_OPTS:="-Xmx2048m"}" @@ -134,9 +105,6 @@ read -r -a HEAP_OPTS_ARR <<< "${TEZ_AM_HEAP_OPTS}" exec java "${HEAP_OPTS_ARR[@]}" "${JAVA_OPTS_ARR[@]}" "${JAVA_ADD_OPENS[@]}" \ -Duser.name="$HADOOP_USER_NAME" \ - -Djava.library.path="$HADOOP_HOME/lib/native" \ - -Dhadoop.home.dir="$HADOOP_HOME" \ - -Dhadoop.log.dir="$LOG_DIRS" \ -Dtez.conf.dir="$TEZ_CONF_DIR" \ -cp "$CLASSPATH" \ org.apache.tez.dag.app.DAGAppMaster --session \ From 5d833ec5578392009bdc5200934c713bf6e8d62b Mon Sep 17 00:00:00 2001 From: Raghav Aggarwal Date: Sat, 7 Mar 2026 16:15:59 +0530 Subject: [PATCH 05/10] Update log4j to use TezLog4jConfigurator and reload4j --- .../resources/tez-container-log4j.properties | 9 ++++++- tez-dist/src/docker/conf/log4j2.properties | 25 ------------------- tez-dist/src/docker/tez-am-entrypoint.sh | 17 +++++++------ 3 files changed, 18 insertions(+), 33 deletions(-) delete mode 100644 tez-dist/src/docker/conf/log4j2.properties diff --git a/tez-dag/src/main/resources/tez-container-log4j.properties b/tez-dag/src/main/resources/tez-container-log4j.properties index 7cebec3289..4525d7018f 100644 --- a/tez-dag/src/main/resources/tez-container-log4j.properties +++ b/tez-dag/src/main/resources/tez-container-log4j.properties @@ -21,9 +21,16 @@ log4j.rootLogger=${tez.root.logger} log4j.threshold=ALL # -# ContainerLog Appender +# Console Appender # +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.out +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{ISO8601} [%p] [%t] |%c{2}|: %m%n +# +# ContainerLog Appender +# log4j.appender.CLA=org.apache.tez.common.TezContainerLogAppender log4j.appender.CLA.containerLogDir=${yarn.app.container.log.dir} diff --git a/tez-dist/src/docker/conf/log4j2.properties b/tez-dist/src/docker/conf/log4j2.properties deleted file mode 100644 index ddccb1b184..0000000000 --- a/tez-dist/src/docker/conf/log4j2.properties +++ /dev/null @@ -1,25 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -appender.console.type = Console -appender.console.name = console -appender.console.target = SYSTEM_ERR -appender.console.layout.type = PatternLayout -appender.console.layout.pattern = %d{ISO8601} %5p [%t] %c{2}: %m%n - -rootLogger.level = INFO -rootLogger.appenderRef.console.ref = console diff --git a/tez-dist/src/docker/tez-am-entrypoint.sh b/tez-dist/src/docker/tez-am-entrypoint.sh index be144cdaca..a01f3042b5 100644 --- a/tez-dist/src/docker/tez-am-entrypoint.sh +++ b/tez-dist/src/docker/tez-am-entrypoint.sh @@ -86,24 +86,27 @@ fi echo "--> Starting DAGAppMaster..." : "${TEZ_AM_HEAP_OPTS:="-Xmx2048m"}" - -# Check for Log4j2 Configuration -LOG4J2_FILE="$TEZ_CONF_DIR/log4j2.properties" -if [[ -f "$LOG4J2_FILE" ]]; then - echo "--> [TEZ-AM] Found Log4j2 configuration: $LOG4J2_FILE" - JAVA_OPTS="${JAVA_OPTS:+$JAVA_OPTS }-Dlog4j.configurationFile=file:$LOG4J2_FILE" -fi +# : "${TEZ_AM_GC_OPTS:="-Xlog:gc*=info,class+load=info::time,uptime,level,tags -XX:+UseNUMA"}" JAVA_ADD_OPENS=( "--add-opens=java.base/java.lang=ALL-UNNAMED" "--add-opens=java.base/java.util=ALL-UNNAMED" "--add-opens=java.base/java.io=ALL-UNNAMED" + "-Dnet.bytebuddy.experimental=true" ) read -r -a JAVA_OPTS_ARR <<< "${JAVA_OPTS:-}" read -r -a HEAP_OPTS_ARR <<< "${TEZ_AM_HEAP_OPTS}" +# read -r -a JAVA_GC_OPTS_ARR <<< "${TEZ_AM_GC_OPTS}" +# Add "${JAVA_GC_OPTS_ARR[@]}" in following command to get gc information. exec java "${HEAP_OPTS_ARR[@]}" "${JAVA_OPTS_ARR[@]}" "${JAVA_ADD_OPENS[@]}" \ + -Djava.net.preferIPv4Stack=true \ + -Djava.io.tmpdir="$PWD/tmp" \ + -Dtez.root.logger=INFO,CLA,console \ + -Dlog4j.configuratorClass=org.apache.tez.common.TezLog4jConfigurator \ + -Dlog4j.configuration=tez-container-log4j.properties \ + -Dyarn.app.container.log.dir="$LOG_DIRS" \ -Duser.name="$HADOOP_USER_NAME" \ -Dtez.conf.dir="$TEZ_CONF_DIR" \ -cp "$CLASSPATH" \ From e360466bf2bbc6797caa40383df1081c339cbb35 Mon Sep 17 00:00:00 2001 From: Raghav Aggarwal Date: Sun, 8 Mar 2026 03:14:33 +0530 Subject: [PATCH 06/10] Improvements --- tez-dist/pom.xml | 2 +- .../{Dockerfile => tez-am/Dockerfile.tez_am} | 0 tez-dist/src/docker/{ => tez-am}/README.md | 67 +++++++++++++------ .../build-tez-am-docker.sh} | 8 +-- .../src/docker/{ => tez-am}/conf/tez-site.xml | 61 +++++++++-------- tez-dist/src/docker/tez-am/docker-compose.yml | 65 ++++++++++++++++++ .../docker/{ => tez-am}/tez-am-entrypoint.sh | 11 ++- .../src/docker/{tez.env => tez-am/tez-am.env} | 3 - 8 files changed, 156 insertions(+), 61 deletions(-) rename tez-dist/src/docker/{Dockerfile => tez-am/Dockerfile.tez_am} (100%) rename tez-dist/src/docker/{ => tez-am}/README.md (59%) rename tez-dist/src/docker/{build-docker.sh => tez-am/build-tez-am-docker.sh} (93%) rename tez-dist/src/docker/{ => tez-am}/conf/tez-site.xml (57%) create mode 100644 tez-dist/src/docker/tez-am/docker-compose.yml rename tez-dist/src/docker/{ => tez-am}/tez-am-entrypoint.sh (89%) rename tez-dist/src/docker/{tez.env => tez-am/tez-am.env} (96%) diff --git a/tez-dist/pom.xml b/tez-dist/pom.xml index fff9980c53..1e254665d7 100644 --- a/tez-dist/pom.xml +++ b/tez-dist/pom.xml @@ -135,7 +135,7 @@ /bin/bash - ${project.basedir}/src/docker/build-docker.sh + ${project.basedir}/src/docker/tez-am/build-tez-am-docker.sh -tez ${project.version} -repo apache diff --git a/tez-dist/src/docker/Dockerfile b/tez-dist/src/docker/tez-am/Dockerfile.tez_am similarity index 100% rename from tez-dist/src/docker/Dockerfile rename to tez-dist/src/docker/tez-am/Dockerfile.tez_am diff --git a/tez-dist/src/docker/README.md b/tez-dist/src/docker/tez-am/README.md similarity index 59% rename from tez-dist/src/docker/README.md rename to tez-dist/src/docker/tez-am/README.md index 0c15f613b9..d3da10e2ce 100644 --- a/tez-dist/src/docker/README.md +++ b/tez-dist/src/docker/tez-am/README.md @@ -24,21 +24,37 @@ mvn clean install -DskipTests -Pdocker ``` -2. Install zookeeper in mac by: +2. Install zookeeper in mac: + + a. Via brew: set the `tez.am.zookeeper.quorum` value as + `host.docker.internal:2181` in `tez-site.xml` ```bash brew install zookeeper zkServer start ``` -3. Running the Tez AM container: + b. Use Zookeeper docker image (Refer to docker compose yml): + + ```bash + docker pull zookeeper:3.8.4 + + docker run -d \ + --name zookeeper-server \ + -p 2181:2181 \ + -p 8080:8080 \ + -e ZOO_MY_ID=1 \ + zookeeper:3.8.4 + ``` + +3. Running the Tez AM container explicitly: ```bash export TEZ_VERSION=1.0.0-SNAPSHOT - + docker run --rm \ -p 10001:10001 \ - --env-file tez-dist/src/docker/tez.env \ + --env-file tez-dist/src/docker/tez-am/tez-am.env \ --name tez-am \ --hostname localhost \ apache/tez-am:$TEZ_VERSION @@ -54,31 +70,32 @@ `-e TEZ_FRAMEWORK_MODE=STANDALONE_ZOOKEEPER` to the `docker run` command. 4. Debugging the Tez AM container: -Uncomment the `JAVA_TOOL_OPTIONS` in `tez.env` and expose 5005 port using `-p` flag +Uncomment the `JAVA_TOOL_OPTIONS` in `tez-am.env` and expose 5005 port +using `-p` flag ```bash docker run --rm \ -p 10001:10001 -p 5005:5005 \ - --env-file tez-dist/src/docker/tez.env \ + --env-file tez-dist/src/docker/tez-am/tez-am.env \ --name tez-am \ --hostname localhost \ apache/tez-am:$TEZ_VERSION ``` 5. To override the tez-site.xml in docker image use: - * Set the `TEZ_CUSTOM_CONF_DIR` environment variable in `tez.env` + * Set the `TEZ_CUSTOM_CONF_DIR` environment variable in `tez-am.env` or via the `docker run` command (e.g., `/opt/tez/custom-conf`). ```bash export TEZ_SITE_PATH=$(pwd)/tez-dist/src/docker/conf/tez-site.xml docker run --rm \ - -p 10001:10001 \ - --env-file tez-dist/src/docker/tez.env \ - -v "$TEZ_SITE_PATH:/opt/tez/custom-conf/tez-site.xml" \ - --name tez-am \ - --hostname localhost \ - apache/tez-am:$TEZ_VERSION + -p 10001:10001 \ + --env-file tez-dist/src/docker/tez-am/tez-am.env \ + -v "$TEZ_SITE_PATH:/opt/tez/custom-conf/tez-site.xml" \ + --name tez-am \ + --hostname localhost \ + apache/tez-am:$TEZ_VERSION ``` 6. To add plugin jars in docker image use: @@ -86,10 +103,22 @@ Uncomment the `JAVA_TOOL_OPTIONS` in `tez.env` and expose 5005 port using `-p` f ```bash docker run --rm \ - -p 10001:10001 \ - --env-file tez-dist/src/docker/tez.env \ - -v "/path/to/your/local/plugins:/opt/tez/plugins" \ - --name tez-am \ - --hostname localhost \ - apache/tez-am:$TEZ_VERSION + -p 10001:10001 \ + --env-file tez-dist/src/docker/tez-am/tez-am.env \ + -v "/path/to/your/local/plugins:/opt/tez/plugins" \ + --name tez-am \ + --hostname localhost \ + apache/tez-am:$TEZ_VERSION ``` + +7. Using Docker Compose: + * Refer to the `docker-compose.yml` file in this directory for + an example of how to run both the Tez AM and Zookeeper containers + together using Docker Compose. + + ```bash + docker-compose -f tez-dist/src/docker/tez-am/docker-compose.yml up -d --build + ``` + + * This command will start both the Tez AM, Zookeeper, Minimal + Hadoop containers as defined in the `docker-compose.yml` file. diff --git a/tez-dist/src/docker/build-docker.sh b/tez-dist/src/docker/tez-am/build-tez-am-docker.sh similarity index 93% rename from tez-dist/src/docker/build-docker.sh rename to tez-dist/src/docker/tez-am/build-tez-am-docker.sh index 91926cf33f..4025a28afc 100755 --- a/tez-dist/src/docker/build-docker.sh +++ b/tez-dist/src/docker/tez-am/build-tez-am-docker.sh @@ -59,8 +59,8 @@ SCRIPT_DIR=$( pwd ) -DIST_DIR=${DIST_DIR:-"$SCRIPT_DIR/../.."} -PROJECT_ROOT=${PROJECT_ROOT:-"$SCRIPT_DIR/../../.."} +DIST_DIR=${DIST_DIR:-"$SCRIPT_DIR/../../.."} +PROJECT_ROOT=${PROJECT_ROOT:-"$SCRIPT_DIR/../../../.."} REPO=${REPO:-apache} WORK_DIR="$(mktemp -d)" @@ -90,12 +90,12 @@ fi # ------------------------------------------------------------------------- cp -R "$SCRIPT_DIR/conf" "$WORK_DIR/" 2>/dev/null || mkdir -p "$WORK_DIR/conf" cp "$SCRIPT_DIR/tez-am-entrypoint.sh" "$WORK_DIR/" -cp "$SCRIPT_DIR/Dockerfile" "$WORK_DIR/" +cp "$SCRIPT_DIR/Dockerfile.tez_am" "$WORK_DIR/" echo "Building Docker image..." docker build \ "$WORK_DIR" \ - -f "$WORK_DIR/Dockerfile" \ + -f "$WORK_DIR/Dockerfile.tez_am" \ -t "$REPO/tez-am:$TEZ_VERSION" \ --build-arg "BUILD_ENV=unarchive" \ --build-arg "TEZ_VERSION=$TEZ_VERSION" diff --git a/tez-dist/src/docker/conf/tez-site.xml b/tez-dist/src/docker/tez-am/conf/tez-site.xml similarity index 57% rename from tez-dist/src/docker/conf/tez-site.xml rename to tez-dist/src/docker/tez-am/conf/tez-site.xml index b1b2b55caa..8c752fede3 100644 --- a/tez-dist/src/docker/conf/tez-site.xml +++ b/tez-dist/src/docker/tez-am/conf/tez-site.xml @@ -18,36 +18,35 @@ --> - - tez.am.client.am.port-range - 10001-10003 - - - - tez.am.tez-ui.webservice.enable - false - - - - tez.am.zookeeper.quorum - host.docker.internal:2181 - - - - tez.am.log.level - DEBUG - - - - - - tez.local.mode - true - + + tez.am.client.am.port-range + 10001-10003 + + + + tez.am.tez-ui.webservice.enable + false + + + + tez.am.zookeeper.quorum + zookeeper:2181 + + + + tez.am.log.level + DEBUG + + + + tez.local.mode + true + + + + + tez.session.am.dag.submit.timeout.secs + -1 + diff --git a/tez-dist/src/docker/tez-am/docker-compose.yml b/tez-dist/src/docker/tez-am/docker-compose.yml new file mode 100644 index 0000000000..429c6ed526 --- /dev/null +++ b/tez-dist/src/docker/tez-am/docker-compose.yml @@ -0,0 +1,65 @@ +name: tez-cluster + +services: + zookeeper: + image: zookeeper:3.8.4 + container_name: zookeeper + hostname: zookeeper + networks: + - hadoop-network + ports: + - "2181:2181" + environment: + ZOO_MY_ID: 1 + volumes: + - zookeeper_data:/data + - zookeeper_datalog:/datalog + - zookeeper_logs:/logs + + hadoop-cluster: + image: minimal-hadoop:3.4.2 + container_name: hadoop-cluster + hostname: hadoop-cluster + networks: + - hadoop-network + ports: + - "9870:9870" # NameNode Web UI + - "9000:9000" # IPC + - "9864:9864" # DataNode Web UI + - "9866:9866" # DataNode + volumes: + - hadoop_data:/hadoop + + tez-am: + image: apache/tez-am:${TEZ_VERSION:-1.0.0-SNAPSHOT} + container_name: tez-am + hostname: tez-am + networks: + - hadoop-network + depends_on: + - zookeeper + - hadoop-cluster + ports: + - "10001:10001" + # - "5005:5005" # Uncomment for remote debugging + env_file: + - ./tez-am.env + # environment: + # - TEZ_CUSTOM_CONF_DIR=/opt/tez/custom-conf + # volumes: + # - ./custom-tez-site.xml:/opt/tez/custom-conf/tez-site.xml + +networks: + hadoop-network: + name: hadoop-network + driver: bridge + +volumes: + hadoop_data: + name: hadoop_data + zookeeper_data: + name: zookeeper_data + zookeeper_datalog: + name: zookeeper_datalog + zookeeper_logs: + name: zookeeper_logs diff --git a/tez-dist/src/docker/tez-am-entrypoint.sh b/tez-dist/src/docker/tez-am/tez-am-entrypoint.sh similarity index 89% rename from tez-dist/src/docker/tez-am-entrypoint.sh rename to tez-dist/src/docker/tez-am/tez-am-entrypoint.sh index a01f3042b5..90930e2fce 100644 --- a/tez-dist/src/docker/tez-am-entrypoint.sh +++ b/tez-dist/src/docker/tez-am/tez-am-entrypoint.sh @@ -25,9 +25,6 @@ set -xeou pipefail : "${CONTAINER_ID:="container_1700000000000_0001_01_000001"}" : "${USER:="tez"}" : "${HADOOP_USER_NAME:="tez"}" -: "${NM_HOST:="localhost"}" -: "${NM_PORT:="12345"}" -: "${NM_HTTP_PORT:="8042"}" : "${LOCAL_DIRS:="/tmp"}" : "${LOG_DIRS:="/opt/tez/logs"}" : "${APP_SUBMIT_TIME_ENV:=$(($(date +%s) * 1000))}" @@ -92,6 +89,14 @@ JAVA_ADD_OPENS=( "--add-opens=java.base/java.lang=ALL-UNNAMED" "--add-opens=java.base/java.util=ALL-UNNAMED" "--add-opens=java.base/java.io=ALL-UNNAMED" + "--add-opens=java.base/java.net=ALL-UNNAMED" + "--add-opens=java.base/java.nio=ALL-UNNAMED" + "--add-opens=java.base/java.util.concurrent=ALL-UNNAMED" + "--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED" + "--add-opens=java.base/java.util.regex=ALL-UNNAMED" + "--add-opens=java.base/java.lang.reflect=ALL-UNNAMED" + "--add-opens=java.sql/java.sql=ALL-UNNAMED" + "--add-opens=java.base/java.text=ALL-UNNAMED" "-Dnet.bytebuddy.experimental=true" ) diff --git a/tez-dist/src/docker/tez.env b/tez-dist/src/docker/tez-am/tez-am.env similarity index 96% rename from tez-dist/src/docker/tez.env rename to tez-dist/src/docker/tez-am/tez-am.env index 832bb986da..714abb27b6 100644 --- a/tez-dist/src/docker/tez.env +++ b/tez-dist/src/docker/tez-am/tez-am.env @@ -20,9 +20,6 @@ CONTAINER_ID=container_1700000000000_0001_01_000001 USER=tez HADOOP_USER_NAME=tez -NM_HOST=localhost -NM_PORT=12345 -NM_HTTP_PORT=8042 LOG_DIRS=/opt/tez/logs TEZ_FRAMEWORK_MODE=STANDALONE_ZOOKEEPER TEZ_CUSTOM_CONF_DIR=/opt/tez/custom-conf From a6cc9c7d14f59c3af1e98ac0f079099285a36c55 Mon Sep 17 00:00:00 2001 From: Raghav Aggarwal Date: Fri, 13 Mar 2026 14:08:14 +0530 Subject: [PATCH 07/10] review comments --- tez-dist/src/docker/tez-am/README.md | 10 +++++ tez-dist/src/docker/tez-am/conf/tez-site.xml | 10 ++++- tez-dist/src/docker/tez-am/docker-compose.yml | 40 +++++++++++++++---- .../src/docker/tez-am/tez-am-entrypoint.sh | 6 +-- tez-dist/src/docker/tez-am/tez-am.env | 2 - 5 files changed, 52 insertions(+), 16 deletions(-) diff --git a/tez-dist/src/docker/tez-am/README.md b/tez-dist/src/docker/tez-am/README.md index d3da10e2ce..a013033e6b 100644 --- a/tez-dist/src/docker/tez-am/README.md +++ b/tez-dist/src/docker/tez-am/README.md @@ -122,3 +122,13 @@ using `-p` flag * This command will start both the Tez AM, Zookeeper, Minimal Hadoop containers as defined in the `docker-compose.yml` file. + +8. For running a hive query using this docker image: + * Create a directory tez-plugins and add hive-exec jar in it. + * Add the line in docker compose under tez-am service to mount this + directory as volume to `/opt/tez/plugins` in docker container. + + ```yml + volumes: + - ./tez-plugins:/opt/tez/plugins + ``` diff --git a/tez-dist/src/docker/tez-am/conf/tez-site.xml b/tez-dist/src/docker/tez-am/conf/tez-site.xml index 8c752fede3..ff2291657b 100644 --- a/tez-dist/src/docker/tez-am/conf/tez-site.xml +++ b/tez-dist/src/docker/tez-am/conf/tez-site.xml @@ -18,6 +18,7 @@ --> + tez.am.client.am.port-range 10001-10003 @@ -28,6 +29,7 @@ false + tez.am.zookeeper.quorum zookeeper:2181 @@ -35,7 +37,7 @@ tez.am.log.level - DEBUG + INFO @@ -49,4 +51,10 @@ -1 + + + dfs.client.use.datanode.hostname + true + + diff --git a/tez-dist/src/docker/tez-am/docker-compose.yml b/tez-dist/src/docker/tez-am/docker-compose.yml index 429c6ed526..bd54fac77a 100644 --- a/tez-dist/src/docker/tez-am/docker-compose.yml +++ b/tez-dist/src/docker/tez-am/docker-compose.yml @@ -1,3 +1,20 @@ +--- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + name: tez-cluster services: @@ -23,10 +40,10 @@ services: networks: - hadoop-network ports: - - "9870:9870" # NameNode Web UI - - "9000:9000" # IPC - - "9864:9864" # DataNode Web UI - - "9866:9866" # DataNode + - "9870:9870" # NameNode Web UI + - "9000:9000" # IPC + - "9864:9864" # DataNode Web UI + - "9866:9866" # DataNode volumes: - hadoop_data:/hadoop @@ -44,10 +61,17 @@ services: # - "5005:5005" # Uncomment for remote debugging env_file: - ./tez-am.env - # environment: - # - TEZ_CUSTOM_CONF_DIR=/opt/tez/custom-conf - # volumes: - # - ./custom-tez-site.xml:/opt/tez/custom-conf/tez-site.xml + # Already define TEZ_CUSTOM_CONF_DIR in the env file, + # but adding here for clarity + # environment: + # - TEZ_CUSTOM_CONF_DIR=/opt/tez/custom-conf + # Uncomment the following lines if you want to mount a custom + # tez-site.xml for the Tez AM + # volumes: + # - ./custom-tez-site.xml:/opt/tez/custom-conf/tez-site.xml + # Uncomment the following lines if you want to mount custom + # plugins/JARS for instance hive-exec jar + # - ./tez-plugins:/opt/tez/plugins networks: hadoop-network: diff --git a/tez-dist/src/docker/tez-am/tez-am-entrypoint.sh b/tez-dist/src/docker/tez-am/tez-am-entrypoint.sh index 90930e2fce..a6128419ce 100644 --- a/tez-dist/src/docker/tez-am/tez-am-entrypoint.sh +++ b/tez-dist/src/docker/tez-am/tez-am-entrypoint.sh @@ -22,16 +22,13 @@ set -xeou pipefail # 1. Mocking DAGAppMaster#main() env variables # ################################################ -: "${CONTAINER_ID:="container_1700000000000_0001_01_000001"}" : "${USER:="tez"}" -: "${HADOOP_USER_NAME:="tez"}" : "${LOCAL_DIRS:="/tmp"}" : "${LOG_DIRS:="/opt/tez/logs"}" : "${APP_SUBMIT_TIME_ENV:=$(($(date +%s) * 1000))}" : "${TEZ_AM_EXTERNAL_ID:="tez-session-$(hostname)"}" -export CONTAINER_ID USER HADOOP_USER_NAME NM_HOST NM_PORT NM_HTTP_PORT \ - LOCAL_DIRS LOG_DIRS APP_SUBMIT_TIME_ENV TEZ_AM_EXTERNAL_ID +export USER LOCAL_DIRS LOG_DIRS APP_SUBMIT_TIME_ENV TEZ_AM_EXTERNAL_ID mkdir -p "$LOG_DIRS" @@ -112,7 +109,6 @@ exec java "${HEAP_OPTS_ARR[@]}" "${JAVA_OPTS_ARR[@]}" "${JAVA_ADD_OPENS[@]}" \ -Dlog4j.configuratorClass=org.apache.tez.common.TezLog4jConfigurator \ -Dlog4j.configuration=tez-container-log4j.properties \ -Dyarn.app.container.log.dir="$LOG_DIRS" \ - -Duser.name="$HADOOP_USER_NAME" \ -Dtez.conf.dir="$TEZ_CONF_DIR" \ -cp "$CLASSPATH" \ org.apache.tez.dag.app.DAGAppMaster --session \ diff --git a/tez-dist/src/docker/tez-am/tez-am.env b/tez-dist/src/docker/tez-am/tez-am.env index 714abb27b6..93cabeea32 100644 --- a/tez-dist/src/docker/tez-am/tez-am.env +++ b/tez-dist/src/docker/tez-am/tez-am.env @@ -17,9 +17,7 @@ # Tez AM Container Environment Configuration -CONTAINER_ID=container_1700000000000_0001_01_000001 USER=tez -HADOOP_USER_NAME=tez LOG_DIRS=/opt/tez/logs TEZ_FRAMEWORK_MODE=STANDALONE_ZOOKEEPER TEZ_CUSTOM_CONF_DIR=/opt/tez/custom-conf From 80fa0128cd0ce58a69568ea63ec99ed7a5bfbf51 Mon Sep 17 00:00:00 2001 From: Raghav Aggarwal Date: Thu, 19 Mar 2026 16:03:23 +0530 Subject: [PATCH 08/10] Use apache/hadoop official docker image --- tez-dist/src/docker/tez-am/conf/core-site.xml | 13 ++++ tez-dist/src/docker/tez-am/conf/hdfs-site.xml | 58 ++++++++++++++ tez-dist/src/docker/tez-am/docker-compose.yml | 77 ++++++++++++++++--- 3 files changed, 136 insertions(+), 12 deletions(-) create mode 100644 tez-dist/src/docker/tez-am/conf/core-site.xml create mode 100644 tez-dist/src/docker/tez-am/conf/hdfs-site.xml diff --git a/tez-dist/src/docker/tez-am/conf/core-site.xml b/tez-dist/src/docker/tez-am/conf/core-site.xml new file mode 100644 index 0000000000..3a41c6fd52 --- /dev/null +++ b/tez-dist/src/docker/tez-am/conf/core-site.xml @@ -0,0 +1,13 @@ + + + + fs.defaultFS + hdfs://namenode:9000 + + + + hadoop.tmp.dir + /data/tmp + + + diff --git a/tez-dist/src/docker/tez-am/conf/hdfs-site.xml b/tez-dist/src/docker/tez-am/conf/hdfs-site.xml new file mode 100644 index 0000000000..554fbc0797 --- /dev/null +++ b/tez-dist/src/docker/tez-am/conf/hdfs-site.xml @@ -0,0 +1,58 @@ + + + + dfs.replication + 1 + + + + dfs.block.size + 67108864 + + + + dfs.namenode.name.dir + file:///data/namenode + + + + dfs.datanode.data.dir + file:///data/datanode + + + + dfs.namenode.rpc-bind-host + 0.0.0.0 + + + + dfs.datanode.address + 0.0.0.0:9866 + + + + dfs.datanode.http.address + 0.0.0.0:9864 + + + + dfs.client.use.datanode.hostname + true + + + + dfs.datanode.use.datanode.hostname + true + + + + dfs.permissions.enabled + false + + + + dfs.datanode.hostname + datanode + + + diff --git a/tez-dist/src/docker/tez-am/docker-compose.yml b/tez-dist/src/docker/tez-am/docker-compose.yml index bd54fac77a..4e1629cc41 100644 --- a/tez-dist/src/docker/tez-am/docker-compose.yml +++ b/tez-dist/src/docker/tez-am/docker-compose.yml @@ -33,19 +33,66 @@ services: - zookeeper_datalog:/datalog - zookeeper_logs:/logs - hadoop-cluster: - image: minimal-hadoop:3.4.2 - container_name: hadoop-cluster - hostname: hadoop-cluster + namenode: + image: apache/hadoop:3.4.2-lean + container_name: namenode + hostname: namenode + platform: linux/amd64 networks: - hadoop-network ports: - - "9870:9870" # NameNode Web UI - - "9000:9000" # IPC - - "9864:9864" # DataNode Web UI - - "9866:9866" # DataNode + - "9870:9870" # NameNode Web UI + - "9000:9000" # IPC volumes: - - hadoop_data:/hadoop + - hadoop_data:/data # Default persistence path + - hadoop_logs:/opt/hadoop/logs + - ./conf/core-site.xml:/opt/hadoop/etc/hadoop/core-site.xml + - ./conf/hdfs-site.xml:/opt/hadoop/etc/hadoop/hdfs-site.xml + healthcheck: + test: + - "CMD-SHELL" + - > + su -s /bin/bash hadoop -c + '/opt/hadoop/bin/hdfs dfsadmin -safemode get | grep -q "Safe mode is OFF"' + interval: 5s + timeout: 5s + retries: 5 + user: root + command: > + /bin/bash -c " + chown -R hadoop:hadoop /data /opt/hadoop/logs; + su -s /bin/bash hadoop -c ' + if [ ! -f /data/namenode/current/VERSION ]; then + echo \"Formatting NameNode...\"; + /opt/hadoop/bin/hdfs namenode -format -force -nonInteractive; + fi; + /opt/hadoop/bin/hdfs namenode + '" + + datanode: + image: apache/hadoop:3.4.2-lean + container_name: datanode + hostname: datanode + platform: linux/amd64 + depends_on: + namenode: + condition: service_healthy + networks: + - hadoop-network + ports: + - "9864:9864" # DataNode Web UI + - "9866:9866" # DataNode + volumes: + - hadoop_data:/data # Default persistence path + - hadoop_logs:/opt/hadoop/logs + - ./conf/core-site.xml:/opt/hadoop/etc/hadoop/core-site.xml + - ./conf/hdfs-site.xml:/opt/hadoop/etc/hadoop/hdfs-site.xml + user: root + command: > + /bin/bash -c " + chown -R hadoop:hadoop /data /opt/hadoop/logs; + su -s /bin/bash hadoop -c '/opt/hadoop/bin/hdfs datanode' + " tez-am: image: apache/tez-am:${TEZ_VERSION:-1.0.0-SNAPSHOT} @@ -53,9 +100,6 @@ services: hostname: tez-am networks: - hadoop-network - depends_on: - - zookeeper - - hadoop-cluster ports: - "10001:10001" # - "5005:5005" # Uncomment for remote debugging @@ -72,6 +116,13 @@ services: # Uncomment the following lines if you want to mount custom # plugins/JARS for instance hive-exec jar # - ./tez-plugins:/opt/tez/plugins + depends_on: + zookeeper: + condition: service_started + namenode: + condition: service_healthy + datanode: + condition: service_started networks: hadoop-network: @@ -81,6 +132,8 @@ networks: volumes: hadoop_data: name: hadoop_data + hadoop_logs: + name: hadoop_logs zookeeper_data: name: zookeeper_data zookeeper_datalog: From ae9456bbd15d56ad68e3bc36f2582db0e600998f Mon Sep 17 00:00:00 2001 From: Raghav Aggarwal Date: Thu, 19 Mar 2026 16:04:56 +0530 Subject: [PATCH 09/10] Review Comments --- tez-dist/pom.xml | 2 +- .../{Dockerfile.tez_am => Dockerfile.am} | 6 ++--- tez-dist/src/docker/tez-am/README.md | 22 ++++++++++--------- ...{tez-am-entrypoint.sh => am-entrypoint.sh} | 0 .../src/docker/tez-am/{tez-am.env => am.env} | 0 ...ld-tez-am-docker.sh => build-am-docker.sh} | 8 +++---- tez-dist/src/docker/tez-am/docker-compose.yml | 19 ++++++++-------- 7 files changed, 29 insertions(+), 28 deletions(-) rename tez-dist/src/docker/tez-am/{Dockerfile.tez_am => Dockerfile.am} (94%) rename tez-dist/src/docker/tez-am/{tez-am-entrypoint.sh => am-entrypoint.sh} (100%) rename tez-dist/src/docker/tez-am/{tez-am.env => am.env} (100%) rename tez-dist/src/docker/tez-am/{build-tez-am-docker.sh => build-am-docker.sh} (93%) diff --git a/tez-dist/pom.xml b/tez-dist/pom.xml index 1e254665d7..5940a996ac 100644 --- a/tez-dist/pom.xml +++ b/tez-dist/pom.xml @@ -135,7 +135,7 @@ /bin/bash - ${project.basedir}/src/docker/tez-am/build-tez-am-docker.sh + ${project.basedir}/src/docker/tez-am/build-am-docker.sh -tez ${project.version} -repo apache diff --git a/tez-dist/src/docker/tez-am/Dockerfile.tez_am b/tez-dist/src/docker/tez-am/Dockerfile.am similarity index 94% rename from tez-dist/src/docker/tez-am/Dockerfile.tez_am rename to tez-dist/src/docker/tez-am/Dockerfile.am index 883652246b..01647f336c 100644 --- a/tez-dist/src/docker/tez-am/Dockerfile.tez_am +++ b/tez-dist/src/docker/tez-am/Dockerfile.am @@ -57,15 +57,15 @@ COPY --from=env --chown=tez /opt/tez $TEZ_HOME RUN mkdir -p $TEZ_CONF_DIR && chown tez:tez $TEZ_CONF_DIR -COPY --chown=tez tez-am-entrypoint.sh / +COPY --chown=tez am-entrypoint.sh / COPY --chown=tez conf $TEZ_CONF_DIR # Create Extension Point Directory RUN mkdir -p /opt/tez/plugins && chown tez:tez /opt/tez/plugins && chmod 755 /opt/tez/plugins -RUN chmod +x /tez-am-entrypoint.sh +RUN chmod +x /am-entrypoint.sh USER tez WORKDIR $TEZ_HOME -ENTRYPOINT ["/tez-am-entrypoint.sh"] +ENTRYPOINT ["/am-entrypoint.sh"] diff --git a/tez-dist/src/docker/tez-am/README.md b/tez-dist/src/docker/tez-am/README.md index a013033e6b..987f381853 100644 --- a/tez-dist/src/docker/tez-am/README.md +++ b/tez-dist/src/docker/tez-am/README.md @@ -54,7 +54,7 @@ docker run --rm \ -p 10001:10001 \ - --env-file tez-dist/src/docker/tez-am/tez-am.env \ + --env-file tez-dist/src/docker/tez-am/am.env \ --name tez-am \ --hostname localhost \ apache/tez-am:$TEZ_VERSION @@ -70,20 +70,20 @@ `-e TEZ_FRAMEWORK_MODE=STANDALONE_ZOOKEEPER` to the `docker run` command. 4. Debugging the Tez AM container: -Uncomment the `JAVA_TOOL_OPTIONS` in `tez-am.env` and expose 5005 port +Uncomment the `JAVA_TOOL_OPTIONS` in `am.env` and expose 5005 port using `-p` flag ```bash docker run --rm \ -p 10001:10001 -p 5005:5005 \ - --env-file tez-dist/src/docker/tez-am/tez-am.env \ + --env-file tez-dist/src/docker/tez-am/am.env \ --name tez-am \ --hostname localhost \ apache/tez-am:$TEZ_VERSION ``` 5. To override the tez-site.xml in docker image use: - * Set the `TEZ_CUSTOM_CONF_DIR` environment variable in `tez-am.env` + * Set the `TEZ_CUSTOM_CONF_DIR` environment variable in `am.env` or via the `docker run` command (e.g., `/opt/tez/custom-conf`). ```bash @@ -91,7 +91,7 @@ using `-p` flag docker run --rm \ -p 10001:10001 \ - --env-file tez-dist/src/docker/tez-am/tez-am.env \ + --env-file tez-dist/src/docker/tez-am/am.env \ -v "$TEZ_SITE_PATH:/opt/tez/custom-conf/tez-site.xml" \ --name tez-am \ --hostname localhost \ @@ -104,7 +104,7 @@ using `-p` flag ```bash docker run --rm \ -p 10001:10001 \ - --env-file tez-dist/src/docker/tez-am/tez-am.env \ + --env-file tez-dist/src/docker/tez-am/am.env \ -v "/path/to/your/local/plugins:/opt/tez/plugins" \ --name tez-am \ --hostname localhost \ @@ -123,10 +123,12 @@ using `-p` flag * This command will start both the Tez AM, Zookeeper, Minimal Hadoop containers as defined in the `docker-compose.yml` file. -8. For running a hive query using this docker image: - * Create a directory tez-plugins and add hive-exec jar in it. - * Add the line in docker compose under tez-am service to mount this - directory as volume to `/opt/tez/plugins` in docker container. +8. To mount custom plugins or JARs required by Tez AM (e.g., for split generation + — typically the hive-exec jar, but in general, any UDFs or dependencies + previously managed via YARN localization: + * Create a directory tez-plugins and add all required jars. + * Uncomment the following lines in docker compose under the tez-am service + to mount this directory as a volume to `/opt/tez/plugins` in the docker container. ```yml volumes: diff --git a/tez-dist/src/docker/tez-am/tez-am-entrypoint.sh b/tez-dist/src/docker/tez-am/am-entrypoint.sh similarity index 100% rename from tez-dist/src/docker/tez-am/tez-am-entrypoint.sh rename to tez-dist/src/docker/tez-am/am-entrypoint.sh diff --git a/tez-dist/src/docker/tez-am/tez-am.env b/tez-dist/src/docker/tez-am/am.env similarity index 100% rename from tez-dist/src/docker/tez-am/tez-am.env rename to tez-dist/src/docker/tez-am/am.env diff --git a/tez-dist/src/docker/tez-am/build-tez-am-docker.sh b/tez-dist/src/docker/tez-am/build-am-docker.sh similarity index 93% rename from tez-dist/src/docker/tez-am/build-tez-am-docker.sh rename to tez-dist/src/docker/tez-am/build-am-docker.sh index 4025a28afc..66bf7fc738 100755 --- a/tez-dist/src/docker/tez-am/build-tez-am-docker.sh +++ b/tez-dist/src/docker/tez-am/build-am-docker.sh @@ -64,8 +64,6 @@ PROJECT_ROOT=${PROJECT_ROOT:-"$SCRIPT_DIR/../../../.."} REPO=${REPO:-apache} WORK_DIR="$(mktemp -d)" -CACHE_DIR="$SCRIPT_DIR/cache" -mkdir -p "$CACHE_DIR" # Defaults Tez versions from pom.xml if not provided TEZ_VERSION=${TEZ_VERSION:-$(mvn -f "$PROJECT_ROOT/pom.xml" -q help:evaluate -Dexpression=project.version -DforceStdout)} @@ -89,13 +87,13 @@ fi # BUILD CONTEXT PREPARATION # ------------------------------------------------------------------------- cp -R "$SCRIPT_DIR/conf" "$WORK_DIR/" 2>/dev/null || mkdir -p "$WORK_DIR/conf" -cp "$SCRIPT_DIR/tez-am-entrypoint.sh" "$WORK_DIR/" -cp "$SCRIPT_DIR/Dockerfile.tez_am" "$WORK_DIR/" +cp "$SCRIPT_DIR/am-entrypoint.sh" "$WORK_DIR/" +cp "$SCRIPT_DIR/Dockerfile.am" "$WORK_DIR/" echo "Building Docker image..." docker build \ "$WORK_DIR" \ - -f "$WORK_DIR/Dockerfile.tez_am" \ + -f "$WORK_DIR/Dockerfile.am" \ -t "$REPO/tez-am:$TEZ_VERSION" \ --build-arg "BUILD_ENV=unarchive" \ --build-arg "TEZ_VERSION=$TEZ_VERSION" diff --git a/tez-dist/src/docker/tez-am/docker-compose.yml b/tez-dist/src/docker/tez-am/docker-compose.yml index 4e1629cc41..d71c521523 100644 --- a/tez-dist/src/docker/tez-am/docker-compose.yml +++ b/tez-dist/src/docker/tez-am/docker-compose.yml @@ -41,10 +41,10 @@ services: networks: - hadoop-network ports: - - "9870:9870" # NameNode Web UI - - "9000:9000" # IPC + - "9870:9870" # NameNode Web UI + - "9000:9000" # IPC volumes: - - hadoop_data:/data # Default persistence path + - hadoop_data:/data # Default persistence path - hadoop_logs:/opt/hadoop/logs - ./conf/core-site.xml:/opt/hadoop/etc/hadoop/core-site.xml - ./conf/hdfs-site.xml:/opt/hadoop/etc/hadoop/hdfs-site.xml @@ -80,10 +80,10 @@ services: networks: - hadoop-network ports: - - "9864:9864" # DataNode Web UI - - "9866:9866" # DataNode + - "9864:9864" # DataNode Web UI + - "9866:9866" # DataNode volumes: - - hadoop_data:/data # Default persistence path + - hadoop_data:/data # Default persistence path - hadoop_logs:/opt/hadoop/logs - ./conf/core-site.xml:/opt/hadoop/etc/hadoop/core-site.xml - ./conf/hdfs-site.xml:/opt/hadoop/etc/hadoop/hdfs-site.xml @@ -104,7 +104,7 @@ services: - "10001:10001" # - "5005:5005" # Uncomment for remote debugging env_file: - - ./tez-am.env + - ./am.env # Already define TEZ_CUSTOM_CONF_DIR in the env file, # but adding here for clarity # environment: @@ -113,8 +113,9 @@ services: # tez-site.xml for the Tez AM # volumes: # - ./custom-tez-site.xml:/opt/tez/custom-conf/tez-site.xml - # Uncomment the following lines if you want to mount custom - # plugins/JARS for instance hive-exec jar + # Uncomment the following lines to mount custom plugins or JARs + # required by Tez AM (e.g., UDFs, or dependencies previously managed + # via YARN localization) # - ./tez-plugins:/opt/tez/plugins depends_on: zookeeper: From 749428edd97f0fd4a3b002945e32f15ad6a3299a Mon Sep 17 00:00:00 2001 From: Raghav Aggarwal Date: Thu, 19 Mar 2026 23:30:23 +0530 Subject: [PATCH 10/10] Add Sample program to showcase working of external tez AM --- .../tez/examples/ExternalAmWordCount.java | 186 ++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 tez-examples/src/main/java/org/apache/tez/examples/ExternalAmWordCount.java diff --git a/tez-examples/src/main/java/org/apache/tez/examples/ExternalAmWordCount.java b/tez-examples/src/main/java/org/apache/tez/examples/ExternalAmWordCount.java new file mode 100644 index 0000000000..151e8a68b2 --- /dev/null +++ b/tez-examples/src/main/java/org/apache/tez/examples/ExternalAmWordCount.java @@ -0,0 +1,186 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at + * + *

http://www.apache.org/licenses/LICENSE-2.0 + * + *

Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tez.examples; + +import java.util.StringTokenizer; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.tez.client.TezClient; +import org.apache.tez.dag.api.DAG; +import org.apache.tez.dag.api.Edge; +import org.apache.tez.dag.api.ProcessorDescriptor; +import org.apache.tez.dag.api.TezConfiguration; +import org.apache.tez.dag.api.Vertex; +import org.apache.tez.dag.api.client.DAGStatus.State; +import org.apache.tez.mapreduce.input.MRInput; +import org.apache.tez.mapreduce.output.MROutput; +import org.apache.tez.mapreduce.processor.SimpleMRProcessor; +import org.apache.tez.runtime.api.ProcessorContext; +import org.apache.tez.runtime.library.api.KeyValueWriter; +import org.apache.tez.runtime.library.api.KeyValuesReader; +import org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig; +import org.apache.tez.runtime.library.partitioner.HashPartitioner; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Sample Program inspired for WordCount but to run with External Tez AM with Zookeeper + */ +public class ExternalAmWordCount { + private static final Logger LOG = LoggerFactory.getLogger(ExternalAmWordCount.class); + private static final String ZK_ADDRESS = "zookeeper:2181"; + + public static void main(String[] args) throws Exception { + if (args.length != 2) { + System.err.println( + "Usage: java -cp com.github.raghav.ExternalAmWordCount "); + System.exit(2); + } + + var inputPath = args[0]; + var outputPath = args[1]; + + var conf = new Configuration(); + var tezConf = new TezConfiguration(conf); + + tezConf.set(TezConfiguration.TEZ_FRAMEWORK_MODE, "STANDALONE_ZOOKEEPER"); + tezConf.set(TezConfiguration.TEZ_AM_ZOOKEEPER_QUORUM, ZK_ADDRESS); + tezConf.set(TezConfiguration.TEZ_AM_CURATOR_SESSION_TIMEOUT, "30000ms"); + tezConf.setBoolean(TezConfiguration.TEZ_LOCAL_MODE, false); + tezConf.setBoolean(TezConfiguration.TEZ_IGNORE_LIB_URIS, true); + + // Prevent Tez from using the current directory for staging (avoids deleting your custom jar) + tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, "/tmp/tez-staging"); + + LOG.info("Starting Tez Client with ZK Address: {}", ZK_ADDRESS); + + var tezClient = TezClient.create("ExternalAmWordCount", tezConf, true); + + try { + tezClient.start(); + LOG.info("Waiting for Tez AM to register"); + tezClient.waitTillReady(); + LOG.info("Tez AM discovered! Submitting DAG..."); + + var app = new ExternalAmWordCount(); + var dag = app.createDAG(tezConf, inputPath, outputPath); + var dagClient = tezClient.submitDAG(dag); + + var dagStatus = dagClient.waitForCompletionWithStatusUpdates(null); + + if (dagStatus.getState() == State.SUCCEEDED) { + LOG.info("DAG Succeeded"); + System.exit(0); + } else { + LOG.error("DAG Failed with state: {}", dagStatus.getState()); + System.exit(1); + } + + } finally { + tezClient.stop(); + } + } + + private DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath) { + var dataSource = + MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath) + .build(); + + var dataSink = + MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath) + .build(); + + var tokenizerVertex = + Vertex.create("Tokenizer", ProcessorDescriptor.create(TokenProcessor.class.getName())) + .addDataSource("Input", dataSource); + + var summerVertex = + Vertex.create( + "Summer", ProcessorDescriptor.create(SumProcessor.class.getName()), 1) // 1 Reducer + .addDataSink("Output", dataSink); + + var edgeConf = + OrderedPartitionedKVEdgeConfig.newBuilder( + Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()) + .setFromConfiguration(tezConf) + .build(); + + return DAG.create("ZkWordCountDAG") + .addVertex(tokenizerVertex) + .addVertex(summerVertex) + .addEdge(Edge.create(tokenizerVertex, summerVertex, edgeConf.createDefaultEdgeProperty())); + } + + public static class TokenProcessor extends SimpleMRProcessor { + private static final IntWritable one = new IntWritable(1); + private final Text word = new Text(); + + public TokenProcessor(ProcessorContext context) { + super(context); + } + + @Override + public void run() throws Exception { + // Get inputs/outputs + var inputs = getInputs(); + var outputs = getOutputs(); + + var reader = (org.apache.tez.mapreduce.lib.MRReader) inputs.get("Input").getReader(); + var writer = (KeyValueWriter) outputs.get("Summer").getWriter(); + + while (reader.next()) { + var val = reader.getCurrentValue(); + var line = val.toString(); + var tokenizer = new StringTokenizer(line); + + while (tokenizer.hasMoreTokens()) { + word.set(tokenizer.nextToken()); + writer.write(word, one); + } + } + } + } + + public static class SumProcessor extends SimpleMRProcessor { + public SumProcessor(ProcessorContext context) { + super(context); + } + + @Override + public void run() throws Exception { + var inputs = getInputs(); + var outputs = getOutputs(); + + var reader = (KeyValuesReader) inputs.get("Tokenizer").getReader(); + var writer = (KeyValueWriter) outputs.get("Output").getWriter(); + + while (reader.next()) { + var key = reader.getCurrentKey(); + var values = reader.getCurrentValues(); + + int sum = 0; + for (var val : values) { + sum += ((IntWritable) val).get(); + } + writer.write(key, new IntWritable(sum)); + } + } + } +}