diff --git a/.gitignore b/.gitignore index 85d660672c..e0df115492 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ .settings pom.xml.versionsBackup target +tez-dist/src/docker/cache/ diff --git a/tez-dag/src/main/resources/tez-container-log4j.properties b/tez-dag/src/main/resources/tez-container-log4j.properties index 7cebec3289..4525d7018f 100644 --- a/tez-dag/src/main/resources/tez-container-log4j.properties +++ b/tez-dag/src/main/resources/tez-container-log4j.properties @@ -21,9 +21,16 @@ log4j.rootLogger=${tez.root.logger} log4j.threshold=ALL # -# ContainerLog Appender +# Console Appender # +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.out +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{ISO8601} [%p] [%t] |%c{2}|: %m%n +# +# ContainerLog Appender +# log4j.appender.CLA=org.apache.tez.common.TezContainerLogAppender log4j.appender.CLA.containerLogDir=${yarn.app.container.log.dir} diff --git a/tez-dist/pom.xml b/tez-dist/pom.xml index 9777d0c0b9..5940a996ac 100644 --- a/tez-dist/pom.xml +++ b/tez-dist/pom.xml @@ -118,6 +118,34 @@ + + docker + + + + org.codehaus.mojo + exec-maven-plugin + + + build-docker-image + package + + exec + + + /bin/bash + + ${project.basedir}/src/docker/tez-am/build-am-docker.sh + -tez ${project.version} + -repo apache + + + + + + + + diff --git a/tez-dist/src/docker/tez-am/Dockerfile.am b/tez-dist/src/docker/tez-am/Dockerfile.am new file mode 100644 index 0000000000..01647f336c --- /dev/null +++ b/tez-dist/src/docker/tez-am/Dockerfile.am @@ -0,0 +1,71 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +ARG BUILD_ENV=unarchive + +# hadolint ignore=DL3006 +FROM ubuntu AS unarchive +# hadolint ignore=DL3010 +ONBUILD COPY tez-*.tar.gz /opt + +# hadolint ignore=DL3006 +FROM ${BUILD_ENV} AS env +ARG TEZ_VERSION + +RUN mkdir -p /opt/tez \ + && tar -xzv \ + -f /opt/tez-$TEZ_VERSION.tar.gz \ + -C /opt/tez \ + && rm -rf /opt/tez-$TEZ_VERSION.tar.gz + +FROM eclipse-temurin:21-jdk-ubi9-minimal AS run + +ARG UID=1000 +ARG TEZ_VERSION + +# Install dependencies +# hadolint ignore=DL3041 +RUN set -ex; \ + microdnf update -y; \ + microdnf -y install procps gettext findutils hostname; \ + microdnf clean all; \ + useradd --no-create-home -s /sbin/nologin -c "" --uid $UID tez + +# Set necessary environment variables +ENV TEZ_HOME=/opt/tez \ + TEZ_CONF_DIR=/opt/tez/conf + +ENV TEZ_CLIENT_VERSION=$TEZ_VERSION + +ENV PATH=$TEZ_HOME/bin:$PATH + +COPY --from=env --chown=tez /opt/tez $TEZ_HOME + +RUN mkdir -p $TEZ_CONF_DIR && chown tez:tez $TEZ_CONF_DIR + +COPY --chown=tez am-entrypoint.sh / +COPY --chown=tez conf $TEZ_CONF_DIR + +# Create Extension Point Directory +RUN mkdir -p /opt/tez/plugins && chown tez:tez /opt/tez/plugins && chmod 755 /opt/tez/plugins + +RUN chmod +x /am-entrypoint.sh + +USER tez +WORKDIR $TEZ_HOME + +ENTRYPOINT ["/am-entrypoint.sh"] diff --git a/tez-dist/src/docker/tez-am/README.md b/tez-dist/src/docker/tez-am/README.md new file mode 100644 index 0000000000..987f381853 --- /dev/null +++ b/tez-dist/src/docker/tez-am/README.md @@ -0,0 +1,136 @@ + + +# Tez AM Docker + +1. Building the docker image: + + ```bash + mvn clean install -DskipTests -Pdocker + ``` + +2. Install zookeeper in mac: + + a. Via brew: set the `tez.am.zookeeper.quorum` value as + `host.docker.internal:2181` in `tez-site.xml` + + ```bash + brew install zookeeper + zkServer start + ``` + + b. Use Zookeeper docker image (Refer to docker compose yml): + + ```bash + docker pull zookeeper:3.8.4 + + docker run -d \ + --name zookeeper-server \ + -p 2181:2181 \ + -p 8080:8080 \ + -e ZOO_MY_ID=1 \ + zookeeper:3.8.4 + ``` + +3. Running the Tez AM container explicitly: + + ```bash + export TEZ_VERSION=1.0.0-SNAPSHOT + + docker run --rm \ + -p 10001:10001 \ + --env-file tez-dist/src/docker/tez-am/am.env \ + --name tez-am \ + --hostname localhost \ + apache/tez-am:$TEZ_VERSION + ``` + + * `TEZ_VERSION` corresponds to the Maven `${project.version}`. + Set this environment variable in your shell before running the commands. + * Expose ports using the `-p` flag based on the + `tez.am.client.am.port-range` property in `tez-site.xml`. + * The `--hostname` flag configures the container's hostname, allowing + services on the host (e.g., macOS) to connect to it. + * Ensure the `--env-file` flag is included, or at a minimum, pass + `-e TEZ_FRAMEWORK_MODE=STANDALONE_ZOOKEEPER` to the `docker run` command. + +4. Debugging the Tez AM container: +Uncomment the `JAVA_TOOL_OPTIONS` in `am.env` and expose 5005 port +using `-p` flag + + ```bash + docker run --rm \ + -p 10001:10001 -p 5005:5005 \ + --env-file tez-dist/src/docker/tez-am/am.env \ + --name tez-am \ + --hostname localhost \ + apache/tez-am:$TEZ_VERSION + ``` + +5. To override the tez-site.xml in docker image use: + * Set the `TEZ_CUSTOM_CONF_DIR` environment variable in `am.env` + or via the `docker run` command (e.g., `/opt/tez/custom-conf`). + + ```bash + export TEZ_SITE_PATH=$(pwd)/tez-dist/src/docker/conf/tez-site.xml + + docker run --rm \ + -p 10001:10001 \ + --env-file tez-dist/src/docker/tez-am/am.env \ + -v "$TEZ_SITE_PATH:/opt/tez/custom-conf/tez-site.xml" \ + --name tez-am \ + --hostname localhost \ + apache/tez-am:$TEZ_VERSION + ``` + +6. To add plugin jars in docker image use: + * The plugin directory path inside the Docker container is fixed at `/opt/tez/plugins`. + + ```bash + docker run --rm \ + -p 10001:10001 \ + --env-file tez-dist/src/docker/tez-am/am.env \ + -v "/path/to/your/local/plugins:/opt/tez/plugins" \ + --name tez-am \ + --hostname localhost \ + apache/tez-am:$TEZ_VERSION + ``` + +7. Using Docker Compose: + * Refer to the `docker-compose.yml` file in this directory for + an example of how to run both the Tez AM and Zookeeper containers + together using Docker Compose. + + ```bash + docker-compose -f tez-dist/src/docker/tez-am/docker-compose.yml up -d --build + ``` + + * This command will start both the Tez AM, Zookeeper, Minimal + Hadoop containers as defined in the `docker-compose.yml` file. + +8. To mount custom plugins or JARs required by Tez AM (e.g., for split generation + — typically the hive-exec jar, but in general, any UDFs or dependencies + previously managed via YARN localization: + * Create a directory tez-plugins and add all required jars. + * Uncomment the following lines in docker compose under the tez-am service + to mount this directory as a volume to `/opt/tez/plugins` in the docker container. + + ```yml + volumes: + - ./tez-plugins:/opt/tez/plugins + ``` diff --git a/tez-dist/src/docker/tez-am/am-entrypoint.sh b/tez-dist/src/docker/tez-am/am-entrypoint.sh new file mode 100644 index 0000000000..a6128419ce --- /dev/null +++ b/tez-dist/src/docker/tez-am/am-entrypoint.sh @@ -0,0 +1,115 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -xeou pipefail + +################################################ +# 1. Mocking DAGAppMaster#main() env variables # +################################################ + +: "${USER:="tez"}" +: "${LOCAL_DIRS:="/tmp"}" +: "${LOG_DIRS:="/opt/tez/logs"}" +: "${APP_SUBMIT_TIME_ENV:=$(($(date +%s) * 1000))}" +: "${TEZ_AM_EXTERNAL_ID:="tez-session-$(hostname)"}" + +export USER LOCAL_DIRS LOG_DIRS APP_SUBMIT_TIME_ENV TEZ_AM_EXTERNAL_ID + +mkdir -p "$LOG_DIRS" + +########################### +# Custom Config directory # +########################### +if [[ -n "${TEZ_CUSTOM_CONF_DIR:-}" ]] && [[ -d "$TEZ_CUSTOM_CONF_DIR" ]]; then + echo "--> Using custom configuration directory: $TEZ_CUSTOM_CONF_DIR" + find "${TEZ_CUSTOM_CONF_DIR}" -type f -exec \ + ln -sf {} "${TEZ_CONF_DIR}"/ \; + + # Remove template keyword if it exists + if [[ -f "$TEZ_CONF_DIR/tez-site.xml.template" ]]; then + envsubst < "$TEZ_CONF_DIR/tez-site.xml.template" > "$TEZ_CONF_DIR/tez-site.xml" + fi +fi + +############# +# CLASSPATH # +############# + +# Order is: conf -> plugins -> tez jars +CLASSPATH="${TEZ_CONF_DIR}" + +# Custom Plugins +# This allows mounting a volume at /opt/tez/plugins containing aux jars +PLUGIN_DIR="/opt/tez/plugins" +if [[ -d "$PLUGIN_DIR" ]]; then + count=$(find "$PLUGIN_DIR" -maxdepth 1 -name "*.jar" 2>/dev/null | wc -l) + if [ "$count" != "0" ]; then + echo "--> Found $count plugin jars. Prepending to classpath." + CLASSPATH="${CLASSPATH}:${PLUGIN_DIR}/*" + fi +fi + +# Tez Jars +CLASSPATH="${CLASSPATH}:${TEZ_HOME}/*:${TEZ_HOME}/lib/*" + +############# +# Execution # +############# +TEZ_DAG_JAR=$(find "$TEZ_HOME" -maxdepth 1 -name "tez-dag-*.jar" ! -name "*-tests.jar" | head -n 1) + +if [ -z "$TEZ_DAG_JAR" ]; then + echo "Error: Could not find tez-dag-*.jar in $TEZ_HOME" + exit 1 +fi + +echo "--> Starting DAGAppMaster..." + +: "${TEZ_AM_HEAP_OPTS:="-Xmx2048m"}" +# : "${TEZ_AM_GC_OPTS:="-Xlog:gc*=info,class+load=info::time,uptime,level,tags -XX:+UseNUMA"}" + +JAVA_ADD_OPENS=( + "--add-opens=java.base/java.lang=ALL-UNNAMED" + "--add-opens=java.base/java.util=ALL-UNNAMED" + "--add-opens=java.base/java.io=ALL-UNNAMED" + "--add-opens=java.base/java.net=ALL-UNNAMED" + "--add-opens=java.base/java.nio=ALL-UNNAMED" + "--add-opens=java.base/java.util.concurrent=ALL-UNNAMED" + "--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED" + "--add-opens=java.base/java.util.regex=ALL-UNNAMED" + "--add-opens=java.base/java.lang.reflect=ALL-UNNAMED" + "--add-opens=java.sql/java.sql=ALL-UNNAMED" + "--add-opens=java.base/java.text=ALL-UNNAMED" + "-Dnet.bytebuddy.experimental=true" +) + +read -r -a JAVA_OPTS_ARR <<< "${JAVA_OPTS:-}" +read -r -a HEAP_OPTS_ARR <<< "${TEZ_AM_HEAP_OPTS}" +# read -r -a JAVA_GC_OPTS_ARR <<< "${TEZ_AM_GC_OPTS}" + +# Add "${JAVA_GC_OPTS_ARR[@]}" in following command to get gc information. +exec java "${HEAP_OPTS_ARR[@]}" "${JAVA_OPTS_ARR[@]}" "${JAVA_ADD_OPENS[@]}" \ + -Djava.net.preferIPv4Stack=true \ + -Djava.io.tmpdir="$PWD/tmp" \ + -Dtez.root.logger=INFO,CLA,console \ + -Dlog4j.configuratorClass=org.apache.tez.common.TezLog4jConfigurator \ + -Dlog4j.configuration=tez-container-log4j.properties \ + -Dyarn.app.container.log.dir="$LOG_DIRS" \ + -Dtez.conf.dir="$TEZ_CONF_DIR" \ + -cp "$CLASSPATH" \ + org.apache.tez.dag.app.DAGAppMaster --session \ + "$@" diff --git a/tez-dist/src/docker/tez-am/am.env b/tez-dist/src/docker/tez-am/am.env new file mode 100644 index 0000000000..93cabeea32 --- /dev/null +++ b/tez-dist/src/docker/tez-am/am.env @@ -0,0 +1,27 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Tez AM Container Environment Configuration + +USER=tez +LOG_DIRS=/opt/tez/logs +TEZ_FRAMEWORK_MODE=STANDALONE_ZOOKEEPER +TEZ_CUSTOM_CONF_DIR=/opt/tez/custom-conf +# TEZ_AM_HEAP_OPTS configures the maximum heap size (Xmx) for the Tez AM. +TEZ_AM_HEAP_OPTS=-Xmx2048m +# Enable remote debugging on port 5005 +# JAVA_TOOL_OPTIONS='-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=*:5005' diff --git a/tez-dist/src/docker/tez-am/build-am-docker.sh b/tez-dist/src/docker/tez-am/build-am-docker.sh new file mode 100755 index 0000000000..66bf7fc738 --- /dev/null +++ b/tez-dist/src/docker/tez-am/build-am-docker.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -xeou pipefail + +TEZ_VERSION= +REPO= + +usage() { + cat <&2 +Usage: $0 [-h] [-tez ] [-repo ] +Build the Apache Tez AM Docker image +-help Display help +-tez Build image with the specified Tez version +-repo Docker repository +EOF +} + +while [ $# -gt 0 ]; do + case "$1" in + -h) + usage + exit 0 + ;; + -tez) + shift + TEZ_VERSION=$1 + shift + ;; + -repo) + shift + REPO=$1 + shift + ;; + *) + shift + ;; + esac +done + +SCRIPT_DIR=$( + cd "$(dirname "$0")" + pwd +) + +DIST_DIR=${DIST_DIR:-"$SCRIPT_DIR/../../.."} +PROJECT_ROOT=${PROJECT_ROOT:-"$SCRIPT_DIR/../../../.."} + +REPO=${REPO:-apache} +WORK_DIR="$(mktemp -d)" + +# Defaults Tez versions from pom.xml if not provided +TEZ_VERSION=${TEZ_VERSION:-$(mvn -f "$PROJECT_ROOT/pom.xml" -q help:evaluate -Dexpression=project.version -DforceStdout)} + +##################################### +# Pick tez tarball from local build # +##################################### +TEZ_FILE_NAME="tez-$TEZ_VERSION.tar.gz" +LOCAL_DIST_PATH="$DIST_DIR/target/$TEZ_FILE_NAME" + +if [ -f "$LOCAL_DIST_PATH" ]; then + echo "--> Found local Tez build artifact at: $LOCAL_DIST_PATH" + cp "$LOCAL_DIST_PATH" "$WORK_DIR/" +else + echo "--> Error: Local Tez artifact not found at $LOCAL_DIST_PATH" + echo "--> Please build the project first (e.g., mvn clean install -DskipTests)." + exit 1 +fi + +# ------------------------------------------------------------------------- +# BUILD CONTEXT PREPARATION +# ------------------------------------------------------------------------- +cp -R "$SCRIPT_DIR/conf" "$WORK_DIR/" 2>/dev/null || mkdir -p "$WORK_DIR/conf" +cp "$SCRIPT_DIR/am-entrypoint.sh" "$WORK_DIR/" +cp "$SCRIPT_DIR/Dockerfile.am" "$WORK_DIR/" + +echo "Building Docker image..." +docker build \ + "$WORK_DIR" \ + -f "$WORK_DIR/Dockerfile.am" \ + -t "$REPO/tez-am:$TEZ_VERSION" \ + --build-arg "BUILD_ENV=unarchive" \ + --build-arg "TEZ_VERSION=$TEZ_VERSION" + +rm -r "${WORK_DIR}" +echo "Docker image $REPO/tez-am:$TEZ_VERSION built successfully." diff --git a/tez-dist/src/docker/tez-am/conf/core-site.xml b/tez-dist/src/docker/tez-am/conf/core-site.xml new file mode 100644 index 0000000000..3a41c6fd52 --- /dev/null +++ b/tez-dist/src/docker/tez-am/conf/core-site.xml @@ -0,0 +1,13 @@ + + + + fs.defaultFS + hdfs://namenode:9000 + + + + hadoop.tmp.dir + /data/tmp + + + diff --git a/tez-dist/src/docker/tez-am/conf/hdfs-site.xml b/tez-dist/src/docker/tez-am/conf/hdfs-site.xml new file mode 100644 index 0000000000..554fbc0797 --- /dev/null +++ b/tez-dist/src/docker/tez-am/conf/hdfs-site.xml @@ -0,0 +1,58 @@ + + + + dfs.replication + 1 + + + + dfs.block.size + 67108864 + + + + dfs.namenode.name.dir + file:///data/namenode + + + + dfs.datanode.data.dir + file:///data/datanode + + + + dfs.namenode.rpc-bind-host + 0.0.0.0 + + + + dfs.datanode.address + 0.0.0.0:9866 + + + + dfs.datanode.http.address + 0.0.0.0:9864 + + + + dfs.client.use.datanode.hostname + true + + + + dfs.datanode.use.datanode.hostname + true + + + + dfs.permissions.enabled + false + + + + dfs.datanode.hostname + datanode + + + diff --git a/tez-dist/src/docker/tez-am/conf/tez-site.xml b/tez-dist/src/docker/tez-am/conf/tez-site.xml new file mode 100644 index 0000000000..ff2291657b --- /dev/null +++ b/tez-dist/src/docker/tez-am/conf/tez-site.xml @@ -0,0 +1,60 @@ + + + + + + + + tez.am.client.am.port-range + 10001-10003 + + + + tez.am.tez-ui.webservice.enable + false + + + + + tez.am.zookeeper.quorum + zookeeper:2181 + + + + tez.am.log.level + INFO + + + + tez.local.mode + true + + + + + tez.session.am.dag.submit.timeout.secs + -1 + + + + + dfs.client.use.datanode.hostname + true + + + diff --git a/tez-dist/src/docker/tez-am/docker-compose.yml b/tez-dist/src/docker/tez-am/docker-compose.yml new file mode 100644 index 0000000000..d71c521523 --- /dev/null +++ b/tez-dist/src/docker/tez-am/docker-compose.yml @@ -0,0 +1,143 @@ +--- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: tez-cluster + +services: + zookeeper: + image: zookeeper:3.8.4 + container_name: zookeeper + hostname: zookeeper + networks: + - hadoop-network + ports: + - "2181:2181" + environment: + ZOO_MY_ID: 1 + volumes: + - zookeeper_data:/data + - zookeeper_datalog:/datalog + - zookeeper_logs:/logs + + namenode: + image: apache/hadoop:3.4.2-lean + container_name: namenode + hostname: namenode + platform: linux/amd64 + networks: + - hadoop-network + ports: + - "9870:9870" # NameNode Web UI + - "9000:9000" # IPC + volumes: + - hadoop_data:/data # Default persistence path + - hadoop_logs:/opt/hadoop/logs + - ./conf/core-site.xml:/opt/hadoop/etc/hadoop/core-site.xml + - ./conf/hdfs-site.xml:/opt/hadoop/etc/hadoop/hdfs-site.xml + healthcheck: + test: + - "CMD-SHELL" + - > + su -s /bin/bash hadoop -c + '/opt/hadoop/bin/hdfs dfsadmin -safemode get | grep -q "Safe mode is OFF"' + interval: 5s + timeout: 5s + retries: 5 + user: root + command: > + /bin/bash -c " + chown -R hadoop:hadoop /data /opt/hadoop/logs; + su -s /bin/bash hadoop -c ' + if [ ! -f /data/namenode/current/VERSION ]; then + echo \"Formatting NameNode...\"; + /opt/hadoop/bin/hdfs namenode -format -force -nonInteractive; + fi; + /opt/hadoop/bin/hdfs namenode + '" + + datanode: + image: apache/hadoop:3.4.2-lean + container_name: datanode + hostname: datanode + platform: linux/amd64 + depends_on: + namenode: + condition: service_healthy + networks: + - hadoop-network + ports: + - "9864:9864" # DataNode Web UI + - "9866:9866" # DataNode + volumes: + - hadoop_data:/data # Default persistence path + - hadoop_logs:/opt/hadoop/logs + - ./conf/core-site.xml:/opt/hadoop/etc/hadoop/core-site.xml + - ./conf/hdfs-site.xml:/opt/hadoop/etc/hadoop/hdfs-site.xml + user: root + command: > + /bin/bash -c " + chown -R hadoop:hadoop /data /opt/hadoop/logs; + su -s /bin/bash hadoop -c '/opt/hadoop/bin/hdfs datanode' + " + + tez-am: + image: apache/tez-am:${TEZ_VERSION:-1.0.0-SNAPSHOT} + container_name: tez-am + hostname: tez-am + networks: + - hadoop-network + ports: + - "10001:10001" + # - "5005:5005" # Uncomment for remote debugging + env_file: + - ./am.env + # Already define TEZ_CUSTOM_CONF_DIR in the env file, + # but adding here for clarity + # environment: + # - TEZ_CUSTOM_CONF_DIR=/opt/tez/custom-conf + # Uncomment the following lines if you want to mount a custom + # tez-site.xml for the Tez AM + # volumes: + # - ./custom-tez-site.xml:/opt/tez/custom-conf/tez-site.xml + # Uncomment the following lines to mount custom plugins or JARs + # required by Tez AM (e.g., UDFs, or dependencies previously managed + # via YARN localization) + # - ./tez-plugins:/opt/tez/plugins + depends_on: + zookeeper: + condition: service_started + namenode: + condition: service_healthy + datanode: + condition: service_started + +networks: + hadoop-network: + name: hadoop-network + driver: bridge + +volumes: + hadoop_data: + name: hadoop_data + hadoop_logs: + name: hadoop_logs + zookeeper_data: + name: zookeeper_data + zookeeper_datalog: + name: zookeeper_datalog + zookeeper_logs: + name: zookeeper_logs diff --git a/tez-examples/src/main/java/org/apache/tez/examples/ExternalAmWordCount.java b/tez-examples/src/main/java/org/apache/tez/examples/ExternalAmWordCount.java new file mode 100644 index 0000000000..151e8a68b2 --- /dev/null +++ b/tez-examples/src/main/java/org/apache/tez/examples/ExternalAmWordCount.java @@ -0,0 +1,186 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license + * agreements. See the NOTICE file distributed with this work for additional information regarding + * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the License. You may obtain a + * copy of the License at + * + *

http://www.apache.org/licenses/LICENSE-2.0 + * + *

Unless required by applicable law or agreed to in writing, software distributed under the + * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tez.examples; + +import java.util.StringTokenizer; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.tez.client.TezClient; +import org.apache.tez.dag.api.DAG; +import org.apache.tez.dag.api.Edge; +import org.apache.tez.dag.api.ProcessorDescriptor; +import org.apache.tez.dag.api.TezConfiguration; +import org.apache.tez.dag.api.Vertex; +import org.apache.tez.dag.api.client.DAGStatus.State; +import org.apache.tez.mapreduce.input.MRInput; +import org.apache.tez.mapreduce.output.MROutput; +import org.apache.tez.mapreduce.processor.SimpleMRProcessor; +import org.apache.tez.runtime.api.ProcessorContext; +import org.apache.tez.runtime.library.api.KeyValueWriter; +import org.apache.tez.runtime.library.api.KeyValuesReader; +import org.apache.tez.runtime.library.conf.OrderedPartitionedKVEdgeConfig; +import org.apache.tez.runtime.library.partitioner.HashPartitioner; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Sample Program inspired for WordCount but to run with External Tez AM with Zookeeper + */ +public class ExternalAmWordCount { + private static final Logger LOG = LoggerFactory.getLogger(ExternalAmWordCount.class); + private static final String ZK_ADDRESS = "zookeeper:2181"; + + public static void main(String[] args) throws Exception { + if (args.length != 2) { + System.err.println( + "Usage: java -cp com.github.raghav.ExternalAmWordCount "); + System.exit(2); + } + + var inputPath = args[0]; + var outputPath = args[1]; + + var conf = new Configuration(); + var tezConf = new TezConfiguration(conf); + + tezConf.set(TezConfiguration.TEZ_FRAMEWORK_MODE, "STANDALONE_ZOOKEEPER"); + tezConf.set(TezConfiguration.TEZ_AM_ZOOKEEPER_QUORUM, ZK_ADDRESS); + tezConf.set(TezConfiguration.TEZ_AM_CURATOR_SESSION_TIMEOUT, "30000ms"); + tezConf.setBoolean(TezConfiguration.TEZ_LOCAL_MODE, false); + tezConf.setBoolean(TezConfiguration.TEZ_IGNORE_LIB_URIS, true); + + // Prevent Tez from using the current directory for staging (avoids deleting your custom jar) + tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, "/tmp/tez-staging"); + + LOG.info("Starting Tez Client with ZK Address: {}", ZK_ADDRESS); + + var tezClient = TezClient.create("ExternalAmWordCount", tezConf, true); + + try { + tezClient.start(); + LOG.info("Waiting for Tez AM to register"); + tezClient.waitTillReady(); + LOG.info("Tez AM discovered! Submitting DAG..."); + + var app = new ExternalAmWordCount(); + var dag = app.createDAG(tezConf, inputPath, outputPath); + var dagClient = tezClient.submitDAG(dag); + + var dagStatus = dagClient.waitForCompletionWithStatusUpdates(null); + + if (dagStatus.getState() == State.SUCCEEDED) { + LOG.info("DAG Succeeded"); + System.exit(0); + } else { + LOG.error("DAG Failed with state: {}", dagStatus.getState()); + System.exit(1); + } + + } finally { + tezClient.stop(); + } + } + + private DAG createDAG(TezConfiguration tezConf, String inputPath, String outputPath) { + var dataSource = + MRInput.createConfigBuilder(new Configuration(tezConf), TextInputFormat.class, inputPath) + .build(); + + var dataSink = + MROutput.createConfigBuilder(new Configuration(tezConf), TextOutputFormat.class, outputPath) + .build(); + + var tokenizerVertex = + Vertex.create("Tokenizer", ProcessorDescriptor.create(TokenProcessor.class.getName())) + .addDataSource("Input", dataSource); + + var summerVertex = + Vertex.create( + "Summer", ProcessorDescriptor.create(SumProcessor.class.getName()), 1) // 1 Reducer + .addDataSink("Output", dataSink); + + var edgeConf = + OrderedPartitionedKVEdgeConfig.newBuilder( + Text.class.getName(), IntWritable.class.getName(), HashPartitioner.class.getName()) + .setFromConfiguration(tezConf) + .build(); + + return DAG.create("ZkWordCountDAG") + .addVertex(tokenizerVertex) + .addVertex(summerVertex) + .addEdge(Edge.create(tokenizerVertex, summerVertex, edgeConf.createDefaultEdgeProperty())); + } + + public static class TokenProcessor extends SimpleMRProcessor { + private static final IntWritable one = new IntWritable(1); + private final Text word = new Text(); + + public TokenProcessor(ProcessorContext context) { + super(context); + } + + @Override + public void run() throws Exception { + // Get inputs/outputs + var inputs = getInputs(); + var outputs = getOutputs(); + + var reader = (org.apache.tez.mapreduce.lib.MRReader) inputs.get("Input").getReader(); + var writer = (KeyValueWriter) outputs.get("Summer").getWriter(); + + while (reader.next()) { + var val = reader.getCurrentValue(); + var line = val.toString(); + var tokenizer = new StringTokenizer(line); + + while (tokenizer.hasMoreTokens()) { + word.set(tokenizer.nextToken()); + writer.write(word, one); + } + } + } + } + + public static class SumProcessor extends SimpleMRProcessor { + public SumProcessor(ProcessorContext context) { + super(context); + } + + @Override + public void run() throws Exception { + var inputs = getInputs(); + var outputs = getOutputs(); + + var reader = (KeyValuesReader) inputs.get("Tokenizer").getReader(); + var writer = (KeyValueWriter) outputs.get("Output").getWriter(); + + while (reader.next()) { + var key = reader.getCurrentKey(); + var values = reader.getCurrentValues(); + + int sum = 0; + for (var val : values) { + sum += ((IntWritable) val).get(); + } + writer.write(key, new IntWritable(sum)); + } + } + } +}