Hasso-Plattner-Institut
Prof. Dr. Tilmann Rabl
 

Compiling Hadoop ARM Bindings

Compiling the armv7/aarch32 binaries for Hadoop is not as easy as it seems. As a first step, clone the latest Hadoop version from the Git Repository onto the Pi device as the Hadoop user. Note that this setup works on the trunk as of March 2020. If you are from the future, maybe some further patches have changed the behaviour again. Furthermore, if you compile the bindings for armv8/aarch64, note that not all changes are necessary.

 If they haven't merged our patch yet, you first of all need to change the Docker setup.

­To do that, first rename `dev-support/docker/Dockerfile_aarch64` to `dev-support/docker/Dockerfile_arm`. Around line 23, replace the check for aarch64 like this:

-if [ "$CPU_ARCH" = "aarch64" ]; then
-  DOCKER_FILE="${DOCKER_DIR}/Dockerfile_aarch64"
+
+1
+  DOCKER_FILE="${DOCKER_DIR}/Dockerfile_arm"

Also, because we compile the script as the Hadoop user we `sudo su`'d from `pi` into `hadoop`. Thus, we needed to change how the script reads the user name.

-USER_NAME=${SUDO_USER:=$USER}
+USER_NAME=${USER}

Now, we will correct the Dockerfile setup. For this, open the now called `dev-support/docker/Dockerfile_arm` in an editor of your choice. For that, first add `libfontconfig1` and `libfreetype6` to the list of apt packages being installed:

RUN apt-get -q update \
    && apt-get -q install -y --no-install-recommends \
        apt-utils \
        build-essential \
        bzip2 \
        clang \
        curl \
        doxygen \
        fuse \
        g++ \
        gcc \
        git \
        gnupg-agent \
        libbz2-dev \
        libcurl4-openssl-dev \
        libfuse-dev \
        libfontconfig1 \
        libfreetype6 \
        libprotobuf-dev \
        libprotoc-dev \
        libsasl2-dev \
        ...

Now, fix the installation of `cmake` to extract the correct file, depending on your platform:
`tar xzf cmake-3.1.0-Linux-aarch64.tar.gz --strip-components 1 -C /opt/cmake \` becomes `tar xzf "cmake-3.1.0-Linux-$(uname -m).tar.gz" --strip-components 1 -C /opt/cmake \`.
For `phantomjs`, we will have to add a check for `aarch64` or `aarch32`. Replace the section with this one:

RUN if [ "$(echo "$MACHTYPE" | cut -d- -f1)" = "aarch64" ]; \
  then mkdir -p /opt/phantomjs \
  && curl -L -s -S \
    github.com/liusheng/phantomjs/releases/download/2.1.1/
    phantomjs-2.1.1-linux-aarch64.tar.bz2 \
   -o /opt/phantomjs/phantomjs-2.1.1-linux-aarch64.tar.bz2 \
  && tar xvjf /opt/phantomjs/phantomjs-2.1.1-linux-aarch64.tar.bz2 
  --strip-components 1 -C /opt/phantomjs \
  && cp /opt/phantomjs/bin/phantomjs /usr/bin/ \
  && rm -rf /opt/phantomjs; else \
  mkdir -p /opt/phantomjs \
  && curl -L -s -S github.com/fg2it/phantomjs-on-raspberry/
  releases/download/v2.1.1-wheezy-jessie-armv6/phantomjs -o /opt/phantomjs/phantomjs \
  && cp /opt/phantomjs/phantomjs /usr/bin/ \
  && rm -rf /opt/phantomjs; fi

Same goes for `Hugo`. Change the section this way:

# Hugo static website generator (for new hadoop site docs)
RUN if [ "$(echo "$MACHTYPE" | cut -d- -f1)" = "aarch64" ]; \
    then curl -L -o hugo.deb github.com/gohugoio/hugo/releases/download/
    v0.58.3/hugo_0.58.3_Linux-ARM64.deb; else \
    curl -L -o hugo.deb github.com/gohugoio/hugo/releases/download/v0.58.3/
    hugo_0.58.3_Linux-ARM.deb; fi \
    && dpkg --install hugo.deb \
    && rm hugo.deb

We also have to move the check of the float ABI of the JVM from `HadoopCommon.cmake` to `HadoopJNI.cmake` (see this Github PR 224: github.com/apache/hadoop/pull/224/). Remove the following from `hadoop-common-project/hadoop-common/HadoopCommon.cmake` (around line 158):

-    # Determine float ABI of JVM on ARM.
-    if(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
-        find_program(READELF readelf)
-        if(READELF MATCHES "NOTFOUND")
-            message(WARNING "readelf not found; JVM float ABI detection disabled")
-        else(READELF MATCHES "NOTFOUND")
-            execute_process(
-                COMMAND ${READELF} -A ${JAVA_JVM_LIBRARY}
-                OUTPUT_VARIABLE JVM_ELF_ARCH
-                ERROR_QUIET)
-            if(NOT JVM_ELF_ARCH MATCHES "Tag_ABI_VFP_args: VFP registers")
-                # Test compilation with -mfloat-abi=softfp using an arbitrary
                libc function
-                # (typically fails with "fatal error: bits/predefs.h: No such
                file or directory"
-                # if soft-float dev libraries are not installed)
-                message("Soft-float JVM detected")
-                include(CMakePushCheckState)
-                cmake_push_check_state()
-                set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}
                    -mfloat-abi=softfp")
-                include(CheckSymbolExists)
-                check_symbol_exists(exit stdlib.h SOFTFP_AVAILABLE)
-                if(NOT SOFTFP_AVAILABLE)
-                    message(FATAL_ERROR "Soft-float dev libraries 
                        required (e.g. 'apt-get install libc6-dev-armel' 
                        on Debian/Ubuntu)")
-                endif()
-                cmake_pop_check_state()
-                hadoop_add_compiler_flags("-mfloat-abi=softfp")
-            endif()
-        endif()
-    endif()

Add the following to `hadoop-common-project/hadoop-common/HadoopJNI.cmake` (around line 88):

    # Use the standard FindJNI module to locate the JNI components.
     find_package(JNI REQUIRED)
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm" AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    find_program(READELF readelf)
+    if (READELF MATCHES "NOTFOUND")
+        message(WARNING "readelf not found; JVM float ABI detection disabled")
+    else (READELF MATCHES "NOTFOUND")
+        message(STATUS "Checking float ABI of ${JAVA_JVM_LIBRARY}")
+        execute_process(
+            COMMAND ${READELF} -A ${JAVA_JVM_LIBRARY}
+            OUTPUT_VARIABLE JVM_ELF_ARCH
+            ERROR_QUIET)
+        if (JVM_ELF_ARCH MATCHES "Tag_ABI_VFP_args: VFP registers")
+           message(STATUS "Hard-float JVM detected")
+        else ()
+            message(STATUS "Soft-float JVM detected")
+
+            # Test compilation with -mfloat-abi=softfp using an arbitrary 
            libc function
+            # (typically fails with "fatal error: bits/predefs.h: No such file
            or directory"
+            # if soft-float dev libraries are not installed)
+            include(CMakePushCheckState)
+            cmake_push_check_state()
+            set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -mfloat-abi=softfp")
+            include(CheckSymbolExists)
+            check_symbol_exists(exit stdlib.h SOFTFP_AVAILABLE)
+            if (NOT SOFTFP_AVAILABLE)
+                message(FATAL_ERROR "Soft-float dev libraries required (e.g.
                    'apt-get install libc6-dev-armel' on Debian/Ubuntu)")
+            endif (NOT SOFTFP_AVAILABLE)
+            cmake_pop_check_state()
+
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfloat-abi=softfp")
+        endif ()
+    endif (READELF MATCHES "NOTFOUND")
+endif (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm" AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
+

 #
 # Otherwise, use the standard FindJNI module to locate the JNI components.

Now, we have to edit the following files

  • hadoop-mapreduce-project/hadoop-mapreduce-client/
  • hadoop-mapreduce-client-nativetask/src/main/native/src/lib/primitives.h

because it uses `bswap` unportably (see issues.apache.org/jira/browse/HADOOP-11505). For this, we will apply patch 001 from that Jira issue. Download this patch and apply it using `patch -p1 < HADOOP-11505.001.patch`. For the record, these are the changes that are going to be applied:

 #define unlikely(x)     (x)
 #endif

+#if (defined(__X64) || defined(__x86_64__) || defined(_M_X64))
+#define IS_X86_64 1
+#endif
+
 //#define SIMPLE_MEMCPY

 #if !defined(SIMPLE_MEMCPY)
@@ -99,31 +103,33 @@ inline void simple_memcpy(void * dest, const void * src, 
    size_t len) {
 inline uint32_t bswap(uint32_t val) {
 #ifdef __aarch64__
   __asm__("rev %w[dst], %w[src]" : [dst]"=r"(val) : [src]"r"(val));
-#else
+  return val;
+#elif IS_X86_64
   __asm__("bswap %0" : "=r" (val) : "0" (val));
-#endif
   return val;
+#else
+  uint32_t b0 = (val >> 24) & 0xff;
+  uint32_t b1 = (val >> 16) & 0xff;
+  uint32_t b2 = (val >>  8) & 0xff;
+  uint32_t b3 = (val >>  0) & 0xff;
+  return (b3 << 24) | (b2 << 16) | (b1 << 8) | b0;
+#endif
 }

 inline uint64_t bswap64(uint64_t val) {
 #ifdef __aarch64__
   __asm__("rev %[dst], %[src]" : [dst]"=r"(val) : [src]"r"(val));
-#else
-#ifdef __X64
+  return val;
+#elif IS_X86_64
   __asm__("bswapq %0" : "=r" (val) : "0" (val));
+  return val;
 #else
-
   uint64_t lower = val & 0xffffffffU;
   uint32_t higher = (val >> 32) & 0xffffffffU;
-
   lower = bswap(lower);
   higher = bswap(higher);
-
   return (lower << 32) + higher;
-
 #endif
-#endif
-  return val;
 }

Now, some components download dependencies incorrectly (not for ARM), see this issue. This is why we are just going to disable them in the Maven setup. First, edit the `hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/pom.xml` file and remove the following line: `<module>hadoop-yarn-applications-catalog</module>`. Then, edit `hadoop-yarn-project/hadoop-yarn/pom.xml` and remove `<module>hadoop-yarn-csi</module>`.

Now, finally, you can run the `start-build-env.sh`. After waiting patiently for the Docker setup (this can take a while!), we are almost done. As soon as the bash prompt is presented to you, we have to register `protobuf` in the Maven repository. For this, run `mvn install:install-file -DgroupId=com.google.protobuf -DartifactId=protoc -Dversion=3.7.1 -Dclassifier=linux-arm_32 -Dpackaging=exe -Dfile=/opt/protobuf/bin/protoc`. And now, for the hacky part, run the same command again, but replace `-Dversion=3.7.1` with `-Dversion=3.6.1`. Some dependency requires Protobuf 3.6.1, but the installation fails due to arm and 3.7.1 seems to be compatible. Now, we can finally build the natives by running `mvn -e -X package -Pdist,native -DskipTests -Dtar`.

After (hopefully) compiling successfully, you can exit the dev-environment. You can find the native libs in the `hadoop-dist/target/hadoop-3.3.0-SNAPSHOT/lib/native` directory (maybe the version string is different for you). Just replace your local native files (for me, they were in `~/hadoop/lib/native`) with these and run ` hadoop checknative -a` to check if they were installed. For me, it shows output like that:

[hadoop@node01:~/hadoop] hadoop checknative -a
2020-03-09 20:48:34,885 INFO bzip2.Bzip2Factory: Successfully loaded 
    & initialized native-bzip2 library system-native
2020-03-09 20:48:34,894 INFO zlib.ZlibFactory: Successfully loaded
    & initialized native-zlib library
2020-03-09 20:48:34,920 WARN erasurecode.ErasureCodeNative: 
    ISA-L support is not available in your platform...
    using builtin-java codec where applicable
Native library checking:
hadoop:  true /home/hadoop/hadoop/lib/native/libhadoop.so
zlib:    true /lib/arm-linux-gnueabihf/libz.so.1
zstd  :  true /usr/lib/arm-linux-gnueabihf/libzstd.so.1
snappy:  true /usr/lib/arm-linux-gnueabihf/libsnappy.so.1
lz4:     true revision:10301
bzip2:   true /lib/arm-linux-gnueabihf/libbz2.so.1
openssl: false EVP_CIPHER_CTX_cleanup
ISA-L:   false libhadoop was built without ISA-L support
2020-03-09 20:48:34,998 INFO util.ExitUtil: Exiting with status 1: ExitException

Congratulations! You have compiled the Hadoop native libs on your system.