nixpkgs/pkgs/applications/networking/cluster/hadoop/default.nix


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166

{ lib
, stdenv
, fetchurl
, makeWrapper
, autoPatchelfHook
, jdk8_headless
, jdk11_headless
, bash
, coreutils
, which
, bzip2
, cyrus_sasl
, protobuf
, snappy
, zlib
, zstd
, openssl
, glibc
, nixosTests
, sparkSupport ? true
, spark
, libtirpc
, callPackage
}:

with lib;

assert elem stdenv.system [ "x86_64-linux" "x86_64-darwin" "aarch64-linux" "aarch64-darwin" ];

let
  common = { pname, platformAttrs, jdk, tests }:
    stdenv.mkDerivation (finalAttrs: {
      inherit pname jdk;
      version = platformAttrs.${stdenv.system}.version or (throw "Unsupported system: ${stdenv.system}");
      src = fetchurl {
        url = "mirror://apache/hadoop/common/hadoop-${finalAttrs.version}/hadoop-${finalAttrs.version}"
              + optionalString stdenv.isAarch64 "-aarch64" + ".tar.gz";
        inherit (platformAttrs.${stdenv.system}) hash;
      };
      doCheck = true;

      # Build the container executor binary from source
      # InstallPhase is not lazily evaluating containerExecutor for some reason
      containerExecutor = if stdenv.isLinux then (callPackage ./containerExecutor.nix {
        inherit (finalAttrs) version;
        inherit platformAttrs;
      }) else "";

      nativeBuildInputs = [ makeWrapper ]
                          ++ optionals stdenv.isLinux [ autoPatchelfHook ];
      buildInputs = optionals stdenv.isLinux [ stdenv.cc.cc.lib openssl protobuf zlib snappy libtirpc ];

      installPhase = ''
        mkdir $out
        mv * $out/
      '' + optionalString stdenv.isLinux ''
        for n in $(find ${finalAttrs.containerExecutor}/bin -type f); do
          ln -sf "$n" $out/bin
        done

        # these libraries are loaded at runtime by the JVM
        ln -s ${getLib cyrus_sasl}/lib/libsasl2.so $out/lib/native/libsasl2.so.2
        ln -s ${getLib openssl}/lib/libcrypto.so $out/lib/native/
        ln -s ${getLib zlib}/lib/libz.so.1 $out/lib/native/
        ln -s ${getLib zstd}/lib/libzstd.so.1 $out/lib/native/
        ln -s ${getLib bzip2}/lib/libbz2.so.1 $out/lib/native/
        ln -s ${getLib snappy}/lib/libsnappy.so.1 $out/lib/native/

        # libjvm.so is in different paths for java 8 and 11
        # libnativetask.so in hadooop 3 and libhdfs.so in hadoop 2 depend on it
        find $out/lib/native/ -name 'libnativetask.so*' -o -name 'libhdfs.so*' | \
          xargs -n1 patchelf --add-rpath $(dirname $(find ${finalAttrs.jdk.home} -name libjvm.so | head -n1))

        # NixOS/nixpkgs#193370
        # This workaround is needed to use protobuf 3.19
        # hadoop 3.3+ depends on protobuf 3.18, 3.2 depends on 3.8
        find $out/lib/native -name 'libhdfspp.so*' | \
          xargs -r -n1 patchelf --replace-needed libprotobuf.so.${
            if (versionAtLeast finalAttrs.version "3.3") then "18"
            else "8"
          } libprotobuf.so

        patchelf --replace-needed libcrypto.so.1.1 libcrypto.so \
          $out/lib/native/{libhdfs{pp,}.so*,examples/{pipes-sort,wordcount-nopipe,wordcount-part,wordcount-simple}}

      '' + ''
        for n in $(find $out/bin -type f ! -name "*.*"); do
          wrapProgram "$n"\
            --set-default JAVA_HOME ${finalAttrs.jdk.home}\
            --set-default HADOOP_HOME $out/\
            --run "test -d /etc/hadoop-conf && export HADOOP_CONF_DIR=\''${HADOOP_CONF_DIR-'/etc/hadoop-conf/'}"\
            --set-default HADOOP_CONF_DIR $out/etc/hadoop/\
            --prefix PATH : "${makeBinPath [ bash coreutils which]}"\
            --prefix JAVA_LIBRARY_PATH : "${makeLibraryPath finalAttrs.buildInputs}"
        done
      '' + (optionalString sparkSupport ''
        # Add the spark shuffle service jar to YARN
        cp ${spark.src}/yarn/spark-${spark.version}-yarn-shuffle.jar $out/share/hadoop/yarn/
      '');

      passthru = { inherit tests; };

      meta = recursiveUpdate {
        homepage = "https://hadoop.apache.org/";
        description = "Framework for distributed processing of large data sets across clusters of computers";
        license = licenses.asl20;
        sourceProvenance = with sourceTypes; [ binaryBytecode ];

        longDescription = ''
          The Apache Hadoop software library is a framework that allows for
          the distributed processing of large data sets across clusters of
          computers using a simple programming model. It is designed to
          scale up from single servers to thousands of machines, each
          offering local computation and storage. Rather than rely on
          hardware to deliver high-avaiability, the library itself is
          designed to detect and handle failures at the application layer,
          so delivering a highly-availabile service on top of a cluster of
          computers, each of which may be prone to failures.
        '';
        maintainers = with maintainers; [ illustris ];
        platforms = attrNames platformAttrs;
      } (attrByPath [ stdenv.system "meta" ] {} platformAttrs);
    });
in
{
  # Different version of hadoop support different java runtime versions
  # https://cwiki.apache.org/confluence/display/HADOOP/Hadoop+Java+Versions
  hadoop_3_3 = common rec {
    pname = "hadoop";
    platformAttrs = rec {
      x86_64-linux = {
        version = "3.3.6";
        hash = "sha256-9RlQWcDUECrap//xf3sqhd+Qa8tuGZSHFjGfmXhkGgQ=";
        srcHash = "sha256-4OEsVhBNV9CJ+PN4FgCduUCVA9/el5yezSCZ6ko3+bU=";
      };
      x86_64-darwin = x86_64-linux;
      aarch64-linux = x86_64-linux // {
        hash = "sha256-5Lv2uA72BJEva5v2yncyPe5gKNCNOPNsoHffVt6KXQ0=";
      };
      aarch64-darwin = aarch64-linux;
    };
    jdk = jdk11_headless;
    # TODO: Package and add Intel Storage Acceleration Library
    tests = nixosTests.hadoop;
  };
  hadoop_3_2 = common {
    pname = "hadoop";
    platformAttrs.x86_64-linux = {
      version = "3.2.4";
      hash = "sha256-qt2gpMr+NHuiVR+/zFRzRyRKG725/ZNBIM69z9J9wNw=";
      srcHash = "sha256-F9nGD3mZZ1eJf3Ec3AJGE9YBcL/HiagskcdKQhCn/sw=";
    };
    jdk = jdk8_headless;
    tests = nixosTests.hadoop_3_2;
  };
  hadoop2 = common rec {
    pname = "hadoop";
    platformAttrs.x86_64-linux = {
      version = "2.10.2";
      hash = "sha256-xhA4zxqIRGNhIeBnJO9dLKf/gx/Bq+uIyyZwsIafEyo=";
      srcHash = "sha256-ucxCyXiJo8aL6aNMhZgKEbn8sGKOoMPVREbMGSfSdAI=";
    };
    jdk = jdk8_headless;
    tests = nixosTests.hadoop2;
  };
}