about summary refs log tree commit diff
path: root/nixos
diff options
context:
space:
mode:
authorMarkus Kowalewski <markus.kowalewski@gmail.com>2018-09-15 13:09:36 +0200
committerMarkus Kowalewski <markus.kowalewski@gmail.com>2018-10-30 19:50:52 +0100
commit79c9dbfb40d5123f18ed64485265d8bc19209091 (patch)
tree6d433e30f896ecc3dff1cd3af62559062dfa8ebd /nixos
parent111d4eb090cdd71b227701842dc67f3bc6dc5f44 (diff)
downloadnixlib-79c9dbfb40d5123f18ed64485265d8bc19209091.tar
nixlib-79c9dbfb40d5123f18ed64485265d8bc19209091.tar.gz
nixlib-79c9dbfb40d5123f18ed64485265d8bc19209091.tar.bz2
nixlib-79c9dbfb40d5123f18ed64485265d8bc19209091.tar.lz
nixlib-79c9dbfb40d5123f18ed64485265d8bc19209091.tar.xz
nixlib-79c9dbfb40d5123f18ed64485265d8bc19209091.tar.zst
nixlib-79c9dbfb40d5123f18ed64485265d8bc19209091.zip
nixos/slurm: add slurmdbd to module
* New options "services.slurm.dbdserver.[enable,config]"
* Add slurmdbd to test slurm.nix
Diffstat (limited to 'nixos')
-rw-r--r--nixos/modules/services/computing/slurm/slurm.nix55
-rw-r--r--nixos/tests/slurm.nix51
2 files changed, 99 insertions, 7 deletions
diff --git a/nixos/modules/services/computing/slurm/slurm.nix b/nixos/modules/services/computing/slurm/slurm.nix
index 9dbb6a4d0d93..77b9c40577a3 100644
--- a/nixos/modules/services/computing/slurm/slurm.nix
+++ b/nixos/modules/services/computing/slurm/slurm.nix
@@ -29,12 +29,19 @@ let
       ${cfg.extraPlugstackConfig}
     '';
 
-
   cgroupConfig = pkgs.writeTextDir "cgroup.conf"
    ''
      ${cfg.extraCgroupConfig}
    '';
 
+  slurmdbdConf = pkgs.writeTextDir "slurmdbd.conf"
+   ''
+     DbdHost=${cfg.dbdserver.dbdHost}
+     SlurmUser=${cfg.user}
+     StorageType=accounting_storage/mysql
+     ${cfg.dbdserver.extraConfig}
+   '';
+
   # slurm expects some additional config files to be
   # in the same directory as slurm.conf
   etcSlurm = pkgs.symlinkJoin {
@@ -65,6 +72,27 @@ in
         };
       };
 
+      dbdserver = {
+        enable = mkEnableOption "SlurmDBD service";
+
+        dbdHost = mkOption {
+          type = types.str;
+          default = config.networking.hostName;
+          description = ''
+            Hostname of the machine where <literal>slurmdbd</literal>
+            is running (i.e. name returned by <literal>hostname -s</literal>).
+          '';
+        };
+
+        extraConfig = mkOption {
+          type = types.lines;
+          default = "";
+          description = ''
+            Extra configuration for <literal>slurmdbd.conf</literal>
+          '';
+        };
+      };
+
       client = {
         enable = mkEnableOption "slurm client daemon";
       };
@@ -208,6 +236,8 @@ in
           used when <literal>procTrackType=proctrack/cgroup</literal>.
         '';
       };
+
+
     };
 
   };
@@ -244,7 +274,10 @@ in
         '';
       };
 
-  in mkIf (cfg.enableStools || cfg.client.enable || cfg.server.enable) {
+  in mkIf ( cfg.enableStools ||
+            cfg.client.enable ||
+            cfg.server.enable ||
+            cfg.dbdserver.enable ) {
 
     environment.systemPackages = [ wrappedSlurm ];
 
@@ -301,6 +334,24 @@ in
       '';
     };
 
+    systemd.services.slurmdbd = mkIf (cfg.dbdserver.enable) {
+      path = with pkgs; [ wrappedSlurm munge coreutils ];
+
+      wantedBy = [ "multi-user.target" ];
+      after = [ "network.target" "munged.service" "mysql.service" ];
+      requires = [ "munged.service" "mysql.service" ];
+
+      # slurm strips the last component off the path
+      environment.SLURM_CONF = "${slurmdbdConf}/slurm.conf";
+
+      serviceConfig = {
+        Type = "forking";
+        ExecStart = "${cfg.package}/bin/slurmdbd";
+        PIDFile = "/run/slurmdbd.pid";
+        ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
+      };
+    };
+
   };
 
 }
diff --git a/nixos/tests/slurm.nix b/nixos/tests/slurm.nix
index 60f44c3c8459..54ea1ee7894e 100644
--- a/nixos/tests/slurm.nix
+++ b/nixos/tests/slurm.nix
@@ -1,5 +1,7 @@
 import ./make-test.nix ({ ... }:
-let mungekey = "mungeverryweakkeybuteasytointegratoinatest";
+let
+    mungekey = "mungeverryweakkeybuteasytointegratoinatest";
+
     slurmconfig = {
       controlMachine = "control";
       nodeName = ''
@@ -7,6 +9,10 @@ let mungekey = "mungeverryweakkeybuteasytointegratoinatest";
         NodeName=node[1-3] CPUs=1 State=UNKNOWN
       '';
       partitionName = "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP";
+      extraConfig = ''
+        AccountingStorageHost=dbd
+        AccountingStorageType=accounting_storage/slurmdbd
+      '';
     };
 in {
   name = "slurm";
@@ -16,7 +22,7 @@ in {
     computeNode =
       { ...}:
       {
-        # TODO slrumd port and slurmctld port should be configurations and
+        # TODO slurmd port and slurmctld port should be configurations and
         # automatically allowed by the  firewall.
         networking.firewall.enable = false;
         services.slurm = {
@@ -43,6 +49,24 @@ in {
         } // slurmconfig;
       };
 
+    dbd =
+      { pkgs, ... } :
+      {
+        networking.firewall.enable = false;
+        services.slurm.dbdserver = {
+          enable = true;
+        };
+        services.mysql = {
+          enable = true;
+          package = pkgs.mysql;
+          ensureDatabases = [ "slurm_acct_db" ];
+          ensureUsers = [{
+            ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; };
+            name = "slurm";
+          }];
+        };
+      };
+
     node1 = computeNode;
     node2 = computeNode;
     node3 = computeNode;
@@ -54,7 +78,7 @@ in {
   startAll;
 
   # Set up authentification across the cluster
-  foreach my $node (($submit,$control,$node1,$node2,$node3))
+  foreach my $node (($submit,$control,$dbd,$node1,$node2,$node3))
   {
     $node->waitForUnit("default.target");
 
@@ -63,10 +87,22 @@ in {
     $node->succeed("chmod 0400 /etc/munge/munge.key");
     $node->succeed("chown munge:munge /etc/munge/munge.key");
     $node->succeed("systemctl restart munged");
-  }
+
+    $node->waitForUnit("munged");
+  };
 
   # Restart the services since they have probably failed due to the munge init
   # failure
+  subtest "can_start_slurmdbd", sub {
+    $dbd->succeed("systemctl restart slurmdbd");
+    $dbd->waitForUnit("slurmdbd.service");
+  };
+
+  # there needs to be an entry for the current
+  # cluster in the database before slurmctld is restarted
+  subtest "add_account", sub {
+    $control->succeed("sacctmgr -i add cluster default");
+  };
 
   subtest "can_start_slurmctld", sub {
     $control->succeed("systemctl restart slurmctld");
@@ -81,12 +117,17 @@ in {
     }
   };
 
-  # Test that the cluster work and can distribute jobs;
+  # Test that the cluster works and can distribute jobs;
 
   subtest "run_distributed_command", sub {
     # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes).
     # The output must contain the 3 different names
     $submit->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq");
   };
+
+  subtest "check_slurm_dbd", sub {
+    # find the srun job from above in the database
+    $submit->succeed("sacct | grep hostname");
+  };
   '';
 })