diff options
author | xeji <36407913+xeji@users.noreply.github.com> | 2018-10-31 00:16:11 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-10-31 00:16:11 +0100 |
commit | 6efd811062849402f1f44952fd5d7cd283d72baf (patch) | |
tree | c207bdf63000dc1385399a32efbc220137af58c0 /nixos | |
parent | 933874ef08e4a110e29cb50752b5e7bfed6925f5 (diff) | |
parent | b388beeca3a3a2dd79ba2f2737deb294d4d4748b (diff) | |
download | nixlib-6efd811062849402f1f44952fd5d7cd283d72baf.tar nixlib-6efd811062849402f1f44952fd5d7cd283d72baf.tar.gz nixlib-6efd811062849402f1f44952fd5d7cd283d72baf.tar.bz2 nixlib-6efd811062849402f1f44952fd5d7cd283d72baf.tar.lz nixlib-6efd811062849402f1f44952fd5d7cd283d72baf.tar.xz nixlib-6efd811062849402f1f44952fd5d7cd283d72baf.tar.zst nixlib-6efd811062849402f1f44952fd5d7cd283d72baf.zip |
Merge pull request #49348 from markuskowa/mod-slurm-upgrade
nixos/slurm: add slurmdbd, run daemons as user
Diffstat (limited to 'nixos')
-rw-r--r-- | nixos/doc/manual/release-notes/rl-1903.xml | 20 | ||||
-rw-r--r-- | nixos/modules/misc/ids.nix | 2 | ||||
-rw-r--r-- | nixos/modules/services/computing/slurm/slurm.nix | 113 | ||||
-rw-r--r-- | nixos/tests/slurm.nix | 60 |
4 files changed, 174 insertions, 21 deletions
diff --git a/nixos/doc/manual/release-notes/rl-1903.xml b/nixos/doc/manual/release-notes/rl-1903.xml index 9c07184613e4..1d9b6ecc0e2e 100644 --- a/nixos/doc/manual/release-notes/rl-1903.xml +++ b/nixos/doc/manual/release-notes/rl-1903.xml @@ -152,6 +152,26 @@ has been renamed to <varname>postgresql_9_6</varname>. </para> </listitem> + <listitem> + <para> + Slurm introduces the new option + <literal>services.slurm.stateSaveLocation</literal>, + which is now set to <literal>/var/spool/slurm</literal> by default + (instead of <literal>/var/spool</literal>). + Make sure to move all files to the new directory or to set the option accordingly. + </para> + <para> + The slurmctld now runs as user <literal>slurm</literal> instead of <literal>root</literal>. + If you want to keep slurmctld running as <literal>root</literal>, set + <literal>services.slurm.user = root</literal>. + </para> + <para> + The options <literal>services.slurm.nodeName</literal> and + <literal>services.slurm.partitionName</literal> are now sets of + strings to correctly reflect that fact that each of these + options can occour more than once in the configuration. + </para> + </listitem> </itemizedlist> </section> diff --git a/nixos/modules/misc/ids.nix b/nixos/modules/misc/ids.nix index 5c30e512a1b3..a32e4fe3f7c8 100644 --- a/nixos/modules/misc/ids.nix +++ b/nixos/modules/misc/ids.nix @@ -331,6 +331,7 @@ zeronet = 304; lirc = 305; lidarr = 306; + slurm = 307; # When adding a uid, make sure it doesn't match an existing gid. And don't use uids above 399! @@ -622,6 +623,7 @@ zeronet = 304; lirc = 305; lidarr = 306; + slurm = 307; # When adding a gid, make sure it doesn't match an existing # uid. Users and groups with the same name should have equal diff --git a/nixos/modules/services/computing/slurm/slurm.nix b/nixos/modules/services/computing/slurm/slurm.nix index 09174ed39f5e..cd481212db2d 100644 --- a/nixos/modules/services/computing/slurm/slurm.nix +++ b/nixos/modules/services/computing/slurm/slurm.nix @@ -6,13 +6,18 @@ let cfg = config.services.slurm; # configuration file can be generated by http://slurm.schedmd.com/configurator.html + + defaultUser = "slurm"; + configFile = pkgs.writeTextDir "slurm.conf" '' ClusterName=${cfg.clusterName} + StateSaveLocation=${cfg.stateSaveLocation} + SlurmUser=${cfg.user} ${optionalString (cfg.controlMachine != null) ''controlMachine=${cfg.controlMachine}''} ${optionalString (cfg.controlAddr != null) ''controlAddr=${cfg.controlAddr}''} - ${optionalString (cfg.nodeName != null) ''nodeName=${cfg.nodeName}''} - ${optionalString (cfg.partitionName != null) ''partitionName=${cfg.partitionName}''} + ${toString (map (x: "NodeName=${x}\n") cfg.nodeName)} + ${toString (map (x: "PartitionName=${x}\n") cfg.partitionName)} PlugStackConfig=${plugStackConfig} ProctrackType=${cfg.procTrackType} ${cfg.extraConfig} @@ -24,12 +29,19 @@ let ${cfg.extraPlugstackConfig} ''; - cgroupConfig = pkgs.writeTextDir "cgroup.conf" '' ${cfg.extraCgroupConfig} ''; + slurmdbdConf = pkgs.writeTextDir "slurmdbd.conf" + '' + DbdHost=${cfg.dbdserver.dbdHost} + SlurmUser=${cfg.user} + StorageType=accounting_storage/mysql + ${cfg.dbdserver.extraConfig} + ''; + # slurm expects some additional config files to be # in the same directory as slurm.conf etcSlurm = pkgs.symlinkJoin { @@ -43,6 +55,8 @@ in ###### interface + meta.maintainers = [ maintainers.markuskowa ]; + options = { services.slurm = { @@ -60,6 +74,27 @@ in }; }; + dbdserver = { + enable = mkEnableOption "SlurmDBD service"; + + dbdHost = mkOption { + type = types.str; + default = config.networking.hostName; + description = '' + Hostname of the machine where <literal>slurmdbd</literal> + is running (i.e. name returned by <literal>hostname -s</literal>). + ''; + }; + + extraConfig = mkOption { + type = types.lines; + default = ""; + description = '' + Extra configuration for <literal>slurmdbd.conf</literal> + ''; + }; + }; + client = { enable = mkEnableOption "slurm client daemon"; }; @@ -116,9 +151,9 @@ in }; nodeName = mkOption { - type = types.nullOr types.str; - default = null; - example = "linux[1-32] CPUs=1 State=UNKNOWN"; + type = types.listOf types.str; + default = []; + example = literalExample ''[ "linux[1-32] CPUs=1 State=UNKNOWN" ];''; description = '' Name that SLURM uses to refer to a node (or base partition for BlueGene systems). Typically this would be the string that "/bin/hostname -s" @@ -127,9 +162,9 @@ in }; partitionName = mkOption { - type = types.nullOr types.str; - default = null; - example = "debug Nodes=linux[1-32] Default=YES MaxTime=INFINITE State=UP"; + type = types.listOf types.str; + default = []; + example = literalExample ''[ "debug Nodes=linux[1-32] Default=YES MaxTime=INFINITE State=UP" ];''; description = '' Name by which the partition may be referenced. Note that now you have to write the partition's parameters after the name. @@ -150,7 +185,7 @@ in }; procTrackType = mkOption { - type = types.string; + type = types.str; default = "proctrack/linuxproc"; description = '' Plugin to be used for process tracking on a job step basis. @@ -159,6 +194,25 @@ in ''; }; + stateSaveLocation = mkOption { + type = types.str; + default = "/var/spool/slurmctld"; + description = '' + Directory into which the Slurm controller, slurmctld, saves its state. + ''; + }; + + user = mkOption { + type = types.str; + default = defaultUser; + description = '' + Set this option when you want to run the slurmctld daemon + as something else than the default slurm user "slurm". + Note that the UID of this user needs to be the same + on all nodes. + ''; + }; + extraConfig = mkOption { default = ""; type = types.lines; @@ -184,6 +238,8 @@ in used when <literal>procTrackType=proctrack/cgroup</literal>. ''; }; + + }; }; @@ -220,12 +276,24 @@ in ''; }; - in mkIf (cfg.enableStools || cfg.client.enable || cfg.server.enable) { + in mkIf ( cfg.enableStools || + cfg.client.enable || + cfg.server.enable || + cfg.dbdserver.enable ) { environment.systemPackages = [ wrappedSlurm ]; services.munge.enable = mkDefault true; + # use a static uid as default to ensure it is the same on all nodes + users.users.slurm = mkIf (cfg.user == defaultUser) { + name = defaultUser; + group = "slurm"; + uid = config.ids.uids.slurm; + }; + + users.groups.slurm.gid = config.ids.uids.slurm; + systemd.services.slurmd = mkIf (cfg.client.enable) { path = with pkgs; [ wrappedSlurm coreutils ] ++ lib.optional cfg.enableSrunX11 slurm-spank-x11; @@ -261,6 +329,29 @@ in PIDFile = "/run/slurmctld.pid"; ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID"; }; + + preStart = '' + mkdir -p ${cfg.stateSaveLocation} + chown -R ${cfg.user}:slurm ${cfg.stateSaveLocation} + ''; + }; + + systemd.services.slurmdbd = mkIf (cfg.dbdserver.enable) { + path = with pkgs; [ wrappedSlurm munge coreutils ]; + + wantedBy = [ "multi-user.target" ]; + after = [ "network.target" "munged.service" "mysql.service" ]; + requires = [ "munged.service" "mysql.service" ]; + + # slurm strips the last component off the path + environment.SLURM_CONF = "${slurmdbdConf}/slurm.conf"; + + serviceConfig = { + Type = "forking"; + ExecStart = "${cfg.package}/bin/slurmdbd"; + PIDFile = "/run/slurmdbd.pid"; + ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID"; + }; }; }; diff --git a/nixos/tests/slurm.nix b/nixos/tests/slurm.nix index 60f44c3c8459..7f9c266cbff6 100644 --- a/nixos/tests/slurm.nix +++ b/nixos/tests/slurm.nix @@ -1,22 +1,27 @@ -import ./make-test.nix ({ ... }: -let mungekey = "mungeverryweakkeybuteasytointegratoinatest"; +import ./make-test.nix ({ lib, ... }: +let + mungekey = "mungeverryweakkeybuteasytointegratoinatest"; + slurmconfig = { controlMachine = "control"; - nodeName = '' - control - NodeName=node[1-3] CPUs=1 State=UNKNOWN + nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ]; + partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ]; + extraConfig = '' + AccountingStorageHost=dbd + AccountingStorageType=accounting_storage/slurmdbd ''; - partitionName = "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP"; }; in { name = "slurm"; + meta.maintainers = [ lib.maintainers.markuskowa ]; + nodes = let computeNode = { ...}: { - # TODO slrumd port and slurmctld port should be configurations and + # TODO slurmd port and slurmctld port should be configurations and # automatically allowed by the firewall. networking.firewall.enable = false; services.slurm = { @@ -43,6 +48,24 @@ in { } // slurmconfig; }; + dbd = + { pkgs, ... } : + { + networking.firewall.enable = false; + services.slurm.dbdserver = { + enable = true; + }; + services.mysql = { + enable = true; + package = pkgs.mysql; + ensureDatabases = [ "slurm_acct_db" ]; + ensureUsers = [{ + ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; }; + name = "slurm"; + }]; + }; + }; + node1 = computeNode; node2 = computeNode; node3 = computeNode; @@ -54,7 +77,7 @@ in { startAll; # Set up authentification across the cluster - foreach my $node (($submit,$control,$node1,$node2,$node3)) + foreach my $node (($submit,$control,$dbd,$node1,$node2,$node3)) { $node->waitForUnit("default.target"); @@ -63,10 +86,22 @@ in { $node->succeed("chmod 0400 /etc/munge/munge.key"); $node->succeed("chown munge:munge /etc/munge/munge.key"); $node->succeed("systemctl restart munged"); - } + + $node->waitForUnit("munged"); + }; # Restart the services since they have probably failed due to the munge init # failure + subtest "can_start_slurmdbd", sub { + $dbd->succeed("systemctl restart slurmdbd"); + $dbd->waitForUnit("slurmdbd.service"); + }; + + # there needs to be an entry for the current + # cluster in the database before slurmctld is restarted + subtest "add_account", sub { + $control->succeed("sacctmgr -i add cluster default"); + }; subtest "can_start_slurmctld", sub { $control->succeed("systemctl restart slurmctld"); @@ -81,12 +116,17 @@ in { } }; - # Test that the cluster work and can distribute jobs; + # Test that the cluster works and can distribute jobs; subtest "run_distributed_command", sub { # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes). # The output must contain the 3 different names $submit->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq"); }; + + subtest "check_slurm_dbd", sub { + # find the srun job from above in the database + $submit->succeed("sacct | grep hostname"); + }; ''; }) |