diff options
Diffstat (limited to 'nixpkgs/nixos/tests/slurm.nix')
-rw-r--r-- | nixpkgs/nixos/tests/slurm.nix | 142 |
1 files changed, 142 insertions, 0 deletions
diff --git a/nixpkgs/nixos/tests/slurm.nix b/nixpkgs/nixos/tests/slurm.nix new file mode 100644 index 000000000000..4c2cd3c3d264 --- /dev/null +++ b/nixpkgs/nixos/tests/slurm.nix @@ -0,0 +1,142 @@ +import ./make-test.nix ({ lib, ... }: +let + mungekey = "mungeverryweakkeybuteasytointegratoinatest"; + + slurmconfig = { + controlMachine = "control"; + nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ]; + partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ]; + extraConfig = '' + AccountingStorageHost=dbd + AccountingStorageType=accounting_storage/slurmdbd + ''; + }; +in { + name = "slurm"; + + meta.maintainers = [ lib.maintainers.markuskowa ]; + + nodes = + let + computeNode = + { ...}: + { + # TODO slurmd port and slurmctld port should be configurations and + # automatically allowed by the firewall. + networking.firewall.enable = false; + services.slurm = { + client.enable = true; + } // slurmconfig; + }; + in { + + control = + { ...}: + { + networking.firewall.enable = false; + services.slurm = { + server.enable = true; + } // slurmconfig; + }; + + submit = + { ...}: + { + networking.firewall.enable = false; + services.slurm = { + enableStools = true; + } // slurmconfig; + }; + + dbd = + { pkgs, ... } : + { + networking.firewall.enable = false; + services.slurm.dbdserver = { + enable = true; + }; + services.mysql = { + enable = true; + package = pkgs.mysql; + ensureDatabases = [ "slurm_acct_db" ]; + ensureUsers = [{ + ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; }; + name = "slurm"; + }]; + extraOptions = '' + # recommendations from: https://slurm.schedmd.com/accounting.html#mysql-configuration + innodb_buffer_pool_size=1024M + innodb_log_file_size=64M + innodb_lock_wait_timeout=900 + ''; + }; + }; + + node1 = computeNode; + node2 = computeNode; + node3 = computeNode; + }; + + + testScript = + '' + startAll; + + # Set up authentification across the cluster + foreach my $node (($submit,$control,$dbd,$node1,$node2,$node3)) + { + $node->waitForUnit("default.target"); + + $node->succeed("mkdir /etc/munge"); + $node->succeed("echo '${mungekey}' > /etc/munge/munge.key"); + $node->succeed("chmod 0400 /etc/munge/munge.key"); + $node->succeed("chown munge:munge /etc/munge/munge.key"); + $node->succeed("systemctl restart munged"); + + $node->waitForUnit("munged"); + }; + + # Restart the services since they have probably failed due to the munge init + # failure + subtest "can_start_slurmdbd", sub { + $dbd->succeed("systemctl restart slurmdbd"); + $dbd->waitForUnit("slurmdbd.service"); + $dbd->waitForOpenPort(6819); + }; + + # there needs to be an entry for the current + # cluster in the database before slurmctld is restarted + subtest "add_account", sub { + $control->succeed("sacctmgr -i add cluster default"); + # check for cluster entry + $control->succeed("sacctmgr list cluster | awk '{ print \$1 }' | grep default"); + }; + + subtest "can_start_slurmctld", sub { + $control->succeed("systemctl restart slurmctld"); + $control->waitForUnit("slurmctld.service"); + }; + + subtest "can_start_slurmd", sub { + foreach my $node (($node1,$node2,$node3)) + { + $node->succeed("systemctl restart slurmd.service"); + $node->waitForUnit("slurmd"); + } + }; + + # Test that the cluster works and can distribute jobs; + + subtest "run_distributed_command", sub { + # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes). + # The output must contain the 3 different names + $submit->succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq"); + }; + + subtest "check_slurm_dbd", sub { + # find the srun job from above in the database + sleep 5; + $control->succeed("sacct | grep hostname"); + }; + ''; +}) |