diff options
Diffstat (limited to 'nixpkgs/nixos/tests/slurm.nix')
-rw-r--r-- | nixpkgs/nixos/tests/slurm.nix | 168 |
1 files changed, 168 insertions, 0 deletions
diff --git a/nixpkgs/nixos/tests/slurm.nix b/nixpkgs/nixos/tests/slurm.nix new file mode 100644 index 000000000000..ad516b6e8d2b --- /dev/null +++ b/nixpkgs/nixos/tests/slurm.nix @@ -0,0 +1,168 @@ +import ./make-test-python.nix ({ lib, pkgs, ... }: +let + slurmconfig = { + services.slurm = { + controlMachine = "control"; + nodeName = [ "node[1-3] CPUs=1 State=UNKNOWN" ]; + partitionName = [ "debug Nodes=node[1-3] Default=YES MaxTime=INFINITE State=UP" ]; + extraConfig = '' + AccountingStorageHost=dbd + AccountingStorageType=accounting_storage/slurmdbd + ''; + }; + environment.systemPackages = [ mpitest ]; + networking.firewall.enable = false; + systemd.tmpfiles.rules = [ + "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest" + ]; + }; + + mpitest = let + mpitestC = pkgs.writeText "mpitest.c" '' + #include <stdio.h> + #include <stdlib.h> + #include <mpi.h> + + int + main (int argc, char *argv[]) + { + int rank, size, length; + char name[512]; + + MPI_Init (&argc, &argv); + MPI_Comm_rank (MPI_COMM_WORLD, &rank); + MPI_Comm_size (MPI_COMM_WORLD, &size); + MPI_Get_processor_name (name, &length); + + if ( rank == 0 ) printf("size=%d\n", size); + + printf ("%s: hello world from process %d of %d\n", name, rank, size); + + MPI_Finalize (); + + return EXIT_SUCCESS; + } + ''; + in pkgs.runCommand "mpitest" {} '' + mkdir -p $out/bin + ${lib.getDev pkgs.mpi}/bin/mpicc ${mpitestC} -o $out/bin/mpitest + ''; +in { + name = "slurm"; + + meta.maintainers = [ lib.maintainers.markuskowa ]; + + nodes = + let + computeNode = + { ...}: + { + imports = [ slurmconfig ]; + # TODO slurmd port and slurmctld port should be configurations and + # automatically allowed by the firewall. + services.slurm = { + client.enable = true; + }; + }; + in { + + control = + { ...}: + { + imports = [ slurmconfig ]; + services.slurm = { + server.enable = true; + }; + }; + + submit = + { ...}: + { + imports = [ slurmconfig ]; + services.slurm = { + enableStools = true; + }; + }; + + dbd = + { pkgs, ... } : + let + passFile = pkgs.writeText "dbdpassword" "password123"; + in { + networking.firewall.enable = false; + systemd.tmpfiles.rules = [ + "f /etc/munge/munge.key 0400 munge munge - mungeverryweakkeybuteasytointegratoinatest" + ]; + services.slurm.dbdserver = { + enable = true; + storagePassFile = "${passFile}"; + }; + services.mysql = { + enable = true; + package = pkgs.mariadb; + initialScript = pkgs.writeText "mysql-init.sql" '' + CREATE USER 'slurm'@'localhost' IDENTIFIED BY 'password123'; + GRANT ALL PRIVILEGES ON slurm_acct_db.* TO 'slurm'@'localhost'; + ''; + ensureDatabases = [ "slurm_acct_db" ]; + ensureUsers = [{ + ensurePermissions = { "slurm_acct_db.*" = "ALL PRIVILEGES"; }; + name = "slurm"; + }]; + settings.mysqld = { + # recommendations from: https://slurm.schedmd.com/accounting.html#mysql-configuration + innodb_buffer_pool_size="1024M"; + innodb_log_file_size="64M"; + innodb_lock_wait_timeout=900; + }; + }; + }; + + node1 = computeNode; + node2 = computeNode; + node3 = computeNode; + }; + + + testScript = + '' + start_all() + + # Make sure DBD is up after DB initialzation + with subtest("can_start_slurmdbd"): + dbd.succeed("systemctl restart slurmdbd") + dbd.wait_for_unit("slurmdbd.service") + dbd.wait_for_open_port(6819) + + # there needs to be an entry for the current + # cluster in the database before slurmctld is restarted + with subtest("add_account"): + control.succeed("sacctmgr -i add cluster default") + # check for cluster entry + control.succeed("sacctmgr list cluster | awk '{ print $1 }' | grep default") + + with subtest("can_start_slurmctld"): + control.succeed("systemctl restart slurmctld") + control.wait_for_unit("slurmctld.service") + + with subtest("can_start_slurmd"): + for node in [node1, node2, node3]: + node.succeed("systemctl restart slurmd.service") + node.wait_for_unit("slurmd") + + # Test that the cluster works and can distribute jobs; + + with subtest("run_distributed_command"): + # Run `hostname` on 3 nodes of the partition (so on all the 3 nodes). + # The output must contain the 3 different names + submit.succeed("srun -N 3 hostname | sort | uniq | wc -l | xargs test 3 -eq") + + with subtest("check_slurm_dbd"): + # find the srun job from above in the database + control.succeed("sleep 5") + control.succeed("sacct | grep hostname") + + with subtest("run_PMIx_mpitest"): + submit.succeed("srun -N 3 --mpi=pmix mpitest | grep size=3") + ''; +}) |