summary refs log tree commit diff
path: root/nixos/modules/services/monitoring
diff options
context:
space:
mode:
Diffstat (limited to 'nixos/modules/services/monitoring')
-rw-r--r--nixos/modules/services/monitoring/cadvisor.nix1
-rw-r--r--nixos/modules/services/monitoring/prometheus/alertmanager.nix116
-rw-r--r--nixos/modules/services/monitoring/prometheus/default.nix445
-rw-r--r--nixos/modules/services/monitoring/prometheus/node-exporter.nix71
-rw-r--r--nixos/modules/services/monitoring/telegraf.nix71
-rw-r--r--nixos/modules/services/monitoring/ups.nix7
6 files changed, 708 insertions, 3 deletions
diff --git a/nixos/modules/services/monitoring/cadvisor.nix b/nixos/modules/services/monitoring/cadvisor.nix
index a67df158be47..8ae8b12056ce 100644
--- a/nixos/modules/services/monitoring/cadvisor.nix
+++ b/nixos/modules/services/monitoring/cadvisor.nix
@@ -90,6 +90,7 @@ in {
             ${optionalString cfg.storageDriverSecure "-storage_driver_secure"}
           ''}
         '';
+        TimeoutStartSec=300;
       };
     };
 
diff --git a/nixos/modules/services/monitoring/prometheus/alertmanager.nix b/nixos/modules/services/monitoring/prometheus/alertmanager.nix
new file mode 100644
index 000000000000..a9c0ce4ed6cb
--- /dev/null
+++ b/nixos/modules/services/monitoring/prometheus/alertmanager.nix
@@ -0,0 +1,116 @@
+{ config, pkgs, lib, ... }:
+
+with lib;
+
+let
+  cfg = config.services.prometheus.alertmanager;
+  mkConfigFile = pkgs.writeText "alertmanager.yml" (builtins.toJSON cfg.configuration);
+in {
+  options = {
+    services.prometheus.alertmanager = {
+      enable = mkEnableOption "Prometheus Alertmanager";
+
+      user = mkOption {
+        type = types.str;
+        default = "nobody";
+        description = ''
+          User name under which Alertmanager shall be run.
+        '';
+      };
+
+      group = mkOption {
+        type = types.str;
+        default = "nogroup";
+        description = ''
+          Group under which Alertmanager shall be run.
+        '';
+      };
+
+      configuration = mkOption {
+        type = types.attrs;
+        default = {};
+        description = ''
+          Alertmanager configuration as nix attribute set.
+        '';
+      };
+
+      logFormat = mkOption {
+        type = types.nullOr types.str;
+        default = null;
+        description = ''
+          If set use a syslog logger or JSON logging.
+        '';
+      };
+
+      logLevel = mkOption {
+        type = types.enum ["debug" "info" "warn" "error" "fatal"];
+        default = "warn";
+        description = ''
+          Only log messages with the given severity or above.
+        '';
+      };
+
+      webExternalUrl = mkOption {
+        type = types.nullOr types.str;
+        default = null;
+        description = ''
+          The URL under which Alertmanager is externally reachable (for example, if Alertmanager is served via a reverse proxy).
+          Used for generating relative and absolute links back to Alertmanager itself.
+          If the URL has a path portion, it will be used to prefix all HTTP endoints served by Alertmanager.
+          If omitted, relevant URL components will be derived automatically.
+        '';
+      };
+
+      listenAddress = mkOption {
+        type = types.nullOr types.str;
+        default = null;
+        description = ''
+          Address to listen on for the web interface and API.
+        '';
+      };
+
+      port = mkOption {
+        type = types.int;
+        default = 9093;
+        description = ''
+          Port to listen on for the web interface and API.
+        '';
+      };
+
+      openFirewall = mkOption {
+        type = types.bool;
+        default = false;
+        description = ''
+          Open port in firewall for incoming connections.
+        '';
+      };
+    };
+  };
+
+
+  config = mkIf cfg.enable {
+    networking.firewall.allowedTCPPorts = optional cfg.openFirewall cfg.port;
+
+    systemd.services.alertmanager = {
+      wantedBy = [ "multi-user.target" ];
+      after    = [ "network.target" ];
+      script = ''
+        ${pkgs.prometheus-alertmanager.bin}/bin/alertmanager \
+        -config.file ${mkConfigFile} \
+        -web.listen-address ${cfg.listenAddress}:${toString cfg.port} \
+        -log.level ${cfg.logLevel} \
+        ${optionalString (cfg.webExternalUrl != null) ''-web.external-url ${cfg.webExternalUrl} \''}
+        ${optionalString (cfg.logFormat != null) "-log.format ${cfg.logFormat}"}
+      '';
+
+      serviceConfig = {
+        User = cfg.user;
+        Group = cfg.group;
+        Restart  = "always";
+        PrivateTmp = true;
+        WorkingDirectory = "/tmp";
+        ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
+      };
+    };
+  };
+}
diff --git a/nixos/modules/services/monitoring/prometheus/default.nix b/nixos/modules/services/monitoring/prometheus/default.nix
new file mode 100644
index 000000000000..e6817ee227ab
--- /dev/null
+++ b/nixos/modules/services/monitoring/prometheus/default.nix
@@ -0,0 +1,445 @@
+{ config, pkgs, lib, ... }:
+
+with lib;
+
+let
+  cfg = config.services.prometheus;
+  promUser = "prometheus";
+  promGroup = "prometheus";
+
+  # Get a submodule without any embedded metadata:
+  _filter = x: filterAttrs (k: v: k != "_module") x;
+
+  # Pretty-print JSON to a file
+  writePrettyJSON = name: x:
+    pkgs.runCommand name { } ''
+      echo '${builtins.toJSON x}' | ${pkgs.jq}/bin/jq . > $out
+    '';
+
+  # This becomes the main config file
+  promConfig = {
+    global = cfg.globalConfig;
+    rule_files = cfg.ruleFiles ++ [
+      (pkgs.writeText "prometheus.rules" (concatStringsSep "\n" cfg.rules))
+    ];
+    scrape_configs = cfg.scrapeConfigs;
+  };
+
+  cmdlineArgs = cfg.extraFlags ++ [
+    "-storage.local.path=${cfg.dataDir}/metrics"
+    "-config.file=${writePrettyJSON "prometheus.yml" promConfig}"
+    "-web.listen-address=${cfg.listenAddress}"
+    "-alertmanager.notification-queue-capacity=${toString cfg.alertmanagerNotificationQueueCapacity}"
+    "-alertmanager.timeout=${toString cfg.alertmanagerTimeout}s"
+    (optionalString (cfg.alertmanagerURL != []) "-alertmanager.url=${concatStringsSep "," cfg.alertmanagerURL}")
+  ];
+
+  promTypes.globalConfig = types.submodule {
+    options = {
+      scrape_interval = mkOption {
+        type = types.str;
+        default = "1m";
+        description = ''
+          How frequently to scrape targets by default.
+        '';
+      };
+
+      scrape_timeout = mkOption {
+        type = types.str;
+        default = "10s";
+        description = ''
+          How long until a scrape request times out.
+        '';
+      };
+
+      evaluation_interval = mkOption {
+        type = types.str;
+        default = "1m";
+        description = ''
+          How frequently to evaluate rules by default.
+        '';
+      };
+
+      labels = mkOption {
+        type = types.attrsOf types.str;
+        default = {};
+        description = ''
+          The labels to add to any timeseries that this Prometheus instance
+          scrapes.
+        '';
+      };
+    };
+  };
+
+  promTypes.scrape_config = types.submodule {
+    options = {
+      job_name = mkOption {
+        type = types.str;
+        description = ''
+          The job name assigned to scraped metrics by default.
+        '';
+      };
+      scrape_interval = mkOption {
+        type = types.nullOr types.str;
+        default = null;
+        description = ''
+          How frequently to scrape targets from this job. Defaults to the
+          globally configured default.
+        '';
+      };
+      scrape_timeout = mkOption {
+        type = types.nullOr types.str;
+        default = null;
+        description = ''
+          Per-target timeout when scraping this job. Defaults to the
+          globally configured default.
+        '';
+      };
+      metrics_path = mkOption {
+        type = types.str;
+        default = "/metrics";
+        description = ''
+          The HTTP resource path on which to fetch metrics from targets.
+        '';
+      };
+      scheme = mkOption {
+        type = types.enum ["http" "https"];
+        default = "http";
+        description = ''
+          The URL scheme with which to fetch metrics from targets.
+        '';
+      };
+      basic_auth = mkOption {
+        type = types.nullOr (types.submodule {
+          options = {
+            username = mkOption {
+              type = types.str;
+              description = ''
+                HTTP username
+              '';
+            };
+            password = mkOption {
+              type = types.str;
+              description = ''
+                HTTP password
+              '';
+            };
+          };
+        });
+        default = null;
+        description = ''
+          Optional http login credentials for metrics scraping.
+        '';
+      };
+      dns_sd_configs = mkOption {
+        type = types.listOf promTypes.dns_sd_config;
+        default = [];
+        apply = x: map _filter x;
+        description = ''
+          List of DNS service discovery configurations.
+        '';
+      };
+      consul_sd_configs = mkOption {
+        type = types.listOf promTypes.consul_sd_config;
+        default = [];
+        apply = x: map _filter x;
+        description = ''
+          List of Consul service discovery configurations.
+        '';
+      };
+      file_sd_configs = mkOption {
+        type = types.listOf promTypes.file_sd_config;
+        default = [];
+        apply = x: map _filter x;
+        description = ''
+          List of file service discovery configurations.
+        '';
+      };
+      static_configs = mkOption {
+        type = types.listOf promTypes.static_config;
+        default = [];
+        apply = x: map _filter x;
+        description = ''
+          List of labeled target groups for this job.
+        '';
+      };
+      relabel_configs = mkOption {
+        type = types.listOf promTypes.relabel_config;
+        default = [];
+        apply = x: map _filter x;
+        description = ''
+          List of relabel configurations.
+        '';
+      };
+    };
+  };
+
+  promTypes.static_config = types.submodule {
+    options = {
+      targets = mkOption {
+        type = types.listOf types.str;
+        description = ''
+          The targets specified by the target group.
+        '';
+      };
+      labels = mkOption {
+        type = types.attrsOf types.str;
+        description = ''
+          Labels assigned to all metrics scraped from the targets.
+        '';
+      };
+    };
+  };
+
+  promTypes.dns_sd_config = types.submodule {
+    options = {
+      names = mkOption {
+        type = types.listOf types.str;
+        description = ''
+          A list of DNS SRV record names to be queried.
+        '';
+      };
+      refresh_interval = mkOption {
+        type = types.str;
+        default = "30s";
+        description = ''
+          The time after which the provided names are refreshed.
+        '';
+      };
+    };
+  };
+
+  promTypes.consul_sd_config = types.submodule {
+    options = {
+      server = mkOption {
+        type = types.str;
+        description = "Consul server to query.";
+      };
+      token = mkOption {
+        type = types.nullOr types.str;
+        description = "Consul token";
+      };
+      datacenter = mkOption {
+        type = types.nullOr types.str;
+        description = "Consul datacenter";
+      };
+      scheme = mkOption {
+        type = types.nullOr types.str;
+        description = "Consul scheme";
+      };
+      username = mkOption {
+        type = types.nullOr types.str;
+        description = "Consul username";
+      };
+      password = mkOption {
+        type = types.nullOr types.str;
+        description = "Consul password";
+      };
+
+      services = mkOption {
+        type = types.listOf types.str;
+        description = ''
+          A list of services for which targets are retrieved.
+        '';
+      };
+      tag_separator = mkOption {
+        type = types.str;
+        default = ",";
+        description = ''
+          The string by which Consul tags are joined into the tag label.
+        '';
+      };
+    };
+  };
+
+  promTypes.file_sd_config = types.submodule {
+    options = {
+      files = mkOption {
+        type = types.listOf types.str;
+        description = ''
+          Patterns for files from which target groups are extracted. Refer
+          to the Prometheus documentation for permitted filename patterns
+          and formats.
+
+        '';
+      };
+      refresh_interval = mkOption {
+        type = types.str;
+        default = "30s";
+        description = ''
+          Refresh interval to re-read the files.
+        '';
+      };
+    };
+  };
+
+  promTypes.relabel_config = types.submodule {
+    options = {
+      source_labels = mkOption {
+        type = types.listOf types.str;
+        description = ''
+          The source labels select values from existing labels. Their content
+          is concatenated using the configured separator and matched against
+          the configured regular expression.
+        '';
+      };
+      separator = mkOption {
+        type = types.str;
+        default = ";";
+        description = ''
+          Separator placed between concatenated source label values.
+        '';
+      };
+      target_label = mkOption {
+        type = types.nullOr types.str;
+        default = null;
+        description = ''
+          Label to which the resulting value is written in a replace action.
+          It is mandatory for replace actions.
+        '';
+      };
+      regex = mkOption {
+        type = types.str;
+        description = ''
+          Regular expression against which the extracted value is matched.
+        '';
+      };
+      replacement = mkOption {
+        type = types.str;
+        default = "";
+        description = ''
+          Replacement value against which a regex replace is performed if the
+          regular expression matches.
+        '';
+      };
+      action = mkOption {
+        type = types.enum ["replace" "keep" "drop"];
+        description = ''
+          Action to perform based on regex matching.
+        '';
+      };
+    };
+  };
+
+in {
+  options = {
+    services.prometheus = {
+
+      enable = mkOption {
+        type = types.bool;
+        default = false;
+        description = ''
+          Enable the Prometheus monitoring daemon.
+        '';
+      };
+
+      listenAddress = mkOption {
+        type = types.str;
+        default = "0.0.0.0:9090";
+        description = ''
+          Address to listen on for the web interface, API, and telemetry.
+        '';
+      };
+
+      dataDir = mkOption {
+        type = types.path;
+        default = "/var/lib/prometheus";
+        description = ''
+          Directory to store Prometheus metrics data.
+        '';
+      };
+
+      extraFlags = mkOption {
+        type = types.listOf types.str;
+        default = [];
+        description = ''
+          Extra commandline options when launching Prometheus.
+        '';
+      };
+
+      globalConfig = mkOption {
+        type = promTypes.globalConfig;
+        default = {};
+        apply = _filter;
+        description = ''
+          Parameters that are valid in all  configuration contexts. They
+          also serve as defaults for other configuration sections
+        '';
+      };
+
+      rules = mkOption {
+        type = types.listOf types.str;
+        default = [];
+        description = ''
+          Alerting and/or Recording rules to evaluate at runtime.
+        '';
+      };
+
+      ruleFiles = mkOption {
+        type = types.listOf types.path;
+        default = [];
+        description = ''
+          Any additional rules files to include in this configuration.
+        '';
+      };
+
+      scrapeConfigs = mkOption {
+        type = types.listOf promTypes.scrape_config;
+        default = [];
+        apply = x: map _filter x;
+        description = ''
+          A list of scrape configurations.
+        '';
+      };
+
+      alertmanagerURL = mkOption {
+        type = types.listOf types.str;
+        default = [];
+        description = ''
+          List of Alertmanager URLs to send notifications to.
+        '';
+      };
+
+      alertmanagerNotificationQueueCapacity = mkOption {
+        type = types.int;
+        default = 10000;
+        description = ''
+          The capacity of the queue for pending alert manager notifications.
+        '';
+      };
+
+      alertmanagerTimeout = mkOption {
+        type = types.int;
+        default = 10;
+        description = ''
+          Alert manager HTTP API timeout (in seconds).
+        '';
+      };
+    };
+  };
+
+  config = mkIf cfg.enable {
+    users.extraGroups.${promGroup}.gid = config.ids.gids.prometheus;
+    users.extraUsers.${promUser} = {
+      description = "Prometheus daemon user";
+      uid = config.ids.uids.prometheus;
+      group = promGroup;
+      home = cfg.dataDir;
+      createHome = true;
+    };
+    systemd.services.prometheus = {
+      wantedBy = [ "multi-user.target" ];
+      after    = [ "network.target" ];
+      script = ''
+        #!/bin/sh
+        exec ${pkgs.prometheus}/bin/prometheus \
+          ${concatStringsSep " \\\n  " cmdlineArgs}
+      '';
+      serviceConfig = {
+        User = promUser;
+        Restart  = "always";
+        WorkingDirectory = cfg.dataDir;
+      };
+    };
+  };
+}
diff --git a/nixos/modules/services/monitoring/prometheus/node-exporter.nix b/nixos/modules/services/monitoring/prometheus/node-exporter.nix
new file mode 100644
index 000000000000..52dc14effc45
--- /dev/null
+++ b/nixos/modules/services/monitoring/prometheus/node-exporter.nix
@@ -0,0 +1,71 @@
+{ config, pkgs, lib, ... }:
+
+with lib;
+
+let
+  cfg = config.services.prometheus.nodeExporter;
+  cmdlineArgs = cfg.extraFlags ++ [
+    "-web.listen-address=${cfg.listenAddress}"
+  ];
+in {
+  options = {
+    services.prometheus.nodeExporter = {
+      enable = mkEnableOption "prometheus node exporter";
+
+      port = mkOption {
+        type = types.int;
+        default = 9100;
+        description = ''
+          Port to listen on.
+        '';
+      };
+
+      listenAddress = mkOption {
+        type = types.string;
+        default = "0.0.0.0";
+        description = ''
+          Address to listen on.
+        '';
+      };
+
+      enabledCollectors = mkOption {
+        type = types.listOf types.string;
+        default = [];
+        example = ''[ "systemd" ]'';
+        description = ''
+          Collectors to enable, additionally to the defaults.
+        '';
+      };
+
+      extraFlags = mkOption {
+        type = types.listOf types.str;
+        default = [];
+        description = ''
+          Extra commandline options when launching the node exporter.
+        '';
+      };
+    };
+  };
+
+  config = mkIf cfg.enable {
+    systemd.services.prometheus-node-exporter = {
+      description = "Prometheus exporter for machine metrics";
+      unitConfig.Documentation = "https://github.com/prometheus/node_exporter";
+      wantedBy = [ "multi-user.target" ];
+      script = ''
+        exec ${pkgs.prometheus-node-exporter}/bin/node_exporter \
+          ${optionalString (cfg.enabledCollectors != [])
+            ''-collectors.enabled ${concatStringsSep "," cfg.enabledCollectors}''} \
+          -web.listen-address ${cfg.listenAddress}:${toString cfg.port} \
+          ${concatStringsSep " \\\n  " cfg.extraFlags}
+      '';
+      serviceConfig = {
+        User = "nobody";
+        Restart  = "always";
+        PrivateTmp = true;
+        WorkingDirectory = /tmp;
+        ExecReload = "${pkgs.coreutils}/bin/kill -HUP $MAINPID";
+      };
+    };
+  };
+}
diff --git a/nixos/modules/services/monitoring/telegraf.nix b/nixos/modules/services/monitoring/telegraf.nix
new file mode 100644
index 000000000000..49dc9d8143e6
--- /dev/null
+++ b/nixos/modules/services/monitoring/telegraf.nix
@@ -0,0 +1,71 @@
+{ config, lib, pkgs, ... }:
+
+with lib;
+
+let
+  cfg = config.services.telegraf;
+
+  configFile = pkgs.runCommand "config.toml" {
+    buildInputs = [ pkgs.remarshal ];
+  } ''
+    remarshal -if json -of toml \
+      < ${pkgs.writeText "config.json" (builtins.toJSON cfg.extraConfig)} \
+      > $out
+  '';
+in {
+  ###### interface
+  options = {
+    services.telegraf = {
+      enable = mkEnableOption "telegraf server";
+
+      package = mkOption {
+        default = pkgs.telegraf;
+        defaultText = "pkgs.telegraf";
+        description = "Which telegraf derivation to use";
+        type = types.package;
+      };
+
+      extraConfig = mkOption {
+        default = {};
+        description = "Extra configuration options for telegraf";
+        type = types.attrs;
+        example = {
+          outputs = {
+            influxdb = {
+              urls = ["http://localhost:8086"];
+              database = "telegraf";
+            };
+          };
+          inputs = {
+            statsd = {
+              service_address = ":8125";
+              delete_timings = true;
+            };
+          };
+        };
+      };
+    };
+  };
+
+
+  ###### implementation
+  config = mkIf config.services.telegraf.enable {
+    systemd.services.telegraf = {
+      description = "Telegraf Agent";
+      wantedBy = [ "multi-user.target" ];
+      after = [ "network-online.target" ];
+      serviceConfig = {
+        ExecStart=''${cfg.package}/bin/telegraf -config "${configFile}"'';
+        ExecReload="${pkgs.coreutils}/bin/kill -HUP $MAINPID";
+        User = "telegraf";
+        Restart = "on-failure";
+      };
+    };
+
+    users.extraUsers = [{
+      name = "telegraf";
+      uid = config.ids.uids.telegraf;
+      description = "telegraf daemon user";
+    }];
+  };
+}
diff --git a/nixos/modules/services/monitoring/ups.nix b/nixos/modules/services/monitoring/ups.nix
index 5f80d547dbcb..febf0c95f5bd 100644
--- a/nixos/modules/services/monitoring/ups.nix
+++ b/nixos/modules/services/monitoring/ups.nix
@@ -182,7 +182,8 @@ in
 
     systemd.services.upsmon = {
       description = "Uninterruptible Power Supplies (Monitor)";
-      wantedBy = [ "ip-up.target" ];
+      after = [ "network.target" ];
+      wantedBy = [ "multi-user.target" ];
       serviceConfig.Type = "forking";
       script = "${pkgs.nut}/sbin/upsmon";
       environment.NUT_CONFPATH = "/etc/nut/";
@@ -191,8 +192,8 @@ in
 
     systemd.services.upsd = {
       description = "Uninterruptible Power Supplies (Daemon)";
+      after = [ "network.target" "upsmon.service" ];
       wantedBy = [ "multi-user.target" ];
-      after = [ "network-interfaces.target" "upsmon.service" ];
       serviceConfig.Type = "forking";
       # TODO: replace 'root' by another username.
       script = "${pkgs.nut}/sbin/upsd -u root";
@@ -202,8 +203,8 @@ in
 
     systemd.services.upsdrv = {
       description = "Uninterruptible Power Supplies (Register all UPS)";
-      wantedBy = [ "multi-user.target" ];
       after = [ "upsd.service" ];
+      wantedBy = [ "multi-user.target" ];
       # TODO: replace 'root' by another username.
       script = ''${pkgs.nut}/bin/upsdrvctl -u root start'';
       serviceConfig = {