diff options
author | Bas van Dijk <v.dijk.bas@gmail.com> | 2019-04-29 10:30:25 +0200 |
---|---|---|
committer | Bas van Dijk <v.dijk.bas@gmail.com> | 2019-07-28 13:28:27 +0200 |
commit | ebc65a5f2111203fad0d68c52d907a789a55fb31 (patch) | |
tree | bb9cfc97a707f6533efaf43690d99ae598357318 | |
parent | 4a6e47126c4144056bc94bc8867e3ec095958c90 (diff) | |
download | nixlib-ebc65a5f2111203fad0d68c52d907a789a55fb31.tar nixlib-ebc65a5f2111203fad0d68c52d907a789a55fb31.tar.gz nixlib-ebc65a5f2111203fad0d68c52d907a789a55fb31.tar.bz2 nixlib-ebc65a5f2111203fad0d68c52d907a789a55fb31.tar.lz nixlib-ebc65a5f2111203fad0d68c52d907a789a55fb31.tar.xz nixlib-ebc65a5f2111203fad0d68c52d907a789a55fb31.tar.zst nixlib-ebc65a5f2111203fad0d68c52d907a789a55fb31.zip |
nixos/thanos: add module for the thanos service
-rw-r--r-- | nixos/modules/module-list.nix | 1 | ||||
-rw-r--r-- | nixos/modules/services/monitoring/thanos.nix | 756 | ||||
-rw-r--r-- | nixos/tests/prometheus-2.nix | 207 |
3 files changed, 946 insertions, 18 deletions
diff --git a/nixos/modules/module-list.nix b/nixos/modules/module-list.nix index 8c6bc47df251..3f23151a4e30 100644 --- a/nixos/modules/module-list.nix +++ b/nixos/modules/module-list.nix @@ -519,6 +519,7 @@ ./services/monitoring/systemhealth.nix ./services/monitoring/teamviewer.nix ./services/monitoring/telegraf.nix + ./services/monitoring/thanos.nix ./services/monitoring/ups.nix ./services/monitoring/uptime.nix ./services/monitoring/vnstat.nix diff --git a/nixos/modules/services/monitoring/thanos.nix b/nixos/modules/services/monitoring/thanos.nix new file mode 100644 index 000000000000..a34a1ecbfea7 --- /dev/null +++ b/nixos/modules/services/monitoring/thanos.nix @@ -0,0 +1,756 @@ +{ config, lib, pkgs, ... }: + +with lib; + +let + cfg = config.services.thanos; + + nullOpt = type : description : mkOption { + type = types.nullOr type; + default = null; + inherit description; + }; + + optionToArgs = opt : v : optional (v != null) ''--${opt}="${toString v}"''; + flagToArgs = opt : v : optional v ''--${opt}''; + listToArgs = opt : vs : map (v: ''--${opt}="${v}"'') vs; + attrsToArgs = opt : kvs : mapAttrsToList (k: v: ''--${opt}=${k}=\"${v}\"'') kvs; + + mkParamDef = type : default : description : mkParam type (description + '' + + Defaults to <literal>${toString default}</literal> in Thanos + when set to <literal>null</literal>. + ''); + + mkParam = type : description : { + toArgs = optionToArgs; + option = nullOpt type description; + }; + + mkFlagParam = description : { + toArgs = flagToArgs; + option = mkOption { + type = types.bool; + default = false; + inherit description; + }; + }; + + mkListParam = opt : description : { + toArgs = _opt : listToArgs opt; + option = mkOption { + type = types.listOf types.str; + default = []; + inherit description; + }; + }; + + mkAttrsParam = opt : description : { + toArgs = _opt : attrsToArgs opt; + option = mkOption { + type = types.attrsOf types.str; + default = {}; + inherit description; + }; + }; + + mkStateDirParam = opt : default : description : { + toArgs = _opt : stateDir : optionToArgs opt "/var/lib/${stateDir}"; + option = mkOption { + type = types.str; + inherit default; + inherit description; + }; + }; + + toYAML = name : attrs : pkgs.runCommandNoCC name { + preferLocalBuild = true; + json = builtins.toFile "${name}.json" (builtins.toJSON attrs); + nativeBuildInputs = [ pkgs.remarshal ]; + } ''json2yaml -i $json -o $out''; + + thanos = cmd : "${cfg.package}/bin/thanos ${cmd}" + + (let args = concatLists (collect isList + (flip mapParamsRecursive params."${cmd}" (path : param : + let opt = concatStringsSep "." path; + v = getAttrFromPath path cfg."${cmd}"; + in param.toArgs opt v))); + in optionalString (length args != 0) (" \\\n " + + concatStringsSep " \\\n " args)); + + mapParamsRecursive = + let noParam = attr : !(attr ? "toArgs" && attr ? "option"); + in mapAttrsRecursiveCond noParam; + + paramsToOptions = mapParamsRecursive (_path : param : param.option); + + params = { + + log = { + + log.level = mkParamDef (types.enum ["debug" "info" "warn" "error" "fatal"]) "info" '' + Log filtering level. + ''; + + log.format = mkParam types.str '' + Log format to use. + ''; + }; + + gcloudtrace = { + + gcloudtrace.project = mkParam types.str '' + GCP project to send Google Cloud Trace tracings to. + + If <literal>null</literal>, tracing will be disabled. + ''; + + gcloudtrace.sample-factor = mkParamDef types.int 1 '' + How often we send traces <literal>1/<sample-factor></literal>. + + If <literal>0</literal> no trace will be sent periodically, unless + forced by baggage item. + ''; + }; + + common = params.log // params.gcloudtrace // { + + http-address = mkParamDef types.str "0.0.0.0:10902" '' + Listen <literal>host:port</literal> for HTTP endpoints. + ''; + + grpc-address = mkParamDef types.str "0.0.0.0:10901" '' + Listen <literal>ip:port</literal> address for gRPC endpoints (StoreAPI). + + Make sure this address is routable from other components if you use gossip, + <option>grpc-advertise-address</option> is empty and you require cross-node connection. + ''; + + grpc-server-tls-cert = mkParam types.str '' + TLS Certificate for gRPC server, leave blank to disable TLS + ''; + + grpc-server-tls-key = mkParam types.str '' + TLS Key for the gRPC server, leave blank to disable TLS + ''; + + grpc-server-tls-client-ca = mkParam types.str '' + TLS CA to verify clients against. + + If no client CA is specified, there is no client verification on server side. + (tls.NoClientCert) + ''; + }; + + objstore = cfg : { + + objstore.config-file = { + toArgs = _opt : path : optionToArgs "objstore.config-file" path; + option = mkOption { + type = with types; nullOr str; + default = if cfg.objstore.config == null then null + else toString (toYAML "objstore.yaml" cfg.objstore.config); + defaultText = '' + if config.services.thanos.<cmd>.objstore.config == null then null + else toString (toYAML "objstore.yaml" config.services.thanos.<cmd>.objstore.config); + ''; + description = '' + Path to YAML file that contains object store configuration. + ''; + }; + }; + + objstore.config = + { + toArgs = _opt : _attrs : []; + option = nullOpt types.attrs '' + Object store configuration. + + When not <literal>null</literal> the attribute set gets converted to + a YAML file and stored in the Nix store. The option + <option>objstore.config-file</option> will default to its path. + + If <option>objstore.config-file</option> is set this option has no effect. + ''; + }; + }; + + sidecar = params.common // params.objstore cfg.sidecar // { + + prometheus.url = mkParamDef types.str "http://localhost:9090" '' + URL at which to reach Prometheus's API. + + For better performance use local network. + ''; + + tsdb.path = { + toArgs = optionToArgs; + option = mkOption { + type = types.str; + default = "/var/lib/${config.services.prometheus2.stateDir}/data"; + defaultText = "/var/lib/\${config.services.prometheus2.stateDir}/data"; + description = '' + Data directory of TSDB. + ''; + }; + }; + + reloader.config-file = mkParam types.str '' + Config file watched by the reloader. + ''; + + reloader.config-envsubst-file = mkParam types.str '' + Output file for environment variable substituted config file. + ''; + + reloader.rule-dirs = mkListParam "reloader.rule-dir" '' + Rule directories for the reloader to refresh. + ''; + + }; + + store = params.common // params.objstore cfg.store // { + + stateDir = mkStateDirParam "data-dir" "thanos-store" '' + Data directory relative to <literal>/var/lib</literal> + in which to cache remote blocks. + ''; + + index-cache-size = mkParamDef types.str "250MB" '' + Maximum size of items held in the index cache. + ''; + + chunk-pool-size = mkParamDef types.str "2GB" '' + Maximum size of concurrently allocatable bytes for chunks. + ''; + + store.grpc.series-sample-limit = mkParamDef types.int 0 '' + Maximum amount of samples returned via a single Series call. + + <literal>0</literal> means no limit. + + NOTE: for efficiency we take 120 as the number of samples in chunk (it + cannot be bigger than that), so the actual number of samples might be + lower, even though the maximum could be hit. + ''; + + store.grpc.series-max-concurrency = mkParamDef types.int 20 '' + Maximum number of concurrent Series calls. + ''; + + sync-block-duration = mkParamDef types.str "3m" '' + Repeat interval for syncing the blocks between local and remote view. + ''; + + block-sync-concurrency = mkParamDef types.int 20 '' + Number of goroutines to use when syncing blocks from object storage. + ''; + }; + + query = params.common // { + + http-advertise-address = mkParam types.str '' + Explicit (external) <literal>host:port</literal> address to advertise + for HTTP QueryAPI in gossip cluster. + + If <literal>null</literal>, the option <option>http-address</option> + will be used. + ''; + + grpc-client-tls-secure = mkFlagParam '' + Use TLS when talking to the gRPC server + ''; + + grpc-client-tls-cert = mkParam types.str '' + TLS Certificates to use to identify this client to the server + ''; + + grpc-client-tls-key = mkParam types.str '' + TLS Key for the client's certificate + ''; + + grpc-client-tls-ca = mkParam types.str '' + TLS CA Certificates to use to verify gRPC servers + ''; + + grpc-client-server-name = mkParam types.str '' + Server name to verify the hostname on the returned gRPC certificates. + See <link xlink:href="https://tools.ietf.org/html/rfc4366#section-3.1"/> + ''; + + web.route-prefix = mkParam types.str '' + Prefix for API and UI endpoints. + + This allows thanos UI to be served on a sub-path. This option is + analogous to <option>web.route-prefix</option> of Promethus. + ''; + + web.external-prefix = mkParam types.str '' + Static prefix for all HTML links and redirect URLs in the UI query web + interface. + + Actual endpoints are still served on / or the + <option>web.route-prefix</option>. This allows thanos UI to be served + behind a reverse proxy that strips a URL sub-path. + ''; + + web.prefix-header = mkParam types.str '' + Name of HTTP request header used for dynamic prefixing of UI links and + redirects. + + This option is ignored if the option + <literal>web.external-prefix</literal> is set. + + Security risk: enable this option only if a reverse proxy in front of + thanos is resetting the header. + + The setting <literal>web.prefix-header="X-Forwarded-Prefix"</literal> + can be useful, for example, if Thanos UI is served via Traefik reverse + proxy with <literal>PathPrefixStrip</literal> option enabled, which + sends the stripped prefix value in <literal>X-Forwarded-Prefix</literal> + header. This allows thanos UI to be served on a sub-path. + ''; + + query.timeout = mkParamDef types.str "2m" '' + Maximum time to process query by query node. + ''; + + query.max-concurrent = mkParamDef types.int 20 '' + Maximum number of queries processed concurrently by query node. + ''; + + query.replica-label = mkParam types.str '' + Label to treat as a replica indicator along which data is + deduplicated. + + Still you will be able to query without deduplication using + <literal>dedup=false</literal> parameter. + ''; + + selector-labels = mkAttrsParam "selector-label" '' + Query selector labels that will be exposed in info endpoint. + ''; + + store.addresses = mkListParam "store" '' + Addresses of statically configured store API servers. + + The scheme may be prefixed with <literal>dns+</literal> or + <literal>dnssrv+</literal> to detect store API servers through + respective DNS lookups. + ''; + + store.sd-files = mkListParam "store.sd-files" '' + Path to files that contain addresses of store API servers. The path + can be a glob pattern. + ''; + + store.sd-interval = mkParamDef types.str "5m" '' + Refresh interval to re-read file SD files. It is used as a resync fallback. + ''; + + store.sd-dns-interval = mkParamDef types.str "30s" '' + Interval between DNS resolutions. + ''; + + store.unhealthy-timeout = mkParamDef types.str "5m" '' + Timeout before an unhealthy store is cleaned from the store UI page. + ''; + + query.auto-downsampling = mkFlagParam '' + Enable automatic adjustment (step / 5) to what source of data should + be used in store gateways if no + <literal>max_source_resolution</literal> param is specified. + ''; + + query.partial-response = mkFlagParam '' + Enable partial response for queries if no + <literal>partial_response</literal> param is specified. + ''; + + query.default-evaluation-interval = mkParamDef types.str "1m" '' + Set default evaluation interval for sub queries. + ''; + + store.response-timeout = mkParamDef types.str "0ms" '' + If a Store doesn't send any data in this specified duration then a + Store will be ignored and partial data will be returned if it's + enabled. <literal>0</literal> disables timeout. + ''; + }; + + rule = params.common // params.objstore cfg.rule // { + + labels = mkAttrsParam "label" '' + Labels to be applied to all generated metrics. + + Similar to external labels for Prometheus, + used to identify ruler and its blocks as unique source. + ''; + + stateDir = mkStateDirParam "data-dir" "thanos-rule" '' + Data directory relative to <literal>/var/lib</literal>. + ''; + + rule-files = mkListParam "rule-file" '' + Rule files that should be used by rule manager. Can be in glob format. + ''; + + eval-interval = mkParamDef types.str "30s" '' + The default evaluation interval to use. + ''; + + tsdb.block-duration = mkParamDef types.str "2h" '' + Block duration for TSDB block. + ''; + + tsdb.retention = mkParamDef types.str "48h" '' + Block retention time on local disk. + ''; + + alertmanagers.urls = mkListParam "alertmanagers.url" '' + Alertmanager replica URLs to push firing alerts. + + Ruler claims success if push to at least one alertmanager from + discovered succeeds. The scheme may be prefixed with + <literal>dns+</literal> or <literal>dnssrv+</literal> to detect + Alertmanager IPs through respective DNS lookups. The port defaults to + <literal>9093</literal> or the SRV record's value. The URL path is + used as a prefix for the regular Alertmanager API path. + ''; + + alertmanagers.send-timeout = mkParamDef types.str "10s" '' + Timeout for sending alerts to alertmanager. + ''; + + alert.query-url = mkParam types.str '' + The external Thanos Query URL that would be set in all alerts 'Source' field. + ''; + + alert.label-drop = mkListParam "alert.label-drop" '' + Labels by name to drop before sending to alertmanager. + + This allows alert to be deduplicated on replica label. + + Similar Prometheus alert relabelling + ''; + + web.route-prefix = mkParam types.str '' + Prefix for API and UI endpoints. + + This allows thanos UI to be served on a sub-path. + + This option is analogous to <literal>--web.route-prefix</literal> of Promethus. + ''; + + web.external-prefix = mkParam types.str '' + Static prefix for all HTML links and redirect URLs in the UI query web + interface. + + Actual endpoints are still served on / or the + <option>web.route-prefix</option>. This allows thanos UI to be served + behind a reverse proxy that strips a URL sub-path. + ''; + + web.prefix-header = mkParam types.str '' + Name of HTTP request header used for dynamic prefixing of UI links and + redirects. + + This option is ignored if the option + <option>web.external-prefix</option> is set. + + Security risk: enable this option only if a reverse proxy in front of + thanos is resetting the header. + + The header <literal>X-Forwarded-Prefix</literal> can be useful, for + example, if Thanos UI is served via Traefik reverse proxy with + <literal>PathPrefixStrip</literal> option enabled, which sends the + stripped prefix value in <literal>X-Forwarded-Prefix</literal> + header. This allows thanos UI to be served on a sub-path. + ''; + + query.addresses = mkListParam "query" '' + Addresses of statically configured query API servers. + + The scheme may be prefixed with <literal>dns+</literal> or + <literal>dnssrv+</literal> to detect query API servers through + respective DNS lookups. + ''; + + query.sd-files = mkListParam "query.sd-files" '' + Path to file that contain addresses of query peers. + The path can be a glob pattern. + ''; + + query.sd-interval = mkParamDef types.str "5m" '' + Refresh interval to re-read file SD files. (used as a fallback) + ''; + + query.sd-dns-interval = mkParamDef types.str "30s" '' + Interval between DNS resolutions. + ''; + }; + + compact = params.log // params.gcloudtrace // params.objstore cfg.compact // { + + http-address = mkParamDef types.str "0.0.0.0:10902" '' + Listen <literal>host:port</literal> for HTTP endpoints. + ''; + + stateDir = mkStateDirParam "data-dir" "thanos-compact" '' + Data directory relative to <literal>/var/lib</literal> + in which to cache blocks and process compactions. + ''; + + consistency-delay = mkParamDef types.str "30m" '' + Minimum age of fresh (non-compacted) blocks before they are being + processed. Malformed blocks older than the maximum of consistency-delay + and 30m0s will be removed. + ''; + + retention.resolution-raw = mkParamDef types.str "0d" '' + How long to retain raw samples in bucket. + + <literal>0d</literal> - disables this retention + ''; + + retention.resolution-5m = mkParamDef types.str "0d" '' + How long to retain samples of resolution 1 (5 minutes) in bucket. + + <literal>0d</literal> - disables this retention + ''; + + retention.resolution-1h = mkParamDef types.str "0d" '' + How long to retain samples of resolution 2 (1 hour) in bucket. + + <literal>0d</literal> - disables this retention + ''; + + startAt = { + toArgs = _opt : startAt : flagToArgs "wait" (startAt == null); + option = nullOpt types.str '' + When this option is set to a <literal>systemd.time</literal> + specification the Thanos compactor will run at the specified period. + + When this option is <literal>null</literal> the Thanos compactor service + will run continuously. So it will not exit after all compactions have + been processed but wait for new work. + ''; + }; + + block-sync-concurrency = mkParamDef types.int 20 '' + Number of goroutines to use when syncing block metadata from object storage. + ''; + + compact.concurrency = mkParamDef types.int 1 '' + Number of goroutines to use when compacting groups. + ''; + }; + + downsample = params.log // params.gcloudtrace // params.objstore cfg.downsample // { + + stateDir = mkStateDirParam "data-dir" "thanos-downsample" '' + Data directory relative to <literal>/var/lib</literal> + in which to cache blocks and process downsamplings. + ''; + + }; + + receive = params.common // params.objstore cfg.receive // { + + remote-write.address = mkParamDef types.str "0.0.0.0:19291" '' + Address to listen on for remote write requests. + ''; + + stateDir = mkStateDirParam "tsdb.path" "thanos-receive" '' + Data directory relative to <literal>/var/lib</literal> of TSDB. + ''; + + labels = mkAttrsParam "labels" '' + External labels to announce. + + This flag will be removed in the future when handling multiple tsdb + instances is added. + ''; + + tsdb.retention = mkParamDef types.str "15d" '' + How long to retain raw samples on local storage. + + <literal>0d</literal> - disables this retention + ''; + }; + + }; + + assertRelativeStateDir = cmd : { + assertions = [ + { + assertion = !hasPrefix "/" cfg."${cmd}".stateDir; + message = + "The option services.thanos.${cmd}.stateDir should not be an absolute directory." + + " It should be a directory relative to /var/lib."; + } + ]; + }; + +in { + + options.services.thanos = { + + package = mkOption { + type = types.package; + default = pkgs.thanos; + defaultText = "pkgs.thanos"; + description = '' + The thanos package that should be used. + ''; + }; + + sidecar = paramsToOptions params.sidecar // { + enable = mkEnableOption + "the Thanos sidecar for Prometheus server"; + }; + + store = paramsToOptions params.store // { + enable = mkEnableOption + "the Thanos store node giving access to blocks in a bucket provider."; + }; + + query = paramsToOptions params.query // { + enable = mkEnableOption + ("the Thanos query node exposing PromQL enabled Query API " + + "with data retrieved from multiple store nodes"); + }; + + rule = paramsToOptions params.rule // { + enable = mkEnableOption + ("the Thanos ruler service which evaluates Prometheus rules against" + + " given Query nodes, exposing Store API and storing old blocks in bucket"); + }; + + compact = paramsToOptions params.compact // { + enable = mkEnableOption + "the Thanos compactor which continuously compacts blocks in an object store bucket"; + }; + + downsample = paramsToOptions params.downsample // { + enable = mkEnableOption + "the Thanos downsampler which continuously downsamples blocks in an object store bucket"; + }; + + receive = paramsToOptions params.receive // { + enable = mkEnableOption + ("the Thanos receiver which accept Prometheus remote write API requests " + + "and write to local tsdb (EXPERIMENTAL, this may change drastically without notice)"); + }; + }; + + config = mkMerge [ + + (mkIf cfg.sidecar.enable { + systemd.services.thanos-sidecar = { + wantedBy = [ "multi-user.target" ]; + after = [ "network.target" "prometheus2.service" ]; + serviceConfig = { + User = "prometheus"; + Restart = "always"; + ExecStart = thanos "sidecar"; + }; + }; + }) + + (mkIf cfg.store.enable (mkMerge [ + (assertRelativeStateDir "store") + { + systemd.services.thanos-store = { + wantedBy = [ "multi-user.target" ]; + after = [ "network.target" ]; + serviceConfig = { + DynamicUser = true; + StateDirectory = cfg.store.stateDir; + Restart = "always"; + ExecStart = thanos "store"; + }; + }; + } + ])) + + (mkIf cfg.query.enable { + systemd.services.thanos-query = { + wantedBy = [ "multi-user.target" ]; + after = [ "network.target" ]; + serviceConfig = { + DynamicUser = true; + Restart = "always"; + ExecStart = thanos "query"; + }; + }; + }) + + (mkIf cfg.rule.enable (mkMerge [ + (assertRelativeStateDir "rule") + { + systemd.services.thanos-rule = { + wantedBy = [ "multi-user.target" ]; + after = [ "network.target" ]; + serviceConfig = { + DynamicUser = true; + StateDirectory = cfg.rule.stateDir; + Restart = "always"; + ExecStart = thanos "rule"; + }; + }; + } + ])) + + (mkIf cfg.compact.enable (mkMerge [ + (assertRelativeStateDir "compact") + { + systemd.services.thanos-compact = + let wait = cfg.compact.startAt == null; in { + wantedBy = [ "multi-user.target" ]; + after = [ "network.target" ]; + serviceConfig = { + Type = if wait then "simple" else "oneshot"; + Restart = if wait then "always" else "no"; + DynamicUser = true; + StateDirectory = cfg.compact.stateDir; + ExecStart = thanos "compact"; + }; + } // optionalAttrs (!wait) { inherit (cfg.compact) startAt; }; + } + ])) + + (mkIf cfg.downsample.enable (mkMerge [ + (assertRelativeStateDir "downsample") + { + systemd.services.thanos-downsample = { + wantedBy = [ "multi-user.target" ]; + after = [ "network.target" ]; + serviceConfig = { + DynamicUser = true; + StateDirectory = cfg.downsample.stateDir; + Restart = "always"; + ExecStart = thanos "downsample"; + }; + }; + } + ])) + + (mkIf cfg.receive.enable (mkMerge [ + (assertRelativeStateDir "receive") + { + systemd.services.thanos-receive = { + wantedBy = [ "multi-user.target" ]; + after = [ "network.target" ]; + serviceConfig = { + DynamicUser = true; + StateDirectory = cfg.receive.stateDir; + Restart = "always"; + ExecStart = thanos "receive"; + }; + }; + } + ])) + + ]; +} diff --git a/nixos/tests/prometheus-2.nix b/nixos/tests/prometheus-2.nix index d7035d49ad4d..3e2c675b7c62 100644 --- a/nixos/tests/prometheus-2.nix +++ b/nixos/tests/prometheus-2.nix @@ -1,9 +1,44 @@ -import ./make-test.nix { +let + grpcPort = 19090; + queryPort = 9090; + minioPort = 9000; + pushgwPort = 9091; + + s3 = { + accessKey = "BKIKJAA5BMMU2RHO6IBB"; + secretKey = "V7f1CwQqAcwo80UEIJEjc5gVQUSSx5ohQ9GSrr12"; + }; + + objstore.config = { + type = "S3"; + config = { + bucket = "thanos-bucket"; + endpoint = "s3:${toString minioPort}"; + region = "us-east-1"; + access_key = s3.accessKey; + secret_key = s3.secretKey; + insecure = true; + signature_version2 = false; + encrypt_sse = false; + put_user_metadata = {}; + http_config = { + idle_conn_timeout = "0s"; + insecure_skip_verify = false; + }; + trace = { + enable = false; + }; + }; + }; + +in import ./make-test.nix { name = "prometheus-2"; nodes = { - one = { pkgs, ... }: { + prometheus = { pkgs, ... }: { + virtualisation.diskSize = 2 * 1024; environment.systemPackages = [ pkgs.jq ]; + networking.firewall.allowedTCPPorts = [ grpcPort ]; services.prometheus2 = { enable = true; scrapeConfigs = [ @@ -11,7 +46,7 @@ import ./make-test.nix { job_name = "prometheus"; static_configs = [ { - targets = [ "127.0.0.1:9090" ]; + targets = [ "127.0.0.1:${toString queryPort}" ]; labels = { instance = "localhost"; }; } ]; @@ -21,7 +56,7 @@ import ./make-test.nix { scrape_interval = "1s"; static_configs = [ { - targets = [ "127.0.0.1:9091" ]; + targets = [ "127.0.0.1:${toString pushgwPort}" ]; } ]; } @@ -35,33 +70,169 @@ import ./make-test.nix { expr: count(up{job="prometheus"}) '' ]; + globalConfig = { + external_labels = { + some_label = "required by thanos"; + }; + }; + extraFlags = [ + # Required by thanos + "--storage.tsdb.min-block-duration=5s" + "--storage.tsdb.max-block-duration=5s" + ]; }; services.prometheus.pushgateway = { enable = true; + web.listen-address = ":${toString pushgwPort}"; persistMetrics = true; persistence.interval = "1s"; stateDir = "prometheus-pushgateway"; }; + services.thanos = { + sidecar = { + enable = true; + grpc-address = "0.0.0.0:${toString grpcPort}"; + inherit objstore; + }; + + # TODO: Add some tests for these services: + #rule = { + # enable = true; + # http-address = "0.0.0.0:19194"; + # grpc-address = "0.0.0.0:19193"; + # query.addresses = [ + # "localhost:19191" + # ]; + # labels = { + # just = "some"; + # nice = "labels"; + # }; + #}; + # + #receive = { + # http-address = "0.0.0.0:19195"; + # enable = true; + # labels = { + # just = "some"; + # nice = "labels"; + # }; + #}; + }; + }; + + query = { pkgs, ... }: { + environment.systemPackages = [ pkgs.jq ]; + services.thanos.query = { + enable = true; + http-address = "0.0.0.0:${toString queryPort}"; + store.addresses = [ + "prometheus:${toString grpcPort}" + ]; + }; + }; + + store = { pkgs, ... }: { + environment.systemPackages = with pkgs; [ jq thanos ]; + services.thanos.store = { + enable = true; + http-address = "0.0.0.0:10902"; + grpc-address = "0.0.0.0:${toString grpcPort}"; + inherit objstore; + sync-block-duration = "1s"; + }; + services.thanos.compact = { + enable = true; + http-address = "0.0.0.0:10903"; + inherit objstore; + consistency-delay = "5s"; + }; + services.thanos.query = { + enable = true; + http-address = "0.0.0.0:${toString queryPort}"; + store.addresses = [ + "localhost:${toString grpcPort}" + ]; + }; + }; + + s3 = { pkgs, ... } : { + # Minio requires at least 1GiB of free disk space to run. + virtualisation.diskSize = 2 * 1024; + networking.firewall.allowedTCPPorts = [ minioPort ]; + + services.minio = { + enable = true; + inherit (s3) accessKey secretKey; + }; + + environment.systemPackages = [ pkgs.minio-client ]; }; }; - testScript = '' - startAll; - $one->waitForUnit("prometheus2.service"); - $one->waitForOpenPort(9090); - $one->succeed("curl -s http://127.0.0.1:9090/metrics"); + testScript = { nodes, ... } : '' + # Before starting the other machines we first make sure that our S3 service is online + # and has a bucket added for thanos: + $s3->start; + $s3->waitForUnit("minio.service"); + $s3->waitForOpenPort(${toString minioPort}); + $s3->succeed( + "mc config host add minio " . + "http://localhost:${toString minioPort} ${s3.accessKey} ${s3.secretKey} S3v4"); + $s3->succeed("mc mb minio/thanos-bucket"); + + # Now that s3 has started we can start the other machines: + $prometheus->start; + $query->start; + $store->start; - # Let's test if pushing a metric to the pushgateway succeeds - # and whether that metric gets ingested by prometheus. - $one->waitForUnit("pushgateway.service"); - $one->succeed( + # Check if prometheus responds to requests: + $prometheus->waitForUnit("prometheus2.service"); + $prometheus->waitForOpenPort(${toString queryPort}); + $prometheus->succeed("curl -s http://127.0.0.1:${toString queryPort}/metrics"); + + # Let's test if pushing a metric to the pushgateway succeeds: + $prometheus->waitForUnit("pushgateway.service"); + $prometheus->succeed( "echo 'some_metric 3.14' | " . - "curl --data-binary \@- http://127.0.0.1:9091/metrics/job/some_job"); - $one->waitUntilSucceeds( - "curl -sf 'http://127.0.0.1:9090/api/v1/query?query=some_metric' " . - "| jq '.data.result[0].value[1]' | grep '\"3.14\"'"); + "curl --data-binary \@- http://127.0.0.1:${toString pushgwPort}/metrics/job/some_job"); + + # Now check whether that metric gets ingested by prometheus. + # Since we'll check for the metric several times on different machines + # we abstract the test using the following function: + + # Function to check if the metric "some_metric" has been received and returns the correct value. + local *Machine::waitForMetric = sub { + my ($self) = @_; + $self->waitUntilSucceeds( + "curl -sf 'http://127.0.0.1:${toString queryPort}/api/v1/query?query=some_metric' " . + "| jq '.data.result[0].value[1]' | grep '\"3.14\"'"); + }; + + $prometheus->waitForMetric; # Let's test if the pushgateway persists metrics to the configured location. - $one->waitUntilSucceeds("test -e /var/lib/prometheus-pushgateway/metrics"); + $prometheus->waitUntilSucceeds("test -e /var/lib/prometheus-pushgateway/metrics"); + + # Test thanos + $prometheus->waitForUnit("thanos-sidecar.service"); + + # Test if the Thanos query service can correctly retrieve the metric that was send above. + $query->waitForUnit("thanos-query.service"); + $query->waitForMetric; + + # Test if the Thanos sidecar has correctly uploaded its TSDB to S3, if the + # Thanos storage service has correctly downloaded it from S3 and if the Thanos + # query service running on $store can correctly retrieve the metric: + $store->waitForUnit("thanos-store.service"); + $store->waitForMetric; + + $store->waitForUnit("thanos-compact.service"); + + # Test if the Thanos bucket command is able to retrieve blocks from the S3 bucket + # and check if the blocks have the correct labels: + $store->succeed( + "thanos bucket ls" . + " --objstore.config-file=${nodes.store.config.services.thanos.store.objstore.config-file}" . + " --output=json | jq .thanos.labels.some_label | grep 'required by thanos'"); ''; } |