about summary refs log tree commit diff
diff options
context:
space:
mode:
authorAtemu <atemu.main@gmail.com>2024-02-01 16:59:28 +0100
committerGitHub <noreply@github.com>2024-02-01 16:59:28 +0100
commitf38bca1ca559974b36863d0ff4bb710b7bc12890 (patch)
tree8950996221b92d6b810467d23b8d44432cb21210
parent2f61aff9ae76490380452d78a722ace1e71c5976 (diff)
parent6300f478e9d9a9a0f0d66004e50e67a107f9cea3 (diff)
downloadnixlib-f38bca1ca559974b36863d0ff4bb710b7bc12890.tar
nixlib-f38bca1ca559974b36863d0ff4bb710b7bc12890.tar.gz
nixlib-f38bca1ca559974b36863d0ff4bb710b7bc12890.tar.bz2
nixlib-f38bca1ca559974b36863d0ff4bb710b7bc12890.tar.lz
nixlib-f38bca1ca559974b36863d0ff4bb710b7bc12890.tar.xz
nixlib-f38bca1ca559974b36863d0ff4bb710b7bc12890.tar.zst
nixlib-f38bca1ca559974b36863d0ff4bb710b7bc12890.zip
Merge pull request #279677 from leona-ya/paperless-nltk
nixos/paperless: use nltk_data package as NLTK data source 
-rw-r--r--nixos/doc/manual/release-notes/rl-2405.section.md2
-rw-r--r--nixos/modules/services/misc/paperless.nix24
-rw-r--r--pkgs/applications/office/paperless-ngx/default.nix2
-rw-r--r--pkgs/tools/text/nltk_data/default.nix5
4 files changed, 14 insertions, 19 deletions
diff --git a/nixos/doc/manual/release-notes/rl-2405.section.md b/nixos/doc/manual/release-notes/rl-2405.section.md
index 1566417bd41c..e26702f7a61d 100644
--- a/nixos/doc/manual/release-notes/rl-2405.section.md
+++ b/nixos/doc/manual/release-notes/rl-2405.section.md
@@ -298,6 +298,8 @@ The pre-existing [services.ankisyncd](#opt-services.ankisyncd.enable) has been m
   - Custom themes and other assets that were previously stored in `custom/public/*` now belong in `custom/public/assets/*`
   - New instances of Gitea using MySQL now ignore the `[database].CHARSET` config option and always use the `utf8mb4` charset, existing instances should migrate via the `gitea doctor convert` CLI command.
 
+- The `services.paperless` module no longer uses the previously downloaded NLTK data stored in `/var/cache/paperless/nltk`. This directory can be removed.
+
 - The `hardware.pulseaudio` module now sets permission of pulse user home directory to 755 when running in "systemWide" mode. It fixes [issue 114399](https://github.com/NixOS/nixpkgs/issues/114399).
 
 - The `btrbk` module now automatically selects and provides required compression
diff --git a/nixos/modules/services/misc/paperless.nix b/nixos/modules/services/misc/paperless.nix
index 9780a4d72257..1256d8315c8b 100644
--- a/nixos/modules/services/misc/paperless.nix
+++ b/nixos/modules/services/misc/paperless.nix
@@ -6,7 +6,6 @@ let
   pkg = cfg.package;
 
   defaultUser = "paperless";
-  nltkDir = "/var/cache/paperless/nltk";
   defaultFont = "${pkgs.liberation_ttf}/share/fonts/truetype/LiberationSerif-Regular.ttf";
 
   # Don't start a redis instance if the user sets a custom redis connection
@@ -17,13 +16,17 @@ let
     PAPERLESS_DATA_DIR = cfg.dataDir;
     PAPERLESS_MEDIA_ROOT = cfg.mediaDir;
     PAPERLESS_CONSUMPTION_DIR = cfg.consumptionDir;
-    PAPERLESS_NLTK_DIR = nltkDir;
     PAPERLESS_THUMBNAIL_FONT_NAME = defaultFont;
     GUNICORN_CMD_ARGS = "--bind=${cfg.address}:${toString cfg.port}";
   } // optionalAttrs (config.time.timeZone != null) {
     PAPERLESS_TIME_ZONE = config.time.timeZone;
   } // optionalAttrs enableRedis {
     PAPERLESS_REDIS = "unix://${redisServer.unixSocket}";
+  } // optionalAttrs (cfg.settings.PAPERLESS_ENABLE_NLTK or true) {
+    PAPERLESS_NLTK_DIR = pkgs.symlinkJoin {
+      name = "paperless_ngx_nltk_data";
+      paths = pkg.nltkData;
+    };
   } // (lib.mapAttrs (_: s:
     if (lib.isAttrs s || lib.isList s) then builtins.toJSON s
     else if lib.isBool s then lib.boolToString s
@@ -292,23 +295,6 @@ in
       };
     };
 
-    # Download NLTK corpus data
-    systemd.services.paperless-download-nltk-data = {
-      wantedBy = [ "paperless-scheduler.service" ];
-      before = [ "paperless-scheduler.service" ];
-      after = [ "network-online.target" ];
-      wants = [ "network-online.target" ];
-      serviceConfig = defaultServiceConfig // {
-        User = cfg.user;
-        Type = "oneshot";
-        # Enable internet access
-        PrivateNetwork = false;
-        ExecStart = let pythonWithNltk = pkg.python.withPackages (ps: [ ps.nltk ]); in ''
-          ${pythonWithNltk}/bin/python -m nltk.downloader -d '${nltkDir}' punkt snowball_data stopwords
-        '';
-      };
-    };
-
     systemd.services.paperless-consumer = {
       description = "Paperless document consumer";
       # Bind to `paperless-scheduler` so that the consumer never runs
diff --git a/pkgs/applications/office/paperless-ngx/default.nix b/pkgs/applications/office/paperless-ngx/default.nix
index afe241222068..94b4ac77b68a 100644
--- a/pkgs/applications/office/paperless-ngx/default.nix
+++ b/pkgs/applications/office/paperless-ngx/default.nix
@@ -18,6 +18,7 @@
 , xcbuild
 , pango
 , pkg-config
+, nltk-data
 }:
 
 let
@@ -293,6 +294,7 @@ python.pkgs.buildPythonApplication rec {
 
   passthru = {
     inherit python path frontend;
+    nltkData = with nltk-data; [ punkt snowball_data stopwords ];
     tests = { inherit (nixosTests) paperless; };
   };
 
diff --git a/pkgs/tools/text/nltk_data/default.nix b/pkgs/tools/text/nltk_data/default.nix
index 118598083d5d..6445dd8c06e2 100644
--- a/pkgs/tools/text/nltk_data/default.nix
+++ b/pkgs/tools/text/nltk_data/default.nix
@@ -48,6 +48,11 @@ lib.makeScope newScope (self: {
     location = "taggers";
     hash = "sha256-ilTs4HWPUoHxQb4kWEy3wJ6QsE/98+EQya44gtV2inw=";
   });
+  snowball_data = makeNltkDataPackage ({
+    pname = "snowball_data";
+    location = "stemmers";
+    hash = "sha256-Y6LERPtaRbCtWmJCvMAd2xH02xdrevZBFNYvP9N4+3s=";
+  });
   stopwords = makeNltkDataPackage ({
     pname = "stopwords";
     location = "corpora";