about summary refs log tree commit diff
path: root/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix
diff options
context:
space:
mode:
Diffstat (limited to 'nixpkgs/pkgs/development/python-modules/tokenizers/default.nix')
-rw-r--r--nixpkgs/pkgs/development/python-modules/tokenizers/default.nix129
1 files changed, 129 insertions, 0 deletions
diff --git a/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix b/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix
new file mode 100644
index 000000000000..d650f350bd28
--- /dev/null
+++ b/nixpkgs/pkgs/development/python-modules/tokenizers/default.nix
@@ -0,0 +1,129 @@
+{ stdenv
+, rustPlatform
+, fetchFromGitHub
+, fetchurl
+, maturin
+, pipInstallHook
+, pytest
+, python
+, requests
+}:
+
+let
+  robertaVocab = fetchurl {
+    url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json";
+    sha256 = "0m86wpkfb2gdh9x9i9ng2fvwk1rva4p0s98xw996nrjxs7166zwy";
+  };
+  robertaMerges = fetchurl {
+    url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt";
+    sha256 = "1idd4rvkpqqbks51i2vjbd928inw7slij9l4r063w3y5fd3ndq8w";
+  };
+  bertVocab = fetchurl {
+    url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt";
+    sha256 = "18rq42cmqa8zanydsbzrb34xwy4l6cz1y900r4kls57cbhvyvv07";
+  };
+  openaiVocab = fetchurl {
+    url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json";
+    sha256 = "0y40gc9bixj5rxv674br1rxmxkd3ly29p80x1596h8yywwcrpx7x";
+  };
+  openaiMerges = fetchurl {
+    url = "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt";
+    sha256 = "09a754pm4djjglv3x5pkgwd6f79i2rq8ydg0f7c3q1wmwqdbba8f";
+  };
+in rustPlatform.buildRustPackage rec {
+  pname = "tokenizers";
+  version = "0.8.1";
+
+  src = fetchFromGitHub {
+    owner = "huggingface";
+    repo = pname;
+    rev = "python-v${version}";
+    sha256 = "0sxdwx05hr87j2z32rk4rgwn6a26w9r7m5fgj6ah1sgagiiyxbjw";
+  };
+
+  # Update parking_lot to be compatible with recent Rust versions, that
+  # replace asm! by llvm_asm!:
+  #
+  # https://github.com/Amanieu/parking_lot/pull/223
+  #
+  # Remove once upstream updates this dependency.
+  cargoPatches = [ ./update-parking-lot.diff ];
+
+  cargoSha256 = "0cdkxmj8z2wdspn6r62lqlpvd0sj1z0cmb1zpqaajxvr0b2kjlj8";
+
+  sourceRoot = "source/bindings/python";
+
+  nativeBuildInputs = [
+    maturin
+    pipInstallHook
+  ];
+
+  propagatedBuildInputs = [
+    python
+  ];
+
+  # tokenizers uses pyo3, which requires Rust nightly.
+  RUSTC_BOOTSTRAP = 1;
+
+  doCheck = false;
+  doInstallCheck = true;
+
+  postUnpack = ''
+    # Add data files for tests, otherwise tests attempt network access.
+    mkdir $sourceRoot/tests/data
+    ( cd $sourceRoot/tests/data
+      ln -s ${robertaVocab} roberta-base-vocab.json
+      ln -s ${robertaMerges} roberta-base-merges.txt
+      ln -s ${bertVocab} bert-base-uncased-vocab.txt
+      ln -s ${openaiVocab} openai-gpt-vocab.json
+      ln -s ${openaiMerges} openai-gpt-merges.txt )
+  '';
+
+  postPatch = ''
+    # pyo3's build check verifies that Rust is a nightly
+    # version. Disable this check.
+    substituteInPlace $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/build.rs \
+      --replace "check_rustc_version()?;" ""
+
+    # Patching the vendored dependency invalidates the file
+    # checksums, so remove them. This should be safe, since
+    # this is just a copy of the vendored dependencies and
+    # the integrity of the vendored dependencies is validated
+    # by cargoSha256.
+    sed -r -i 's|"files":\{[^}]+\}|"files":{}|' \
+      $NIX_BUILD_TOP/$cargoDepsCopy/pyo3/.cargo-checksum.json
+
+    # Maturin uses the crate name as the wheel name.
+    substituteInPlace Cargo.toml \
+      --replace "tokenizers-python" "tokenizers"
+  '';
+
+  buildPhase = ''
+    maturin build --release --manylinux off
+  '';
+
+  installPhase = ''
+    # Put the wheels where the pip install hook can find them.
+    install -Dm644 -t dist target/wheels/*.whl
+    pipInstallPhase
+  '';
+
+  installCheckInputs = [
+    pytest
+    requests
+  ];
+
+  installCheckPhase = ''
+    # Append paths, or the binding's tokenizer module will be
+    # used, since the test directories have __init__.py
+    pytest --import-mode=append
+  '';
+
+  meta = with stdenv.lib; {
+    homepage = "https://github.com/huggingface/tokenizers";
+    description = "Fast State-of-the-Art Tokenizers optimized for Research and Production";
+    license = licenses.asl20;
+    platforms = platforms.unix;
+    maintainers = with maintainers; [ danieldk ];
+  };
+}