1 files changed, 111 insertions, 0 deletions
diff --git a/nixpkgs/pkgs/development/python-modules/paddleocr/default.nix b/nixpkgs/pkgs/development/python-modules/paddleocr/default.nix
new file mode 100644
index 000000000000..9bd81077abfd
--- /dev/null
+++ b/nixpkgs/pkgs/development/python-modules/paddleocr/default.nix
@@ -0,0 +1,111 @@
+{ lib
+, buildPythonPackage
+, pythonRelaxDepsHook
+, fetchFromGitHub
+, attrdict
+, beautifulsoup4
+, cython
+, fire
+, fonttools
+, lmdb
+, lxml
+, numpy
+, opencv4
+, openpyxl
+, pdf2docx
+, pillow
+, premailer
+, pyclipper
+, pymupdf
+, python-docx
+, rapidfuzz
+, scikit-image
+, shapely
+, tqdm
+, paddlepaddle
+, lanms-neo
+, polygon3
+}:
+
+let
+  version = "2.7.0.1";
+in
+buildPythonPackage {
+  pname = "paddleocr";
+  inherit version;
+  format = "setuptools";
+
+  src = fetchFromGitHub {
+    owner = "PaddlePaddle";
+    repo = "PaddleOCR";
+    rev = "254786752a2659e184822b4b2de5637a05236590";
+    hash = "sha256-M/Fpk9swX9Gds7o5poM9Iv6LOhKoZNbe0Wv9JNMPOU0=";
+  };
+
+  patches = [
+    # The `ppocr.data.imaug` re-exports the `IaaAugment` and `CopyPaste`
+    # classes. These classes depend on the `imgaug` package which is
+    # unmaintained and has been removed from nixpkgs.
+    #
+    # The image OCR feature of PaddleOCR doesn't use these classes though, so
+    # they work even after stripping the the `IaaAugment` and `CopyPaste`
+    # exports. It probably breaks some of the OCR model creation tooling that
+    # PaddleOCR provides, however.
+    ./remove-import-imaug.patch
+  ];
+
+  nativeBuildInputs = [ pythonRelaxDepsHook ];
+  # trying to relax only pymupdf makes the whole build fail
+  pythonRelaxDeps = true;
+  pythonRemoveDeps = [
+    "imgaug"
+    "visualdl"
+    "opencv-python"
+    "opencv-contrib-python"
+  ];
+
+  propagatedBuildInputs = [
+    attrdict
+    beautifulsoup4
+    cython
+    fire
+    fonttools
+    lmdb
+    lxml
+    numpy
+    opencv4
+    openpyxl
+    pdf2docx
+    pillow
+    premailer
+    pyclipper
+    pymupdf
+    python-docx
+    rapidfuzz
+    scikit-image
+    shapely
+    tqdm
+    paddlepaddle
+    lanms-neo
+    polygon3
+  ];
+
+  # TODO: The tests depend, among possibly other things, on `cudatoolkit`.
+  # But Cudatoolkit fails to install.
+  # preCheck = "export HOME=$TMPDIR";
+  # nativeCheckInputs = with pkgs; [ which cudatoolkit ];
+  doCheck = false;
+
+  meta = with lib; {
+    homepage = "https://github.com/PaddlePaddle/PaddleOCR";
+    license = licenses.asl20;
+    description = "Multilingual OCR toolkits based on PaddlePaddle";
+    longDescription = ''
+      PaddleOCR aims to create multilingual, awesome, leading, and practical OCR
+      tools that help users train better models and apply them into practice.
+    '';
+    changelog = "https://github.com/PaddlePaddle/PaddleOCR/releases/tag/v${version}";
+    maintainers = with maintainers; [ happysalada ];
+    platforms = [ "x86_64-linux" "x86_64-darwin" "aarch64-darwin" ];
+  };
+}