about summary refs log tree commit diff
path: root/nixpkgs/pkgs/development/python-modules/trafilatura/default.nix
diff options
context:
space:
mode:
Diffstat (limited to 'nixpkgs/pkgs/development/python-modules/trafilatura/default.nix')
-rw-r--r--nixpkgs/pkgs/development/python-modules/trafilatura/default.nix67
1 files changed, 67 insertions, 0 deletions
diff --git a/nixpkgs/pkgs/development/python-modules/trafilatura/default.nix b/nixpkgs/pkgs/development/python-modules/trafilatura/default.nix
new file mode 100644
index 000000000000..5ebb2280f31b
--- /dev/null
+++ b/nixpkgs/pkgs/development/python-modules/trafilatura/default.nix
@@ -0,0 +1,67 @@
+{ lib
+, buildPythonPackage
+, fetchPypi
+, pytestCheckHook
+, pythonOlder
+, certifi
+, charset-normalizer
+, courlan
+, htmldate
+, justext
+, lxml
+, urllib3
+}:
+
+buildPythonPackage rec {
+  pname = "trafilatura";
+  version = "1.7.0";
+  format = "setuptools";
+
+  disabled = pythonOlder "3.6";
+
+  src = fetchPypi {
+    inherit pname version;
+    hash = "sha256-oWbmfwBaahLvGU9Ix8n6ThsONnVv3Stk4CRzw1aWLwQ=";
+  };
+
+  propagatedBuildInputs = [
+    certifi
+    charset-normalizer
+    courlan
+    htmldate
+    justext
+    lxml
+    urllib3
+  ];
+
+  nativeCheckInputs = [ pytestCheckHook ];
+
+  # disable tests that require an internet connection
+  disabledTests = [
+    "test_download"
+    "test_fetch"
+    "test_redirection"
+    "test_meta_redirections"
+    "test_crawl_page"
+    "test_whole"
+    "test_probing"
+    "test_cli_pipeline"
+  ];
+
+  # patch out gui cli because it is not supported in this packaging
+  # nixify path to the trafilatura binary in the test suite
+  postPatch = ''
+    substituteInPlace setup.py --replace '"trafilatura_gui=trafilatura.gui:main",' ""
+    substituteInPlace tests/cli_tests.py --replace "trafilatura_bin = 'trafilatura'" "trafilatura_bin = '$out/bin/trafilatura'"
+  '';
+
+  pythonImportsCheck = [ "trafilatura" ];
+
+  meta = with lib; {
+    description = "Python package and command-line tool designed to gather text on the Web";
+    homepage = "https://trafilatura.readthedocs.io";
+    changelog = "https://github.com/adbar/trafilatura/blob/v${version}/HISTORY.md";
+    license = licenses.gpl3Plus;
+    maintainers = with maintainers; [ jokatzke ];
+  };
+}