22 files changed, 528 insertions, 0 deletions
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/frog.nix b/nixpkgs/pkgs/development/libraries/languagemachines/frog.nix
new file mode 100644
index 000000000000..50167f28a9db
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/frog.nix
@@ -0,0 +1,54 @@
+{ lib, stdenv, fetchurl
+, automake, autoconf, bzip2, libtar, libtool, pkg-config, autoconf-archive
+, libxml2, icu
+, languageMachines
+}:
+
+let
+  release = lib.importJSON ./release-info/LanguageMachines-frog.json;
+in
+
+stdenv.mkDerivation {
+  pname = "frog";
+  version = release.version;
+  src = fetchurl { inherit (release) url sha256;
+                   name = "frog-v${release.version}.tar.gz"; };
+  nativeBuildInputs = [ pkg-config automake autoconf ];
+  buildInputs = [ bzip2 libtar libtool autoconf-archive
+                  libxml2 icu
+                  languageMachines.ticcutils
+                  languageMachines.timbl
+                  languageMachines.mbt
+                  languageMachines.libfolia
+                  languageMachines.ucto
+                  languageMachines.frogdata
+                ];
+
+  preConfigure = ''
+    sh bootstrap.sh
+  '';
+  postInstall = ''
+    # frog expects the data files installed in the same prefix
+    mkdir -p $out/share/frog/;
+    for f in ${languageMachines.frogdata}/share/frog/*; do
+      ln -s $f $out/share/frog/;
+    done;
+
+    make check
+  '';
+
+  meta = with lib; {
+    description = "A Tagger-Lemmatizer-Morphological-Analyzer-Dependency-Parser for Dutch";
+    homepage    = "https://languagemachines.github.io/frog";
+    license     = licenses.gpl3;
+    platforms   = platforms.all;
+    maintainers = with maintainers; [ roberth ];
+
+    longDescription = ''
+      Frog is an integration of memory-based natural language processing (NLP) modules developed for Dutch. All NLP modules are based on Timbl, the Tilburg memory-based learning software package. Most modules were created in the 1990s at the ILK Research Group (Tilburg University, the Netherlands) and the CLiPS Research Centre (University of Antwerp, Belgium). Over the years they have been integrated into a single text processing tool, which is currently maintained and developed by the Language Machines Research Group and the Centre for Language and Speech Technology at Radboud University Nijmegen. A dependency parser, a base phrase chunker, and a named-entity recognizer module were added more recently. Where possible, Frog makes use of multi-processor support to run subtasks in parallel.
+
+      Various (re)programming rounds have been made possible through funding by NWO, the Netherlands Organisation for Scientific Research, particularly under the CGN project, the IMIX programme, the Implicit Linguistics project, the CLARIN-NL programme and the CLARIAH programme.
+    '';
+  };
+
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/frogdata.nix b/nixpkgs/pkgs/development/libraries/languagemachines/frogdata.nix
new file mode 100644
index 000000000000..5b1b07e79277
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/frogdata.nix
@@ -0,0 +1,30 @@
+{ lib, stdenv, fetchurl
+, automake, autoconf, libtool, pkg-config, autoconf-archive
+}:
+
+let
+  release = lib.importJSON ./release-info/LanguageMachines-frogdata.json;
+in
+
+stdenv.mkDerivation {
+  pname = "frogdata";
+  version = release.version;
+  src = fetchurl { inherit (release) url sha256;
+                   name = "frogdata-${release.version}.tar.gz"; };
+  nativeBuildInputs = [ pkg-config automake autoconf ];
+  buildInputs = [ libtool autoconf-archive
+                ];
+
+  preConfigure = ''
+    sh bootstrap.sh
+  '';
+
+  meta = with lib; {
+    description = "Data for Frog, a Tagger-Lemmatizer-Morphological-Analyzer-Dependency-Parser for Dutch";
+    homepage    = "https://languagemachines.github.io/frog";
+    license     = licenses.gpl3;
+    platforms   = platforms.all;
+    maintainers = with maintainers; [ roberth ];
+  };
+
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/libfolia.nix b/nixpkgs/pkgs/development/libraries/languagemachines/libfolia.nix
new file mode 100644
index 000000000000..ec7129396540
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/libfolia.nix
@@ -0,0 +1,35 @@
+{ lib, stdenv, fetchurl
+, automake, autoconf, libtool, pkg-config, autoconf-archive
+, libxml2, icu, bzip2, libtar
+, languageMachines }:
+
+let
+  release = lib.importJSON ./release-info/LanguageMachines-libfolia.json;
+in
+
+stdenv.mkDerivation {
+  pname = "libfolia";
+  version = release.version;
+  src = fetchurl { inherit (release) url sha256;
+                   name = "libfolia-${release.version}.tar.gz"; };
+  nativeBuildInputs = [ pkg-config automake autoconf ];
+  buildInputs = [ bzip2 libtool autoconf-archive libtar libxml2 icu languageMachines.ticcutils ];
+  preConfigure = "sh bootstrap.sh";
+
+  # compat with icu61+ https://github.com/unicode-org/icu/blob/release-64-2/icu4c/readme.html#L554
+  CXXFLAGS = [ "-DU_USING_ICU_NAMESPACE=1" ];
+
+  meta = with lib; {
+    description = "A C++ API for FoLiA documents; an XML-based linguistic annotation format.";
+    mainProgram = "folialint";
+    homepage    = "https://proycon.github.io/folia/";
+    license     = licenses.gpl3;
+    platforms   = platforms.all;
+    maintainers = with maintainers; [ roberth ];
+
+    longDescription = ''
+      A high-level C++ API to read, manipulate, and create FoLiA documents. FoLiA is an XML-based annotation format, suitable for the representation of linguistically annotated language resources. FoLiA’s intended use is as a format for storing and/or exchanging language resources, including corpora.
+    '';
+  };
+
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/mbt-add-libxml2-dep.patch b/nixpkgs/pkgs/development/libraries/languagemachines/mbt-add-libxml2-dep.patch
new file mode 100644
index 000000000000..9037f1093bb9
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/mbt-add-libxml2-dep.patch
@@ -0,0 +1,13 @@
+--- a/configure.ac	2017-06-12 06:48:15.000000000 +0200
++++ b/configure.ac	2017-06-12 06:50:06.000000000 +0200
+@@ -76,6 +76,10 @@
+ CXXFLAGS="$CXXFLAGS $ticcutils_CFLAGS"
+ LIBS="$LIBS $ticcutils_LIBS"
+ 
++PKG_CHECK_MODULES([libxml2], [libxml-2.0 >= 2.6.16] )
++CXXFLAGS="$CXXFLAGS $libxml2_CFLAGS"
++LIBS="$LIBS $libxml2_LIBS"
++
+ AC_CONFIG_FILES([
+   Makefile
+   mbt.pc
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/mbt.nix b/nixpkgs/pkgs/development/libraries/languagemachines/mbt.nix
new file mode 100644
index 000000000000..9556c1d56701
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/mbt.nix
@@ -0,0 +1,41 @@
+{ lib, stdenv, fetchurl
+, automake, autoconf, bzip2, libtar, libtool, pkg-config, autoconf-archive
+, libxml2
+, languageMachines
+}:
+
+let
+  release = lib.importJSON ./release-info/LanguageMachines-mbt.json;
+in
+
+stdenv.mkDerivation {
+  pname = "mbt";
+  version = release.version;
+  src = fetchurl { inherit (release) url sha256;
+                   name = "mbt-${release.version}.tar.gz"; };
+  nativeBuildInputs = [ pkg-config automake autoconf ];
+  buildInputs = [ bzip2 libtar libtool autoconf-archive
+                  libxml2
+                  languageMachines.ticcutils
+                  languageMachines.timbl
+                ];
+  patches = [ ./mbt-add-libxml2-dep.patch ];
+  preConfigure = ''
+    sh bootstrap.sh
+  '';
+
+  meta = with lib; {
+    description = "Memory Based Tagger";
+    homepage    = "https://languagemachines.github.io/mbt/";
+    license     = licenses.gpl3;
+    platforms   = platforms.all;
+    maintainers = with maintainers; [ roberth ];
+
+    longDescription = ''
+      MBT is a memory-based tagger-generator and tagger in one. The tagger-generator part can generate a sequence tagger on the basis of a training set of tagged sequences; the tagger part can tag new sequences. MBT can, for instance, be used to generate part-of-speech taggers or chunkers for natural language processing. It has also been used for named-entity recognition, information extraction in domain-specific texts, and disfluency chunking in transcribed speech.
+
+      Mbt is used by Frog for Dutch tagging.
+    '';
+  };
+
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/packages.nix b/nixpkgs/pkgs/development/libraries/languagemachines/packages.nix
new file mode 100644
index 000000000000..8a490bbc9867
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/packages.nix
@@ -0,0 +1,18 @@
+{ pkgs }:
+let
+  inherit (pkgs) callPackage;
+  icu = pkgs.icu60;
+in
+{
+  ticcutils = callPackage ./ticcutils.nix { };
+  libfolia = callPackage ./libfolia.nix { inherit icu; };
+  ucto = callPackage ./ucto.nix { inherit icu; };
+  uctodata = callPackage ./uctodata.nix { };
+  timbl = callPackage ./timbl.nix { };
+  timblserver = callPackage ./timblserver.nix { };
+  mbt = callPackage ./mbt.nix { };
+  frog = callPackage ./frog.nix { inherit icu; };
+  frogdata = callPackage ./frogdata.nix { };
+
+  test = callPackage ./test.nix { };
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-frog.json b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-frog.json
new file mode 100644
index 000000000000..55c2ec20a312
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-frog.json
@@ -0,0 +1,5 @@
+{
+  "version": "v0.13.7",
+  "url": "https://api.github.com/repos/LanguageMachines/frog/tarball/v0.13.7",
+  "sha256": "0swyfi3g862n888qj8v8kd18745hasy0vnc70i9qlv0ji0321bnf"
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-frogdata.json b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-frogdata.json
new file mode 100644
index 000000000000..6ea449387172
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-frogdata.json
@@ -0,0 +1,5 @@
+{
+  "version": "0.13",
+  "url": "https://api.github.com/repos/LanguageMachines/frogdata/tarball/v0.13",
+  "sha256": "13mhv8qacl0n20ddl1ay49xi6h2m0a149ya3rrsmaah3x4adb4sg"
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-libfolia.json b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-libfolia.json
new file mode 100644
index 000000000000..792d958213fb
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-libfolia.json
@@ -0,0 +1,5 @@
+{
+  "version": "v1.7",
+  "url": "https://api.github.com/repos/LanguageMachines/libfolia/tarball/v1.7",
+  "sha256": "0hpxdry7n2887klryc587xv46p6z6jp6hz9x7k2pk5v7jb0z4s65"
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-mbt.json b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-mbt.json
new file mode 100644
index 000000000000..f1bbff47a28e
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-mbt.json
@@ -0,0 +1,5 @@
+{
+  "version": "v3.2.16",
+  "url": "https://api.github.com/repos/LanguageMachines/mbt/tarball/v3.2.16",
+  "sha256": "0f9f5l84m0lmmv4km9myn3yhy67jbmk3qn2fi40dy025gx4l0x3x"
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-ticcutils.json b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-ticcutils.json
new file mode 100644
index 000000000000..11069c6b02c7
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-ticcutils.json
@@ -0,0 +1,5 @@
+{
+  "version": "v0.15",
+  "url": "https://api.github.com/repos/LanguageMachines/ticcutils/tarball/v0.15",
+  "sha256": "0lssb1klx2flmr6fy78j37i5lbq3gfhzjx24j6n72ndm2rvprvcn"
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-timbl.json b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-timbl.json
new file mode 100644
index 000000000000..d35f2c8333aa
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-timbl.json
@@ -0,0 +1,5 @@
+{
+  "version": "v6.4.9",
+  "url": "https://api.github.com/repos/LanguageMachines/timbl/tarball/v6.4.9",
+  "sha256": "1279npc3xlq05hnkylpbkgg941gjhvl6sd5fw4vgwcx2rwmmlaay"
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-timblserver.json b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-timblserver.json
new file mode 100644
index 000000000000..d588da3f8b6e
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-timblserver.json
@@ -0,0 +1,5 @@
+{
+  "version": "v1.11",
+  "url": "https://api.github.com/repos/LanguageMachines/timblserver/tarball/v1.11",
+  "sha256": "02k8c704wr5miy82w6zj0imm7sdfnxf3db34qiaa8l3myhn17qlw"
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-ucto.json b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-ucto.json
new file mode 100644
index 000000000000..9b05cf3e1393
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-ucto.json
@@ -0,0 +1,5 @@
+{
+  "version": "v0.9.6",
+  "url": "https://api.github.com/repos/LanguageMachines/ucto/tarball/v0.9.6",
+  "sha256": "0fxq4j32g7kp6789xz23651c4v2j7zlz87cshfv9g1xjs7jxns3f"
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-uctodata.json b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-uctodata.json
new file mode 100644
index 000000000000..08069bb333bc
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/release-info/LanguageMachines-uctodata.json
@@ -0,0 +1,5 @@
+{
+  "version": "v0.4",
+  "url": "https://api.github.com/repos/LanguageMachines/uctodata/tarball/v0.4",
+  "sha256": "02c78qmwi9ijpk5wila3p62fmfdy1rpmlvvzbxs3wg0rdb0nwvd2"
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/test.nix b/nixpkgs/pkgs/development/libraries/languagemachines/test.nix
new file mode 100644
index 000000000000..26fff825a151
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/test.nix
@@ -0,0 +1,25 @@
+{ runCommand
+, languageMachines
+}:
+
+runCommand "frog-test" {} ''
+  ${languageMachines.frog}/bin/frog >$out <<EOF
+  Dit is een test
+
+  EOF
+  echo "Frog output:"
+  cat $out
+
+  expected () {
+    echo "Test expectation failed: $@"
+    exit 1
+  }
+
+  lines="$(wc -l $out | awk '{print $1}')"
+  test 5 = $lines || expected "Five lines of output"
+  grep "is" $out | grep "zijn" >/dev/null || expected "Stemming works"
+  grep "een" $out | grep "onbep" >/dev/null || expected "Tagging works"
+
+  deps="$(echo $(awk 'BEGIN { FS = "\t*" } ; {print $1 " -> " $9 "; "}' <$out))"
+  test "1 -> 2; 2 -> 0; 3 -> 4; 4 -> 2; -> ;" = "$deps" || expected "Dependency parsing works"
+''
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/ticcutils.nix b/nixpkgs/pkgs/development/libraries/languagemachines/ticcutils.nix
new file mode 100644
index 000000000000..0b5fef292fcf
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/ticcutils.nix
@@ -0,0 +1,30 @@
+{ lib, stdenv, fetchurl
+, automake, autoconf, libtool, pkg-config, autoconf-archive
+, libxml2, zlib, bzip2, libtar }:
+
+let
+  release = lib.importJSON ./release-info/LanguageMachines-ticcutils.json;
+in
+
+stdenv.mkDerivation {
+  pname = "ticcutils";
+  version = release.version;
+  src = fetchurl { inherit (release) url sha256;
+                   name = "ticcutils-${release.version}.tar.gz"; };
+  nativeBuildInputs = [ pkg-config automake autoconf ];
+  buildInputs = [ libtool autoconf-archive libxml2
+                  # optional:
+                  zlib bzip2 libtar
+                  # broken but optional: boost
+                ];
+  preConfigure = "sh bootstrap.sh";
+
+  meta = with lib; {
+    description = "This module contains useful functions for general use in the TiCC software stack and beyond.";
+    homepage    = "https://github.com/LanguageMachines/ticcutils";
+    license     = licenses.gpl3;
+    platforms   = platforms.all;
+    maintainers = with maintainers; [ roberth ];
+  };
+
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/timbl.nix b/nixpkgs/pkgs/development/libraries/languagemachines/timbl.nix
new file mode 100644
index 000000000000..e1884e56132d
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/timbl.nix
@@ -0,0 +1,38 @@
+{ lib, stdenv, fetchurl
+, automake, autoconf, libtool, pkg-config, autoconf-archive
+, libxml2, bzip2, libtar
+, languageMachines
+}:
+
+let
+  release = lib.importJSON ./release-info/LanguageMachines-timbl.json;
+in
+
+stdenv.mkDerivation {
+  pname = "timbl";
+  version = release.version;
+  src = fetchurl { inherit (release) url sha256;
+                   name = "timbl-${release.version}.tar.gz"; };
+  nativeBuildInputs = [ pkg-config automake autoconf ];
+  buildInputs = [ bzip2 libtar libtool autoconf-archive
+                  libxml2
+                  languageMachines.ticcutils
+                ];
+  preConfigure = "sh bootstrap.sh";
+
+  meta = with lib; {
+    description = "TiMBL implements several memory-based learning algorithms";
+    mainProgram = "timbl";
+    homepage    = "https://github.com/LanguageMachines/timbl/";
+    license     = licenses.gpl3;
+    platforms   = platforms.all;
+    maintainers = with maintainers; [ roberth ];
+
+    longDescription = ''
+      TiMBL is an open source software package implementing several memory-based learning algorithms, among which IB1-IG, an implementation of k-nearest neighbor classification with feature weighting suitable for symbolic feature spaces, and IGTree, a decision-tree approximation of IB1-IG. All implemented algorithms have in common that they store some representation of the training set explicitly in memory. During testing, new cases are classified by extrapolation from the most similar stored cases.
+
+      For over fifteen years TiMBL has been mostly used in natural language processing as a machine learning classifier component, but its use extends to virtually any supervised machine learning domain. Due to its particular decision-tree-based implementation, TiMBL is in many cases far more efficient in classification than a standard k-nearest neighbor algorithm would be.
+    '';
+  };
+
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/timblserver.nix b/nixpkgs/pkgs/development/libraries/languagemachines/timblserver.nix
new file mode 100644
index 000000000000..ea40d017d471
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/timblserver.nix
@@ -0,0 +1,38 @@
+{ lib, stdenv, fetchurl
+, automake, autoconf, bzip2, libtar, libtool, pkg-config, autoconf-archive
+, libxml2
+, languageMachines
+}:
+
+let
+  release = lib.importJSON ./release-info/LanguageMachines-timblserver.json;
+in
+
+stdenv.mkDerivation {
+  pname = "timblserver";
+  version = release.version;
+  src = fetchurl { inherit (release) url sha256;
+                   name = "timblserver-${release.version}.tar.gz"; };
+  nativeBuildInputs = [ pkg-config automake autoconf ];
+  buildInputs = [ bzip2 libtar libtool autoconf-archive
+                  libxml2
+                  languageMachines.ticcutils
+                  languageMachines.timbl
+                ];
+  preConfigure = "sh bootstrap.sh";
+
+  meta = with lib; {
+    description = "This server for TiMBL implements several memory-based learning algorithms";
+    homepage    = "https://github.com/LanguageMachines/timblserver/";
+    license     = licenses.gpl3;
+    platforms   = platforms.all;
+    maintainers = with maintainers; [ roberth ];
+
+    longDescription = ''
+      This implements a server for TiMBL. TiMBL is an open source software package implementing several memory-based learning algorithms, among which IB1-IG, an implementation of k-nearest neighbor classification with feature weighting suitable for symbolic feature spaces, and IGTree, a decision-tree approximation of IB1-IG. All implemented algorithms have in common that they store some representation of the training set explicitly in memory. During testing, new cases are classified by extrapolation from the most similar stored cases.
+
+      For over fifteen years TiMBL has been mostly used in natural language processing as a machine learning classifier component, but its use extends to virtually any supervised machine learning domain. Due to its particular decision-tree-based implementation, TiMBL is in many cases far more efficient in classification than a standard k-nearest neighbor algorithm would be.
+    '';
+  };
+
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/ucto.nix b/nixpkgs/pkgs/development/libraries/languagemachines/ucto.nix
new file mode 100644
index 000000000000..6a3e31bcd553
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/ucto.nix
@@ -0,0 +1,50 @@
+{ lib, stdenv, fetchurl
+, automake, autoconf, libtool, pkg-config, autoconf-archive
+, libxml2, icu, bzip2, libtar
+, languageMachines
+}:
+
+let
+  release = lib.importJSON ./release-info/LanguageMachines-ucto.json;
+in
+
+stdenv.mkDerivation {
+  pname = "ucto";
+  version = release.version;
+  src = fetchurl { inherit (release) url sha256;
+                   name = "ucto-${release.version}.tar.gz"; };
+  nativeBuildInputs = [ pkg-config automake autoconf ];
+  buildInputs = [ bzip2 libtool autoconf-archive
+                  icu libtar libxml2
+                  languageMachines.ticcutils
+                  languageMachines.libfolia
+                  languageMachines.uctodata
+                  # TODO textcat from libreoffice? Pulls in X11 dependencies?
+                ];
+  preConfigure = "sh bootstrap.sh;";
+
+  postInstall = ''
+    # ucto expects the data files installed in the same prefix
+    mkdir -p $out/share/ucto/;
+    for f in ${languageMachines.uctodata}/share/ucto/*; do
+      echo "Linking $f"
+      ln -s $f $out/share/ucto/;
+    done;
+  '';
+
+  meta = with lib; {
+    description = "A rule-based tokenizer for natural language";
+    mainProgram = "ucto";
+    homepage    = "https://languagemachines.github.io/ucto/";
+    license     = licenses.gpl3;
+    platforms   = platforms.all;
+    maintainers = with maintainers; [ roberth ];
+
+    longDescription = ''
+      Ucto tokenizes text files: it separates words from punctuation, and splits sentences. It offers several other basic preprocessing steps such as changing case that you can all use to make your text suited for further processing such as indexing, part-of-speech tagging, or machine translation.
+
+      Ucto comes with tokenisation rules for several languages and can be easily extended to suit other languages. It has been incorporated for tokenizing Dutch text in Frog, a Dutch morpho-syntactic processor.
+    '';
+  };
+
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/uctodata.nix b/nixpkgs/pkgs/development/libraries/languagemachines/uctodata.nix
new file mode 100644
index 000000000000..a274b6193edd
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/uctodata.nix
@@ -0,0 +1,32 @@
+{ lib, stdenv, fetchurl
+, automake, autoconf, libtool, pkg-config, autoconf-archive
+ }:
+
+let
+  release = lib.importJSON ./release-info/LanguageMachines-uctodata.json;
+in
+
+stdenv.mkDerivation {
+  pname = "uctodata";
+  version = release.version;
+  src = fetchurl { inherit (release) url sha256;
+                   name = "uctodata-${release.version}.tar.gz"; };
+  nativeBuildInputs = [ pkg-config automake autoconf ];
+  buildInputs = [ libtool autoconf-archive ];
+  preConfigure = "sh bootstrap.sh";
+
+  meta = with lib; {
+    description = "A rule-based tokenizer for natural language";
+    homepage    = "https://languagemachines.github.io/ucto/";
+    license     = licenses.gpl3;
+    platforms   = platforms.all;
+    maintainers = with maintainers; [ roberth ];
+
+    longDescription = ''
+      Ucto tokenizes text files: it separates words from punctuation, and splits sentences. It offers several other basic preprocessing steps such as changing case that you can all use to make your text suited for further processing such as indexing, part-of-speech tagging, or machine translation.
+
+      Ucto comes with tokenisation rules for several languages and can be easily extended to suit other languages. It has been incorporated for tokenizing Dutch text in Frog, a Dutch morpho-syntactic processor.
+    '';
+  };
+
+}
diff --git a/nixpkgs/pkgs/development/libraries/languagemachines/update b/nixpkgs/pkgs/development/libraries/languagemachines/update
new file mode 100755
index 000000000000..3189637df746
--- /dev/null
+++ b/nixpkgs/pkgs/development/libraries/languagemachines/update
@@ -0,0 +1,79 @@
+#!/usr/bin/env nix-shell
+#!nix-shell --packages curl
+#!nix-shell --packages jq
+#!nix-shell --packages parallel
+#!nix-shell -i bash
+
+# Exit immediately if a command exits with a non-zero status.
+# Exit when a producer fails in a pipe
+# Treat undefined variable references as errors
+set -e -o pipefail -u
+
+# Check if working directory is (probably) right
+test "./update" = $0 || {
+    echo "The working directory ought to be the same is the update script location. Please invoke as ./update" 1>&2
+    exit 1
+}
+
+# Create temporary directory with automatic cleanup
+readonly MY_TMP="$(mktemp -d)"
+cleanup () {
+    rm -rf "$MY_TMP"
+}
+trap cleanup EXIT
+
+# stdout: file containing release info and a convenient placeholder
+# for the sha256 attribute
+getRelease () {
+    local owner="$1"
+    local repo="$2"
+    local out="$MY_TMP/$owner--$repo-release"
+    curl -fSs https://api.github.com/repos/"$owner"/"$repo"/releases/latest \
+        | jq '{ version: .name, url: .tarball_url, sha256: "__SHA256__" }' \
+             > "$out"
+    echo "$out"
+}
+
+# 'getters' for the release info file
+
+# stdout: unquoted tarball url
+releaseUrl () {
+    local file="$1"
+    jq -r '.url' <"$file"
+}
+
+# stdout: unquoted version
+releaseVersion () {
+    local file="$1"
+    jq -r '.version' <"$file"
+}
+
+# Fetch release tarball and compute hash
+# stdout: base32 sha256 to be used in fetchurl
+getReleaseHash () {
+    local file="$1"
+    local name="$2"
+    nix-prefetch-url "$(releaseUrl "$file")" --name "$name-$(releaseVersion "$file").tar.gz"
+}
+
+# Write a release info file to release-info/$owner-$repo.json
+updateRelease () {
+    local owner="$1"
+    local repo="$2"
+    local r="$(getRelease "$owner" "$repo")"
+    local hash="$(getReleaseHash "$r" "$repo")"
+    sed \
+      -e s/__SHA256__/"$hash"/\
+      <"$r" \
+      >"release-info/$owner-$repo.json"
+}
+
+updateRelease LanguageMachines frogdata
+updateRelease LanguageMachines frog
+updateRelease LanguageMachines libfolia
+updateRelease LanguageMachines mbt
+updateRelease LanguageMachines ticcutils
+updateRelease LanguageMachines timbl
+updateRelease LanguageMachines timblserver
+updateRelease LanguageMachines ucto
+updateRelease LanguageMachines uctodata