about summary refs log tree commit diff
path: root/nixpkgs-overlays/nixpkgs-mozilla/lib/parseTOML.nix
diff options
context:
space:
mode:
Diffstat (limited to 'nixpkgs-overlays/nixpkgs-mozilla/lib/parseTOML.nix')
-rw-r--r--nixpkgs-overlays/nixpkgs-mozilla/lib/parseTOML.nix210
1 files changed, 210 insertions, 0 deletions
diff --git a/nixpkgs-overlays/nixpkgs-mozilla/lib/parseTOML.nix b/nixpkgs-overlays/nixpkgs-mozilla/lib/parseTOML.nix
new file mode 100644
index 000000000000..07a93696340c
--- /dev/null
+++ b/nixpkgs-overlays/nixpkgs-mozilla/lib/parseTOML.nix
@@ -0,0 +1,210 @@
+with builtins;
+
+# Tokenizer.
+let
+  layout_pat = "[ \n]+";
+  layout_pat_opt = "[ \n]*";
+  token_pat = ''=|[[][[][a-zA-Z0-9_."*-]+[]][]]|[[][a-zA-Z0-9_."*-]+[]]|[a-zA-Z0-9_-]+|"[^"]*"''; #"
+
+  tokenizer_1_11 = str:
+    let
+      tokenizer_rec = len: prevTokens: patterns: str:
+        let
+          pattern = head patterns;
+          layoutAndTokens = match pattern str;
+          matchLength = stringLength (head layoutAndTokens);
+          tokens = prevTokens ++ tail layoutAndTokens;
+        in
+          if layoutAndTokens == null then
+            # if we cannot reduce the pattern, return the list of token
+            if tail patterns == [] then prevTokens
+            # otherwise, take the next pattern, which only captures half the token.
+            else tokenizer_rec len prevTokens (tail patterns) str
+          else tokenizer_rec len tokens patterns (substring matchLength len str);
+
+      avgTokenSize = 100;
+      ceilLog2 = v:
+        let inner = n: i: if i < v then inner (n + 1) (i * 2) else n; in
+        inner 1 1;
+
+      # The builtins.match function match the entire string, and generate a list of all captured
+      # elements. This is the most efficient way to make a tokenizer, if we can make a pattern which
+      # capture all token of the file. Unfortunately C++ std::regex does not support captures in
+      # repeated patterns. As a work-around, we generate patterns which are matching tokens in multiple
+      # of 2, such that we can avoid iterating too many times over the content.
+      generatePatterns = str:
+        let
+          depth = ceilLog2 (stringLength str / avgTokenSize);
+          inner = depth:
+            if depth == 0 then [ "(${token_pat})" ]
+            else
+              let next = inner (depth - 1); in
+              [ "${head next}${layout_pat}${head next}" ] ++ next;
+        in
+          map (pat: "(${layout_pat_opt}${pat}).*" ) (inner depth);
+
+    in
+      tokenizer_rec (stringLength str) [] (generatePatterns str) str;
+
+  tokenizer_1_12 = str:
+    let
+      # Nix 1.12 has the builtins.split function which allow to tokenize the
+      # file quickly. by iterating with a simple regexp.
+      layoutTokenList = split "(${token_pat})" str;
+      isLayout = s: match layout_pat_opt s != null;
+      filterLayout = list:
+        filter (s:
+          if isString s then
+            if isLayout s then false
+            else throw "Error: Unexpected token: '${s}'"
+          else true) list;
+      removeTokenWrapper = list:
+        map (x: assert tail x == []; head x) list;
+    in
+      removeTokenWrapper (filterLayout layoutTokenList);
+
+  tokenizer =
+    if builtins ? split
+    then tokenizer_1_12
+    else tokenizer_1_11;
+in
+
+# Parse entry headers
+let
+  unescapeString = str:
+    # Let's ignore any escape character for the moment.
+    assert match ''"[^"]*"'' str != null; #"
+    substring 1 (stringLength str - 2) str;
+
+  # Match the content of TOML format section names.
+  ident_pat = ''[a-zA-Z0-9_-]+|"[^"]*"''; #"
+
+  removeBraces = token: wrapLen:
+    substring wrapLen (stringLength token - 2 * wrapLen) token;
+
+  # Note, this implementation is limited to 11 identifiers.
+  matchPathFun_1_11 = token:
+    let
+      # match header_pat "a.b.c" == [ "a" ".b" "b" ".c" "c" ]
+      header_pat =
+        foldl' (pat: n: "(${ident_pat})([.]${pat})?")
+           "(${ident_pat})" (genList (n: 0) 10);
+      matchPath = match header_pat token;
+      filterDot = filter (s: substring 0 1 s != ".") matchPath;
+    in
+      filterDot;
+
+  matchPathFun_1_12 = token:
+    map (e: head e)
+      (filter (s: isList s)
+        (split "(${ident_pat})" token));
+
+  matchPathFun =
+    if builtins ? split
+    then matchPathFun_1_12
+    else matchPathFun_1_11;
+
+  headerToPath = token: wrapLen:
+    let
+      token' = removeBraces token wrapLen;
+      matchPath = matchPathFun token';
+      path =
+        map (s:
+          if substring 0 1 s != ''"'' then s #"
+          else unescapeString s 
+        ) matchPath;
+    in 
+      assert matchPath != null;
+      # assert trace "Path: ${token'}; match as ${toString path}" true;
+      path;
+in
+
+# Reconstruct the equivalent attribute set.
+let
+  tokenToValue = token:
+    if token == "true" then true
+    else if token == "false" then false
+    else unescapeString token;
+
+  parserInitState = {
+    idx = 0;
+    path = [];
+    isList = false;
+    output = [];
+    elem = {};
+  };
+
+  # Imported from nixpkgs library.
+  setAttrByPath = attrPath: value:
+    if attrPath == [] then value
+    else listToAttrs
+      [ { name = head attrPath; value = setAttrByPath (tail attrPath) value; } ];
+
+  closeSection = state:
+    state // {
+      output = state.output ++ [ (setAttrByPath state.path (
+        if state.isList then [ state.elem ]
+        else state.elem
+      )) ];
+    };
+
+  readToken = state: token:
+    # assert trace "Read '${token}'" true;
+    if state.idx == 0 then
+      if substring 0 2 token == "[[" then
+        (closeSection state) // {
+          path = headerToPath token 2;
+          isList = true;
+          elem = {};
+        }
+      else if substring 0 1 token == "[" then
+        (closeSection state) // {
+          path = headerToPath token 1;
+          isList = false;
+          elem = {};
+        }
+      else
+        assert match "[a-zA-Z0-9_-]+" token != null;
+        state // { idx = 1; name = token; }
+    else if state.idx == 1 then
+      assert token == "=";
+      state // { idx = 2; }
+    else
+      assert state.idx == 2;
+      state // {
+        idx = 0;
+        elem = state.elem // {
+          "${state.name}" = tokenToValue token;
+        };
+      };
+
+  # aggregate each section as individual attribute sets.
+  parser = str:
+    closeSection (foldl' readToken parserInitState (tokenizer str));
+
+  fromTOML = toml:
+    let
+      sections = (parser toml).output;
+      # Inlined from nixpkgs library functions.
+      zipAttrs = sets:
+        listToAttrs (map (n: {
+          name = n;
+          value =
+            let v = catAttrs n sets; in
+            # assert trace "Visiting ${n}" true;
+            if tail v == [] then head v 
+            else if isList (head v) then concatLists v
+            else if isAttrs (head v) then zipAttrs v
+            else throw "cannot merge sections";
+        }) (concatLists (map attrNames sets)));
+    in
+      zipAttrs sections;
+in
+
+{
+  testing = fromTOML (builtins.readFile ./channel-rust-nightly.toml);
+  testing_url = fromTOML (builtins.readFile (builtins.fetchurl
+  https://static.rust-lang.org/dist/channel-rust-nightly.toml));
+  inherit fromTOML;
+}
+