diff options
author | Alyssa Ross <hi@alyssa.is> | 2019-01-07 02:18:36 +0000 |
---|---|---|
committer | Alyssa Ross <hi@alyssa.is> | 2019-01-07 02:18:47 +0000 |
commit | 36f56d99fa0a0765c9f1de4a5f17a9b05830c3f2 (patch) | |
tree | b3faaf573407b32aa645237a4d16b82778a39a92 /nixpkgs/pkgs/servers/dict | |
parent | 4e31070265257dc67d120c27e0f75c2344fdfa9a (diff) | |
parent | abf060725d7614bd3b9f96764262dfbc2f9c2199 (diff) | |
download | nixlib-36f56d99fa0a0765c9f1de4a5f17a9b05830c3f2.tar nixlib-36f56d99fa0a0765c9f1de4a5f17a9b05830c3f2.tar.gz nixlib-36f56d99fa0a0765c9f1de4a5f17a9b05830c3f2.tar.bz2 nixlib-36f56d99fa0a0765c9f1de4a5f17a9b05830c3f2.tar.lz nixlib-36f56d99fa0a0765c9f1de4a5f17a9b05830c3f2.tar.xz nixlib-36f56d99fa0a0765c9f1de4a5f17a9b05830c3f2.tar.zst nixlib-36f56d99fa0a0765c9f1de4a5f17a9b05830c3f2.zip |
Add 'nixpkgs/' from commit 'abf060725d7614bd3b9f96764262dfbc2f9c2199'
git-subtree-dir: nixpkgs git-subtree-mainline: 4e31070265257dc67d120c27e0f75c2344fdfa9a git-subtree-split: abf060725d7614bd3b9f96764262dfbc2f9c2199
Diffstat (limited to 'nixpkgs/pkgs/servers/dict')
-rw-r--r-- | nixpkgs/pkgs/servers/dict/buildfix.diff | 11 | ||||
-rw-r--r-- | nixpkgs/pkgs/servers/dict/default.nix | 35 | ||||
-rw-r--r-- | nixpkgs/pkgs/servers/dict/dictd-db-collector.nix | 80 | ||||
-rw-r--r-- | nixpkgs/pkgs/servers/dict/dictd-db.nix | 95 | ||||
-rw-r--r-- | nixpkgs/pkgs/servers/dict/dictd-wiktionary.nix | 31 | ||||
-rw-r--r-- | nixpkgs/pkgs/servers/dict/dictd-wordnet.nix | 36 | ||||
-rw-r--r-- | nixpkgs/pkgs/servers/dict/libmaa.nix | 20 | ||||
-rw-r--r-- | nixpkgs/pkgs/servers/dict/wiktionary2dict.py | 778 | ||||
-rw-r--r-- | nixpkgs/pkgs/servers/dict/wordnet_structures.py | 319 |
9 files changed, 1405 insertions, 0 deletions
diff --git a/nixpkgs/pkgs/servers/dict/buildfix.diff b/nixpkgs/pkgs/servers/dict/buildfix.diff new file mode 100644 index 000000000000..e30fcb2b0380 --- /dev/null +++ b/nixpkgs/pkgs/servers/dict/buildfix.diff @@ -0,0 +1,11 @@ +--- Makefile.in~ 2011-03-06 18:52:54.000000000 +0100 ++++ Makefile.in 2014-01-29 19:04:51.384844897 +0100 +@@ -123,7 +123,7 @@ + + %: %.o + $(LIBTOOL) --tag=CC --mode=link $(CC) -o $@ -static \ +- $^ $(OBJS) $(LDFLAGS) -lz ${LIBS} ++ $(^:.o=.lo) $(OBJS) $(LDFLAGS) -lz ${LIBS} + + include $(srcdir)/deps + diff --git a/nixpkgs/pkgs/servers/dict/default.nix b/nixpkgs/pkgs/servers/dict/default.nix new file mode 100644 index 000000000000..bf9fd77df7c9 --- /dev/null +++ b/nixpkgs/pkgs/servers/dict/default.nix @@ -0,0 +1,35 @@ +{ stdenv, fetchurl, which, bison, flex, libmaa, zlib, libtool }: + +stdenv.mkDerivation rec { + name = "dictd-${version}"; + version = "1.12.1"; + + src = fetchurl { + url = "mirror://sourceforge/dict/dictd-${version}.tar.gz"; + sha256 = "0min6v60b6z5mrymyjfwzx8nv6rdm8pd8phlwl6v2jl5vkngcdx2"; + }; + + buildInputs = [ libmaa zlib ]; + + nativeBuildInputs = [ bison flex libtool which ]; + + # Makefile(.in) contains "clientparse.c clientparse.h: clientparse.y" which + # causes bison to run twice, and break the build when this happens in + # parallel. Test with "make -j clientparse.c clientparse.h". The error + # message may be "mv: cannot move 'y.tab.c' to 'clientparse.c'". + enableParallelBuilding = false; + + patchPhase = "patch -p0 < ${./buildfix.diff}"; + configureFlags = [ + "--enable-dictorg" + "--datadir=/run/current-systems/sw/share/dictd" + ]; + + meta = with stdenv.lib; { + description = "Dict protocol server and client"; + homepage = http://www.dict.org; + license = licenses.gpl2; + maintainers = with maintainers; [ ]; + platforms = platforms.linux; + }; +} diff --git a/nixpkgs/pkgs/servers/dict/dictd-db-collector.nix b/nixpkgs/pkgs/servers/dict/dictd-db-collector.nix new file mode 100644 index 000000000000..f635c98602d9 --- /dev/null +++ b/nixpkgs/pkgs/servers/dict/dictd-db-collector.nix @@ -0,0 +1,80 @@ +{stdenv, lib, dict}: +({dictlist, allowList ? ["127.0.0.1"], denyList ? []}: +/* + dictlist is a list of form + [ { filename = /path/to/files/basename; + name = "name"; } ] + basename.dict.dz and basename.index should be + dict files. Or look below for other options. + allowList is a list of IP/domain *-wildcarded strings + denyList is the same.. +*/ + +let + link_arguments = map + (x: '' "${x.filename}" '') + dictlist; + databases = lib.concatStrings (map (x : + "${x.name} ${x.filename}\n") dictlist); + allow = lib.concatStrings (map (x: "allow ${x}\n") allowList); + deny = lib.concatStrings (map (x: "deny ${x}\n") denyList); + accessSection = " + access { + ${allow} + ${deny} + } + "; + installPhase = '' + mkdir -p $out/share/dictd + cd $out/share/dictd + echo "${databases}" >databases.names + echo "${accessSection}" > dictd.conf + for j in ${toString link_arguments}; do + name="$(egrep ' '"$j"\$ databases.names)" + name=''${name% $j} + if test -d "$j"; then + if test -d "$j"/share/dictd ; then + echo "Got store path $j" + j="$j"/share/dictd + fi + echo "Directory reference: $j" + i=$(ls "$j""/"*.index) + i="''${i%.index}"; + else + i="$j"; + fi + echo "Basename is $i" + locale=$(cat "$(dirname "$i")"/locale) + base="$(basename "$i")" + echo "Locale is $locale" + export LC_ALL=$locale + export LANG=$locale + if test -e "$i".dict.dz; then + ln -s "$i".dict.dz + else + cp "$i".dict . + dictzip "$base".dict + fi + ln -s "$i".index . + dictfmt_index2word --locale $locale < "$base".index > "$base".word || true + dictfmt_index2suffix --locale $locale < "$base".index > "$base".suffix || true + + echo "database $name {" >> dictd.conf + echo " data $out/share/dictd/$base.dict.dz" >> dictd.conf + echo " index $out/share/dictd/$base.index" >> dictd.conf + echo " index_word $out/share/dictd/$base.word" >> dictd.conf + echo " index_suffix $out/share/dictd/$base.suffix" >> dictd.conf + echo "}" >> dictd.conf + done + ''; + +in + +stdenv.mkDerivation { + name = "dictd-dbs"; + + phases = ["installPhase"]; + buildInputs = [dict]; + + inherit installPhase; +}) diff --git a/nixpkgs/pkgs/servers/dict/dictd-db.nix b/nixpkgs/pkgs/servers/dict/dictd-db.nix new file mode 100644 index 000000000000..065218a5b95d --- /dev/null +++ b/nixpkgs/pkgs/servers/dict/dictd-db.nix @@ -0,0 +1,95 @@ +{ stdenv, fetchurl, callPackage }: + +let + # Probably a bug in some FreeDict release files, but easier to trivially + # work around than report. Not that it can cause any other problems.. + makeDictdDBFreedict = src: name: locale: + makeDictdDB src name "{.,bin}" locale; + + makeDictdDB = src: _name: _subdir: _locale: + stdenv.mkDerivation rec { + name = "dictd-db-${_name}"; + inherit src; + locale = _locale; + dbName = _name; + buildPhase = ":"; + unpackPhase = '' + tar xf ${src} + ''; + installPhase = '' + mkdir -p $out/share/dictd + cp $(ls ./${_subdir}/*.{dict*,index} || true) $out/share/dictd + echo "${_locale}" >$out/share/dictd/locale + ''; + + meta = { + description = "dictd-db dictionary for dictd"; + platforms = stdenv.lib.platforms.linux; + }; + }; +in rec { + deu2eng = makeDictdDBFreedict (fetchurl { + url = mirror://sourceforge/freedict/deu-eng.tar.gz; + sha256 = "0dqrhv04g4f5s84nbgisgcfwk5x0rpincif0yfhfh4sc1bsvzsrb"; + }) "deu-eng" "de_DE"; + eng2deu = makeDictdDBFreedict (fetchurl { + url = mirror://sourceforge/freedict/eng-deu.tar.gz; + sha256 = "01x12p72sa3071iff3jhzga8588440f07zr56r3x98bspvdlz73r"; + }) "eng-deu" "en_EN"; + nld2eng = makeDictdDBFreedict (fetchurl { + url = mirror://sourceforge/freedict/nld-eng.tar.gz; + sha256 = "1vhw81pphb64fzsjvpzsnnyr34ka2fxizfwilnxyjcmpn9360h07"; + }) "nld-eng" "nl_NL"; + eng2nld = makeDictdDBFreedict (fetchurl { + url = mirror://sourceforge/freedict/eng-nld.tar.gz; + sha256 = "0rcg28ldykv0w2mpxc6g4rqmfs33q7pbvf68ssy1q9gpf6mz7vcl"; + }) "eng-nld" "en_UK"; + eng2rus = makeDictdDBFreedict (fetchurl { + url = mirror://sourceforge/freedict/eng-rus.tar.gz; + sha256 = "15409ivhww1wsfjr05083pv6mg10bak8v5pg1wkiqybk7ck61rry"; + }) "eng-rus" "en_UK"; + fra2eng = makeDictdDBFreedict (fetchurl { + url = mirror://sourceforge/freedict/fra-eng.tar.gz; + sha256 = "0sdd88s2zs5whiwdf3hd0s4pzzv75sdsccsrm1wxc87l3hjm85z3"; + }) "fra-eng" "fr_FR"; + eng2fra = makeDictdDBFreedict (fetchurl { + url = mirror://sourceforge/freedict/eng-fra.tar.gz; + sha256 = "0fi6rrnbqnhc6lq8d0nmn30zdqkibrah0mxfg27hsn9z7alwbj3m"; + }) "eng-fra" "en_UK"; + mueller_eng2rus_pkg = makeDictdDB (fetchurl { + url = mirror://sourceforge/mueller-dict/mueller-dict-3.1.tar.gz; + sha256 = "04r5xxznvmcb8hkxqbjgfh2gxvbdd87jnhqn5gmgvxxw53zpwfmq"; + }) "mueller-eng-rus" "mueller-dict-*/dict" "en_UK"; + mueller_enru_abbr = { + outPath = "${mueller_eng2rus_pkg}/share/dictd/mueller-abbrev"; + name = "mueller-abbr"; + dbName = "mueller-abbr"; + locale = "en_UK"; + }; + mueller_enru_base = { + outPath = "${mueller_eng2rus_pkg}/share/dictd/mueller-base"; + name = "mueller-base"; + dbName = "mueller-base"; + locale = "en_UK"; + }; + mueller_enru_dict = { + outPath = "${mueller_eng2rus_pkg}/share/dictd/mueller-dict"; + name = "mueller-dict"; + dbName = "mueller-dict"; + locale = "en_UK"; + }; + mueller_enru_geo = { + outPath = "${mueller_eng2rus_pkg}/share/dictd/mueller-geo"; + name = "mueller-geo"; + dbName = "mueller-geo"; + locale = "en_UK"; + }; + mueller_enru_names = { + outPath = "${mueller_eng2rus_pkg}/share/dictd/mueller-names"; + name = "mueller-names"; + dbName = "mueller-names"; + locale = "en_UK"; + }; + wordnet = callPackage ./dictd-wordnet.nix {}; + wiktionary = callPackage ./dictd-wiktionary.nix {}; +} diff --git a/nixpkgs/pkgs/servers/dict/dictd-wiktionary.nix b/nixpkgs/pkgs/servers/dict/dictd-wiktionary.nix new file mode 100644 index 000000000000..13e4757fe89e --- /dev/null +++ b/nixpkgs/pkgs/servers/dict/dictd-wiktionary.nix @@ -0,0 +1,31 @@ +{stdenv, fetchurl, python, dict, glibcLocales, writeScript}: + +stdenv.mkDerivation rec { + version = "20161001"; + name = "dict-db-wiktionary-${version}"; + data = fetchurl { + url = "http://dumps.wikimedia.org/enwiktionary/${version}/enwiktionary-${version}-pages-articles.xml.bz2"; + sha256 = "0g3k7kxp2nzg0v56i4cz253af3aqvhn1lwkys2fnam51cn3yqm7m"; + }; + + convert = ./wiktionary2dict.py; + buildInputs = [python dict glibcLocales]; + + builder = writeScript "wiktionary-builder.sh" '' + source $stdenv/setup + + mkdir -p $out/share/dictd/ + cd $out/share/dictd + + python -O ${convert} ${data} + dictzip wiktionary-en.dict + echo en_US.UTF-8 > locale + ''; + + meta = { + description = "DICT version of English Wiktionary"; + homepage = http://en.wiktionary.org/; + maintainers = [ ]; + platforms = stdenv.lib.platforms.all; + }; +} diff --git a/nixpkgs/pkgs/servers/dict/dictd-wordnet.nix b/nixpkgs/pkgs/servers/dict/dictd-wordnet.nix new file mode 100644 index 000000000000..8a1bb6313ad2 --- /dev/null +++ b/nixpkgs/pkgs/servers/dict/dictd-wordnet.nix @@ -0,0 +1,36 @@ +{stdenv, python, wordnet, writeScript}: + +stdenv.mkDerivation rec { + version = "542"; + name = "dict-db-wordnet-${version}"; + + buildInputs = [python wordnet]; + convert = ./wordnet_structures.py; + + builder = writeScript "builder.sh" '' + . ${stdenv}/setup + mkdir -p $out/share/dictd/ + cd $out/share/dictd + + for i in ${wordnet}/dict/data.*; do + DATA="$DATA `echo $i | sed -e s,data,index,` $i"; + done + + python ${convert} $DATA + echo en_US.UTF-8 > locale + ''; + + meta = { + description = "dictd-compatible version of WordNet"; + + longDescription = + '' WordNet® is a large lexical database of English. This package makes + the wordnet data available to dictd and by extension for lookup with + the dict command. ''; + + homepage = https://wordnet.princeton.edu/; + + maintainers = [ ]; + platforms = stdenv.lib.platforms.all; + }; +} diff --git a/nixpkgs/pkgs/servers/dict/libmaa.nix b/nixpkgs/pkgs/servers/dict/libmaa.nix new file mode 100644 index 000000000000..3c04a678def5 --- /dev/null +++ b/nixpkgs/pkgs/servers/dict/libmaa.nix @@ -0,0 +1,20 @@ +{ stdenv, fetchurl, libtool }: + +stdenv.mkDerivation rec { + version = "1.3.2"; + name = "libmaa-${version}"; + + src = fetchurl { + url = "mirror://sourceforge/dict/libmaa-${version}.tar.gz"; + sha256 = "1idi4c30pi79g5qfl7rr9s17krbjbg93bi8f2qrbsdlh78ga19ar"; + }; + + buildInputs = [ libtool ]; + # configureFlags = [ "--datadir=/var/run/current-system/share/dictd" ]; + + meta = with stdenv.lib; { + description = "Dict protocol server and client"; + maintainers = [ ]; + platforms = platforms.linux; + }; +} diff --git a/nixpkgs/pkgs/servers/dict/wiktionary2dict.py b/nixpkgs/pkgs/servers/dict/wiktionary2dict.py new file mode 100644 index 000000000000..8a0210e7254f --- /dev/null +++ b/nixpkgs/pkgs/servers/dict/wiktionary2dict.py @@ -0,0 +1,778 @@ +# Adapted to produce DICT-compatible files by Petr Rockai in 2012 +# Based on code from wiktiondict by Greg Hewgill +import re +import sys +import codecs +import os +import textwrap +import time +import xml.sax + +class Text: + def __init__(self, s): + self.s = s + def process(self): + return s + +class TemplateCall: + def __init__(self): + pass + def process(self): + pass + +class Template: + def __init__(self): + self.parts = [] + def append(self, part): + self.parts.append(part) + def process(self): + return ''.join(x.process() for x in self.parts) + +class Whitespace: + def __init__(self, s): + self.s = s + +class OpenDouble: pass +class OpenTriple: pass +class CloseDouble: pass +class CloseTriple: pass + +class Equals: + def __str__(self): + return "=" + +class Delimiter: + def __init__(self, c): + self.c = c + def __str__(self): + return self.c + +def Tokenise(s): + s = unicode(s) + stack = [] + last = 0 + i = 0 + while i < len(s): + if s[i] == '{' and i+1 < len(s) and s[i+1] == '{': + if i > last: + yield s[last:i] + if i+2 < len(s) and s[i+2] == '{': + yield OpenTriple() + stack.append(3) + i += 3 + else: + yield OpenDouble() + stack.append(2) + i += 2 + last = i + elif s[i] == '}' and i+1 < len(s) and s[i+1] == '}': + if i > last: + yield s[last:i] + if len(stack) == 0: + yield "}}" + i += 2 + elif stack[-1] == 2: + yield CloseDouble() + i += 2 + stack.pop() + elif i+2 < len(s) and s[i+2] == '}': + yield CloseTriple() + i += 3 + stack.pop() + else: + raise SyntaxError() + last = i + elif s[i] == ':' or s[i] == '|': + if i > last: + yield s[last:i] + yield Delimiter(s[i]) + i += 1 + last = i + elif s[i] == '=': + if i > last: + yield s[last:i] + yield Equals() + i += 1 + last = i + #elif s[i] == ' ' or s[i] == '\t' or s[i] == '\n': + # if i > last: + # yield s[last:i] + # last = i + # m = re.match(r"\s+", s[i:]) + # assert m + # yield Whitespace(m.group(0)) + # i += len(m.group(0)) + # last = i + else: + i += 1 + if i > last: + yield s[last:i] + +def processSub(templates, tokens, args): + t = tokens.next() + if not isinstance(t, unicode): + raise SyntaxError + name = t + t = tokens.next() + default = None + if isinstance(t, Delimiter) and t.c == '|': + default = "" + while True: + t = tokens.next() + if isinstance(t, unicode): + default += t + elif isinstance(t, OpenDouble): + default += processTemplateCall(templates, tokens, args) + elif isinstance(t, OpenTriple): + default += processSub(templates, tokens, args) + elif isinstance(t, CloseTriple): + break + else: + print "Unexpected:", t + raise SyntaxError() + if name in args: + return args[name] + if default is not None: + return default + if name == "lang": + return "en" + return "{{{%s}}}" % name + +def processTemplateCall(templates, tokens, args): + template = tokens.next().strip().lower() + args = {} + a = 1 + t = tokens.next() + while True: + if isinstance(t, Delimiter): + name = unicode(a) + arg = "" + while True: + t = tokens.next() + if isinstance(t, unicode): + arg += t + elif isinstance(t, OpenDouble): + arg += processTemplateCall(templates, tokens, args) + elif isinstance(t, OpenTriple): + arg += processSub(templates, tokens, args) + elif isinstance(t, Delimiter) and t.c != '|': + arg += str(t) + else: + break + if isinstance(t, Equals): + name = arg.strip() + arg = "" + while True: + t = tokens.next() + if isinstance(t, (unicode, Equals)): + arg += unicode(t) + elif isinstance(t, OpenDouble): + arg += processTemplateCall(templates, tokens, args) + elif isinstance(t, OpenTriple): + arg += processSub(templates, tokens, args) + elif isinstance(t, Delimiter) and t.c != '|': + arg += str(t) + else: + break + arg = arg.strip() + else: + a += 1 + args[name] = arg + elif isinstance(t, CloseDouble): + break + else: + print "Unexpected:", t + raise SyntaxError + #print template, args + if template[0] == '#': + if template == "#if": + if args['1'].strip(): + return args['2'] + elif '3' in args: + return args['3'] + else: + return "" + elif template == "#ifeq": + if args['1'].strip() == args['2'].strip(): + return args['3'] + elif '4' in args: + return args['4'] + else: + return "" + elif template == "#ifexist": + return "" + elif template == "#switch": + sw = args['1'].strip() + if sw in args: + return args[sw] + else: + return "" + else: + print "Unknown ParserFunction:", template + sys.exit(1) + if template not in templates: + return "{{%s}}" % template + return process(templates, templates[template], args) + +def process(templates, s, args = {}): + s = re.compile(r"<!--.*?-->", re.DOTALL).sub("", s) + s = re.compile(r"<noinclude>.*?</noinclude>", re.DOTALL).sub("", s) + assert "<onlyinclude>" not in s + #s = re.sub(r"(.*?)<onlyinclude>(.*?)</onlyinclude>(.*)", r"\1", s) + s = re.compile(r"<includeonly>(.*?)</includeonly>", re.DOTALL).sub(r"\1", s) + r = "" + #print list(Tokenise(s)) + tokens = Tokenise(s) + try: + while True: + t = tokens.next() + if isinstance(t, OpenDouble): + r += processTemplateCall(templates, tokens, args) + elif isinstance(t, OpenTriple): + r += processSub(templates, tokens, args) + else: + r += unicode(t) + except StopIteration: + pass + return r + +def test(): + templates = { + 'lb': "{{", + 'name-example': "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].", + 't': "start-{{{1|pqr}}}-end", + 't0': "start-{{{1}}}-end", + 't1': "start{{{1}}}end<noinclude>moo</noinclude>", + 't2a1': "{{t2demo|a|{{{1}}}}}", + 't2a2': "{{t2demo|a|2={{{1}}}}}", + 't2demo': "start-{{{1}}}-middle-{{{2}}}-end", + 't5': "{{t2demo|{{{a}}}=b}}", + 't6': "t2demo|a", + } + def t(text, expected): + print "text:", text + s = process(templates, text) + if s != expected: + print "got:", s + print "expected:", expected + sys.exit(1) + t("{{Name-example}}", "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].") + t("{{Name-example | firstName=John | lastName=Smith }}", "I am a template example, my first name is '''John''' and my last name is '''Smith'''. You can reference my page at [[Smith, John]].") + t("{{t0|a}}", "start-a-end") + t("{{t0| }}", "start- -end") + t("{{t0|}}", "start--end") + t("{{t0}}", "start-{{{1}}}-end") + t("{{t0| }}", "start- -end") + t("{{t0|\n}}", "start-\n-end") + t("{{t0|1= }}", "start--end") + t("{{t0|1=\n}}", "start--end") + t("{{T}}", "start-pqr-end") + t("{{T|}}", "start--end") + t("{{T|abc}}", "start-abc-end") + t("{{T|abc|def}}", "start-abc-end") + t("{{T|1=abc|1=def}}", "start-def-end") + t("{{T|abc|1=def}}", "start-def-end") + t("{{T|1=abc|def}}", "start-def-end") + t("{{T|{{T}}}}", "start-start-pqr-end-end") + t("{{T|{{T|{{T}}}}}}", "start-start-start-pqr-end-end-end") + t("{{T|{{T|{{T|{{T}}}}}}}}", "start-start-start-start-pqr-end-end-end-end") + t("{{T|a{{t|b}}}}", "start-astart-b-end-end") + t("{{T|{{T|a=b}}}}", "start-start-pqr-end-end") + t("{{T|a=b}}", "start-pqr-end") + t("{{T|1=a=b}}", "start-a=b-end") + #t("{{t1|{{lb}}tc}}}}", "start{{tcend}}") + #t("{{t2a1|1=x=y}}", "start-a-middle-{{{2}}}-end") + #t("{{t2a2|1=x=y}}", "start-a-middle-x=y-end") + #t("{{t5|a=2=d}}", "start-{{{1}}}-middle-d=b-end") + #t("{{ {{t6}} }}", "{{ t2demo|a }}") + t("{{t|[[a|b]]}}", "start-b-end") + t("{{t|[[a|b]] }}", "start-b -end") + +Parts = { + # Standard POS headers + 'noun': "n.", + 'Noun': "n.", + 'Noun 1': "n.", + 'Noun 2': "n.", + 'Verb': "v.", + 'Adjective': "adj.", + 'Adverb': "adv.", + 'Pronoun': "pron.", + 'Conjunction': "conj.", + 'Interjection': "interj.", + 'Preposition': "prep.", + 'Proper noun': "n.p.", + 'Proper Noun': "n.p.", + 'Article': "art.", + + # Standard non-POS level 3 headers + '{{acronym}}': "acr.", + 'Acronym': "acr.", + '{{abbreviation}}': "abbr.", + '[[Abbreviation]]': "abbr.", + 'Abbreviation': "abbr.", + '[[initialism]]': "init.", + '{{initialism}}': "init.", + 'Initialism': "init.", + 'Contraction': "cont.", + 'Prefix': "prefix", + 'Suffix': "suffix", + 'Symbol': "sym.", + 'Letter': "letter", + 'Idiom': "idiom", + 'Idioms': "idiom", + 'Phrase': "phrase", + + # Debated POS level 3 headers + 'Number': "num.", + 'Numeral': "num.", + 'Cardinal number': "num.", + 'Ordinal number': "num.", + 'Cardinal numeral': "num.", + 'Ordinal numeral': "num.", + + # Other headers in use + 'Personal pronoun': "pers.pron.", + 'Adjective/Adverb': "adj./adv.", + 'Proper adjective': "prop.adj.", + 'Determiner': "det.", + 'Demonstrative determiner': "dem.det.", + 'Clitic': "clitic", + 'Infix': "infix", + 'Counter': "counter", + 'Kanji': None, + 'Kanji reading': None, + 'Hiragana letter': None, + 'Katakana letter': None, + 'Pinyin': None, + 'Han character': None, + 'Hanzi': None, + 'Hanja': None, + 'Proverb': "prov.", + 'Expression': None, + 'Adjectival noun': None, + 'Quasi-adjective': None, + 'Particle': "part.", + 'Infinitive particle': "part.", + 'Possessive adjective': "poss.adj.", + 'Verbal prefix': "v.p.", + 'Postposition': "post.", + 'Prepositional article': "prep.art.", + 'Phrasal verb': "phr.v.", + 'Participle': "participle", + 'Interrogative auxiliary verb': "int.aux.v.", + 'Pronominal adverb': "pron.adv.", + 'Adnominal': "adn.", + 'Abstract pronoun': "abs.pron.", + 'Conjunction particle': None, + 'Root': "root", + + # Non-standard, deprecated headers + 'Noun form': "n.", + 'Verb form': "v.", + 'Adjective form': "adj.form.", + 'Nominal phrase': "nom.phr.", + 'Noun phrase': "n. phrase", + 'Verb phrase': "v. phrase", + 'Transitive verb': "v.t.", + 'Intransitive verb': "v.i.", + 'Reflexive verb': "v.r.", + 'Cmavo': None, + 'Romaji': "rom.", + 'Hiragana': None, + 'Furigana': None, + 'Compounds': None, + + # Other headers seen + 'Alternative forms': None, + 'Alternative spellings': None, + 'Anagrams': None, + 'Antonym': None, + 'Antonyms': None, + 'Conjugation': None, + 'Declension': None, + 'Declension and pronunciations': None, + 'Definite Article': "def.art.", + 'Definite article': "def.art.", + 'Demonstrative pronoun': "dem.pron.", + 'Derivation': None, + 'Derived expression': None, + 'Derived expressions': None, + 'Derived forms': None, + 'Derived phrases': None, + 'Derived terms': None, + 'Derived, Related terms': None, + 'Descendants': None, + #'Etymology': None, + #'Etymology 1': None, + #'Etymology 2': None, + #'Etymology 3': None, + #'Etymology 4': None, + #'Etymology 5': None, + 'Examples': None, + 'External links': None, + '[[Gismu]]': None, + 'Gismu': None, + 'Homonyms': None, + 'Homophones': None, + 'Hyphenation': None, + 'Indefinite article': "art.", + 'Indefinite pronoun': "ind.pron.", + 'Indefinite Pronoun': "ind.pron.", + 'Indetermined pronoun': "ind.pron.", + 'Interrogative conjunction': "int.conj.", + 'Interrogative determiner': "int.det.", + 'Interrogative particle': "int.part.", + 'Interrogative pronoun': "int.pron.", + 'Legal expression': "legal", + 'Mass noun': "n.", + 'Miscellaneous': None, + 'Mutations': None, + 'Noun and verb': "n/v.", + 'Other language': None, + 'Pinyin syllable': None, + 'Possessive determiner': "poss.det.", + 'Possessive pronoun': "poss.pron.", + 'Prepositional phrase': "prep.phr.", + 'Prepositional Pronoun': "prep.pron.", + 'Pronunciation': None, + 'Pronunciation 1': None, + 'Pronunciation 2': None, + 'Quotations': None, + 'References': None, + 'Reflexive pronoun': "refl.pron.", + 'Related expressions': None, + 'Related terms': None, + 'Related words': None, + 'Relative pronoun': "rel.pron.", + 'Saying': "saying", + 'See also': None, + 'Shorthand': None, + '[http://en.wikipedia.org/wiki/Shorthand Shorthand]': None, + 'Sister projects': None, + 'Spelling note': None, + 'Synonyms': None, + 'Translation': None, + 'Translations': None, + 'Translations to be checked': None, + 'Transliteration': None, + 'Trivia': None, + 'Usage': None, + 'Usage in English': None, + 'Usage notes': None, + 'Verbal noun': "v.n.", +} +PartsUsed = {} +for p in Parts.keys(): + PartsUsed[p] = 0 + +def encode(s): + r = e(s) + assert r[1] == len(s) + return r[0] + +def dowikilink(m): + a = m.group(1).split("|") + if len(a) > 1: + link = a[1] + else: + link = a[0] + if ':' in link: + link = "" + return link + +seentemplates = {} +def dotemplate(m): + aa = m.group(1).split("|") + args = {} + n = 0 + for a in aa: + am = re.match(r"(.*?)(=(.*))?", a) + if am: + args[am.group(1)] = am.group(3) + else: + n += 1 + args[n] = am.group(1) + + #if aa[0] in seentemplates: + # seentemplates[aa[0]] += 1 + #else: + # seentemplates[aa[0]] = 1 + # print len(seentemplates), aa[0] + #print aa[0] + + #if aa[0] not in Templates: + # return "(unknown template %s)" % aa[0] + #body = Templates[aa[0]] + #body = re.sub(r"<noinclude>.*?</noinclude>", "", body) + #assert "<onlyinclude>" not in body + ##body = re.sub(r"(.*?)<onlyinclude>(.*?)</onlyinclude>(.*)", r"\1", body) + #body = re.sub(r"<includeonly>(.*?)</includeonly>", r"\1", body) + #def dotemplatearg(m): + # ta = m.group(1).split("|") + # if ta[0] in args: + # return args[ta[0]] + # elif len(ta) > 1: + # return ta[1] + # else: + # return "{{{%s}}}" % ta[0] + #body = re.sub(r"{{{(.*?)}}}", dotemplatearg, body) + #return dewiki(body) + +def doparserfunction(m): + a = m.group(2).split("|") + if m.group(1) == "ifeq": + if a[0] == a[1]: + return a[2] + elif len(a) >= 4: + return a[3] + return "" + +def dewiki(body, indent = 0): + # process in this order: + # {{{ }}} + # <> <> + # [[ ]] + # {{ }} + # ''' ''' + # '' '' + #body = wikimediatemplate.process(Templates, body) + body = re.sub(r"\[\[(.*?)\]\]", dowikilink, body) + #body = re.sub(r"{{(.*?)}}", dotemplate, body) + #body = re.sub(r"{{#(.*?):(.*?)}}", doparserfunction, body) + body = re.sub(r"'''(.*?)'''", r"\1", body) + body = re.sub(r"''(.*?)''", r"\1", body) + lines = body.split("\n") + n = 0 + i = 0 + while i < len(lines): + if len(lines[i]) > 0 and lines[i][0] == "#": + if len(lines[i]) > 1 and lines[i][1] == '*': + wlines = textwrap.wrap(lines[i][2:].strip(), + initial_indent = " * ", + subsequent_indent = " ") + elif len(lines[i]) > 1 and lines[i][1] == ':': + wlines = textwrap.wrap(lines[i][2:].strip(), + initial_indent = " ", + subsequent_indent = " ") + else: + n += 1 + wlines = textwrap.wrap(str(n) + ". " + lines[i][1:].strip(), + subsequent_indent = " ") + elif len(lines[i]) > 0 and lines[i][0] == "*": + n = 0 + wlines = textwrap.wrap(lines[i][1:].strip(), + initial_indent = "* ", + subsequent_indent = " ") + else: + n = 0 + wlines = textwrap.wrap(lines[i].strip()) + if len(wlines) == 0: + wlines = [''] + lines[i:i+1] = wlines + i += len(wlines) + return ''.join(" "*(indent-1)+x+"\n" for x in lines) + +class WikiSection: + def __init__(self, heading, body): + self.heading = heading + self.body = body + #self.lines = re.split("\n+", body.strip()) + #if len(self.lines) == 1 and len(self.lines[0]) == 0: + # self.lines = [] + self.children = [] + def __str__(self): + return "<%s:%i:%s>" % (self.heading, len(self.body or ""), ','.join([str(x) for x in self.children])) + def add(self, section): + self.children.append(section) + +def parse(word, text): + headings = list(re.finditer("^(=+)\s*(.*?)\s*=+\n", text, re.MULTILINE)) + #print [x.group(1) for x in headings] + doc = WikiSection(word, "") + stack = [doc] + for i, m in enumerate(headings): + depth = len(m.group(1)) + if depth < len(stack): + stack = stack[:depth] + else: + while depth > len(stack): + s = WikiSection(None, "") + stack[-1].add(s) + stack.append(s) + if i+1 < len(headings): + s = WikiSection(m.group(2), text[m.end(0):headings[i+1].start(0)].strip()) + else: + s = WikiSection(m.group(2), text[m.end(0):].strip()) + assert len(stack) == depth + stack[-1].add(s) + stack.append(s) + #while doc.heading is None and len(doc.lines) == 0 and len(doc.children) == 1: + # doc = doc.children[0] + return doc + +def formatFull(word, doc): + def f(depth, section): + if section.heading: + r = " "*(depth-1) + section.heading + "\n\n" + else: + r = "" + if section.body: + r += dewiki(section.body, depth+1)+"\n" + #r += "".join(" "*depth + x + "\n" for x in dewiki(section.body)) + #if len(section.lines) > 0: + # r += "\n" + for c in section.children: + r += f(depth+1, c) + return r + s = f(0, doc) + s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word + return s + +def formatNormal(word, doc): + def f(depth, posdepth, section): + r = "" + if depth == posdepth: + if not section.heading or section.heading.startswith("Etymology"): + posdepth += 1 + elif section.heading in Parts: + #p = Parts[section.heading] + #if p: + # r += " "*(depth-1) + word + " (" + p + ")\n\n" + r += " "*(depth-1) + section.heading + "\n\n" + else: + print >>errors, "Unknown part: (%s) %s" % (word, section.heading) + return "" + elif depth > posdepth: + return "" + elif section.heading: + r += " "*(depth-1) + section.heading + "\n\n" + if section.body: + r += dewiki(section.body, depth+1)+"\n" + #r += "".join(" "*depth + x + "\n" for x in dewiki(section.lines)) + #if len(section.lines) > 0: + # r += "\n" + for c in section.children: + r += f(depth+1, posdepth, c) + return r + s = f(0, 3, doc) + s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word + return s + +def formatBrief(word, doc): + def f(depth, posdepth, section): + if depth == posdepth: + h = section.heading + if not section.heading or section.heading.startswith("Etymology"): + posdepth += 1 + elif section.heading in Parts: + #h = Parts[section.heading] + #if h: + # h = "%s (%s)" % (word, h) + pass + stack.append([h, False]) + elif depth > 0: + stack.append([section.heading, False]) + else: + stack.append(["%h " + section.heading, False]) + r = "" + #if section.heading: + # r += " "*(depth-1) + section.heading + "\n" + body = ''.join(x+"\n" for x in section.body.split("\n") if len(x) > 0 and x[0] == '#') + if len(body) > 0: + for i in range(len(stack)): + if not stack[i][1]: + if stack[i][0]: + r += " "*(i-1) + stack[i][0] + "\n" + stack[i][1] = True + r += dewiki(body, depth+1) + for c in section.children: + r += f(depth+1, posdepth, c) + stack.pop() + return r + stack = [] + s = f(0, 3, doc) + s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word + return s + +class WikiHandler(xml.sax.ContentHandler): + def __init__(self): + self.element = None + self.page = None + self.text = "" + self.long = {} + def startElement(self, name, attrs): + #print "start", name, attrs + self.element = name + def endElement(self, name): + #print "end", name + if self.element == "text": + if self.page: + if self.page in self.long: + print self.page, len(self.text) + print + self.doPage(self.page, self.text) + self.page = None + self.text = "" + self.element = None + def characters(self, content): + #print "characters", content + if self.element == "title": + if self.checkPage(content): + self.page = content + elif self.element == "text": + if self.page: + self.text += content + if len(self.text) > 100000 and self.page not in self.long: + self.long[self.page] = 1 + def checkPage(self, page): + return False + def doPage(self, page, text): + pass + +class TemplateHandler(WikiHandler): + def checkPage(self, page): + return page.startswith("Template:") + def doPage(self, page, text): + Templates[page[page.find(':')+1:].lower()] = text + +class WordHandler(WikiHandler): + def checkPage(self, page): + return ':' not in page + def doPage(self, page, text): + m = re.match(r"#redirect\s*\[\[(.*?)\]\]", text, re.IGNORECASE) + if m: + out.write(" See <%s>" % page) + return + doc = parse(page, text) + out.write(formatBrief(page, doc)) + #print formatBrief(page, doc) + +fn = sys.argv[1] +info = """ This file was converted from the original database on: + %s + + The original data is available from: + http://en.wiktionary.org + The version from which this file was generated was: + %s + + Wiktionary is available under the GNU Free Documentation License. +""" % (time.ctime(), os.path.basename(fn)) + +errors = codecs.open("mkdict.err", "w", "utf_8") +e = codecs.getencoder("utf_8") + +Templates = {} +f = os.popen("bunzip2 -c %s" % fn, "r") +xml.sax.parse(f, TemplateHandler()) +f.close() + +f = os.popen("bunzip2 -c %s" % fn, "r") +out = codecs.getwriter("utf_8")( + os.popen("dictfmt -p wiktionary-en --locale en_US.UTF-8 --columns 0 -u http://en.wiktionary.org", "w")) + +out.write(("%%h English Wiktionary\n%s" % info).encode('utf-8')) +xml.sax.parse(f, WordHandler()) +f.close() +out.close() diff --git a/nixpkgs/pkgs/servers/dict/wordnet_structures.py b/nixpkgs/pkgs/servers/dict/wordnet_structures.py new file mode 100644 index 000000000000..6a88427c5da1 --- /dev/null +++ b/nixpkgs/pkgs/servers/dict/wordnet_structures.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python +#Copyright 2007 Sebastian Hagen +# This file is part of wordnet_tools. + +# wordnet_tools is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 +# as published by the Free Software Foundation + +# wordnet_tools is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with wordnet_tools; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +# This program requires python >= 2.4. + +# This program converts wordnet index/data file pairs into dict index/data +# files usable by dictd. +# This is basically a reimplementation of the wnfilter program by Rik Faith, +# which unfortunately doesn't work correctly for wordnet files in the newer +# formats. This version of wordnet_structures whould parse wordnet 2.1 files +# correctly, and create output very similar to what wnfilter would have +# written. + +import datetime +from textwrap import TextWrapper + +CAT_ADJECTIVE = 0 +CAT_ADVERB = 1 +CAT_NOUN = 2 +CAT_VERB = 3 + +category_map = { + 'n': CAT_NOUN, + 'v': CAT_VERB, + 'a': CAT_ADJECTIVE, + 's': CAT_ADJECTIVE, + 'r': CAT_ADVERB +} + + +class WordIndex: + def __init__(self, lemma, category, ptrs, synsets, tagsense_count): + self.lemma = lemma + self.category = category + self.ptrs = ptrs + self.synsets = synsets + self.tagsense_count = tagsense_count + + @classmethod + def build_from_line(cls, line_data, synset_map): + line_split = line_data.split() + lemma = line_split[0] + category = category_map[line_split[1]] + synset_count = int(line_split[2],10) + ptr_count = int(line_split[3],10) + ptrs = [line_split[i] for i in range(3, 3+ptr_count)] + tagsense_count = int(line_split[5 + ptr_count],10) + synsets = [synset_map[int(line_split[i],10)] for i in range(6 + ptr_count, 6 + ptr_count + synset_count)] + return cls(lemma, category, ptrs, synsets, tagsense_count) + + @classmethod + def build_from_file(cls, f, synset_map, rv_base=None): + if (rv_base is None): + rv = {} + else: + rv = rv_base + + for line in f: + if (line.startswith(' ')): + continue + wi = cls.build_from_line(line, synset_map) + word = wi.lemma.lower() + if not (word in rv): + rv[word] = [] + rv[word].append(wi) + return rv + + def __repr__(self): + return '%s%s' % (self.__class__.__name__, (self.lemma, self.category, self.ptrs, self.synsets, self.tagsense_count)) + + +class WordIndexDictFormatter(WordIndex): + category_map_rev = { + CAT_NOUN: 'n', + CAT_VERB: 'v', + CAT_ADJECTIVE: 'adj', + CAT_ADVERB: 'adv' + } + linesep = '\n' + LINE_WIDTH_MAX = 68 + prefix_fmtf_line_first = '%5s 1: ' + prefix_fmtn_line_first = ' ' + prefix_fmtf_line_nonfirst = '%5d: ' + prefix_fmtn_line_nonfirst = ' ' + + def dict_str(self): + tw = TextWrapper(width=self.LINE_WIDTH_MAX, + initial_indent=(self.prefix_fmtf_line_first % self.category_map_rev[self.category]), + subsequent_indent=self.prefix_fmtn_line_first) + + lines = (tw.wrap(self.synsets[0].dict_str())) + i = 2 + for synset in self.synsets[1:]: + tw = TextWrapper(width=self.LINE_WIDTH_MAX, + initial_indent=(self.prefix_fmtf_line_nonfirst % i), + subsequent_indent=self.prefix_fmtn_line_nonfirst) + lines.extend(tw.wrap(synset.dict_str())) + i += 1 + return self.linesep.join(lines) + + +class Synset: + def __init__(self, offset, ss_type, words, ptrs, gloss, frames=()): + self.offset = offset + self.type = ss_type + self.words = words + self.ptrs = ptrs + self.gloss = gloss + self.frames = frames + self.comments = [] + + @classmethod + def build_from_line(cls, line_data): + line_split = line_data.split() + synset_offset = int(line_split[0],10) + ss_type = category_map[line_split[2]] + word_count = int(line_split[3],16) + words = [line_split[i] for i in range(4, 4 + word_count*2,2)] + ptr_count = int(line_split[4 + word_count*2],10) + ptrs = [(line_split[i], line_split[i+1], line_split[i+2], line_split[i+3]) for i in range(5 + word_count*2,4 + word_count*2 + ptr_count*4,4)] + + tok = line_split[5 + word_count*2 + ptr_count*4] + base = 6 + word_count*2 + ptr_count*4 + if (tok != '|'): + frame_count = int(tok, 10) + frames = [(int(line_split[i+1],10), int(line_split[i+2],16)) for i in range(base, base + frame_count*3, 3)] + base += frame_count*3 + 1 + else: + frames = [] + + line_split2 = line_data.split(None, base) + if (len(line_split2) < base): + gloss = None + else: + gloss = line_split2[-1] + + return cls(synset_offset, ss_type, words, ptrs, gloss, frames) + + @classmethod + def build_from_file(cls, f): + rv = {} + comments = [] + + for line in f: + if (line.startswith(' ')): + line_s = line.lstrip().rstrip('\n') + line_elements = line_s.split(None,1) + try: + int(line_elements[0]) + except ValueError: + continue + if (len(line_elements) == 1): + line_elements.append('') + comments.append(line_elements[1]) + continue + synset = cls.build_from_line(line.rstrip()) + rv[synset.offset] = synset + + return (rv, comments) + + def dict_str(self): + rv = self.gloss + if (len(self.words) > 1): + rv += ' [syn: %s]' % (', '.join([('{%s}' % word) for word in self.words])) + return rv + + def __repr__(self): + return '%s%s' % (self.__class__.__name__, (self.offset, self.type, self.words, self.ptrs, self.gloss, self.frames)) + + +class WordnetDict: + db_info_fmt = '''This file was converted from the original database on: + %(conversion_datetime)s + +The original data is available from: + %(wn_url)s + +The original data was distributed with the notice shown below. No +additional restrictions are claimed. Please redistribute this changed +version under the same conditions and restriction that apply to the +original version.\n\n +%(wn_license)s''' + + datetime_fmt = '%Y-%m-%dT%H:%M:%S' + base64_map = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/' + + def __init__(self, wn_url, desc_short, desc_long): + self.word_data = {} + self.wn_url = wn_url + self.desc_short = desc_short + self.desc_long = desc_long + self.wn_license = None + + def wn_dict_add(self, file_index, file_data): + file_data.seek(0) + file_index.seek(0) + (synsets, license_lines) = Synset.build_from_file(file_data) + WordIndexDictFormatter.build_from_file(file_index, synsets, self.word_data) + if (license_lines): + self.wn_license = '\n'.join(license_lines) + '\n' + + @classmethod + def base64_encode(cls, i): + """Encode a non-negative integer into a dictd compatible base64 string""" + if (i < 0): + raise ValueError('Value %r for i is negative' % (i,)) + r = 63 + e = 1 + while (r < i): + e += 1 + r = 64**e - 1 + + rv = '' + while (e > 0): + e -= 1 + d = (i / 64**e) + rv += cls.base64_map[d] + i = i % (64**e) + return rv + + @classmethod + def dict_entry_write(cls, file_index, file_data, key, entry, linesep='\n'): + """Write a single dict entry for <key> to index and data files""" + entry_start = file_data.tell() + file_data.write(entry) + entry_len = len(entry) + file_index.write('%s\t%s\t%s%s' % (key, cls.base64_encode(entry_start), + cls.base64_encode(entry_len), linesep)) + + def dict_generate(self, file_index, file_data): + file_index.seek(0) + file_data.seek(0) + # The dictd file format is fairly iffy on the subject of special + # headwords: either dictd is buggy, or the manpage doesn't tell the whole + # story about the format. + # The upshot is that order of these entries in the index *matters*. + # Putting them at the beginning and in alphabetic order is afaict ok. + # Some other orders completely and quietly break the ability to look + # those headwords up. + # -- problem encountered with 1.10.2, at 2007-08-05. + file_data.write('\n') + wn_url = self.wn_url + conversion_datetime = datetime.datetime.now().strftime(self.datetime_fmt) + wn_license = self.wn_license + self.dict_entry_write(file_index, file_data, '00-database-info', '00-database-info\n%s\n' % (self.db_info_fmt % vars())) + self.dict_entry_write(file_index, file_data, '00-database-long', '00-database-long\n%s\n' % self.desc_long) + self.dict_entry_write(file_index, file_data, '00-database-short', '00-database-short\n%s\n' % self.desc_short) + self.dict_entry_write(file_index, file_data, '00-database-url', '00-database-url\n%s\n' % self.wn_url) + + + words = self.word_data.keys() + words.sort() + for word in words: + for wi in self.word_data[word]: + word_cs = word + # Use case-sensitivity information of first entry of first synset that + # matches this word case-insensitively + for synset in wi.synsets: + for ss_word in synset.words: + if (ss_word.lower() == word_cs.lower()): + word_cs = ss_word + break + else: + continue + break + else: + continue + break + + outstr = '' + for wi in self.word_data[word]: + outstr += wi.dict_str() + '\n' + + outstr = '%s%s%s' % (word_cs, wi.linesep, outstr) + self.dict_entry_write(file_index, file_data, word_cs, outstr, wi.linesep) + + file_index.truncate() + file_data.truncate() + + +if (__name__ == '__main__'): + import optparse + op = optparse.OptionParser(usage='usage: %prog [options] (<wn_index_file> <wn_data_file>)+') + op.add_option('-i', '--outindex', dest='oi', default='wn.index', help='filename of index file to write to') + op.add_option('-d', '--outdata', dest='od', default='wn.dict', help='filename of data file to write to') + op.add_option('--wn_url', dest='wn_url', default='ftp://ftp.cogsci.princeton.edu/pub/wordnet/2.0', help='URL for wordnet sources') + op.add_option('--db_desc_short', dest='desc_short', default=' WordNet (r) 2.1 (2005)', help='short dict DB description') + op.add_option('--db_desc_long', dest='desc_long', default=' WordNet (r): A Lexical Database for English from the\n Cognitive Science Laboratory at Princeton University', help='long dict DB description') + + (options, args) = op.parse_args() + + wnd = WordnetDict(wn_url=options.wn_url, desc_short=options.desc_short, desc_long=options.desc_long) + + for i in range(0,len(args),2): + print 'Opening index file %r...' % args[i] + file_index = file(args[i]) + print 'Opening data file %r...' % args[i+1] + file_data = file(args[i+1]) + print 'Parsing index file and data file...' + wnd.wn_dict_add(file_index, file_data) + + print 'All input files parsed. Writing output to index file %r and data file %r.' % (options.oi, options.od) + + wnd.dict_generate(file(options.oi, 'w'),file(options.od, 'w')) + print 'All done.' |