diff --git a/pkgs/tools/text/nltk-data/default.nix b/pkgs/tools/text/nltk-data/default.nix index 1ee377e4757b..e4aa7f06c355 100644 --- a/pkgs/tools/text/nltk-data/default.nix +++ b/pkgs/tools/text/nltk-data/default.nix @@ -54,41 +54,212 @@ let ''; } ); + + makeChunker = + pname: + makeNltkDataPackage { + inherit pname; + location = "chunkers"; + hash = "sha256-kemjqaCM9hlKAdMw8oVJnp62EAC9rMQ50dKg7wlAwEc="; + }; + + makeCorpus = + pname: + makeNltkDataPackage { + inherit pname; + location = "corpora"; + hash = "sha256-8lMjW5YI8h6dHJ/83HVY2OYGDyKPpgkUAKPISiAKqqk="; + }; + + makeGrammar = + pname: + makeNltkDataPackage { + inherit pname; + location = "grammars"; + hash = "sha256-pyLEcX3Azv8j1kCGvVYonuiNgVJxtWt7veU0S/yNbIM="; + }; + + makeHelp = + pname: + makeNltkDataPackage { + inherit pname; + location = "help"; + hash = "sha256-97mYLNES5WujLF5gD8Ul4cJ6LqSzz+jDzclUsdBeHNE="; + }; + + makeMisc = + pname: + makeNltkDataPackage { + inherit pname; + location = "misc"; + hash = "sha256-XtizfEsc8TYWqvvC/eSFdha2ClC5/ZiJM8nue0vXLb4="; + }; + + makeModel = + pname: + makeNltkDataPackage { + inherit pname; + location = "models"; + hash = "sha256-iq3weEgCci6rgLW2j28F2eRLprJtInGXKe/awJPSVG4="; + }; + + makeTagger = + pname: + makeNltkDataPackage { + inherit pname; + location = "taggers"; + hash = "sha256-tl3Cn2okhBkUtTXvAmFRx72Brez6iTGRdmFTwFmpk3M="; + }; + + makeTokenizer = + pname: + makeNltkDataPackage { + inherit pname; + location = "tokenizers"; + hash = "sha256-OzMkruoYbFKqzuimOXIpE5lhHz8tmSqOFoLT+fjdTVg="; + }; + + makeStemmer = + pname: + makeNltkDataPackage { + inherit pname; + location = "stemmers"; + hash = "sha256-mNefwOPVJGz9kXV3LV4DuV7FJpNir/Nwg4ujd0CogEk="; + }; in lib.makeScope newScope (self: { - punkt = makeNltkDataPackage { - pname = "punkt"; - location = "tokenizers"; - hash = "sha256-OzMkruoYbFKqzuimOXIpE5lhHz8tmSqOFoLT+fjdTVg="; - }; - punkt-tab = makeNltkDataPackage { - pname = "punkt_tab"; - location = "tokenizers"; - hash = "sha256-OzMkruoYbFKqzuimOXIpE5lhHz8tmSqOFoLT+fjdTVg="; - }; - averaged-perceptron-tagger = makeNltkDataPackage { - pname = "averaged_perceptron_tagger"; - location = "taggers"; - hash = "sha256-tl3Cn2okhBkUtTXvAmFRx72Brez6iTGRdmFTwFmpk3M="; - }; - averaged-perceptron-tagger-eng = makeNltkDataPackage { - pname = "averaged_perceptron_tagger_eng"; - location = "taggers"; - hash = "sha256-tl3Cn2okhBkUtTXvAmFRx72Brez6iTGRdmFTwFmpk3M="; - }; - snowball-data = makeNltkDataPackage { - pname = "snowball_data"; - location = "stemmers"; - hash = "sha256-mNefwOPVJGz9kXV3LV4DuV7FJpNir/Nwg4ujd0CogEk="; - }; - stopwords = makeNltkDataPackage { - pname = "stopwords"; - location = "corpora"; - hash = "sha256-8lMjW5YI8h6dHJ/83HVY2OYGDyKPpgkUAKPISiAKqqk="; - }; - wordnet = makeNltkDataPackage { - pname = "wordnet"; - location = "corpora"; - hash = "sha256-8lMjW5YI8h6dHJ/83HVY2OYGDyKPpgkUAKPISiAKqqk="; - }; + ## Chunkers + maxent-ne-chunker = makeChunker "maxent_ne_chunker"; + maxent-ne-chunker-tab = makeChunker "maxent_ne_chunker_tab"; + + ## Corpora + abc = makeCorpus "abc"; + alpino = makeCorpus "alpino"; + bcp47 = makeCorpus "bcp47"; + biocreative-ppi = makeCorpus "biocreative_ppi"; + brown = makeCorpus "brown"; + brown-tei = makeCorpus "brown_tei"; + cess-cat = makeCorpus "cess_cat"; + cess-esp = makeCorpus "cess_esp"; + chat80 = makeCorpus "chat80"; + city-database = makeCorpus "city_database"; + cmudict = makeCorpus "cmudict"; + comparative-sentences = makeCorpus "comparative_sentences"; + comtrans = makeCorpus "comtrans"; + conll2000 = makeCorpus "conll2000"; + conll2002 = makeCorpus "conll2002"; + conll2007 = makeCorpus "conll2007"; + crubadan = makeCorpus "crubadan"; + dependency-treebank = makeCorpus "dependency_treebank"; + dolch = makeCorpus "dolch"; + europarl-raw = makeCorpus "europarl_raw"; + extended-omw = makeCorpus "extended_omw"; + floresta = makeCorpus "floresta"; + framenet-v15 = makeCorpus "framenet_v15"; + framenet-v17 = makeCorpus "framenet_v17"; + gazetteers = makeCorpus "gazetteers"; + genesis = makeCorpus "genesis"; + gutenberg = makeCorpus "gutenberg"; + ieer = makeCorpus "ieer"; + inaugural = makeCorpus "inaugural"; + indian = makeCorpus "indian"; + jeita = makeCorpus "jeita"; + kimmo = makeCorpus "kimmo"; + knbc = makeCorpus "knbc"; + lin-thesaurus = makeCorpus "lin_thesaurus"; + mac-morpho = makeCorpus "mac_morpho"; + machado = makeCorpus "machado"; + masc-tagged = makeCorpus "masc_tagged"; + movie-reviews = makeCorpus "movie_reviews"; + mte-teip5 = makeCorpus "mte_teip5"; + names = makeCorpus "names"; + nombank-1-0 = makeCorpus "nombank.1.0"; + nonbreaking-prefixes = makeCorpus "nonbreaking_prefixes"; + nps-chat = makeCorpus "nps_chat"; + omw = makeCorpus "omw"; + omw-1-4 = makeCorpus "omw-1.4"; + opinion-lexicon = makeCorpus "opinion_lexicon"; + panlex-swadesh = makeCorpus "panlex_swadesh"; + paradigms = makeCorpus "paradigms"; + pe08 = makeCorpus "pe08"; + pil = makeCorpus "pil"; + pl196x = makeCorpus "pl196x"; + ppattach = makeCorpus "ppattach"; + problem-reports = makeCorpus "problem_reports"; + product-reviews-1 = makeCorpus "product_reviews_1"; + product-reviews-2 = makeCorpus "product_reviews_2"; + propbank = makeCorpus "propbank"; + pros-cons = makeCorpus "pros_cons"; + ptb = makeCorpus "ptb"; + qc = makeCorpus "qc"; + reuters = makeCorpus "reuters"; + rte = makeCorpus "rte"; + semcor = makeCorpus "semcor"; + senseval = makeCorpus "senseval"; + sentence-polarity = makeCorpus "sentence_polarity"; + sentiwordnet = makeCorpus "sentiwordnet"; + shakespeare = makeCorpus "shakespeare"; + sinica-treebank = makeCorpus "sinica_treebank"; + smultron = makeCorpus "smultron"; + state-union = makeCorpus "state_union"; + stopwords = makeCorpus "stopwords"; + subjectivity = makeCorpus "subjectivity"; + swadesh = makeCorpus "swadesh"; + switchboard = makeCorpus "switchboard"; + timit = makeCorpus "timit"; + toolbox = makeCorpus "toolbox"; + treebank = makeCorpus "treebank"; + twitter-samples = makeCorpus "twitter_samples"; + udhr = makeCorpus "udhr"; + udhr2 = makeCorpus "udhr2"; + unicode-samples = makeCorpus "unicode_samples"; + universal-treebanks-v20 = makeCorpus "universal_treebanks_v20"; + verbnet = makeCorpus "verbnet"; + verbnet3 = makeCorpus "verbnet3"; + webtext = makeCorpus "webtext"; + wordnet = makeCorpus "wordnet"; + wordnet-ic = makeCorpus "wordnet_ic"; + wordnet2021 = makeCorpus "wordnet2021"; + wordnet2022 = makeCorpus "wordnet2022"; + wordnet31 = makeCorpus "wordnet31"; + words = makeCorpus "words"; + ycoe = makeCorpus "ycoe"; + + ## Grammars + basque-grammars = makeGrammar "basque_grammars"; + book-grammars = makeGrammar "book_grammars"; + large-grammars = makeGrammar "large_grammars"; + sample-grammars = makeGrammar "sample_grammars"; + spanish-grammars = makeGrammar "spanish_grammars"; + + ## Help + tagsets-json = makeHelp "tagsets_json"; + + ## Misc + mwa-ppdb = makeMisc "mwa_ppdb"; + perluniprops = makeMisc "perluniprops"; + + ## Models + bllip-wsj-no-aux = makeModel "bllip_wsj_no_aux"; + moses-sample = makeModel "moses_sample"; + wmt15-eval = makeModel "wmt15_eval"; + word2vec-sample = makeModel "word2vec_sample"; + + ## Taggers + averaged-perceptron-tagger = makeTagger "averaged_perceptron_tagger"; + averaged-perceptron-tagger-eng = makeTagger "averaged_perceptron_tagger_eng"; + averaged-perceptron-tagger-ru = makeTagger "averaged_perceptron_tagger_ru"; + averaged-perceptron-tagger-rus = makeTagger "averaged_perceptron_tagger_rus"; + maxent-treebank-pos-tagger = makeTagger "maxent_treebank_pos_tagger"; + maxent-treebank-pos-tagger-tab = makeTagger "maxent_treebank_pos_tagger_tab"; + universal-tagset = makeTagger "universal_tagset"; + + ## Tokenizers + punkt = makeTokenizer "punkt"; + punkt-tab = makeTokenizer "punkt_tab"; + + ## Stemmers + porter-test = makeStemmer "porter_test"; + rslp = makeStemmer "rslp"; + snowball-data = makeStemmer "snowball_data"; })