python-3.x 在空间查找数据中找不到语言'en'的表lexeme_norm

aurhwmvo  于 2022-12-24  发布在  Python
关注(0)|答案(2)|浏览(203)

我想用下面的代码训练新的NER实体:

def train_spacy_model(data, model='en_core_web_trf', n_iter=50):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
        
    TRAIN_DATA = data
    ner = nlp.get_pipe("ner")
    
    examples = []
    for text, annotations in TRAIN_DATA:
        examples.append(Example.from_dict(nlp.make_doc(text), annotations))
    nlp.initialize(lambda: examples)
    
    pipe_exceptions = ["ner"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    with nlp.disable_pipes(*other_pipes):  # only train NER

        for itn in range(n_iter):
            random.shuffle(examples)
            losses = {}
            batches = minibatch(examples, size=compounding(4.0, 64.0, 1.2))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    batch,  
                    drop=0.20, 
                    losses=losses
                   
                )
            print("Losses", losses)
    
    return nlp

nlp = train_spacy_model(data=dataset, n_iter=30)

我一直收到这个错误:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[296], line 40
     36             print("Losses", losses)
     38     return nlp
---> 40 nlp = train_spacy_model(data=no_verlaps_dataset, n_iter=30)
     42 # save model to output directory
     43 output_dir = '_data/models/actor_ner'

Cell In[296], line 16, in train_spacy_model(data, model, n_iter)
     14 for text, annotations in TRAIN_DATA:
     15     examples.append(Example.from_dict(nlp.make_doc(text), annotations))
---> 16 nlp.initialize(lambda: examples)
     17     # for ent in annotations.get('entities'):
     18     #     ner.add_label(ent[2])
     20 pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]

File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/spacy/language.py:1290, in Language.initialize(self, get_examples, sgd)
   1288 config = self.config.interpolate()
   1289 # These are the settings provided in the [initialize] block in the config
-> 1290 I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
   1291 before_init = I["before_init"]
   1292 if before_init is not None:

File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/thinc/config.py:746, in registry.resolve(cls, config, schema, overrides, validate)
    737 @classmethod
    738 def resolve(
    739     cls,
   (...)
    744     validate: bool = True,
    745 ) -> Dict[str, Any]:
--> 746     resolved, _ = cls._make(
    747         config, schema=schema, overrides=overrides, validate=validate, resolve=True
    748     )
    749     return resolved

File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/thinc/config.py:795, in registry._make(cls, config, schema, overrides, resolve, validate)
    793 if not is_interpolated:
    794     config = Config(orig_config).interpolate()
--> 795 filled, _, resolved = cls._fill(
    796     config, schema, validate=validate, overrides=overrides, resolve=resolve
    797 )
    798 filled = Config(filled, section_order=section_order)
    799 # Check that overrides didn't include invalid properties not in config

File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/thinc/config.py:867, in registry._fill(cls, config, schema, validate, resolve, parent, overrides)
    864     getter = cls.get(reg_name, func_name)
    865     # We don't want to try/except this and raise our own error
    866     # here, because we want the traceback if the function fails.
--> 867     getter_result = getter(*args, **kwargs)
    868 else:
    869     # We're not resolving and calling the function, so replace
    870     # the getter_result with a Promise class
    871     getter_result = Promise(
    872         registry=reg_name, name=func_name, args=args, kwargs=kwargs
    873     )

File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/spacy/language.py:108, in load_lookups_data(lang, tables)
    105 @registry.misc("spacy.LookupsDataLoader.v1")
    106 def load_lookups_data(lang, tables):
    107     util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
--> 108     lookups = load_lookups(lang=lang, tables=tables)
    109     return lookups

File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/spacy/lookups.py:30, in load_lookups(lang, tables, strict)
     28 if lang not in registry.lookups:
     29     if strict and len(tables) > 0:
---> 30         raise ValueError(Errors.E955.format(table=", ".join(tables), lang=lang))
     31     return lookups
     32 data = registry.lookups.get(lang)

ValueError: [E955] Can't find table(s) lexeme_norm for language 'en' in spacy-lookups-data. Make sure you have the package installed or provide your own lookup tables if no default lookups are available for your language.

我已经安装了软件包:

pip install spacy-lookups-data
Collecting spacy-lookups-data
  Downloading spacy_lookups_data-1.0.3-py2.py3-none-any.whl (98.5 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 98.5/98.5 MB 25.9 MB/s eta 0:00:00

但它仍然存在。
我怎样才能修复这个错误,开始更新模型来检测新任务的新实体?
当我在运行这段代码的jupyter notbook中重新启动内核时,这个问题得到了修复。

xfb7svmp

xfb7svmp1#

为了回答这个狭义的问题:您可能需要重新启动运行库,以便注册spacy-lookups-data中的表。
为了回答你没有问的问题:引用的脚本看起来只是从v2更新了一部分,我不推荐使用它,尤其是不推荐用于en_core_web_trf。在spacy v3 pipelines中更新ner组件的一种推荐方法显示在此演示项目中:
https://github.com/explosion/projects/tree/v3/pipelines/ner_demo_update
它为您处理了大量的pipeline/config/training细节,以便在不影响pipeline中其他组件性能的情况下更新ner

qlfbtfca

qlfbtfca2#

我一直在Jupyter Notebook中运行这段代码,直到我重新启动内核,错误仍然存在,所以答案是重新启动笔记本内核。

相关问题