我想用下面的代码训练新的NER实体:
def train_spacy_model(data, model='en_core_web_trf', n_iter=50):
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
TRAIN_DATA = data
ner = nlp.get_pipe("ner")
examples = []
for text, annotations in TRAIN_DATA:
examples.append(Example.from_dict(nlp.make_doc(text), annotations))
nlp.initialize(lambda: examples)
pipe_exceptions = ["ner"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes): # only train NER
for itn in range(n_iter):
random.shuffle(examples)
losses = {}
batches = minibatch(examples, size=compounding(4.0, 64.0, 1.2))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(
batch,
drop=0.20,
losses=losses
)
print("Losses", losses)
return nlp
nlp = train_spacy_model(data=dataset, n_iter=30)
我一直收到这个错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[296], line 40
36 print("Losses", losses)
38 return nlp
---> 40 nlp = train_spacy_model(data=no_verlaps_dataset, n_iter=30)
42 # save model to output directory
43 output_dir = '_data/models/actor_ner'
Cell In[296], line 16, in train_spacy_model(data, model, n_iter)
14 for text, annotations in TRAIN_DATA:
15 examples.append(Example.from_dict(nlp.make_doc(text), annotations))
---> 16 nlp.initialize(lambda: examples)
17 # for ent in annotations.get('entities'):
18 # ner.add_label(ent[2])
20 pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/spacy/language.py:1290, in Language.initialize(self, get_examples, sgd)
1288 config = self.config.interpolate()
1289 # These are the settings provided in the [initialize] block in the config
-> 1290 I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
1291 before_init = I["before_init"]
1292 if before_init is not None:
File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/thinc/config.py:746, in registry.resolve(cls, config, schema, overrides, validate)
737 @classmethod
738 def resolve(
739 cls,
(...)
744 validate: bool = True,
745 ) -> Dict[str, Any]:
--> 746 resolved, _ = cls._make(
747 config, schema=schema, overrides=overrides, validate=validate, resolve=True
748 )
749 return resolved
File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/thinc/config.py:795, in registry._make(cls, config, schema, overrides, resolve, validate)
793 if not is_interpolated:
794 config = Config(orig_config).interpolate()
--> 795 filled, _, resolved = cls._fill(
796 config, schema, validate=validate, overrides=overrides, resolve=resolve
797 )
798 filled = Config(filled, section_order=section_order)
799 # Check that overrides didn't include invalid properties not in config
File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/thinc/config.py:867, in registry._fill(cls, config, schema, validate, resolve, parent, overrides)
864 getter = cls.get(reg_name, func_name)
865 # We don't want to try/except this and raise our own error
866 # here, because we want the traceback if the function fails.
--> 867 getter_result = getter(*args, **kwargs)
868 else:
869 # We're not resolving and calling the function, so replace
870 # the getter_result with a Promise class
871 getter_result = Promise(
872 registry=reg_name, name=func_name, args=args, kwargs=kwargs
873 )
File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/spacy/language.py:108, in load_lookups_data(lang, tables)
105 @registry.misc("spacy.LookupsDataLoader.v1")
106 def load_lookups_data(lang, tables):
107 util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
--> 108 lookups = load_lookups(lang=lang, tables=tables)
109 return lookups
File ~/miniconda3/envs/tvman_ENV/lib/python3.9/site-packages/spacy/lookups.py:30, in load_lookups(lang, tables, strict)
28 if lang not in registry.lookups:
29 if strict and len(tables) > 0:
---> 30 raise ValueError(Errors.E955.format(table=", ".join(tables), lang=lang))
31 return lookups
32 data = registry.lookups.get(lang)
ValueError: [E955] Can't find table(s) lexeme_norm for language 'en' in spacy-lookups-data. Make sure you have the package installed or provide your own lookup tables if no default lookups are available for your language.
我已经安装了软件包:
pip install spacy-lookups-data
Collecting spacy-lookups-data
Downloading spacy_lookups_data-1.0.3-py2.py3-none-any.whl (98.5 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 98.5/98.5 MB 25.9 MB/s eta 0:00:00
但它仍然存在。
我怎样才能修复这个错误,开始更新模型来检测新任务的新实体?
当我在运行这段代码的jupyter notbook中重新启动内核时,这个问题得到了修复。
2条答案
按热度按时间xfb7svmp1#
为了回答这个狭义的问题:您可能需要重新启动运行库,以便注册
spacy-lookups-data
中的表。为了回答你没有问的问题:引用的脚本看起来只是从v2更新了一部分,我不推荐使用它,尤其是不推荐用于
en_core_web_trf
。在spacy v3 pipelines中更新ner
组件的一种推荐方法显示在此演示项目中:https://github.com/explosion/projects/tree/v3/pipelines/ner_demo_update
它为您处理了大量的pipeline/config/training细节,以便在不影响pipeline中其他组件性能的情况下更新
ner
。qlfbtfca2#
我一直在Jupyter Notebook中运行这段代码,直到我重新启动内核,错误仍然存在,所以答案是重新启动笔记本内核。