JohnSnowLabs/spark-nlp-workshop

de-identifier NER model in offline mode: Got Py4JJavaError: An error occurred while calling None.com.johnsnowlabs.nlp.annotators.ner.MedicalNerModel.

egenc opened this issue · 1 comments

egenc commented

Hi I am trying to use the de-identifier NER model in offline mode: https://nlp.johnsnowlabs.com/2021/03/31/ner_deid_enriched_en.html
Unlike the embedding models I have used, I can not load this one in offline mode.

import os
import json

with open('spark-nlp-jsl-keys.json') as f:
    license_keys = json.load(f)
    
os.environ['SPARK_NLP_LICENSE'] = license_keys['SPARK_NLP_LICENSE']

spark_nlp_jar_path = "/mnt/hcscdasap01/eds-share/spark_nlp_models/spark-nlp-assembly-3.0.3.jar"
spark_nlp_internal = "/mnt/hcscdasap01/eds-share/spark_nlp_models/spark-nlp-jsl-3.0.3.jar"
spark_nlp_jar_path = spark_nlp_jar_path+","+spark_nlp_internal

import sparknlp
import sparknlp_jsl
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

# Annotator that transforms a text column from dataframe into an Annotation ready for NLP

documentAssembler = DocumentAssembler()\
  .setInputCol("extracted_text")\
  .setOutputCol("document")

# Sentence Detector annotator, processes various sentences per line
sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
  .setInputCols(["sentence"])\
  .setOutputCol("token")

word_embeddings = WordEmbeddingsModel.load("<path>/cache_pretrained/glove_100d_en_2.4.0_2.4_1579690104032")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")


deid_ner = MedicalNerModel.load("<path>/cache_pretrained/ner_deid_enriched_en_3.0.0_3.0_1617208426129")\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner")


ner_converter = NerConverter()\
      .setInputCols(["sentence", "token", "ner"])\
      .setOutputCol("ner_chunk")

---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-14-cb53967261e5> in <module>
----> 1 deid_ner = MedicalNerModel.load("<path>/cache_pretrained/ner_deid_enriched_en_3.0.0_3.0_1617208426129")\
      2     .setInputCols(["sentence", "token", "embeddings"])\
      3     .setOutputCol("ner")
      4 
      5 # deid_ner = MedicalNerModel.pretrained("ner_deid_generic_glove", "en", "clinical/models") \

~/.conda/envs/sparknlp3/lib/python3.7/site-packages/pyspark/ml/util.py in load(cls, path)
    330     def load(cls, path):
    331         """Reads an ML instance from the input path, a shortcut of `read().load(path)`."""
--> 332         return cls.read().load(path)
    333 
    334 

~/.conda/envs/sparknlp3/lib/python3.7/site-packages/sparknlp/internal.py in read(cls)
     64     def read(cls):
     65         """Returns an MLReader instance for this class."""
---> 66         return AnnotatorJavaMLReader(cls())
     67 
     68 

~/.conda/envs/sparknlp3/lib/python3.7/site-packages/sparknlp_jsl/annotator.py in __init__(self, classname, java_model)
   1822         super(MedicalNerModel, self).__init__(
   1823             classname=classname,
-> 1824             java_model=java_model
   1825         )
   1826         self._setDefault(

~/.conda/envs/sparknlp3/lib/python3.7/site-packages/pyspark/__init__.py in wrapper(self, *args, **kwargs)
    112             raise TypeError("Method %s forces keyword arguments." % func.__name__)
    113         self._input_kwargs = kwargs
--> 114         return func(self, **kwargs)
    115     return wrapper
    116 

~/.conda/envs/sparknlp3/lib/python3.7/site-packages/sparknlp/common.py in __init__(self, classname, java_model)
     72         if classname and not java_model:
     73             self.__class__._java_class_name = classname
---> 74             self._java_obj = self._new_java_obj(classname, self.uid)
     75         if java_model is not None:
     76             self._transfer_params_from_java()

~/.conda/envs/sparknlp3/lib/python3.7/site-packages/pyspark/ml/wrapper.py in _new_java_obj(java_class, *args)
     64             java_obj = getattr(java_obj, name)
     65         java_args = [_py2java(sc, arg) for arg in args]
---> 66         return java_obj(*java_args)
     67 
     68     @staticmethod

~/.conda/envs/sparknlp3/lib/python3.7/site-packages/py4j/java_gateway.py in __call__(self, *args)
   1567         answer = self._gateway_client.send_command(command)
   1568         return_value = get_return_value(
-> 1569             answer, self._gateway_client, None, self._fqn)
   1570 
   1571         for temp_arg in temp_args:

~/.conda/envs/sparknlp3/lib/python3.7/site-packages/pyspark/sql/utils.py in deco(*a, **kw)
    109     def deco(*a, **kw):
    110         try:
--> 111             return f(*a, **kw)
    112         except py4j.protocol.Py4JJavaError as e:
    113             converted = convert_exception(e.java_exception)

~/.conda/envs/sparknlp3/lib/python3.7/site-packages/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    326                 raise Py4JJavaError(
    327                     "An error occurred while calling {0}{1}{2}.\n".
--> 328                     format(target_id, ".", name), value)
    329             else:
    330                 raise Py4JError(

Py4JJavaError: An error occurred while calling None.com.johnsnowlabs.nlp.annotators.ner.MedicalNerModel.
: java.lang.NoClassDefFoundError: Could not initialize class com.johnsnowlabs.license.LicenseValidator$
	at com.johnsnowlabs.license.Licensed.$init$(Licensed.scala:5)
	at com.johnsnowlabs.nlp.annotators.ner.MedicalNerModel.<init>(MedicalNerModel.scala:18)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
	at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.lang.reflect.Constructor.newInstance(Constructor.java:422)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:238)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
egenc commented

need to use the right embeddings as stated at that model link:

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
   .setInputCols(["sentence", "token"])\
   .setOutputCol("embeddings")