databricks/spark-corenlp

Is it possible to specify the "language" model?

Opened this issue · 1 comments

Hi all,

first of all, thank you for having made this wrapper available. Really useful.

Could you let me know if it is possible to specify the underlying CoreNLP model (english, french, ...) ?

According to what I understand from your code, it won't be easy since you use the simple Core API but it should be possible. Any idea/plans to extend your code with this possibility?

Regards,

Grégory

Hi, in my case i created a new function called for example "ner2" :)

def ner2 = udf { sentence: String =>
val pipeline = getOrCreateSentimentPipeline()

val document = pipeline.process(sentence)

val sentences = document.get(classOf[SentencesAnnotation]).asScala.toList

val tokens  = sentences.flatMap{sentence =>
  sentence.get(classOf[TokensAnnotation]).asScala.toList}

tokens.map { token =>
  //val word = token.get(classOf[TextAnnotation])
  val ner = token.get(classOf[NamedEntityTagAnnotation])
  //val lemma = token.get(classOf[LemmaAnnotation])
  (ner)
}

}

private def getOrCreateSentimentPipeline(): StanfordCoreNLP = {
if (sentimentPipeline == null) {
val props = new Properties()
//props.setProperty("annotators", "tokenize, ssplit, parse, sentiment")
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner")
props.setProperty("tokenize.language", "es")
props.setProperty("tokenize.verbose", "true")
props.setProperty("pos.model", "edu/stanford/nlp/models/pos-tagger/spanish/spanish-distsim.tagger")
props.setProperty("ner.model", "edu/stanford/nlp/models/ner/spanish.ancora.distsim.s512.crf.ser.gz")
props.setProperty("ner.applyNumericClassifiers", "false")
props.setProperty("ner.useSUTime", "false")
props.setProperty("ner.language", "spanish")
props.setProperty("parse.model", "edu/stanford/nlp/models/lexparser/spanishPCFG.ser.gz")
props.setProperty("depparse.model", "edu/stanford/nlp/models/parser/nndep/UD_Spanish.gz")
props.setProperty("depparse.language", "spanish")
props.setProperty("regexner.ignoreCase", "true")
props.setProperty("regexner.verbose", "true")
sentimentPipeline = new StanfordCoreNLP(props)
}
sentimentPipeline
}