IBMDataScience/DSx-Desktop

1.1.7 Can't load local file into Python notebook. Py4J has ref to wrong Java library

Closed this issue · 2 comments

Running the following code in a Jupyter notebook using v1.1.7 on Mac OS Sierra

from pyspark.sql import SQLContext
# Add asset from file system
df_data_1 = SQLContext(sc).read.csv('../datasets/propertytax.csv', header='true')
df_data_1.show(5)

Results in the following error

Py4JJavaErrorTraceback (most recent call last)
<ipython-input-2-bb03e7679975> in <module>()
      3 from pyspark.sql import SQLContext
      4 # Add asset from file system
----> 5 df_data_1 = SQLContext(sc).read.csv('../datasets/propertytax.csv', header='true')
      6 df_data_1.show(5)
      7 

/usr/local/spark/python/pyspark/sql/readwriter.py in csv(self, path, schema, sep, encoding, quote, escape, comment, header, inferSchema, ignoreLeadingWhiteSpace, ignoreTrailingWhiteSpace, nullValue, nanValue, positiveInf, negativeInf, dateFormat, timestampFormat, maxColumns, maxCharsPerColumn, maxMalformedLogPerPartition, mode)
    375         if isinstance(path, basestring):
    376             path = [path]
--> 377         return self._df(self._jreader.csv(self._spark._sc._jvm.PythonUtils.toSeq(path)))
    378 
    379     @since(1.5)

/usr/local/spark/python/lib/py4j-0.10.3-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1131         answer = self.gateway_client.send_command(command)
   1132         return_value = get_return_value(
-> 1133             answer, self.gateway_client, self.target_id, self.name)
   1134 
   1135         for temp_arg in temp_args:

/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
     61     def deco(*a, **kw):
     62         try:
---> 63             return f(*a, **kw)
     64         except py4j.protocol.Py4JJavaError as e:
     65             s = e.java_exception.toString()

/usr/local/spark/python/lib/py4j-0.10.3-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    317                 raise Py4JJavaError(
    318                     "An error occurred while calling {0}{1}{2}.\n".
--> 319                     format(target_id, ".", name), value)
    320             else:
    321                 raise Py4JError(

Py4JJavaError: An error occurred while calling o70.csv.
: java.lang.NoSuchMethodError: com.fasterxml.jackson.databind.JavaType.isReferenceType()Z
	at com.fasterxml.jackson.databind.ser.BasicSerializerFactory.findSerializerByLookup(BasicSerializerFactory.java:302)
	at com.fasterxml.jackson.databind.ser.BeanSerializerFactory._createSerializer2(BeanSerializerFactory.java:218)
	at com.fasterxml.jackson.databind.ser.BeanSerializerFactory.createSerializer(BeanSerializerFactory.java:153)
	at com.fasterxml.jackson.databind.SerializerProvider._createUntypedSerializer(SerializerProvider.java:1203)
	at com.fasterxml.jackson.databind.SerializerProvider._createAndCacheUntypedSerializer(SerializerProvider.java:1157)
	at com.fasterxml.jackson.databind.SerializerProvider.findValueSerializer(SerializerProvider.java:481)
	at com.fasterxml.jackson.databind.SerializerProvider.findTypedValueSerializer(SerializerProvider.java:679)
	at com.fasterxml.jackson.databind.ser.DefaultSerializerProvider.serializeValue(DefaultSerializerProvider.java:107)
	at com.fasterxml.jackson.databind.ObjectMapper._configAndWriteValue(ObjectMapper.java:3559)
	at com.fasterxml.jackson.databind.ObjectMapper.writeValueAsString(ObjectMapper.java:2927)
	at org.apache.spark.rdd.RDDOperationScope.toJson(RDDOperationScope.scala:52)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:145)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.SparkContext.withScope(SparkContext.scala:679)
	at org.apache.spark.SparkContext.textFile(SparkContext.scala:797)
	at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.readText(CSVFileFormat.scala:176)
	at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.baseRdd(CSVFileFormat.scala:141)
	at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.inferSchema(CSVFileFormat.scala:58)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$15.apply(DataSource.scala:421)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$15.apply(DataSource.scala:421)
	at scala.Option.orElse(Option.scala:289)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:420)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:149)
	at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:413)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)

@djccarew In the latest version of Desktop...the following piece of code should load the csv file into the dataframe:

from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext.getOrCreate()
# Add asset from file system
df_data_1 = SQLContext(sc).read.csv('../datasets/propertytax.csv', header='true')
df_data_1.show(5)

Can you check and let us know if the problem still persists?

Closing due to lack of activity. Reopen if needed.