mrchristine/db-migration

Tables are blank/empty after migration

Closed this issue · 5 comments

When migration table from one workspace to another it misses out 10 % tables and the rest 85-90% are blank. Some have records mismatch rest all have zero counts.

Can you provide more logs from the console?
There's a error log file under logs/failed_metastore.log. If you provide some of those errors, I can help debug further.

The Databricks Labs project is the official supported package. GitHub does not support attachments. Can you share the console output failures, or a few lines of the failed log here for me to debug further?
Feel free to open an issue on the labs migrate project to debug further.

Here is the failed_metastore log below:

{"resultType": "error", "summary": "java.lang.UnsupportedOperationException: Parquet does not support decimal. See HIVE-6384", "cause": "---------------------------------------------------------------------------\nPy4JJavaError Traceback (most recent call last)\n in \n----> 1 ddl_str = spark.sql("show create table default.campaign_master_table").collect()[0][0]\n\n/databricks/spark/python/pyspark/sql/session.py in sql(self, sqlQuery)\n 707 [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]\n 708 """\n--> 709 return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)\n 710 \n 711 @SInCE(2.0)\n\n/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in call(self, *args)\n 1303 answer = self.gateway_client.send_command(command)\n 1304 return_value = get_return_value(\n-> 1305 answer, self.gateway_client, self.target_id, self.name)\n 1306 \n 1307 for temp_arg in temp_args:\n\n/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)\n 125 def deco(*a, **kw):\n 126 try:\n--> 127 return f(*a, **kw)\n 128 except py4j.protocol.Py4JJavaError as e:\n 129 converted = convert_exception(e.java_exception)\n\n/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)\n 326 raise Py4JJavaError(\n 327 "An error occurred while calling {0}{1}{2}.\n".\n--> 328 format(target_id, ".", name), value)\n 329 else:\n 330 raise Py4JError(\n\nPy4JJavaError: An error occurred while calling o226.sql.\n: java.lang.UnsupportedOperationException: Parquet does not support decimal. See HIVE-6384\n\tat org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector.getObjectInspector(ArrayWritableObjectInspector.java:102)\n\tat org.apache.hadoop.hive.ql.io.parquet.serde.ArrayWritableObjectInspector.(ArrayWritableObjectInspector.java:60)\n\tat org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe.initialize(ParquetHiveSerDe.java:113)\n\tat org.apache.hadoop.hive.metastore.MetaStoreUtils.getDeserializer(MetaStoreUtils.java:339)\n\tat org.apache.hadoop.hive.ql.metadata.Table.getDeserializerFromMetaStore(Table.java:288)\n\tat org.apache.hadoop.hive.ql.metadata.Table.checkValidity(Table.java:194)\n\tat org.apache.hadoop.hive.ql.metadata.Hive.getTable(Hive.java:1017)\n\tat org.apache.spark.sql.hive.client.HiveClientImpl.getRawTableOption(HiveClientImpl.scala:454)\n\tat org.apache.spark.sql.hive.client.HiveClientImpl.$anonfun$tableExists$1(HiveClientImpl.scala:468)\n\tat scala.runtime.java8.JFunction0$mcZ$sp.apply(JFunction0$mcZ$sp.java:23)\n\tat org.apache.spark.sql.hive.client.HiveClientImpl.$anonfun$withHiveState$1(HiveClientImpl.scala:351)\n\tat org.apache.spark.sql.hive.client.HiveClientImpl.$anonfun$retryLocked$1(HiveClientImpl.scala:254)\n\tat org.apache.spark.sql.hive.client.HiveClientImpl.synchronizeOnObject(HiveClientImpl.scala:290)\n\tat org.apache.spark.sql.hive.client.HiveClientImpl.retryLocked(HiveClientImpl.scala:246)\n\tat org.apache.spark.sql.hive.client.HiveClientImpl.withHiveState(HiveClientImpl.scala:333)\n\tat org.apache.spark.sql.hive.client.HiveClientImpl.tableExists(HiveClientImpl.scala:468)\n\tat org.apache.spark.sql.hive.client.PoolingHiveClient.$anonfun$tableExists$1(PoolingHiveClient.scala:282)\n\tat org.apache.spark.sql.hive.client.PoolingHiveClient.$anonfun$tableExists$1$adapted(PoolingHiveClient.scala:281)\n\tat org.apache.spark.sql.hive.client.PoolingHiveClient.withHiveClient(PoolingHiveClient.scala:112)\n\tat org.apache.spark.sql.hive.client.PoolingHiveClient.tableExists(PoolingHiveClient.scala:281)\n\tat org.apache.spark.sql.hive.HiveExternalCatalog.$anonfun$tableExists$1(HiveExternalCatalog.scala:943)\n\tat scala.runtime.java8.JFunction0$mcZ$sp.apply(JFunction0$mcZ$sp.java:23)\n\tat org.apache.spark.sql.hive.HiveExternalCatalog.$anonfun$withClient$2(HiveExternalCatalog.scala:151)\n\tat org.apache.spark.sql.hive.HiveExternalCatalog.maybeSynchronized(HiveExternalCatalog.scala:112)\n\tat org.apache.spark.sql.hive.HiveExternalCatalog.$anonfun$withClient$1(HiveExternalCatalog.scala:150)\n\tat com.databricks.backend.daemon.driver.ProgressReporter$.withStatusCode(ProgressReporter.scala:377)\n\tat com.databricks.backend.daemon.driver.ProgressReporter$.withStatusCode(ProgressReporter.scala:363)\n\tat com.databricks.spark.util.SparkDatabricksProgressReporter$.withStatusCode(ProgressReporter.scala:34)\n\tat org.apache.spark.sql.hive.HiveExternalCatalog.withClient(HiveExternalCatalog.scala:149)\n\tat org.apache.spark.sql.hive.HiveExternalCatalog.tableExists(HiveExternalCatalog.scala:943)\n\tat org.apache.spark.sql.catalyst.catalog.ExternalCatalogWithListener.tableExists(ExternalCatalogWithListener.scala:155)\n\tat org.apache.spark.sql.catalyst.catalog.SessionCatalog.tableExists(SessionCatalog.scala:486)\n\tat org.apache.spark.sql.catalyst.catalog.SessionCatalog.requireTableExists(SessionCatalog.scala:218)\n\tat org.apache.spark.sql.catalyst.catalog.SessionCatalog.getTableMetadata(SessionCatalog.scala:499)\n\tat com.databricks.sql.DatabricksSessionCatalog.super$getTableMetadata(DatabricksSessionCatalog.scala:90)\n\tat com.databricks.sql.DatabricksSessionCatalog.$anonfun$getTableMetadata$1(DatabricksSessionCatalog.scala:90)\n\tat com.databricks.sql.DatabricksSessionCatalog.deltaTableMeta(DatabricksSessionCatalog.scala:109)\n\tat com.databricks.sql.DatabricksSessionCatalog.getTableMetadata(DatabricksSessionCatalog.scala:90)\n\tat org.apache.spark.sql.execution.command.ShowCreateTableCommand.run(tables.scala:1175)\n\tat org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)\n\tat org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)\n\tat org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:79)\n\tat org.apache.spark.sql.Dataset.$anonfun$logicalPlan$1(Dataset.scala:234)\n\tat org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3728)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:116)\n\tat org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:248)\n\tat org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:101)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:841)\n\tat org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77)\n\tat org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:198)\n\tat org.apache.spark.sql.Dataset.withAction(Dataset.scala:3726)\n\tat org.apache.spark.sql.Dataset.(Dataset.scala:234)\n\tat org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:104)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:841)\n\tat org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:101)\n\tat org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:676)\n\tat org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:841)\n\tat org.apache.spark.sql.SparkSession.sql(SparkSession.scala:671)\n\tat sun.reflect.GeneratedMethodAccessor364.invoke(Unknown Source)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)\n\tat py4j.Gateway.invoke(Gateway.java:295)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:251)\n\tat java.lang.Thread.run(Thread.java:748)", "table": "default.campaign_master_table"}
{"resultType": "error", "summary": "<span class="ansi-red-fg">AnalysisException: Table or view 'clover_pd_end' not found in database 'default';", "cause": "---------------------------------------------------------------------------\nAnalysisException Traceback (most recent call last)\n in \n----> 1 ddl_str = spark.sql("show create table default.clover_pd_end").collect()[0][0]\n\n/databricks/spark/python/pyspark/sql/session.py in sql(self, sqlQuery)\n 707 [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]\n 708 """\n--> 709 return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)\n 710 \n 711 @SInCE(2.0)\n\n/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in call(self, *args)\n 1303 answer = self.gateway_client.send_command(command)\n 1304 return_value = get_return_value(\n-> 1305 answer, self.gateway_client, self.target_id, self.name)\n 1306 \n 1307 for temp_arg in temp_args:\n\n/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)\n 131 # Hide where the exception came from that shows a non-Pythonic\n 132 # JVM exception message.\n--> 133 raise_from(converted)\n 134 else:\n 135 raise\n\n/databricks/spark/python/pyspark/sql/utils.py in raise_from(e)\n\nAnalysisException: Table or view 'clover_pd_end' not found in database 'default';", "table": "default.clover_pd_end"}
{"resultType": "error", "summary": "<span class="ansi-red-fg">AnalysisException: Table or view 'mp_tmp_basket_class' not found in database 'default';", "cause": "---------------------------------------------------------------------------\nAnalysisException Traceback (most recent call last)\n in \n----> 1 ddl_str = spark.sql("show create table default.mp_tmp_basket_class").collect()[0][0]\n\n/databricks/spark/python/pyspark/sql/session.py in sql(self, sqlQuery)\n 707 [Row(f1=1, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')]\n 708 """\n--> 709 return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)\n 710 \n 711 @SInCE(2.0)\n\n/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in call(self, *args)\n 1303 answer = self.gateway_client.send_command(command)\n 1304 return_value = get_return_value(\n-> 1305 answer, self.gateway_client, self.target_id, self.name)\n 1306 \n 1307 for temp_arg in temp_args:\n\n/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)\n 131 # Hide where the exception came from that shows a non-Pythonic\n 132 # JVM exception message.\n--> 133 raise_from(converted)\n 134 else:\n 135 raise\n\n/databricks/spark/python/pyspark/sql/utils.py in raise_from(e)\n\nAnalysisException: Table or view 'mp_tmp_basket_class' not found in database 'default';", "table": "default.mp_tmp_basket_class"}

Closing this as a duplicate to the migrate labs repo issue.