Found duplicates, but no duplicates into files
hipp0gryph opened this issue · 2 comments
hipp0gryph commented
Hello! I use that read:
main_tag="ЭДПФР"
charset="utf-8"
df = spark.read.format("xml") \
.option("rowTag", main_tag) \
.option("attributePrefix", "") \
.option("mode", "PERMISSIVE") \
.option("charset", charset) \
.option("inferSchema", False) \
.option("ignoreNamespace", False) \
.load(["test_dup2/*.xml"])
df.printSchema()
And get error
---------------------------------------------------------------------------
AnalysisException Traceback (most recent call last)
<ipython-input-21-d50c6e08593d> in <module>
328 .option("inferSchema", False) \
329 .option("ignoreNamespace", False) \
--> 330 .load(["test_dup2/*.xml"])
331 # df.printSchema()
332 # 4, 7
/usr/local/spark/python/pyspark/sql/readwriter.py in load(self, path, format, schema, **options)
206 if type(path) != list:
207 path = [path]
--> 208 return self._df(self._jreader.load(self._spark._sc._jvm.PythonUtils.toSeq(path)))
209 else:
210 return self._df(self._jreader.load())
/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
115 # Hide where the exception came from that shows a non-Pythonic
116 # JVM exception message.
--> 117 raise converted from None
118 else:
119 raise
AnalysisException: Found duplicate column(s) in the data schema: `xmlns:xsi`
But if I use that code
main_tag="ЭДПФР"
charset="utf-8"
for file in os.listdir("test_dup"):
df = spark.read.format("xml") \
.option("rowTag", main_tag) \
.option("attributePrefix", "") \
.option("mode", "PERMISSIVE") \
.option("charset", charset) \
.option("inferSchema", False) \
.option("ignoreNamespace", False) \
.load(f"test_dup2/{file}")
df.printSchema()
All normal. I'm sorry for another language( I add test xml files with problem
test_dup2.zip
Thank you in advance!
hipp0gryph commented
That problem except if name have another register into xml. In my situation - xmlns:xsi and xmlns:Xsi. How I can resolve it?
srowen commented
XML namespaces are not supported. I think in this case you want to ignore them, then just ignore the parsed cols containing the namespace info. But if you need the namespace info, this won't work.