xitongsys/parquet-go

Reading parquet files written by spark returns unexpected values

mitchelllussier opened this issue · 0 comments

I am attempting to read/write parquet files following one of the examples. When I write data to a parquet file using this library and read it, it behaves as expected. When I try to read a parquet file that was written by Apache Spark, I get values that are not decoded properly. I have tried all the different conversion types when defining my data structure but nothing seems to work. Has anyone run into an issue like this before or have any insight on what I am doing wrong? The parquet I'm trying to read is definitely because I have been able to parse it with an online parquet reader. Thanks!

Code:

func readParquet[T any](path string) {

	//read
	fr, err := local.NewLocalFileReader(path)
	if err != nil {
		log.Println("Can't open file", err)
		return
	}

	pr, err := reader.NewParquetReader(fr, new(T), 4)
	if err != nil {
		log.Println("Can't create column reader", err)
		return
	}

	res, err := pr.ReadByNumber(10)
	if err != nil {
		log.Println("Can't read by number", err)
		return
	}

	jsonBs, err := json.Marshal(res)
	if err != nil {
		log.Println("Can't marshal json", err)
		return
	}

	log.Println(string(jsonBs))
	pr.ReadStop()
	fr.Close()
}

func main(){
	type datastruct struct {
		Character string `parquet:"name=character, type=BYTE_ARRAY, convertedtype=UTF8, encoding=rle"`
		Franchise string `parquet:"name=franchise, type=BYTE_ARRAY, convertedtype=UTF8, encoding=PLAIN_DICTIONARY"`
	}

	readParquet[datastruct]("data.parquet")
}

Output:

[{"Character":"\u0003\u0001","Franchise":"\u0003\u0001"}]

Expected Output:

[{"Character":"link","Franchise":"zelda"}]