How to read TFRecs from GCS bucket in Colab?
linkun-1998 opened this issue · 3 comments
linkun-1998 commented
Every Time I try to use any publicly available GCS bucket from which I can read Multiple or Single tfrecords, it raises the FileNotFoundError
, whereas when the same path is used in TensorFlow, gives the expected output.
This is the error I am getting:-
FileNotFoundError Traceback (most recent call last)
<ipython-input-20-2a5acbdc128e> in <module>()
11 transform=transforms)
12 loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE)
---> 13 data = next(iter(loader))
14 print(data)
6 frames
/usr/local/lib/python3.6/dist-packages/tfrecord/reader.py in tfrecord_iterator(data_path, index_path, shard)
42 file (for a single record).
43 """
---> 44 file = io.open(data_path, "rb")
45
46 length_bytes = bytearray(8)
FileNotFoundError: [Errno 2] No such file or directory: 'gs://flowers-public/tfrecords-jpeg-192x192-2/flowers05-230.tfrec'
This is the colab notebook which I was trying to implement.
Please correct me, if I'm wrong somewhere.
for the reference
vahidk commented
I don't have access to colab. Paste code here.
linkun-1998 commented
I don't have access to colab. Paste code here.
"""# For Multiple TFRecords"""
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import torch
from tfrecord.torch.dataset import MultiTFRecordDataset
from sklearn.model_selection import train_test_split
IMAGE_SIZE = [192, 192]
BATCH_SIZE = 32
EPOCHS = 5
VALIDATION_SPLIT = 0.25
D_PATH = 'gs://flowers-public/tfrecords-jpeg-192x192-2/#.tfrec'
assert IMAGE_SIZE[0] == IMAGE_SIZE[1]
GCS_PATTERN = D_PATH.replace('#', '*')
filenames = tf.io.gfile.glob(GCS_PATTERN)
splits = {}
for file in filenames:
splits[str(file.split('/')[-1][:-6])] = 1/len(filenames)
print(splits)
tfrecord_pattern = D_PATH.replace('#', '{}')
index_pattern = None
def transforms(features):
features["image"] = cv2.resize(cv2.imdecode(features["image"], -1), tuple(IMAGE_SIZE))
features["class"] = np.squeeze(np.eye(num_classes)[np.array([features["class"]]).reshape(-1)])
return features
description = {"image": "byte",
"class": "int"}
dataset = MultiTFRecordDataset(tfrecord_pattern, index_pattern,
splits, description,
transform=transforms)
loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE)
data = next(iter(loader))
print(data['image'].shape)
print(data['class'].shape)
"""#For single TFRecord"""
from tfrecord.torch.dataset import TFRecordDataset
tfrecord_path = 'gs://flowers-public/tfrecords-jpeg-192x192-2/flowers05-230.tfrec'
index_path = None
description = {"image": "byte",
"class": "int"}
def transforms(features):
features["image"] = cv2.resize(cv2.imdecode(features["image"], -1), (*IMAGE_SIZE))
features["class"] = np.squeeze(np.eye(num_classes)[np.array([features["class"]]).reshape(-1)])
return features
dataset = TFRecordDataset(tfrecord_path, index_path, description, transform=transforms)
loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE)
data = next(iter(loader))
print(data)
vahidk commented
This is expected. We don't support reading from gcs bucket. Just mount the bucket to a local folder and update the path.