Multi-caption strategy from parquet might not work
Closed this issue · 2 comments
AmericanPresidentJimmyCarter commented
Caption is checked for existence, but if it's a list it causes a crash.
if not caption and fallback_caption_column:
if not caption and fallback_caption_column:
ValueError: The truth value of an array with more than one element is ambiguous.
Use a.any() or a.all()
You should
def _extract_captions_to_fast_list(self):
"""
Pull the captions from the parquet table into a dict with the format {filename: caption}.
This helps because parquet's columnar format sucks for searching.
Returns:
dict: A dictionary of captions.
"""
if self.parquet_database is None:
raise ValueError("Parquet database is not loaded.")
filename_column = self.parquet_config.get("filename_column")
caption_column = self.parquet_config.get("caption_column")
fallback_caption_column = self.parquet_config.get("fallback_caption_column")
identifier_includes_extension = self.parquet_config.get(
"identifier_includes_extension", False
)
captions = {}
for index, row in self.parquet_database.iterrows():
if filename_column in row:
filename = str(row[filename_column])
else:
filename = str(index)
if not identifier_includes_extension:
filename = os.path.splitext(filename)[0]
if type(caption_column) == list:
caption = None
if len(caption_column) > 0:
caption = [row[c] for c in caption_column]
else:
caption = row.get(caption_column)
if isinstance(caption, (numpy.ndarray, pd.Series)):
caption = [str(item) for item in caption if item is not None]
if caption is None and fallback_caption_column:
caption = row.get(fallback_caption_column, None)
if caption is None or caption == "" or caption == []:
raise ValueError(
f"Could not locate caption for image {filename} in sampler_backend {self.id} with filename column {filename_column}, caption column {caption_column}, and a parquet database with {len(self.parquet_database)} entries."
)
if type(caption) == bytes:
caption = caption.decode("utf-8")
elif type(caption) == list:
caption = [c.strip() for c in caption if c.strip()]
elif type(caption) == str:
caption = caption.strip()
captions[filename] = caption
return captions
AmericanPresidentJimmyCarter commented
It crashes later on too at:
# Check for empty strings
if (df[caption_column] == "").sum() > 0 and not fallback_caption_column:
raise ValueError(
f"Parquet file {parquet_path} contains empty strings in the '{caption_column}' column."
)
if (df[filename_column] == "").sum() > 0:
raise ValueError(
f"Parquet file {parquet_path} contains empty strings in the '{filename_column}' column."
)
AmericanPresidentJimmyCarter commented
Needs also in prompts.py
if type(image_caption) == bytes:
image_caption = image_caption.decode("utf-8")
if type(image_caption) == str:
image_caption = image_caption.strip()
+ if type(image_caption) in (list, tuple, numpy.ndarray, pd.Series):
+ image_caption = [str(item).strip() for item in image_caption if item is not None]
if prepend_instance_prompt:
if type(image_caption) == list:
image_caption = [instance_prompt + " " + x for x in image_caption]
else:
image_caption = instance_prompt + " " + image_caption
return image_caption