Generating images smaller than 512x512 gives only noises?
zjysteven opened this issue · 4 comments
Hi,
Thank you for making the code public! I was trying to generate 128x128 images with TreeRing but I could only get noises. Adjusting the watermark radius doesn't help (see the following two examples, one with r = 10 and the other with r = 0).
In comparison, generating 512x512 images works well (the prompt is the same as 128x128 case).
I was wondering if there are something I'm missing. Below is my code snippet, which I slightly adapted from your code without modifying any core functionalities.
import argparse
import os
from diffusers import DPMSolverMultistepScheduler
from inverse_stable_diffusion import InversableStableDiffusionPipeline
import torch
from tqdm import tqdm
from tree_ring_utils import (get_watermarking_mask, get_watermarking_pattern,
inject_watermark, set_random_seed)
torch.set_grad_enabled(False)
ROOT = "/".join(os.path.dirname(os.path.abspath(__file__)).split("/")[:-1])
def main(args):
# load diffusion model
device = "cuda" if torch.cuda.is_available() else "cpu"
scheduler = DPMSolverMultistepScheduler.from_pretrained(
args.model_id, subfolder="scheduler"
)
pipe = InversableStableDiffusionPipeline.from_pretrained(
args.model_id,
scheduler=scheduler,
torch_dtype=torch.float16,
revision="fp16",
)
pipe = pipe.to(device)
# prompts
if not args.from_file:
prompt = args.prompt
assert prompt is not None
data = [prompt]
save_dir = (
f"{ROOT}/imgs/TreeRing/samples"
)
else:
print(f"reading prompts from {args.from_file}")
with open(args.from_file, "r") as f:
data = f.read().splitlines()
save_dir = (
f"{ROOT}/imgs/TreeRing/samples_{len(data)}/samples"
)
os.makedirs(save_dir, exist_ok=True)
# ground-truth patch
gt_patch = get_watermarking_pattern(pipe, args, device)
torch.save(
torch.view_as_real(gt_patch.cpu()),
f"{save_dir}/gt_patch.pt",
)
for i in tqdm(range(len(data))):
seed = i + args.gen_seed
current_prompt = data[i]
# generation with watermarking
set_random_seed(seed)
init_latents_w = pipe.get_random_latents(
height=args.image_length, width=args.image_length)
# get watermarking mask
watermarking_mask = get_watermarking_mask(init_latents_w, args, device)
# inject watermark
init_latents_w = inject_watermark(
init_latents_w, watermarking_mask, gt_patch, args
)
# https://huggingface.co/CompVis/stable-diffusion-v1-4/discussions/10#641051192a593afb553c969e
with torch.autocast("cuda"):
outputs_w = pipe(
current_prompt,
num_images_per_prompt=args.num_images,
guidance_scale=args.guidance_scale,
num_inference_steps=args.num_inference_steps,
height=args.image_length,
width=args.image_length,
latents=init_latents_w,
)
orig_image_w = outputs_w.images[0]
# output is PIL image
orig_image_w.save(os.path.join(save_dir, f"{i:06d}.png"))
# watermarking mask is needed in decoding
torch.save(
watermarking_mask.cpu(),
f"{save_dir}/watermarking_mask.pt",
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="diffusion watermark")
parser.add_argument("--image_length", default=512, type=int)
parser.add_argument("--model_id", default="stabilityai/stable-diffusion-2-1-base")
parser.add_argument("--num_images", default=1, type=int)
parser.add_argument("--guidance_scale", default=7.5, type=float)
parser.add_argument("--num_inference_steps", default=50, type=int)
parser.add_argument("--gen_seed", default=0, type=int)
# watermark
parser.add_argument("--w_seed", default=999999, type=int)
parser.add_argument("--w_channel", default=3, type=int)
parser.add_argument("--w_pattern", default="ring")
parser.add_argument("--w_mask_shape", default="circle")
parser.add_argument("--w_radius", default=10, type=int)
parser.add_argument("--w_measurement", default="l1_complex")
parser.add_argument("--w_injection", default="complex")
parser.add_argument("--w_pattern_const", default=0, type=float)
# prompt
parser.add_argument(
"--prompt",
type=str,
nargs="?",
default="a professional photograph of an astronaut riding a triceratops",
help="the prompt to render",
)
parser.add_argument(
"--from-file",
type=str,
help="if specified, load prompts from this file, separated by newlines",
)
args = parser.parse_args()
main(args)
Would appreciate any help/inputs! Thanks!
Hi, thanks for reaching out. I think the model stabilityai/stable-diffusion-2-1-base
is fine-tuned on high resolution (>=512x512) images in the final training stage, so the model is not good at generating images smaller than 512x512 even without any watermarks injected.
According to this issue, CompVis/stable-diffusion#58, you can check out some other checkpoints like: https://huggingface.co/lambdalabs/sd-image-variations-diffusers.
Let me know if you have further problems with it!
Thank you! I totally missed this point earlier...
@zjysteven sorry to bother you,if i want to genenrative a watermarked image. what should i do?