Generating images smaller than 512x512 gives only noises?

Question

Generating images smaller than 512x512 gives only noises?

zjysteven opened this issue a year ago · 4 comments

Hi,

Thank you for making the code public! I was trying to generate 128x128 images with TreeRing but I could only get noises. Adjusting the watermark radius doesn't help (see the following two examples, one with r = 10 and the other with r = 0).

In comparison, generating 512x512 images works well (the prompt is the same as 128x128 case).

I was wondering if there are something I'm missing. Below is my code snippet, which I slightly adapted from your code without modifying any core functionalities.

import argparse
import os

from diffusers import DPMSolverMultistepScheduler
from inverse_stable_diffusion import InversableStableDiffusionPipeline
import torch
from tqdm import tqdm
from tree_ring_utils import (get_watermarking_mask, get_watermarking_pattern,
                             inject_watermark, set_random_seed)

torch.set_grad_enabled(False)
ROOT = "/".join(os.path.dirname(os.path.abspath(__file__)).split("/")[:-1])


def main(args):
    # load diffusion model
    device = "cuda" if torch.cuda.is_available() else "cpu"

    scheduler = DPMSolverMultistepScheduler.from_pretrained(
        args.model_id, subfolder="scheduler"
    )
    pipe = InversableStableDiffusionPipeline.from_pretrained(
        args.model_id,
        scheduler=scheduler,
        torch_dtype=torch.float16,
        revision="fp16",
    )
    pipe = pipe.to(device)

    # prompts
    if not args.from_file:
        prompt = args.prompt
        assert prompt is not None
        data = [prompt]
        save_dir = (
            f"{ROOT}/imgs/TreeRing/samples"
        )
    else:
        print(f"reading prompts from {args.from_file}")
        with open(args.from_file, "r") as f:
            data = f.read().splitlines()
        save_dir = (
            f"{ROOT}/imgs/TreeRing/samples_{len(data)}/samples"
        )
    os.makedirs(save_dir, exist_ok=True)

    # ground-truth patch
    gt_patch = get_watermarking_pattern(pipe, args, device)
    torch.save(
        torch.view_as_real(gt_patch.cpu()),
        f"{save_dir}/gt_patch.pt",
    )

    for i in tqdm(range(len(data))):
        seed = i + args.gen_seed
        current_prompt = data[i]

        # generation with watermarking
        set_random_seed(seed)
        init_latents_w = pipe.get_random_latents(
            height=args.image_length, width=args.image_length)

        # get watermarking mask
        watermarking_mask = get_watermarking_mask(init_latents_w, args, device)

        # inject watermark
        init_latents_w = inject_watermark(
            init_latents_w, watermarking_mask, gt_patch, args
        )

        # https://huggingface.co/CompVis/stable-diffusion-v1-4/discussions/10#641051192a593afb553c969e
        with torch.autocast("cuda"):
            outputs_w = pipe(
                current_prompt,
                num_images_per_prompt=args.num_images,
                guidance_scale=args.guidance_scale,
                num_inference_steps=args.num_inference_steps,
                height=args.image_length,
                width=args.image_length,
                latents=init_latents_w,
            )
        orig_image_w = outputs_w.images[0]

        # output is PIL image
        orig_image_w.save(os.path.join(save_dir, f"{i:06d}.png"))

    # watermarking mask is needed in decoding
    torch.save(
        watermarking_mask.cpu(),
        f"{save_dir}/watermarking_mask.pt",
    )


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="diffusion watermark")
    parser.add_argument("--image_length", default=512, type=int)
    parser.add_argument("--model_id", default="stabilityai/stable-diffusion-2-1-base")
    parser.add_argument("--num_images", default=1, type=int)
    parser.add_argument("--guidance_scale", default=7.5, type=float)
    parser.add_argument("--num_inference_steps", default=50, type=int)
    parser.add_argument("--gen_seed", default=0, type=int)

    # watermark
    parser.add_argument("--w_seed", default=999999, type=int)
    parser.add_argument("--w_channel", default=3, type=int)
    parser.add_argument("--w_pattern", default="ring")
    parser.add_argument("--w_mask_shape", default="circle")
    parser.add_argument("--w_radius", default=10, type=int)
    parser.add_argument("--w_measurement", default="l1_complex")
    parser.add_argument("--w_injection", default="complex")
    parser.add_argument("--w_pattern_const", default=0, type=float)

    # prompt
    parser.add_argument(
        "--prompt",
        type=str,
        nargs="?",
        default="a professional photograph of an astronaut riding a triceratops",
        help="the prompt to render",
    )
    parser.add_argument(
        "--from-file",
        type=str,
        help="if specified, load prompts from this file, separated by newlines",
    )

    args = parser.parse_args()
    main(args)

Would appreciate any help/inputs! Thanks!

Answer 1 · 2023-09-28T01:48:30.000Z

To add more information, the above two are 256x256 generated images with the same prompt in my first comment (one with r=0 and the other r=10).

I also tried to generate 1024x1024 images but unfortunately my GPU doesn't have enough memory...

Answer 2 · 2023-09-28T11:48:24.000Z

Hi, thanks for reaching out. I think the model stabilityai/stable-diffusion-2-1-base is fine-tuned on high resolution (>=512x512) images in the final training stage, so the model is not good at generating images smaller than 512x512 even without any watermarks injected.

According to this issue, CompVis/stable-diffusion#58, you can check out some other checkpoints like: https://huggingface.co/lambdalabs/sd-image-variations-diffusers.

Let me know if you have further problems with it!

Answer 3 · 2023-09-28T14:10:36.000Z

Thank you! I totally missed this point earlier...

Answer 4 · 2024-02-28T13:50:03.000Z

@zjysteven sorry to bother you,if i want to genenrative a watermarked image. what should i do?