Image Generation with Diffusion Models

This guide demonstrates how to build powerful image generation services using state-of-the-art diffusion models like FLUX.1. You'll learn to create a complete image generation API with custom parameters, validation, and optimization.

Overview

The Chutes platform makes it easy to deploy advanced image generation models:

FLUX.1 [dev]: 12 billion parameter rectified flow transformer
Stable Diffusion: Various versions and fine-tuned models
Custom Models: Support for any diffusion architecture
GPU Optimization: Automatic scaling and memory management

Complete FLUX.1 Implementation

Input Schema Design

First, define comprehensive input validation using Pydantic:

from pydantic import BaseModel, Field
from typing import Optional

class GenerationInput(BaseModel):
    prompt: str
    height: int = Field(default=1024, ge=128, le=2048)
    width: int = Field(default=1024, ge=128, le=2048)
    num_inference_steps: int = Field(default=10, ge=1, le=30)
    guidance_scale: float = Field(default=7.5, ge=1.0, le=20.0)
    seed: Optional[int] = Field(default=None, ge=0, le=2**32 - 1)

# Simplified input for basic usage
class MinifiedGenerationInput(BaseModel):
    prompt: str = "a beautiful mountain landscape"

Custom Image Configuration

Create a pre-built image with the FLUX.1 model:

from chutes.image import Image

# Create a markdown readme from model documentation
readme = """`FLUX.1 [dev]` is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions.

# Key Features
1. Cutting-edge output quality, second only to our state-of-the-art model `FLUX.1 [pro]`.
2. Competitive prompt following, matching the performance of closed source alternatives.
3. Trained using guidance distillation, making `FLUX.1 [dev]` more efficient.
4. Open weights to drive new scientific research, and empower artists to develop innovative workflows.
5. Generated outputs can be used for personal, scientific, and commercial purposes.
"""

# Use pre-built image with FLUX.1 model
image = (
    Image(
        username="myuser",
        name="flux.1-dev",
        tag="0.0.2",
        readme=readme)
    .from_base("parachutes/flux.1-dev:latest")
)

Chute Configuration

Set up the service with appropriate hardware requirements:

from chutes.chute import Chute, NodeSelector

chute = Chute(
    username="myuser",
    name="FLUX.1-dev-generator",
    readme=readme,
    image=image,
    # FLUX.1 requires significant GPU memory
    node_selector=NodeSelector(
        gpu_count=1,
        min_vram_gb_per_gpu=80,  # 80GB for optimal performance
    ),
    # Limit concurrency due to memory requirements
    concurrency=1)

Model Initialization

Initialize the diffusion pipeline on startup:

@chute.on_startup()
async def initialize_pipeline(self):
    """
    Initialize the FLUX.1 pipeline with optimizations.
    """
    import torch
    from diffusers import FluxPipeline

    # Clear GPU cache and initialize
    self.torch = torch
    torch.cuda.empty_cache()
    torch.cuda.init()
    torch.cuda.set_device(0)

    # Load pre-downloaded model for faster startup
    self.pipeline = FluxPipeline.from_pretrained(
        "black-forest-labs/FLUX.1-dev",
        torch_dtype=torch.bfloat16,
        local_files_only=True,  # Use cached model
        cache_dir="/home/chutes/.cache/huggingface/hub").to("cuda")

Generation Endpoint

Create the main image generation endpoint:

import uuid
from io import BytesIO
from fastapi import Response

@chute.cord(
    public_api_path="/generate",
    method="POST",
    input_schema=GenerationInput,
    minimal_input_schema=MinifiedGenerationInput,
    output_content_type="image/jpeg")
async def generate(self, params: GenerationInput) -> Response:
    """
    Generate high-quality images from text prompts.
    """
    # Set up random seed if provided
    generator = None
    if params.seed is not None:
        generator = self.torch.Generator(device="cuda").manual_seed(params.seed)

    # Generate image with optimized inference
    with self.torch.inference_mode():
        result = self.pipeline(
            prompt=params.prompt,
            height=params.height,
            width=params.width,
            num_inference_steps=params.num_inference_steps,
            guidance_scale=params.guidance_scale,
            max_sequence_length=256,
            generator=generator)

    # Convert to JPEG and return
    image = result.images[0]
    buffer = BytesIO()
    image.save(buffer, format="JPEG", quality=85)
    buffer.seek(0)

    return Response(
        content=buffer.getvalue(),
        media_type="image/jpeg",
        headers={
            "Content-Disposition": f'attachment; filename="{uuid.uuid4()}.jpg"'
        })

Alternative: Stable Diffusion Setup

For a more customizable approach using Stable Diffusion:

from chutes.image import Image
from chutes.chute import Chute, NodeSelector

# Build custom Stable Diffusion image
image = (
    Image(username="myuser", name="stable-diffusion", tag="2.1")
    .from_base("nvidia/cuda:11.8-devel-ubuntu22.04")
    .run_command("apt update && apt install -y python3 python3-pip git")
    .run_command("pip3 install torch torchvision --index-url https://download.pytorch.org/whl/cu118")
    .run_command("pip3 install diffusers transformers accelerate")
    .run_command("pip3 install fastapi uvicorn pydantic pillow")
    .set_workdir("/app")
)

chute = Chute(
    username="myuser",
    name="stable-diffusion-xl",
    image=image,
    node_selector=NodeSelector(
        gpu_count=1,
        min_vram_gb_per_gpu=24),
    concurrency=2)

@chute.on_startup()
async def load_sd_pipeline(self):
    """Load Stable Diffusion XL pipeline."""
    from diffusers import StableDiffusionXLPipeline
    import torch

    self.pipe = StableDiffusionXLPipeline.from_pretrained(
        "stabilityai/stable-diffusion-xl-base-1.0",
        torch_dtype=torch.float16,
        use_safetensors=True).to("cuda")

    # Enable memory efficient attention
    self.pipe.enable_memory_efficient_attention()

@chute.cord(public_api_path="/sdxl", method="POST")
async def generate_sdxl(self, prompt: str, width: int = 1024, height: int = 1024):
    """Generate images with Stable Diffusion XL."""
    images = self.pipe(
        prompt,
        width=width,
        height=height,
        num_inference_steps=20).images

    # Return first image as base64
    buffer = BytesIO()
    images[0].save(buffer, format="PNG")
    return {"image": base64.b64encode(buffer.getvalue()).decode()}

Advanced Features

Batch Generation

Generate multiple images in a single request:

from typing import List

class BatchGenerationInput(BaseModel):
    prompts: List[str] = Field(max_items=4)  # Limit batch size
    width: int = Field(default=1024, ge=512, le=2048)
    height: int = Field(default=1024, ge=512, le=2048)
    num_inference_steps: int = Field(default=20, ge=10, le=50)

@chute.cord(public_api_path="/batch", method="POST")
async def generate_batch(self, params: BatchGenerationInput) -> List[str]:
    """Generate multiple images from prompts."""
    results = []

    for prompt in params.prompts:
        with self.torch.inference_mode():
            result = self.pipeline(
                prompt=prompt,
                width=params.width,
                height=params.height,
                num_inference_steps=params.num_inference_steps)

        # Convert to base64
        buffer = BytesIO()
        result.images[0].save(buffer, format="JPEG", quality=90)
        b64_image = base64.b64encode(buffer.getvalue()).decode()
        results.append(b64_image)

    return results

Image-to-Image Generation

Transform existing images with text prompts:

import base64
from PIL import Image as PILImage

class Img2ImgInput(BaseModel):
    prompt: str
    image_b64: str  # Base64 encoded input image
    strength: float = Field(default=0.75, ge=0.1, le=1.0)
    guidance_scale: float = Field(default=7.5, ge=1.0, le=20.0)

@chute.cord(public_api_path="/img2img", method="POST")
async def image_to_image(self, params: Img2ImgInput) -> Response:
    """Transform images with text prompts."""
    # Decode input image
    image_data = base64.b64decode(params.image_b64)
    init_image = PILImage.open(BytesIO(image_data)).convert("RGB")

    # Generate transformed image
    with self.torch.inference_mode():
        result = self.pipeline(
            prompt=params.prompt,
            image=init_image,
            strength=params.strength,
            guidance_scale=params.guidance_scale)

    # Return as JPEG
    buffer = BytesIO()
    result.images[0].save(buffer, format="JPEG", quality=85)
    buffer.seek(0)

    return Response(
        content=buffer.getvalue(),
        media_type="image/jpeg")

Inpainting Support

Fill or edit specific regions of images:

class InpaintInput(BaseModel):
    prompt: str
    image_b64: str      # Original image
    mask_b64: str       # Mask (white = inpaint, black = keep)
    strength: float = Field(default=0.75, ge=0.1, le=1.0)

@chute.on_startup()
async def load_inpaint_pipeline(self):
    """Load inpainting-specific pipeline."""
    from diffusers import StableDiffusionInpaintPipeline

    self.inpaint_pipe = StableDiffusionInpaintPipeline.from_pretrained(
        "runwayml/stable-diffusion-inpainting",
        torch_dtype=torch.float16).to("cuda")

@chute.cord(public_api_path="/inpaint", method="POST")
async def inpaint(self, params: InpaintInput) -> Response:
    """Inpaint regions of images."""
    # Decode images
    image_data = base64.b64decode(params.image_b64)
    mask_data = base64.b64decode(params.mask_b64)

    image = PILImage.open(BytesIO(image_data)).convert("RGB")
    mask = PILImage.open(BytesIO(mask_data)).convert("L")

    # Generate inpainted result
    result = self.inpaint_pipe(
        prompt=params.prompt,
        image=image,
        mask_image=mask,
        strength=params.strength)

    # Return result
    buffer = BytesIO()
    result.images[0].save(buffer, format="PNG")
    buffer.seek(0)

    return Response(content=buffer.getvalue(), media_type="image/png")

Deployment and Usage

Deploy Your Service

# Build and deploy the image generation service
chutes deploy my_image_gen:chute

# Monitor deployment status
chutes chutes get my-image-gen

Using the API

Basic Generation

curl -X POST "https://myuser-my-image-gen.chutes.ai/generate" \
  -H "Content-Type: application/json" \
  -d '{
    "prompt": "a majestic dragon flying over a crystal lake at sunset",
    "width": 1024,
    "height": 1024,
    "num_inference_steps": 20,
    "guidance_scale": 7.5,
    "seed": 42
  }' \
  --output generated_image.jpg

Python Client

import requests
import base64
from PIL import Image
from io import BytesIO

def generate_image(prompt, **kwargs):
    """Generate image using your Chutes service."""
    url = "https://myuser-my-image-gen.chutes.ai/generate"

    payload = {
        "prompt": prompt,
        **kwargs
    }

    response = requests.post(url, json=payload)

    if response.status_code == 200:
        # Save image
        with open("generated.jpg", "wb") as f:
            f.write(response.content)

        # Or display in Jupyter
        image = Image.open(BytesIO(response.content))
        return image
    else:
        print(f"Error: {response.status_code}")
        return None

# Generate an image
image = generate_image(
    "a cyberpunk cityscape with neon lights and flying cars",
    width=1920,
    height=1080,
    num_inference_steps=25,
    seed=123
)

Performance Optimization

Memory Management

# Enable memory efficient attention
self.pipeline.enable_memory_efficient_attention()

# Use attention slicing for large images
self.pipeline.enable_attention_slicing()

# Enable CPU offloading for very large models
self.pipeline.enable_model_cpu_offload()

Speed Optimizations

# Compile the UNet for faster inference
self.pipeline.unet = torch.compile(self.pipeline.unet, mode="reduce-overhead")

# Use faster schedulers
from diffusers import DPMSolverMultistepScheduler
self.pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
    self.pipeline.scheduler.config
)

Hardware Scaling

# Scale up for higher throughput
node_selector = NodeSelector(
    gpu_count=2,  # Multi-GPU setup
    min_vram_gb_per_gpu=40)

# Or scale out with multiple instances
chute = Chute(
    # ... configuration
    concurrency=4,  # Handle more concurrent requests
)

Best Practices

1. Prompt Engineering

# Good prompts are specific and detailed
good_prompt = """
a photorealistic portrait of a wise old wizard with a long white beard,
wearing a starry blue robe, holding a glowing crystal staff,
in a mystical forest clearing with soft golden sunlight filtering through trees,
highly detailed, 8k resolution, fantasy art style
"""

# Add negative prompts to avoid unwanted elements
negative_prompt = """
blurry, low quality, deformed, ugly, bad anatomy,
watermark, signature, text, cropped
"""

2. Parameter Tuning

# High quality settings
high_quality_params = {
    "num_inference_steps": 50,
    "guidance_scale": 7.5,
    "width": 1024,
    "height": 1024,
}

# Fast generation settings
fast_params = {
    "num_inference_steps": 15,
    "guidance_scale": 5.0,
    "width": 512,
    "height": 512,
}

3. Error Handling

@chute.cord(public_api_path="/generate", method="POST")
async def generate_with_fallback(self, params: GenerationInput) -> Response:
    """Generate with proper error handling."""
    try:
        # Try high-quality generation first
        result = self.pipeline(
            prompt=params.prompt,
            width=params.width,
            height=params.height,
            num_inference_steps=params.num_inference_steps)

    except torch.cuda.OutOfMemoryError:
        # Fallback to lower resolution
        logger.warning("OOM error, reducing resolution")
        result = self.pipeline(
            prompt=params.prompt,
            width=params.width // 2,
            height=params.height // 2,
            num_inference_steps=params.num_inference_steps // 2)

    except Exception as e:
        logger.error(f"Generation failed: {e}")
        raise HTTPException(status_code=500, detail="Generation failed")

    # Return image...

Monitoring and Scaling

Resource Monitoring

# Check GPU utilization
chutes chutes metrics my-image-gen

# View generation logs
chutes chutes logs my-image-gen --tail 100

# Monitor request patterns
chutes chutes status my-image-gen

Auto-scaling Configuration

# Configure auto-scaling based on queue length
chute = Chute(
    # ... other config
    concurrency=2,           # Base concurrency
    max_replicas=5,          # Scale up to 5 instances
    scale_up_threshold=10,   # Scale when queue > 10
    scale_down_delay=300,    # Wait 5 min before scaling down
)

Next Steps

Advanced Models: Experiment with ControlNet, LoRA fine-tuning
Custom Training: Train models on your own datasets
Integration: Build web interfaces and mobile apps
Optimization: Implement caching and CDN distribution

For more advanced examples, see: