Z-Image Diffusers Python SDK 开发接入指南:从入门到生产环境

mei 26, 2026

Z-Image Diffusers Python SDK 开发接入指南:从入门到生产环境

关键词: z-image diffusers python pipeline


目录


简介

HuggingFace Diffusers 库为扩散模型提供了标准化的 Python 接口。本指南介绍如何使用 Diffusers 接入 Z-Image 模型进行本地 Python 开发,涵盖从基础安装到生产环境部署的完整流程。

与 ZI-044(REST API 接入)不同,本文专注于本地 Python 环境开发,适用于需要更细粒度控制、自定义数据处理或模型微调的场景。

安装与配置

基础安装

# 安装核心依赖
pip install diffusers transformers accelerate safetensors

# 安装 PyTorch(根据 CUDA 版本选择)
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# 安装图像处理库
pip install pillow numpy opencv-python

可选依赖

# xFormers(显存优化)
pip install xformers

# ONNX 运行时(可选推理后端)
pip install onnxruntime-gpu

# 量化推理
pip install optimum bitsandbytes

验证安装

import torch
import diffusers
import transformers

print(f"PyTorch: {torch.__version__}")
print(f"Diffusers: {diffusers.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")

ZImagePipeline 基础

Diffusers 提供了几种与 Z-Image 配合使用的 Pipeline 类:

Pipeline 类 用途
ZImagePipeline 文本到图像生成
ZImageImg2ImgPipeline 图像到图像转换
ZImageInpaintPipeline 图像修复(Inpainting)
StableDiffusionPipeline 兼容模式(Flux-based)

加载模型

from diffusers import ZImagePipeline

# 从 HuggingFace Hub 加载
pipe = ZImagePipeline.from_pretrained(
    "z-image/omni-base",
    torch_dtype=torch.float16,
    use_safetensors=True
)

# 或从本地路径加载
pipe = ZImagePipeline.from_pretrained(
    "./models/z-image-omni-base",
    torch_dtype=torch.float16
)

# 移动到 GPU
pipe.to("cuda")

模型加载选项

pipe = ZImagePipeline.from_pretrained(
    "z-image/omni-base",
    torch_dtype=torch.float16,          # FP16 精度
    use_safetensors=True,               # 使用 safetensors 格式
    variant="fp16",                     # 变体选择
    device_map="auto",                  # 自动设备分配
    low_cpu_mem_usage=True,             # 减少 CPU 内存使用
    load_in_4bit=True,                  # NF4 量化(需 bitsandbytes)
)

文本到图像代码示例

基本生成

import torch
from diffusers import ZImagePipeline

# 加载管道
pipe = ZImagePipeline.from_pretrained(
    "z-image/omni-base",
    torch_dtype=torch.float16
)
pipe.to("cuda")

# 生成图像
prompt = "a serene lake at sunrise, mountains in the background, photorealistic, 4K quality"

image = pipe(
    prompt=prompt,
    width=1024,
    height=1024,
    num_inference_steps=30,
    guidance_scale=7.5,
    generator=torch.Generator(device="cuda").manual_seed(42)
).images[0]

# 保存图像
image.save("output.png")

使用负面提示词

# 注意:Z-Image 使用双编码器(T5-XXL + CLIP-L)
# 负面提示词通过 negative_prompt 参数传递

negative_prompt = "blurry, low quality, deformed, distorted, bad anatomy, watermark, text, signature"

image = pipe(
    prompt="a professional headshot portrait, studio lighting, sharp focus",
    negative_prompt=negative_prompt,
    width=1024,
    height=1024,
    num_inference_steps=30,
    guidance_scale=7.5,
    generator=torch.Generator(device="cuda").manual_seed(123)
).images[0]

image.save("portrait.png")

指定宽高比

# 不同宽高比
aspect_ratios = {
    "1:1": (1024, 1024),
    "16:9": (1344, 768),
    "9:16": (768, 1344),
    "4:3": (1152, 896),
    "3:2": (1152, 768),
}

for name, (w, h) in aspect_ratios.items():
    image = pipe(
        prompt="a futuristic cityscape at night, neon lights, cyberpunk style",
        width=w,
        height=h,
        num_inference_steps=30,
        guidance_scale=7.5,
    ).images[0]
    image.save(f"cityscape_{name}.png")

图像到图像代码示例

基本图像到图像

from diffusers import ZImageImg2ImgPipeline
from PIL import Image

# 加载 img2img 管道
img2img_pipe = ZImageImg2ImgPipeline.from_pretrained(
    "z-image/omni-base",
    torch_dtype=torch.float16
)
img2img_pipe.to("cuda")

# 加载参考图像
input_image = Image.open("reference.jpg").convert("RGB")
input_image = input_image.resize((1024, 1024))

# 图像到图像转换
result = img2img_pipe(
    prompt="convert to oil painting style, thick brushstrokes, vivid colors",
    image=input_image,
    strength=0.75,          # 重绘强度 (0.0-1.0)
    num_inference_steps=30,
    guidance_scale=6.0,
    generator=torch.Generator(device="cuda").manual_seed(456)
)

result.images[0].save("oil_painting.png")

风格转换

# 不同风格转换的 strength 建议值
style_configs = {
    "轻微风格化": 0.3,
    "中度风格化": 0.5,
    "重度风格化": 0.75,
    "完全风格化": 0.9,
}

for style_name, strength in style_configs.items():
    result = img2img_pipe(
        prompt="anime art style, cel shading, vibrant colors, detailed",
        image=input_image,
        strength=strength,
        num_inference_steps=25,
        guidance_scale=5.5,
    )
    result.images[0].save(f"anime_{style_name}.png")

图像修复(Inpainting)

from diffusers import ZImageInpaintPipeline
import numpy as np

# 加载 inpaint 管道
inpaint_pipe = ZImageInpaintPipeline.from_pretrained(
    "z-image/omni-base",
    torch_dtype=torch.float16
)
inpaint_pipe.to("cuda")

# 准备输入
source_image = Image.open("photo.jpg").convert("RGB").resize((1024, 1024))

# 创建遮罩(白色 = 需要修复的区域)
mask = np.zeros((1024, 1024), dtype=np.uint8)
# 例如:修复图像右侧
mask[:, 512:1024] = 255
mask_image = Image.fromarray(mask)

# 执行修复
result = inpaint_pipe(
    prompt="a lush garden with colorful flowers, natural sunlight",
    image=source_image,
    mask_image=mask_image,
    strength=0.85,
    num_inference_steps=30,
    guidance_scale=8.0,
    generator=torch.Generator(device="cuda").manual_seed(789)
)

result.images[0].save("inpainted.png")

提示词格式化

提示词结构

Z-Image 使用双文本编码器(T5-XXL + CLIP-L),提示词格式对输出质量有显著影响。

推荐格式

# 基础格式:主体 + 环境 + 风格 + 质量修饰
prompt = (
    "a [主体] "           # a young woman
    "in [环境] "          # in a cozy coffee shop
    "[风格修饰] "          # cinematic lighting, depth of field
    "[质量修饰]"          # photorealistic, 4K, highly detailed
)

# 具体示例
prompt = "a young woman in a cozy coffee shop, cinematic lighting, " /
         "depth of field, photorealistic, 4K, highly detailed, " /
         "shot on 85mm lens, golden hour"

不同风格的提示词模板

# 摄影风格
photo_prompt = "a [subject], natural lighting, shot on [camera/lens], " /
               "[photo style], 4K, highly detailed, realistic"

# 动漫风格
anime_prompt = "a [subject], anime style, cel shading, vibrant colors, " /
               "detailed background, studio quality, key visual"

# 油画风格
oil_prompt = "a [subject], oil painting style, thick brushstrokes, " /
             "classical composition, rich colors, museum quality"

# 3D 渲染风格
render_prompt = "a [subject], 3D render, octane render, ray tracing, " /
                "unreal engine 5, volumetric lighting, 8K"

负面提示词模板

# 通用负面提示词
default_negative = (
    "blurry, low quality, worst quality, lowres, "
    "deformed, distorted, disfigured, bad anatomy, "
    "watermark, text, signature, username, "
    "extra limbs, extra fingers, fused fingers"
)

# 摄影专用负面提示词
photo_negative = (
    "oversaturated, underexposed, overexposed, "
    "motion blur, lens flare, noise, grain, "
    "jpeg artifacts, compression artifacts"
)

# 动漫专用负面提示词
anime_negative = (
    "3D render, photorealistic, realistic, "
    "bad anatomy, deformed hands, extra fingers, "
    "watermark, text, signature"
)

参数调优

Guidance Scale(引导尺度)

# Guidance Scale 对照表
# 范围: 1.0 - 15.0
# 推荐值因任务而异

guidance_configs = {
    "文本到图像": {"min": 5.0, "max": 10.0, "default": 7.5},
    "图像到图像": {"min": 3.0, "max": 7.0, "default": 5.5},
    "图像修复": {"min": 5.0, "max": 10.0, "default": 8.0},
}

# 测试不同 guidance scale
for cfg in [3.0, 5.0, 7.5, 10.0, 12.0]:
    image = pipe(
        prompt="a majestic eagle flying over mountains",
        width=1024,
        height=1024,
        num_inference_steps=30,
        guidance_scale=cfg,
        generator=torch.Generator(device="cuda").manual_seed(42)
    ).images[0]
    image.save(f"cfg_{cfg:.1f}.png")

推理步数

# 步数与质量/速度的权衡
steps_configs = [
    {"steps": 10, "description": "快速预览"},
    {"steps": 20, "description": "日常使用"},
    {"steps": 30, "description": "高质量"},
    {"steps": 50, "description": "极致质量"},
]

for cfg in steps_configs:
    image = pipe(
        prompt="a detailed mechanical watch interior, macro photography",
        width=1024,
        height=1024,
        num_inference_steps=cfg["steps"],
        guidance_scale=7.5,
        generator=torch.Generator(device="cuda").manual_seed(42)
    ).images[0]
    image.save(f"steps_{cfg['steps']}.png")

种子控制

# 固定种子确保可复现
seed = 42
generator = torch.Generator(device="cuda").manual_seed(seed)

# 种子变异:探索同一提示词的不同输出
base_seed = 42
for offset in range(0, 10):
    current_seed = base_seed + offset
    generator = torch.Generator(device="cuda").manual_seed(current_seed)
    image = pipe(
        prompt="a cat wearing sunglasses, funny pose, studio background",
        width=1024,
        height=1024,
        num_inference_steps=30,
        guidance_scale=7.5,
        generator=generator
    ).images[0]
    image.save(f"cat_var_{current_seed}.png")

Strength 参数(img2img / inpaint)

# Strength 参数说明:
# 0.0 = 完全不改变原图
# 0.5 = 中度修改
# 1.0 = 完全重绘(等同于从噪声开始)

# img2img 推荐范围: 0.3 - 0.75
# inpaint 推荐范围: 0.7 - 0.95

GPU 优化

xFormers

# 启用 xFormers 优化注意力计算
try:
    import xformers
    pipe.enable_xformers_memory_efficient_attention()
    print("xFormers enabled")
except ImportError:
    print("xFormers not available, using default attention")
except Exception as e:
    print(f"xFormers error: {e}")

Tensor Float 32

# 在 Ampere+ GPU (RTX 30xx, A100 等) 上启用 TF32
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

Torch.compile

# 使用 Torch.compile 加速推理(PyTorch 2.0+)
pipe.unet = torch.compile(pipe.unet)

# 注意:第一次推理会有编译开销
# 后续推理速度提升约 10-30%

内存优化

# CPU Offload(显存不足时)
pipe.enable_model_cpu_offload()

# 或手动管理显存
pipe.to("cuda")
# ... 执行推理 ...
pipe.to("cpu")
torch.cuda.empty_cache()

# 低 VRAM 模式
pipe.enable_sequential_cpu_offload()

混合优化

# 组合多种优化
pipe = ZImagePipeline.from_pretrained(
    "z-image/omni-base",
    torch_dtype=torch.float16
)

try:
    pipe.enable_xformers_memory_efficient_attention()
except:
    pass

pipe.enable_attention_slicing("max")
pipe.enable_vae_slicing()

pipe.to("cuda")

# TF32 (Ampere+ only)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

批量处理

基础批量生成

# 方法一:使用 batch_size 参数
prompts = [
    "a forest path at dawn",
    "ocean waves crashing on rocks",
    "snowy mountain peak at sunset",
    "desert landscape with cacti",
]

results = pipe(
    prompt=prompts,
    width=1024,
    height=1024,
    num_inference_steps=30,
    guidance_scale=7.5,
)

for i, img in enumerate(results.images):
    img.save(f"batch_{i}.png")

批量处理脚本

import os
import json
import torch
from diffusers import ZImagePipeline
from pathlib import Path

class BatchImageGenerator:
    def __init__(self, model_path: str, device: str = "cuda"):
        self.pipe = ZImagePipeline.from_pretrained(
            model_path,
            torch_dtype=torch.float16
        )
        self.pipe.to(device)
        self.device = device

    def generate_batch(self, config_file: str, output_dir: str):
        """从 JSON 配置文件批量生成"""
        with open(config_file) as f:
            configs = json.load(f)

        os.makedirs(output_dir, exist_ok=True)

        for i, cfg in enumerate(configs):
            prompt = cfg["prompt"]
            width = cfg.get("width", 1024)
            height = cfg.get("height", 1024)
            steps = cfg.get("steps", 30)
            cfg_scale = cfg.get("guidance_scale", 7.5)
            seed = cfg.get("seed", 42)

            generator = torch.Generator(device=self.device).manual_seed(seed)

            try:
                result = self.pipe(
                    prompt=prompt,
                    width=width,
                    height=height,
                    num_inference_steps=steps,
                    guidance_scale=cfg_scale,
                    generator=generator
                )
                output_path = os.path.join(output_dir, f"img_{i:04d}.png")
                result.images[0].save(output_path)
                print(f"Generated: {output_path}")
            except Exception as e:
                print(f"Error generating {i}: {e}")

    def generate_grid(self, prompt: str, seeds: list, output_path: str):
        """生成种子对比网格"""
        import torchvision.utils as vutils

        images = []
        for seed in seeds:
            generator = torch.Generator(device=self.device).manual_seed(seed)
            result = self.pipe(
                prompt=prompt,
                width=512,
                height=512,
                num_inference_steps=20,
                guidance_scale=7.5,
                generator=generator
            )
            images.append(result.images[0])

        grid = vutils.make_grid(
            [img.convert("RGB") for img in images],
            nrow=int(len(seeds) ** 0.5)
        )
        grid.save(output_path)

配置文件示例 (config.json)

[
  {
    "prompt": "a magical forest with glowing mushrooms, fantasy art",
    "width": 1024,
    "height": 1024,
    "steps": 30,
    "guidance_scale": 7.5,
    "seed": 1001
  },
  {
    "prompt": "a steampunk airship above clouds, cinematic lighting",
    "width": 1344,
    "height": 768,
    "steps": 30,
    "guidance_scale": 7.5,
    "seed": 1002
  },
  {
    "prompt": "underwater coral reef, vibrant colors, macro photography",
    "width": 1024,
    "height": 1024,
    "steps": 25,
    "guidance_scale": 7.0,
    "seed": 1003
  }
]

生产部署模式

FastAPI 服务

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from PIL import Image
import io
import torch
from diffusers import ZImagePipeline

app = FastAPI(title="Z-Image Generation API")

# 全局模型实例
pipe = None

class GenerateRequest(BaseModel):
    prompt: str
    negative_prompt: str = ""
    width: int = 1024
    height: int = 1024
    steps: int = 30
    guidance_scale: float = 7.5
    seed: int = 42

class GenerateResponse(BaseModel):
    image_bytes: str  # base64 encoded
    metadata: dict

@app.on_event("startup")
async def load_model():
    global pipe
    pipe = ZImagePipeline.from_pretrained(
        "z-image/omni-base",
        torch_dtype=torch.float16
    )
    pipe.to("cuda")
    # 启用优化
    try:
        pipe.enable_xformers_memory_efficient_attention()
    except:
        pass

@app.post("/generate", response_model=GenerateResponse)
async def generate(req: GenerateRequest):
    global pipe
    try:
        generator = torch.Generator(device="cuda").manual_seed(req.seed)
        result = pipe(
            prompt=req.prompt,
            negative_prompt=req.negative_prompt or None,
            width=req.width,
            height=req.height,
            num_inference_steps=req.steps,
            guidance_scale=req.guidance_scale,
            generator=generator
        )

        # 转换为 base64
        img = result.images[0]
        buf = io.BytesIO()
        img.save(buf, format="PNG")
        import base64
        img_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")

        return GenerateResponse(
            image_bytes=img_b64,
            metadata={
                "width": req.width,
                "height": req.height,
                "steps": req.steps,
                "guidance_scale": req.guidance_scale,
                "seed": req.seed
            }
        )
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

# 运行: uvicorn main:app --host 0.0.0.0 --port 8000

Flask 服务

from flask import Flask, request, jsonify, Response
from PIL import Image
import io
import torch
from diffusers import ZImagePipeline

app = Flask(__name__)
pipe = None

@app.before_first_request
def initialize():
    global pipe
    pipe = ZImagePipeline.from_pretrained(
        "z-image/omni-base",
        torch_dtype=torch.float16
    )
    pipe.to("cuda")

@app.route("/generate", methods=["POST"])
def generate():
    data = request.json
    prompt = data.get("prompt", "")
    width = data.get("width", 1024)
    height = data.get("height", 1024)
    steps = data.get("steps", 30)
    cfg = data.get("guidance_scale", 7.5)
    seed = data.get("seed", 42)

    generator = torch.Generator(device="cuda").manual_seed(seed)
    result = pipe(
        prompt=prompt,
        width=width,
        height=height,
        num_inference_steps=steps,
        guidance_scale=cfg,
        generator=generator
    )

    img = result.images[0]
    buf = io.BytesIO()
    img.save(buf, format="PNG")
    buf.seek(0)

    return Response(
        buf.getvalue(),
        mimetype="image/png",
        headers={"Content-Disposition": "attachment; filename=output.png"}
    )

# 运行: python flask_app.py

错误处理

常见错误及处理

import traceback

def safe_generate(pipe, prompt: str, **kwargs):
    """安全的图像生成函数"""
    try:
        result = pipe(prompt=prompt, **kwargs)
        return result
    except torch.cuda.OutOfMemoryError as e:
        print(f"显存不足: {e}")
        torch.cuda.empty_cache()
        # 尝试降低分辨率
        kwargs["width"] = kwargs.get("width", 1024) // 2
        kwargs["height"] = kwargs.get("height", 1024) // 2
        return safe_generate(pipe, prompt, **kwargs)
    except RuntimeError as e:
        if "CUDA" in str(e):
            print(f"CUDA 错误: {e}")
            torch.cuda.empty_cache()
        else:
            print(f"运行时错误: {e}")
        return None
    except Exception as e:
        print(f"未知错误: {e}")
        traceback.print_exc()
        return None

# 使用示例
result = safe_generate(
    pipe,
    prompt="a beautiful landscape",
    width=1024,
    height=1024,
    num_inference_steps=30,
    guidance_scale=7.5,
    generator=torch.Generator(device="cuda").manual_seed(42)
)

模型加载错误处理

def load_model_safe(model_path: str, fallback_path: str = None):
    """安全加载模型"""
    try:
        pipe = ZImagePipeline.from_pretrained(
            model_path,
            torch_dtype=torch.float16,
            use_safetensors=True
        )
        print(f"模型加载成功: {model_path}")
        return pipe
    except FileNotFoundError:
        print(f"模型文件不存在: {model_path}")
        if fallback_path:
            print(f"尝试备用路径: {fallback_path}")
            return load_model_safe(fallback_path)
        return None
    except Exception as e:
        print(f"模型加载失败: {e}")
        return None

实用示例

简单生成脚本

#!/usr/bin/env python3
"""Z-Image 简单图像生成脚本"""

import argparse
import torch
from diffusers import ZImagePipeline

def main():
    parser = argparse.ArgumentParser(description="Z-Image 图像生成")
    parser.add_argument("--prompt", type=str, required=True, help="生成提示词")
    parser.add_argument("--output", type=str, default="output.png", help="输出文件")
    parser.add_argument("--width", type=int, default=1024)
    parser.add_argument("--height", type=int, default=1024)
    parser.add_argument("--steps", type=int, default=30)
    parser.add_argument("--cfg", type=float, default=7.5)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--model", type=str, default="z-image/omni-base")
    parser.add_argument("--neg", type=str, default="")

    args = parser.parse_args()

    # 加载模型
    pipe = ZImagePipeline.from_pretrained(
        args.model,
        torch_dtype=torch.float16
    )
    pipe.to("cuda")

    # 生成
    generator = torch.Generator(device="cuda").manual_seed(args.seed)
    result = pipe(
        prompt=args.prompt,
        negative_prompt=args.neg or None,
        width=args.width,
        height=args.height,
        num_inference_steps=args.steps,
        guidance_scale=args.cfg,
        generator=generator
    )

    result.images[0].save(args.output)
    print(f"图像已保存: {args.output}")

if __name__ == "__main__":
    main()

# 使用: python generate.py --prompt "a cat" --output cat.png --seed 123

批量图像生成器

#!/usr/bin/env python3
"""批量图像生成器"""

import json
import os
import torch
from diffusers import ZImagePipeline
from pathlib import Path
import time

def generate_batch(prompts_file, output_dir, model_path="z-image/omni-base"):
    os.makedirs(output_dir, exist_ok=True)

    pipe = ZImagePipeline.from_pretrained(
        model_path,
        torch_dtype=torch.float16
    )
    pipe.to("cuda")

    with open(prompts_file) as f:
        prompts = json.load(f)

    for i, item in enumerate(prompts):
        start = time.time()
        prompt = item["prompt"]
        seed = item.get("seed", 42 + i)

        generator = torch.Generator(device="cuda").manual_seed(seed)
        result = pipe(
            prompt=prompt,
            width=item.get("width", 1024),
            height=item.get("height", 1024),
            num_inference_steps=item.get("steps", 30),
            guidance_scale=item.get("guidance_scale", 7.5),
            generator=generator
        )

        output_path = os.path.join(output_dir, f"img_{i:04d}.png")
        result.images[0].save(output_path)
        elapsed = time.time() - start
        print(f"[{i+1}/{len(prompts)}] Saved {output_path} ({elapsed:.1f}s)")

    print(f"完成!共生成 {len(prompts)} 张图像")

if __name__ == "__main__":
    generate_batch("prompts.json", "output/")

参考资源

Z-Image Team

Z-Image Diffusers Python SDK 开发接入指南:从入门到生产环境 | Blog