Z-Image Diffusers Python SDK 开发接入指南:从入门到生产环境
关键词: z-image diffusers python pipeline
目录
- 简介
- 安装与配置
- ZImagePipeline 基础
- 文本到图像代码示例
- 图像到图像代码示例
- 图像修复代码示例
- 提示词格式化
- 参数调优
- GPU 优化
- 批量处理
- 生产部署模式
- 错误处理
- 实用示例
- 参考资源
简介
HuggingFace Diffusers 库为扩散模型提供了标准化的 Python 接口。本指南介绍如何使用 Diffusers 接入 Z-Image 模型进行本地 Python 开发,涵盖从基础安装到生产环境部署的完整流程。
与 ZI-044(REST API 接入)不同,本文专注于本地 Python 环境开发,适用于需要更细粒度控制、自定义数据处理或模型微调的场景。
安装与配置
基础安装
# 安装核心依赖
pip install diffusers transformers accelerate safetensors
# 安装 PyTorch(根据 CUDA 版本选择)
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# 安装图像处理库
pip install pillow numpy opencv-python
可选依赖
# xFormers(显存优化)
pip install xformers
# ONNX 运行时(可选推理后端)
pip install onnxruntime-gpu
# 量化推理
pip install optimum bitsandbytes
验证安装
import torch
import diffusers
import transformers
print(f"PyTorch: {torch.__version__}")
print(f"Diffusers: {diffusers.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")
ZImagePipeline 基础
Diffusers 提供了几种与 Z-Image 配合使用的 Pipeline 类:
| Pipeline 类 | 用途 |
|---|---|
ZImagePipeline |
文本到图像生成 |
ZImageImg2ImgPipeline |
图像到图像转换 |
ZImageInpaintPipeline |
图像修复(Inpainting) |
StableDiffusionPipeline |
兼容模式(Flux-based) |
加载模型
from diffusers import ZImagePipeline
# 从 HuggingFace Hub 加载
pipe = ZImagePipeline.from_pretrained(
"z-image/omni-base",
torch_dtype=torch.float16,
use_safetensors=True
)
# 或从本地路径加载
pipe = ZImagePipeline.from_pretrained(
"./models/z-image-omni-base",
torch_dtype=torch.float16
)
# 移动到 GPU
pipe.to("cuda")
模型加载选项
pipe = ZImagePipeline.from_pretrained(
"z-image/omni-base",
torch_dtype=torch.float16, # FP16 精度
use_safetensors=True, # 使用 safetensors 格式
variant="fp16", # 变体选择
device_map="auto", # 自动设备分配
low_cpu_mem_usage=True, # 减少 CPU 内存使用
load_in_4bit=True, # NF4 量化(需 bitsandbytes)
)
文本到图像代码示例
基本生成
import torch
from diffusers import ZImagePipeline
# 加载管道
pipe = ZImagePipeline.from_pretrained(
"z-image/omni-base",
torch_dtype=torch.float16
)
pipe.to("cuda")
# 生成图像
prompt = "a serene lake at sunrise, mountains in the background, photorealistic, 4K quality"
image = pipe(
prompt=prompt,
width=1024,
height=1024,
num_inference_steps=30,
guidance_scale=7.5,
generator=torch.Generator(device="cuda").manual_seed(42)
).images[0]
# 保存图像
image.save("output.png")
使用负面提示词
# 注意:Z-Image 使用双编码器(T5-XXL + CLIP-L)
# 负面提示词通过 negative_prompt 参数传递
negative_prompt = "blurry, low quality, deformed, distorted, bad anatomy, watermark, text, signature"
image = pipe(
prompt="a professional headshot portrait, studio lighting, sharp focus",
negative_prompt=negative_prompt,
width=1024,
height=1024,
num_inference_steps=30,
guidance_scale=7.5,
generator=torch.Generator(device="cuda").manual_seed(123)
).images[0]
image.save("portrait.png")
指定宽高比
# 不同宽高比
aspect_ratios = {
"1:1": (1024, 1024),
"16:9": (1344, 768),
"9:16": (768, 1344),
"4:3": (1152, 896),
"3:2": (1152, 768),
}
for name, (w, h) in aspect_ratios.items():
image = pipe(
prompt="a futuristic cityscape at night, neon lights, cyberpunk style",
width=w,
height=h,
num_inference_steps=30,
guidance_scale=7.5,
).images[0]
image.save(f"cityscape_{name}.png")
图像到图像代码示例
基本图像到图像
from diffusers import ZImageImg2ImgPipeline
from PIL import Image
# 加载 img2img 管道
img2img_pipe = ZImageImg2ImgPipeline.from_pretrained(
"z-image/omni-base",
torch_dtype=torch.float16
)
img2img_pipe.to("cuda")
# 加载参考图像
input_image = Image.open("reference.jpg").convert("RGB")
input_image = input_image.resize((1024, 1024))
# 图像到图像转换
result = img2img_pipe(
prompt="convert to oil painting style, thick brushstrokes, vivid colors",
image=input_image,
strength=0.75, # 重绘强度 (0.0-1.0)
num_inference_steps=30,
guidance_scale=6.0,
generator=torch.Generator(device="cuda").manual_seed(456)
)
result.images[0].save("oil_painting.png")
风格转换
# 不同风格转换的 strength 建议值
style_configs = {
"轻微风格化": 0.3,
"中度风格化": 0.5,
"重度风格化": 0.75,
"完全风格化": 0.9,
}
for style_name, strength in style_configs.items():
result = img2img_pipe(
prompt="anime art style, cel shading, vibrant colors, detailed",
image=input_image,
strength=strength,
num_inference_steps=25,
guidance_scale=5.5,
)
result.images[0].save(f"anime_{style_name}.png")
图像修复(Inpainting)
from diffusers import ZImageInpaintPipeline
import numpy as np
# 加载 inpaint 管道
inpaint_pipe = ZImageInpaintPipeline.from_pretrained(
"z-image/omni-base",
torch_dtype=torch.float16
)
inpaint_pipe.to("cuda")
# 准备输入
source_image = Image.open("photo.jpg").convert("RGB").resize((1024, 1024))
# 创建遮罩(白色 = 需要修复的区域)
mask = np.zeros((1024, 1024), dtype=np.uint8)
# 例如:修复图像右侧
mask[:, 512:1024] = 255
mask_image = Image.fromarray(mask)
# 执行修复
result = inpaint_pipe(
prompt="a lush garden with colorful flowers, natural sunlight",
image=source_image,
mask_image=mask_image,
strength=0.85,
num_inference_steps=30,
guidance_scale=8.0,
generator=torch.Generator(device="cuda").manual_seed(789)
)
result.images[0].save("inpainted.png")
提示词格式化
提示词结构
Z-Image 使用双文本编码器(T5-XXL + CLIP-L),提示词格式对输出质量有显著影响。
推荐格式
# 基础格式:主体 + 环境 + 风格 + 质量修饰
prompt = (
"a [主体] " # a young woman
"in [环境] " # in a cozy coffee shop
"[风格修饰] " # cinematic lighting, depth of field
"[质量修饰]" # photorealistic, 4K, highly detailed
)
# 具体示例
prompt = "a young woman in a cozy coffee shop, cinematic lighting, " /
"depth of field, photorealistic, 4K, highly detailed, " /
"shot on 85mm lens, golden hour"
不同风格的提示词模板
# 摄影风格
photo_prompt = "a [subject], natural lighting, shot on [camera/lens], " /
"[photo style], 4K, highly detailed, realistic"
# 动漫风格
anime_prompt = "a [subject], anime style, cel shading, vibrant colors, " /
"detailed background, studio quality, key visual"
# 油画风格
oil_prompt = "a [subject], oil painting style, thick brushstrokes, " /
"classical composition, rich colors, museum quality"
# 3D 渲染风格
render_prompt = "a [subject], 3D render, octane render, ray tracing, " /
"unreal engine 5, volumetric lighting, 8K"
负面提示词模板
# 通用负面提示词
default_negative = (
"blurry, low quality, worst quality, lowres, "
"deformed, distorted, disfigured, bad anatomy, "
"watermark, text, signature, username, "
"extra limbs, extra fingers, fused fingers"
)
# 摄影专用负面提示词
photo_negative = (
"oversaturated, underexposed, overexposed, "
"motion blur, lens flare, noise, grain, "
"jpeg artifacts, compression artifacts"
)
# 动漫专用负面提示词
anime_negative = (
"3D render, photorealistic, realistic, "
"bad anatomy, deformed hands, extra fingers, "
"watermark, text, signature"
)
参数调优
Guidance Scale(引导尺度)
# Guidance Scale 对照表
# 范围: 1.0 - 15.0
# 推荐值因任务而异
guidance_configs = {
"文本到图像": {"min": 5.0, "max": 10.0, "default": 7.5},
"图像到图像": {"min": 3.0, "max": 7.0, "default": 5.5},
"图像修复": {"min": 5.0, "max": 10.0, "default": 8.0},
}
# 测试不同 guidance scale
for cfg in [3.0, 5.0, 7.5, 10.0, 12.0]:
image = pipe(
prompt="a majestic eagle flying over mountains",
width=1024,
height=1024,
num_inference_steps=30,
guidance_scale=cfg,
generator=torch.Generator(device="cuda").manual_seed(42)
).images[0]
image.save(f"cfg_{cfg:.1f}.png")
推理步数
# 步数与质量/速度的权衡
steps_configs = [
{"steps": 10, "description": "快速预览"},
{"steps": 20, "description": "日常使用"},
{"steps": 30, "description": "高质量"},
{"steps": 50, "description": "极致质量"},
]
for cfg in steps_configs:
image = pipe(
prompt="a detailed mechanical watch interior, macro photography",
width=1024,
height=1024,
num_inference_steps=cfg["steps"],
guidance_scale=7.5,
generator=torch.Generator(device="cuda").manual_seed(42)
).images[0]
image.save(f"steps_{cfg['steps']}.png")
种子控制
# 固定种子确保可复现
seed = 42
generator = torch.Generator(device="cuda").manual_seed(seed)
# 种子变异:探索同一提示词的不同输出
base_seed = 42
for offset in range(0, 10):
current_seed = base_seed + offset
generator = torch.Generator(device="cuda").manual_seed(current_seed)
image = pipe(
prompt="a cat wearing sunglasses, funny pose, studio background",
width=1024,
height=1024,
num_inference_steps=30,
guidance_scale=7.5,
generator=generator
).images[0]
image.save(f"cat_var_{current_seed}.png")
Strength 参数(img2img / inpaint)
# Strength 参数说明:
# 0.0 = 完全不改变原图
# 0.5 = 中度修改
# 1.0 = 完全重绘(等同于从噪声开始)
# img2img 推荐范围: 0.3 - 0.75
# inpaint 推荐范围: 0.7 - 0.95
GPU 优化
xFormers
# 启用 xFormers 优化注意力计算
try:
import xformers
pipe.enable_xformers_memory_efficient_attention()
print("xFormers enabled")
except ImportError:
print("xFormers not available, using default attention")
except Exception as e:
print(f"xFormers error: {e}")
Tensor Float 32
# 在 Ampere+ GPU (RTX 30xx, A100 等) 上启用 TF32
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
Torch.compile
# 使用 Torch.compile 加速推理(PyTorch 2.0+)
pipe.unet = torch.compile(pipe.unet)
# 注意:第一次推理会有编译开销
# 后续推理速度提升约 10-30%
内存优化
# CPU Offload(显存不足时)
pipe.enable_model_cpu_offload()
# 或手动管理显存
pipe.to("cuda")
# ... 执行推理 ...
pipe.to("cpu")
torch.cuda.empty_cache()
# 低 VRAM 模式
pipe.enable_sequential_cpu_offload()
混合优化
# 组合多种优化
pipe = ZImagePipeline.from_pretrained(
"z-image/omni-base",
torch_dtype=torch.float16
)
try:
pipe.enable_xformers_memory_efficient_attention()
except:
pass
pipe.enable_attention_slicing("max")
pipe.enable_vae_slicing()
pipe.to("cuda")
# TF32 (Ampere+ only)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
批量处理
基础批量生成
# 方法一:使用 batch_size 参数
prompts = [
"a forest path at dawn",
"ocean waves crashing on rocks",
"snowy mountain peak at sunset",
"desert landscape with cacti",
]
results = pipe(
prompt=prompts,
width=1024,
height=1024,
num_inference_steps=30,
guidance_scale=7.5,
)
for i, img in enumerate(results.images):
img.save(f"batch_{i}.png")
批量处理脚本
import os
import json
import torch
from diffusers import ZImagePipeline
from pathlib import Path
class BatchImageGenerator:
def __init__(self, model_path: str, device: str = "cuda"):
self.pipe = ZImagePipeline.from_pretrained(
model_path,
torch_dtype=torch.float16
)
self.pipe.to(device)
self.device = device
def generate_batch(self, config_file: str, output_dir: str):
"""从 JSON 配置文件批量生成"""
with open(config_file) as f:
configs = json.load(f)
os.makedirs(output_dir, exist_ok=True)
for i, cfg in enumerate(configs):
prompt = cfg["prompt"]
width = cfg.get("width", 1024)
height = cfg.get("height", 1024)
steps = cfg.get("steps", 30)
cfg_scale = cfg.get("guidance_scale", 7.5)
seed = cfg.get("seed", 42)
generator = torch.Generator(device=self.device).manual_seed(seed)
try:
result = self.pipe(
prompt=prompt,
width=width,
height=height,
num_inference_steps=steps,
guidance_scale=cfg_scale,
generator=generator
)
output_path = os.path.join(output_dir, f"img_{i:04d}.png")
result.images[0].save(output_path)
print(f"Generated: {output_path}")
except Exception as e:
print(f"Error generating {i}: {e}")
def generate_grid(self, prompt: str, seeds: list, output_path: str):
"""生成种子对比网格"""
import torchvision.utils as vutils
images = []
for seed in seeds:
generator = torch.Generator(device=self.device).manual_seed(seed)
result = self.pipe(
prompt=prompt,
width=512,
height=512,
num_inference_steps=20,
guidance_scale=7.5,
generator=generator
)
images.append(result.images[0])
grid = vutils.make_grid(
[img.convert("RGB") for img in images],
nrow=int(len(seeds) ** 0.5)
)
grid.save(output_path)
配置文件示例 (config.json)
[
{
"prompt": "a magical forest with glowing mushrooms, fantasy art",
"width": 1024,
"height": 1024,
"steps": 30,
"guidance_scale": 7.5,
"seed": 1001
},
{
"prompt": "a steampunk airship above clouds, cinematic lighting",
"width": 1344,
"height": 768,
"steps": 30,
"guidance_scale": 7.5,
"seed": 1002
},
{
"prompt": "underwater coral reef, vibrant colors, macro photography",
"width": 1024,
"height": 1024,
"steps": 25,
"guidance_scale": 7.0,
"seed": 1003
}
]
生产部署模式
FastAPI 服务
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from PIL import Image
import io
import torch
from diffusers import ZImagePipeline
app = FastAPI(title="Z-Image Generation API")
# 全局模型实例
pipe = None
class GenerateRequest(BaseModel):
prompt: str
negative_prompt: str = ""
width: int = 1024
height: int = 1024
steps: int = 30
guidance_scale: float = 7.5
seed: int = 42
class GenerateResponse(BaseModel):
image_bytes: str # base64 encoded
metadata: dict
@app.on_event("startup")
async def load_model():
global pipe
pipe = ZImagePipeline.from_pretrained(
"z-image/omni-base",
torch_dtype=torch.float16
)
pipe.to("cuda")
# 启用优化
try:
pipe.enable_xformers_memory_efficient_attention()
except:
pass
@app.post("/generate", response_model=GenerateResponse)
async def generate(req: GenerateRequest):
global pipe
try:
generator = torch.Generator(device="cuda").manual_seed(req.seed)
result = pipe(
prompt=req.prompt,
negative_prompt=req.negative_prompt or None,
width=req.width,
height=req.height,
num_inference_steps=req.steps,
guidance_scale=req.guidance_scale,
generator=generator
)
# 转换为 base64
img = result.images[0]
buf = io.BytesIO()
img.save(buf, format="PNG")
import base64
img_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
return GenerateResponse(
image_bytes=img_b64,
metadata={
"width": req.width,
"height": req.height,
"steps": req.steps,
"guidance_scale": req.guidance_scale,
"seed": req.seed
}
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# 运行: uvicorn main:app --host 0.0.0.0 --port 8000
Flask 服务
from flask import Flask, request, jsonify, Response
from PIL import Image
import io
import torch
from diffusers import ZImagePipeline
app = Flask(__name__)
pipe = None
@app.before_first_request
def initialize():
global pipe
pipe = ZImagePipeline.from_pretrained(
"z-image/omni-base",
torch_dtype=torch.float16
)
pipe.to("cuda")
@app.route("/generate", methods=["POST"])
def generate():
data = request.json
prompt = data.get("prompt", "")
width = data.get("width", 1024)
height = data.get("height", 1024)
steps = data.get("steps", 30)
cfg = data.get("guidance_scale", 7.5)
seed = data.get("seed", 42)
generator = torch.Generator(device="cuda").manual_seed(seed)
result = pipe(
prompt=prompt,
width=width,
height=height,
num_inference_steps=steps,
guidance_scale=cfg,
generator=generator
)
img = result.images[0]
buf = io.BytesIO()
img.save(buf, format="PNG")
buf.seek(0)
return Response(
buf.getvalue(),
mimetype="image/png",
headers={"Content-Disposition": "attachment; filename=output.png"}
)
# 运行: python flask_app.py
错误处理
常见错误及处理
import traceback
def safe_generate(pipe, prompt: str, **kwargs):
"""安全的图像生成函数"""
try:
result = pipe(prompt=prompt, **kwargs)
return result
except torch.cuda.OutOfMemoryError as e:
print(f"显存不足: {e}")
torch.cuda.empty_cache()
# 尝试降低分辨率
kwargs["width"] = kwargs.get("width", 1024) // 2
kwargs["height"] = kwargs.get("height", 1024) // 2
return safe_generate(pipe, prompt, **kwargs)
except RuntimeError as e:
if "CUDA" in str(e):
print(f"CUDA 错误: {e}")
torch.cuda.empty_cache()
else:
print(f"运行时错误: {e}")
return None
except Exception as e:
print(f"未知错误: {e}")
traceback.print_exc()
return None
# 使用示例
result = safe_generate(
pipe,
prompt="a beautiful landscape",
width=1024,
height=1024,
num_inference_steps=30,
guidance_scale=7.5,
generator=torch.Generator(device="cuda").manual_seed(42)
)
模型加载错误处理
def load_model_safe(model_path: str, fallback_path: str = None):
"""安全加载模型"""
try:
pipe = ZImagePipeline.from_pretrained(
model_path,
torch_dtype=torch.float16,
use_safetensors=True
)
print(f"模型加载成功: {model_path}")
return pipe
except FileNotFoundError:
print(f"模型文件不存在: {model_path}")
if fallback_path:
print(f"尝试备用路径: {fallback_path}")
return load_model_safe(fallback_path)
return None
except Exception as e:
print(f"模型加载失败: {e}")
return None
实用示例
简单生成脚本
#!/usr/bin/env python3
"""Z-Image 简单图像生成脚本"""
import argparse
import torch
from diffusers import ZImagePipeline
def main():
parser = argparse.ArgumentParser(description="Z-Image 图像生成")
parser.add_argument("--prompt", type=str, required=True, help="生成提示词")
parser.add_argument("--output", type=str, default="output.png", help="输出文件")
parser.add_argument("--width", type=int, default=1024)
parser.add_argument("--height", type=int, default=1024)
parser.add_argument("--steps", type=int, default=30)
parser.add_argument("--cfg", type=float, default=7.5)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--model", type=str, default="z-image/omni-base")
parser.add_argument("--neg", type=str, default="")
args = parser.parse_args()
# 加载模型
pipe = ZImagePipeline.from_pretrained(
args.model,
torch_dtype=torch.float16
)
pipe.to("cuda")
# 生成
generator = torch.Generator(device="cuda").manual_seed(args.seed)
result = pipe(
prompt=args.prompt,
negative_prompt=args.neg or None,
width=args.width,
height=args.height,
num_inference_steps=args.steps,
guidance_scale=args.cfg,
generator=generator
)
result.images[0].save(args.output)
print(f"图像已保存: {args.output}")
if __name__ == "__main__":
main()
# 使用: python generate.py --prompt "a cat" --output cat.png --seed 123
批量图像生成器
#!/usr/bin/env python3
"""批量图像生成器"""
import json
import os
import torch
from diffusers import ZImagePipeline
from pathlib import Path
import time
def generate_batch(prompts_file, output_dir, model_path="z-image/omni-base"):
os.makedirs(output_dir, exist_ok=True)
pipe = ZImagePipeline.from_pretrained(
model_path,
torch_dtype=torch.float16
)
pipe.to("cuda")
with open(prompts_file) as f:
prompts = json.load(f)
for i, item in enumerate(prompts):
start = time.time()
prompt = item["prompt"]
seed = item.get("seed", 42 + i)
generator = torch.Generator(device="cuda").manual_seed(seed)
result = pipe(
prompt=prompt,
width=item.get("width", 1024),
height=item.get("height", 1024),
num_inference_steps=item.get("steps", 30),
guidance_scale=item.get("guidance_scale", 7.5),
generator=generator
)
output_path = os.path.join(output_dir, f"img_{i:04d}.png")
result.images[0].save(output_path)
elapsed = time.time() - start
print(f"[{i+1}/{len(prompts)}] Saved {output_path} ({elapsed:.1f}s)")
print(f"完成!共生成 {len(prompts)} 张图像")
if __name__ == "__main__":
generate_batch("prompts.json", "output/")
参考资源
- HuggingFace Diffusers 文档: https://huggingface.co/docs/diffusers
- Z-Image HuggingFace: https://huggingface.co/z-image
- Diffusers GitHub: https://github.com/huggingface/diffusers
- ComfyUI 项目: https://github.com/comfyanonymous/ComfyUI
- xFormers 文档: https://github.com/facebookresearch/xformers
- FastAPI 文档: https://fastapi.tiangolo.com/
- Torch.compile 文档: https://pytorch.org/docs/stable/generated/torch.compile.html