Wan 2.2 I2V Fast (5B)

This guide walks you through deploying the Pruna-optimized WAN 2.2 Image-To-Video (I2V) 5b model.

What are the prerequisites?

To run the model, you’ll need:

HuggingFace token (HF_TOKEN): Enables you to download the optimized model.
Pruna token (PRUNA_TOKEN): Enables you to load and run the model.
An environment with pruna_pro installed: pip install pruna_pro==0.2.9

What inputs does the model support?

prompt: Input text to generate video from.
image: Input image to generate video from.
num_frames: Number of video frames. 121 frames give the best results
resolution: 480p (on a single H100) or 720p (on a single H100)
frames_per_second: FPS of the saved video. 121 fps give the best results
seed: Random seed. Leave blank for random
go_fast: We offer a very fast and a conservative option.

How do I load the model?

You can initialize the Pruna-optimized WAN 2.2 i2v model directly with PrunaProModel.from_pretrained:


from pruna_pro import PrunaProModel

self.pipe = PrunaProModel.from_pretrained(
            "PrunaAI/Wan2.2-I2V-5B-Diffusers",
            token="HF_TOKEN",
            hf_token="PRUNA_TOKEN"
        )

What does a minimal working example look like?

Below is a complete script that sets up the pipeline, loads an image, and generates a short video. This is the fastest way to test the model end-to-end.


import tempfile

import numpy as np
import torch
from diffusers.utils import export_to_video, load_image
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler

from pruna_pro import PrunaProModel


class Predictor:
    def setup(self):
        import logging

        logging.basicConfig(level=logging.INFO)
        self.pipe = PrunaProModel.from_pretrained(
            "PrunaAI/Wan2.2-I2V-5B-Diffusers",
            token="PRUNA_TOKEN",
            hf_token="HF_TOKEN",
        )
        self.pipe.transformer.forward = torch.compile(
            self.pipe.transformer.forward, fullgraph=False
        )
        self.pipe.scheduler = FlowMatchEulerDiscreteScheduler.from_config(
            self.pipe.scheduler.config
        )

        self.pipe.scheduler.set_shift(5.0)

    def predict(
        self,
        prompt,
        image,
        num_frames=121,
        resolution="720p",
        frames_per_second=21,
        go_fast=True,
        seed=None,
    ):
        generator = (
            torch.Generator("cuda").manual_seed(seed) if seed is not None else None
        )
        if resolution == "480p":
            width, height = 480, 832
        else:
            width, height = 704, 1280
        image = load_image(str(image))
        max_area = height * width
        aspect_ratio = image.height / image.width
        mod_value = (
            self.pipe.vae_scale_factor_spatial
            * self.pipe.transformer.config.patch_size[1]
        )
        height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
        width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
        image = image.resize((width, height))

        prompt = "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside."

        negative_prompt = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
        generator = torch.Generator(device="cuda").manual_seed(0)
        if go_fast:
            num_inference_steps = 6
        else:
            num_inference_steps = 12
        with torch.inference_mode(), torch.no_grad():
            output_video = self.pipe(
                image=image,
                prompt=prompt,
                negative_prompt=negative_prompt,
                height=height,
                width=width,
                num_frames=num_frames,
                guidance_scale=1.0,
                num_inference_steps=num_inference_steps,
                generator=generator,
            ).frames[0][1:]
        output_dir = tempfile.mkdtemp()
        export_to_video(output_video, output_dir + "/output.mp4", fps=frames_per_second)
        return output_dir + "/output.mp4"


if __name__ == "__main__":
    predictor = Predictor()
    predictor.setup()
    output = predictor.predict(
        prompt="A white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside.",
        image="i2v_input.jpg",
    )
    print(output)