Skip to content

vllm_omni.diffusion.models.audiox.pipeline_audiox

logger module-attribute

logger = init_logger(__name__)

prepare_audio_reference module-attribute

prepare_audio_reference = (
    _audiox_transforms.prepare_audio_reference
)

prepare_video_reference module-attribute

prepare_video_reference = (
    _audiox_transforms.prepare_video_reference
)

AudioVaePromptAdapter

Bases: Module

pretransform instance-attribute

pretransform = _build_audiox_oobleck()

proj_features_128 instance-attribute

proj_features_128 = nn.Linear(latent_seq_len, 128)

proj_out instance-attribute

proj_out = (
    nn.Linear(in_ch, cond_dim)
    if in_ch != cond_dim
    else nn.Identity()
)

forward

forward(audio: Tensor) -> tuple[Tensor, Tensor]

AudioXPipeline

Bases: Module, SupportAudioOutput, DiffusionPipelineProfilerMixin

audio_channels class-attribute

audio_channels: int = 2

audio_sample_rate class-attribute

audio_sample_rate: int = 44100

audio_vae_adapter instance-attribute

audio_vae_adapter = AudioVaePromptAdapter(
    cond_dim=int(model_config["conditioning"]["cond_dim"]),
    latent_seq_len=int(
        cond_configs["audio_prompt"]["latent_seq_len"]
    ),
)

clip_empty_visual_feat instance-attribute

clip_empty_visual_feat = nn.Parameter(
    torch.zeros(1, self._clip_out_features, _DIM),
    requires_grad=False,
)

clip_encoder instance-attribute

clip_encoder = CLIPVisionModelWithProjection(
    clip_config.vision_config
)

clip_proj instance-attribute

clip_proj = nn.Linear(_in_features, self._clip_out_features)

clip_proj_sync instance-attribute

clip_proj_sync = nn.Linear(240, self._clip_out_features)

clip_sync_weight instance-attribute

clip_sync_weight = nn.Parameter(torch.tensor(0.0))

clip_temp_pos_embedding instance-attribute

clip_temp_pos_embedding = nn.Parameter(
    torch.randn(1, _VIDEO_FPS * _DURATION_SEC, _DIM)
)

clip_temp_transformer instance-attribute

clip_temp_transformer = SA_Transformer(
    _DIM, depth=4, heads=16, dim_head=64, mlp_dim=_DIM * 4
)

device instance-attribute

device = get_local_device()

diffusion_objective instance-attribute

diffusion_objective = 'v'

io_channels instance-attribute

io_channels = model_config['io_channels']

maf_block instance-attribute

maf_block = MAF_Block(
    dim=768,
    num_experts_per_modality=int(
        gate_type_config["num_experts_per_modality"]
    ),
    num_heads=int(gate_type_config["num_heads"]),
    num_fusion_layers=int(
        gate_type_config["num_fusion_layers"]
    ),
)

model instance-attribute

model = MMDiffusionTransformer(
    **(dict(diffusion_config["config"]))
)

od_config instance-attribute

od_config = od_config

pretransform instance-attribute

pretransform = _build_audiox_oobleck(
    scaling_factor=float(
        model_config["pretransform"].get("scale", 1.0)
    )
)

support_audio_output class-attribute

support_audio_output: bool = True

text_encoder instance-attribute

text_encoder = (
    T5EncoderModel(t5_config)
    .train(False)
    .requires_grad_(False)
    .to(torch.float16)
)

tokenizer instance-attribute

tokenizer = T5TokenizerFast.from_pretrained(t5_name)

weights_sources instance-attribute

weights_sources = [
    DiffusersPipelineLoader.ComponentSource(
        model_or_path=self._model_root,
        subfolder="transformer",
        revision=getattr(od_config, "revision", None),
        prefix="",
    )
]

diffuse

diffuse(
    *,
    steps: int,
    guidance_scale: float,
    conditioning_tensors: dict[str, Any],
    negative_conditioning_tensors: dict[str, Any] | None,
    batch_size: int,
    sigma_min: float,
    sigma_max: float,
    generator: Generator,
    cfg_rescale: float,
) -> Tensor

forward

get_conditioning_inputs

get_conditioning_inputs(
    conditioning_tensors: dict[str, Any],
    negative: bool = False,
) -> dict[str, Any]

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

MAF_Block

Bases: Module

DIM class-attribute instance-attribute

DIM = 768

MLP_RATIO class-attribute instance-attribute

MLP_RATIO = 4.0

bypass_gate_a instance-attribute

bypass_gate_a = nn.Parameter(torch.tensor(-10.0))

bypass_gate_t instance-attribute

bypass_gate_t = nn.Parameter(torch.tensor(-10.0))

bypass_gate_v instance-attribute

bypass_gate_v = nn.Parameter(torch.tensor(-10.0))

cross_block instance-attribute

cross_block = _MAFCrossAttentionBlock(dim, num_heads)

fusion_blocks instance-attribute

fusion_blocks = nn.ModuleList(
    [
        (_MAFFusionBlock(dim, num_heads, mlp_ratio))
        for _ in (range(num_fusion_layers))
    ]
)

gating_network instance-attribute

gating_network = nn.Sequential(
    nn.Linear(dim * 3, dim),
    nn.GELU(),
    nn.Linear(dim, 3),
    nn.Sigmoid(),
)

norm1 instance-attribute

norm1 = nn.LayerNorm(dim)

norm_a2 instance-attribute

norm_a2 = nn.LayerNorm(dim)

norm_t2 instance-attribute

norm_t2 = nn.LayerNorm(dim)

norm_v2 instance-attribute

norm_v2 = nn.LayerNorm(dim)

unified_experts instance-attribute

unified_experts = nn.Parameter(
    torch.randn(total_experts, dim)
)

forward

forward(
    video_tokens: Tensor,
    text_tokens: Tensor,
    audio_tokens: Tensor,
) -> dict[str, Tensor]

SA_Attention

Bases: Module

heads instance-attribute

heads = heads

scale instance-attribute

scale = dim_head ** -0.5

to_out instance-attribute

to_out = (
    nn.Sequential(
        nn.Linear(inner_dim, dim), nn.Dropout(0.0)
    )
    if project_out
    else nn.Identity()
)

to_qkv instance-attribute

to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)

forward

forward(x)

SA_FeedForward

Bases: Module

net instance-attribute

net = nn.Sequential(
    nn.Linear(dim, hidden_dim),
    nn.GELU(),
    nn.Dropout(0.0),
    nn.Linear(hidden_dim, dim),
    nn.Dropout(0.0),
)

forward

forward(x)

SA_PreNorm

Bases: Module

fn instance-attribute

fn = fn

norm instance-attribute

norm = nn.LayerNorm(dim)

forward

forward(x, **kwargs)

SA_Transformer

Bases: Module

layers instance-attribute

layers = nn.ModuleList([])

norm instance-attribute

norm = nn.LayerNorm(dim)

forward

forward(x)

get_audiox_post_process_func

get_audiox_post_process_func(
    od_config: OmniDiffusionConfig,
)

Convert the pipeline's float audio tensor to a CPU numpy array for serving.