Skip to content

vllm_omni.diffusion.models.nextstep_1_1.modeling_nextstep

NextStepConfig

Bases: LlamaConfig

boi instance-attribute

boi = boi

eoi instance-attribute

eoi = eoi

fm_head_batch_mul instance-attribute

fm_head_batch_mul = fm_head_batch_mul

fm_head_dim instance-attribute

fm_head_dim = fm_head_dim

fm_head_layers instance-attribute

fm_head_layers = fm_head_layers

im_loss_weight instance-attribute

im_loss_weight = im_loss_weight

image_placeholder_id instance-attribute

image_placeholder_id = image_placeholder_id

latent_channels instance-attribute

latent_channels = latent_channels

latent_patch_size instance-attribute

latent_patch_size = latent_patch_size

latent_size instance-attribute

latent_size = latent_size

lm_loss_weight instance-attribute

lm_loss_weight = lm_loss_weight

model_type class-attribute instance-attribute

model_type = 'nextstep'

o_attention_bias instance-attribute

o_attention_bias = (
    attention_bias
    if o_attention_bias is None
    else o_attention_bias
)

pad_token_id_added instance-attribute

pad_token_id_added = pad_token_id_added

vae_name_or_path instance-attribute

vae_name_or_path = vae_name_or_path

from_json classmethod

from_json(path: str) -> NextStepConfig

NextStepModel

Bases: Module

config instance-attribute

config = config

embed_tokens instance-attribute

embed_tokens = Embedding(
    vocab_size, hidden_size, padding_idx
)

image_head instance-attribute

image_head = FlowMatchingHead(
    input_dim=token_dim,
    cond_dim=hidden_size,
    dim=fm_head_dim,
    layers=fm_head_layers,
)

image_in_projector instance-attribute

image_in_projector = Linear(token_dim, hidden_size)

image_out_projector instance-attribute

image_out_projector = Linear(hidden_size, hidden_size)

layers instance-attribute

layers = ModuleList(
    [
        (LlamaDecoderLayer(config, layer_idx))
        for layer_idx in (range(num_hidden_layers))
    ]
)

lm_head instance-attribute

lm_head = Linear(hidden_size, vocab_size, bias=False)

norm instance-attribute

norm = LlamaRMSNorm(hidden_size, eps=rms_norm_eps)

padding_idx instance-attribute

padding_idx = pad_token_id

rotary_emb instance-attribute

rotary_emb = LlamaRotaryEmbedding(config=config)

vocab_size instance-attribute

vocab_size = vocab_size

forward_model

forward_model(
    inputs_embeds: FloatTensor,
    attention_mask: Tensor | None = None,
    past_key_values: Cache
    | list[FloatTensor]
    | None = None,
    use_cache: bool | None = None,
    output_attentions: bool | None = None,
    output_hidden_states: bool | None = None,
    cache_position: LongTensor | None = None,
) -> BaseModelOutputWithPast

gen_pos_embed_with_ar

gen_pos_embed_with_ar(h: int, w: int) -> Tensor

load_weights

load_weights(
    weights: Iterable[tuple[str, Tensor]],
) -> set[str]

patchify

patchify(img: Tensor) -> Tensor

prepare_inputs_embeds

prepare_inputs_embeds(
    input_ids: LongTensor,
    latents: FloatTensor | None = None,
) -> Tensor

unpatchify

unpatchify(
    x: Tensor, h: int | None = None, w: int | None = None
) -> Tensor

get_2d_sincos_pos_embed

get_2d_sincos_pos_embed(
    embed_dim: int, grid_size: int
) -> ndarray