Skip to content

vllm_omni.diffusion.models.gr00t.modeling.modules.dit

AdaLayerNorm

Bases: Module

chunk_dim instance-attribute

chunk_dim = chunk_dim

linear instance-attribute

linear = nn.Linear(embedding_dim, output_dim)

norm instance-attribute

norm = nn.LayerNorm(
    output_dim // 2, norm_eps, norm_elementwise_affine
)

silu instance-attribute

silu = nn.SiLU()

forward

forward(x: Tensor, temb: Tensor | None = None) -> Tensor

AlternateVLDiT

Bases: DiT

attend_text_every_n_blocks instance-attribute

attend_text_every_n_blocks = attend_text_every_n_blocks

forward

forward(
    hidden_states: Tensor,
    encoder_hidden_states: Tensor,
    timestep: LongTensor | None = None,
    encoder_attention_mask: Tensor | None = None,
    return_all_hidden_states: bool = False,
    image_mask: Tensor | None = None,
    backbone_attention_mask: Tensor | None = None,
)

BasicTransformerBlock

Bases: Module

attn1 instance-attribute

attn1 = Attention(
    query_dim=dim,
    heads=num_attention_heads,
    dim_head=attention_head_dim,
    dropout=dropout,
    bias=attention_bias,
    cross_attention_dim=cross_attention_dim,
    upcast_attention=upcast_attention,
    out_bias=attention_out_bias,
)

ff instance-attribute

ff = FeedForward(
    dim,
    dropout=dropout,
    activation_fn=activation_fn,
    final_dropout=final_dropout,
    inner_dim=ff_inner_dim,
    bias=ff_bias,
)

final_dropout instance-attribute

final_dropout = (
    nn.Dropout(dropout) if final_dropout else None
)

norm1 instance-attribute

norm1 = AdaLayerNorm(dim)

norm3 instance-attribute

norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)

norm_type instance-attribute

norm_type = norm_type

pos_embed instance-attribute

pos_embed = SinusoidalPositionalEmbedding(
    dim, max_seq_length=num_positional_embeddings
)

forward

forward(
    hidden_states: Tensor,
    attention_mask: Tensor | None = None,
    encoder_hidden_states: Tensor | None = None,
    encoder_attention_mask: Tensor | None = None,
    temb: LongTensor | None = None,
) -> Tensor

DiT

Bases: ModelMixin, ConfigMixin

gradient_checkpointing instance-attribute

gradient_checkpointing = False

inner_dim instance-attribute

inner_dim = num_attention_heads * attention_head_dim

norm_out instance-attribute

norm_out = nn.LayerNorm(
    self.inner_dim, elementwise_affine=False, eps=1e-06
)

proj_out_1 instance-attribute

proj_out_1 = nn.Linear(self.inner_dim, 2 * self.inner_dim)

proj_out_2 instance-attribute

proj_out_2 = nn.Linear(self.inner_dim, output_dim)

timestep_encoder instance-attribute

timestep_encoder = TimestepEncoder(
    embedding_dim=self.inner_dim
)

transformer_blocks instance-attribute

transformer_blocks = nn.ModuleList(all_blocks)

forward

forward(
    hidden_states: Tensor,
    encoder_hidden_states: Tensor,
    timestep: LongTensor | None = None,
    encoder_attention_mask: Tensor | None = None,
    return_all_hidden_states: bool = False,
)

SelfAttentionTransformer

Bases: ModelMixin, ConfigMixin

gradient_checkpointing instance-attribute

gradient_checkpointing = False

inner_dim instance-attribute

inner_dim = num_attention_heads * attention_head_dim

transformer_blocks instance-attribute

transformer_blocks = nn.ModuleList(
    [
        (
            BasicTransformerBlock(
                self.inner_dim,
                num_attention_heads,
                attention_head_dim,
                dropout=dropout,
                activation_fn=activation_fn,
                attention_bias=attention_bias,
                upcast_attention=upcast_attention,
                positional_embeddings=positional_embeddings,
                num_positional_embeddings=max_num_positional_embeddings,
                final_dropout=final_dropout,
            )
        )
        for _ in (range(num_layers))
    ]
)

forward

forward(
    hidden_states: Tensor,
    return_all_hidden_states: bool = False,
)

TimestepEncoder

Bases: Module

time_proj instance-attribute

time_proj = Timesteps(
    num_channels=256,
    flip_sin_to_cos=True,
    downscale_freq_shift=1,
)

timestep_embedder instance-attribute

timestep_embedder = TimestepEmbedding(
    in_channels=256, time_embed_dim=embedding_dim
)

forward

forward(timesteps: Tensor) -> Tensor