vllm_omni.diffusion.models.gr00t.modeling.modules.dit ¶
AdaLayerNorm ¶
Bases: Module
BasicTransformerBlock ¶
Bases: Module
attn1 instance-attribute ¶
attn1 = Attention(
query_dim=dim,
heads=num_attention_heads,
dim_head=attention_head_dim,
dropout=dropout,
bias=attention_bias,
cross_attention_dim=cross_attention_dim,
upcast_attention=upcast_attention,
out_bias=attention_out_bias,
)
ff instance-attribute ¶
ff = FeedForward(
dim,
dropout=dropout,
activation_fn=activation_fn,
final_dropout=final_dropout,
inner_dim=ff_inner_dim,
bias=ff_bias,
)
final_dropout instance-attribute ¶
pos_embed instance-attribute ¶
forward ¶
forward(
hidden_states: Tensor,
attention_mask: Tensor | None = None,
encoder_hidden_states: Tensor | None = None,
encoder_attention_mask: Tensor | None = None,
temb: LongTensor | None = None,
) -> Tensor
DiT ¶
Bases: ModelMixin, ConfigMixin
norm_out instance-attribute ¶
timestep_encoder instance-attribute ¶
timestep_encoder = TimestepEncoder(
embedding_dim=self.inner_dim
)
SelfAttentionTransformer ¶
Bases: ModelMixin, ConfigMixin
transformer_blocks instance-attribute ¶
transformer_blocks = nn.ModuleList(
[
(
BasicTransformerBlock(
self.inner_dim,
num_attention_heads,
attention_head_dim,
dropout=dropout,
activation_fn=activation_fn,
attention_bias=attention_bias,
upcast_attention=upcast_attention,
positional_embeddings=positional_embeddings,
num_positional_embeddings=max_num_positional_embeddings,
final_dropout=final_dropout,
)
)
for _ in (range(num_layers))
]
)
TimestepEncoder ¶
Bases: Module