smolVLM Model Architecture

SON·2026년 3월 12일
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch

model_path = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
processor = AutoProcessor.from_pretrained(model_path)


model = AutoModelForImageTextToText.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    #_attn_implementation="flash_attention_2"
    _attn_implementation="sdpa"
).to("cuda")

print(model)
print(model.config)

"""
SmolVLMForConditionalGeneration(
  (model): SmolVLMModel(
    (vision_model): SmolVLMVisionTransformer(
      (embeddings): SmolVLMVisionEmbeddings(
        (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), padding=valid)
        (position_embedding): Embedding(1024, 768)
      )
      (encoder): SmolVLMEncoder(
        (layers): ModuleList(
          (0-11): 12 x SmolVLMEncoderLayer(
            (self_attn): SmolVLMVisionAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
            (mlp): SmolVLMVisionMLP(
              (activation_fn): GELUTanh()
              (fc1): Linear(in_features=768, out_features=3072, bias=True)
              (fc2): Linear(in_features=3072, out_features=768, bias=True)
            )
            (layer_norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          )
        )
      )
      (post_layernorm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      # one patch token dim : 768 
    )
    (connector): SmolVLMConnector(
      (modality_projection): SmolVLMSimpleMLP(
        (proj): Linear(in_features=12288, out_features=576, bias=False)
        # we make 16 patch so 768 x 16 = 12288
      )
    )
    (text_model): LlamaModel(
      (embed_tokens): Embedding(49280, 576, padding_idx=2) 
      # LM 입력 전체 shape를 뜻하는 게 아니고 텍스트 토큰 ID 하나를 몇 차원 벡터로 바꿀 것인가
      # 49280 = vocabulary size = 토큰 종류 수
      # 576 = hidden size = 각 토큰을 표현하는 벡터 길이
      (layers): ModuleList(
        (0-29): 30 x LlamaDecoderLayer(
          (self_attn): LlamaAttention(
            (q_proj): Linear(in_features=576, out_features=576, bias=False)
            (k_proj): Linear(in_features=576, out_features=192, bias=False) # 메모리와 연산량을 아끼는 방식
            (v_proj): Linear(in_features=576, out_features=192, bias=False)
            (o_proj): Linear(in_features=576, out_features=576, bias=False)
            # what os o_proj : 1. input  x 
                               2. q, k, v generation
                               3. caluate attetion_score
                               4. weigth_sum -> attention_output
                               5. 그 결과를 다시 hidden size에 맞게 변환 =====  이 마지막 단계가 바로 o_proj

          )
          (mlp): LlamaMLP(
            (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
            (up_proj): Linear(in_features=576, out_features=1536, bias=False)
            (down_proj): Linear(in_features=1536, out_features=576, bias=False)
            (act_fn): SiLUActivation()
          )
          (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
          (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
        )
      )
      (norm): LlamaRMSNorm((576,), eps=1e-05)
      (rotary_emb): LlamaRotaryEmbedding()
    )
  )
  (lm_head): Linear(in_features=576, out_features=49280, bias=False)
)


"""
profile
Like it, and it will be the best.

0개의 댓글