from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
model_path = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForImageTextToText.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
#_attn_implementation="flash_attention_2"
_attn_implementation="sdpa"
).to("cuda")
print(model)
print(model.config)
"""
SmolVLMForConditionalGeneration(
(model): SmolVLMModel(
(vision_model): SmolVLMVisionTransformer(
(embeddings): SmolVLMVisionEmbeddings(
(patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), padding=valid)
(position_embedding): Embedding(1024, 768)
)
(encoder): SmolVLMEncoder(
(layers): ModuleList(
(0-11): 12 x SmolVLMEncoderLayer(
(self_attn): SmolVLMVisionAttention(
(k_proj): Linear(in_features=768, out_features=768, bias=True)
(v_proj): Linear(in_features=768, out_features=768, bias=True)
(q_proj): Linear(in_features=768, out_features=768, bias=True)
(out_proj): Linear(in_features=768, out_features=768, bias=True)
)
(layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
(mlp): SmolVLMVisionMLP(
(activation_fn): GELUTanh()
(fc1): Linear(in_features=768, out_features=3072, bias=True)
(fc2): Linear(in_features=3072, out_features=768, bias=True)
)
(layer_norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
)
)
)
(post_layernorm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
# one patch token dim : 768
)
(connector): SmolVLMConnector(
(modality_projection): SmolVLMSimpleMLP(
(proj): Linear(in_features=12288, out_features=576, bias=False)
# we make 16 patch so 768 x 16 = 12288
)
)
(text_model): LlamaModel(
(embed_tokens): Embedding(49280, 576, padding_idx=2)
# LM 입력 전체 shape를 뜻하는 게 아니고 텍스트 토큰 ID 하나를 몇 차원 벡터로 바꿀 것인가
# 49280 = vocabulary size = 토큰 종류 수
# 576 = hidden size = 각 토큰을 표현하는 벡터 길이
(layers): ModuleList(
(0-29): 30 x LlamaDecoderLayer(
(self_attn): LlamaAttention(
(q_proj): Linear(in_features=576, out_features=576, bias=False)
(k_proj): Linear(in_features=576, out_features=192, bias=False) # 메모리와 연산량을 아끼는 방식
(v_proj): Linear(in_features=576, out_features=192, bias=False)
(o_proj): Linear(in_features=576, out_features=576, bias=False)
# what os o_proj : 1. input x
2. q, k, v generation
3. caluate attetion_score
4. weigth_sum -> attention_output
5. 그 결과를 다시 hidden size에 맞게 변환 ===== 이 마지막 단계가 바로 o_proj
)
(mlp): LlamaMLP(
(gate_proj): Linear(in_features=576, out_features=1536, bias=False)
(up_proj): Linear(in_features=576, out_features=1536, bias=False)
(down_proj): Linear(in_features=1536, out_features=576, bias=False)
(act_fn): SiLUActivation()
)
(input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
(post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
)
)
(norm): LlamaRMSNorm((576,), eps=1e-05)
(rotary_emb): LlamaRotaryEmbedding()
)
)
(lm_head): Linear(in_features=576, out_features=49280, bias=False)
)
"""