[CUDA, Torch] Vector Addition

kaeul·2025년 7월 25일

Easy
Implement a program that performs element-wise addition of two vectors containing 32-bit floating point numbers on a GPU. The program should take two input vectors of equal length and produce a single output vector containing their sum.

Implementation Requirements
External libraries are not permitted
The solve function signature must remain unchanged
The final result must be stored in vector C
Example 1:
Input:  A = [1.0, 2.0, 3.0, 4.0]
        B = [5.0, 6.0, 7.0, 8.0]
Output: C = [6.0, 8.0, 10.0, 12.0]
Example 2:
Input:  A = [1.5, 1.5, 1.5]
        B = [2.3, 2.3, 2.3]
Output: C = [3.8, 3.8, 3.8]
Constraints
Input vectors A and B have identical lengths
1 ≤ N ≤ 100,000,000

import torch
# 메모리 효율적 버전 
# A, B, C are tensors on the GPU

def solve(
    A: torch.Tensor, 
    B: torch.Tensor, 
    C: torch.Tensor, 
    N: int
    ):

    """
    1. pytorch가 적절한 cuda 커널 선택 
    - 텐서크기, 데이터타입 등 고려 
    - elementwisr_add_kernel 같은 커널 선택

    2. gpu 메모리 할당 
    - 결과를저장할 메모리 공간 확보

    3. 커널 호출 파라미터 계산 
    - 블록수 = (N + 255) / 256
    - 스레드 수 = 256

    4. CUDA 커널 실행
    - 각 스레드가 하나 이상의 요소 처리 
    - 모든 연산이 병렬로 수행 

    5. 동기화 및 결과 반환 
    - gpu 연산 완료 대기 
    - 결과 텐서 반환 
    
    
    """



    # element-wise addition of two vectors 32-bit floating point 
    # equal length 
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # 큰 벡터의 경우 청크 단위로 처리 
    chunk_size = 10000000

    for i in range(0, N, chunk_size):
        end_idx = min(i + chunk_size, N)
        
        # 입력 데이터를 pytorch tensor로 변환하고 gpu로 이동 
        chunk_A = torch.tensor(A[i:end_idx], dtype=torch.float32, device=device)
        chunk_B = torch.tensor(B[i:end_idx], dtype=torch.float32, device=device)
    
        # gpu에서 요소별 덧셈 수행
        # pytorch는 내부적으로 최적화된 cuda 커널을 호출
        chunk_C = chunk_A + chunk_B

        # 결과물을 cpu로 복사하고 python리스트로 변환 
        result_chunk = chunk_C.cpu().numpy()
        for j, val in enumerate(result_chunk):
            C[i + j] = float(val)

        # Gpu 메모리 정리 
        del chunk_A, chunk_B, chunk_C
        torch.cuda.empty_cache()
        ```
        
        
        ```
        #include <cuda_runtime.h>
#include <stdio.h>

// 커널함수
__global__ void vector_add_optimized(const float* A, const float* B, float* C, int N) {
    // Grid-stride loop: 각 스레드가 여러 요소 처리 
    
    // 각 스레드가 처리할 전역 인덱스 계산 
    int idx = blockIdx.x * blockDim.X + threadIdx.x;
    // blockIdx.x : 현재 블록의 번호 
    // blockDim.x : 블록당 스레드 수 (여기서는 256)
    // threadIdx.x : 블록 내에서 현재 스레드 번호 

    // 블록 0: 스레드 0~255가 벡터 인덱스 0~255 처리
    // 블록 1: 스레드 0~255가 벡터 인덱스 256~511 처리
    // 블록 2: 스레드 0~255가 벡터 인덱스 512~767 처리
    int stride = blockDim.x * gridDim.x;

    // // 경계 검사: 인덱스가 벡터 크기를 초과하지 않도록 
    // if (idx < N){
    //     // 각 스레드가 하나의 요소를 처리 
    //     C[idx] = A[idx] + B[idx];
    // }

    for (int i = idx; i < N; i += stride){
        C[i] = A[i] + B[i];
    }
}


// -> 메모리 접근 최적화 버전 (float 4 사용)
__global__ void vector_add_float4(const float* A, const float* B, float* C, int N) {
    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * 4;

    if (idx + 3 < N) {
        // 4개 요소를 한번에 로드/저장 (128 비트 단위)
        float4 a = *reinterpret_cast<const float4*>(&A[idx]);
        float4 b = *reinterpret_cast<const float4*>(&B[idx]);
        float4 c;

        c.x = a.x + b.x;
        c.y = a.y + b.y;
        c.z = a.z + b.z;
        c.w = a.w + b.w; 
        
        *reinterpret_cast<float4*>(&C[idx]) = c;
    }
}
 

//A, B, C are device pointers 
//(i.e. pointers to memory on the GPU)
extern "C" void solve(const float* A, const float* B, float* C, int N) {
    int threadsPerBlock = 256;
    int blockPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    vector_add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);
    cudaDeviceSynchronize();
}


int main() {
    int N  = 10000000;
    size_t size = N * sizeof(float);

    // host memory 할당
    float *h_A = (float*)malloc(size);
    float *h_B = (float*)malloc(size);
    float *h_C = (float*)malloc(size);

    // 초기화
    for (int i = 0; i < N; i ++) {
        h_A[i] = 1.0f;
        h_B[i] = 2.0f;
        }
    
    // 디바이스 메모리 할당
    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size);
    cudaMalloc(&d_B, size);
    cudaMalloc(&d_C, size);

    // 호스트 -> 디바이스 복사 
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    
    // solve 함수 호출 
    solve(d_A, d_B, d_C, N);

    // 결과를 호스트로 복사 
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    //결과 확인
    printf("first 5 results: ");
    for (int i = 0; i < 5; i ++) {
        printf("%.1f ", h_C[i]); 
    }
    printf("\n")

    // memory 해제 
    free(h_A); free(h_B); free(h_C);
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);

    return 0; 
    }



        
        ```

kaeul

Deep learning

이전 포스트

[CUDA, Torch] Vector Addition

Continuous Pre-Training (CPT)

0개의 댓글