[PIM] CPU/DPU Programming Code Review

[PIM] CPU/DPU Programming Code Review

2024. 7. 16. 00:42ㆍComputerScience/ProcessingInMemory

PrIM Benchmarks
Vertor Addition Code Reivew

https://github.com/SohyeonKim-dev/prim-benchmarks

GitHub - SohyeonKim-dev/prim-benchmarks: PrIM (Processing-In-Memory benchmarks) is the first benchmark suite for a real-world pr

PrIM (Processing-In-Memory benchmarks) is the first benchmark suite for a real-world processing-in-memory (PIM) architecture. PrIM is developed to evaluate, analyze, and characterize the first publ...

github.com

# Makefile

DPU_DIR := dpu
HOST_DIR := host
BUILDDIR ?= bin
NR_TASKLETS ?= 16
BL ?= 10
NR_DPUS ?= 1
TYPE ?= INT32
ENERGY ?= 0

define conf_filename
	${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2)_BL_$(3)_TYPE_$(4).conf
endef
CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS},${BL},${TYPE})

HOST_TARGET := ${BUILDDIR}/host_code
DPU_TARGET := ${BUILDDIR}/dpu_code

COMMON_INCLUDES := support
HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c)
DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c)
# 와일드 카드 - 명령을 내릴 때, 여러 파일을 한꺼번에 지정할 목적으로 사용 (해당 디렉토리의 여러 .c sources 의미)

.PHONY: all clean test

__dirs := $(shell mkdir -p ${BUILDDIR}) // 디렉토리 생성

COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TYPE} -DENERGY=${ENERGY}
DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -D${TYPE}

all: ${HOST_TARGET} ${DPU_TARGET}

${CONF}:
	$(RM) $(call conf_filename,*,*)
	touch ${CONF}
	# 설정 파일을 생성, 이전 설정 파일 제거

# HOST Compile
${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF}
	$(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS}

# DPU Compile
${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF}
	dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}

clean:
	$(RM) -r $(BUILDDIR)
	# build dir 삭제

test: all
	./${HOST_TARGET}
	# 빌드된 host 코드 실행

+ makefile이란?

- make는 파일 관리 유틸리티 -> 수정된 소스파일 recompile 용이

- Makefile 내부에 작성된 코드에 따라 make가 컴파일러에게 SHELL 명령어를 순차적으로 실행하도록 명령

-> 소스파일을 자동으로 컴파일 가능

+ 쿠버네티스/도커 실습에서 활용한 Dockerfile, docker-compose.yaml 파일과 유사한 것 같다.

-> 해당 파일들은 이미지 or 도커 컨테이너를 위해서, makefile은 빌드 or 컴파일 자동화를 위해서

$@ : 목표 이름
$* : 목표 이름에서 확장자가 없는 이름
$< : 첫 번째 전제 조건의 파일 이름
$? : 목표 파일 보다 더 최근에 갱신된 파일 이름
$^: 현재 Target이 의존하는 대상들의 전체 목록
$?: 현재 Target이 의존하는 대상들 중 변경된 것들의 목록
$% : 대상의 이름 (해당 규칙 대상이 아카이브 인 경우)

# host - app.c

/**
* app.c
* VA Host Application Source File
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <dpu.h>
#include <dpu_log.h>
#include <unistd.h>
#include <getopt.h>
#include <assert.h>

#include "../support/common.h"
#include "../support/timer.h"
#include "../support/params.h"

// Define the DPU Binary path as DPU_BINARY here
#ifndef DPU_BINARY
#define DPU_BINARY "./bin/dpu_code"
#endif

#if ENERGY
#include <dpu_probe.h>
#endif

// Pointer declaration - generic 
// input, output을 저장할 generic type의 배열을 선언 (포인터 형태로)
static T* A;
static T* B;
static T* C;
static T* C2; // 왜 C2? -> 

// Create input arrays - random으로 생성한 데이터를 A, B에 대입하여 저장 
static void read_input(T* A, T* B, unsigned int nr_elements) {
    srand(0);
    printf("nr_elements\t%u\t", nr_elements);
    for (unsigned int i = 0; i < nr_elements; i++) {
        A[i] = (T) (rand());
        B[i] = (T) (rand());
    }
}

// Compute output in the host - 호스트에서 벡터 덧셈을 (요소별 덧셈) 수행하는 코드 
static void vector_addition_host(T* C, T* A, T* B, unsigned int nr_elements) {
    for (unsigned int i = 0; i < nr_elements; i++) {
        C[i] = A[i] + B[i];
    }
}

// Main of the Host Application
int main(int argc, char **argv) {

    struct Params p = input_params(argc, argv);

    struct dpu_set_t dpu_set, dpu;
    uint32_t nr_of_dpus;

#if ENERGY
    struct dpu_probe_t probe;
    DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
#endif

    // Allocate DPUs and load binary
    DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
    DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
    DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
    printf("Allocated %d DPU(s)\n", nr_of_dpus);
    unsigned int i = 0;

    const unsigned int input_size = p.exp == 0 ? p.input_size * nr_of_dpus : p.input_size; // Total input size (weak or strong scaling)
    const unsigned int input_size_8bytes = 
        ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned
    const unsigned int input_size_dpu = divceil(input_size, nr_of_dpus); // Input size per DPU (max.)
    const unsigned int input_size_dpu_8bytes = 
        ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned

    // Input/output allocation - 배열 크기 계산하여, 각 어레이에 메모리를 할당하는 과정
    A = malloc(input_size_dpu_8bytes * nr_of_dpus * sizeof(T));
    B = malloc(input_size_dpu_8bytes * nr_of_dpus * sizeof(T));
    C = malloc(input_size_dpu_8bytes * nr_of_dpus * sizeof(T));
    C2 = malloc(input_size_dpu_8bytes * nr_of_dpus * sizeof(T));

    T *bufferA = A;
    T *bufferB = B;
    T *bufferC = C2;

    // Create an input file with arbitrary data
    read_input(A, B, input_size);

    // Timer declaration
    Timer timer;

    printf("NR_TASKLETS\t%d\tBL\t%d\n", NR_TASKLETS, BL);

    // Loop over main kernel
    for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {

        // Compute output on CPU (performance comparison and verification purposes)
        // 타이머 + host에서 벡터 덧셈을 수행하는 과정 (결과 비교를 위한 목적으로)
        if(rep >= p.n_warmup)
            start(&timer, 0, rep - p.n_warmup);
        vector_addition_host(C, A, B, input_size);
        if(rep >= p.n_warmup)
            stop(&timer, 0);

        printf("Load input data\n");
        if(rep >= p.n_warmup)
            start(&timer, 1, rep - p.n_warmup);
        
        // Input arguments
        unsigned int kernel = 0;
        dpu_arguments_t input_arguments[NR_DPUS];

        for(i=0; i<nr_of_dpus-1; i++) {
            input_arguments[i].size=input_size_dpu_8bytes * sizeof(T); 
            input_arguments[i].transfer_size=input_size_dpu_8bytes * sizeof(T); 
            input_arguments[i].kernel=kernel;
        }
        input_arguments[nr_of_dpus-1].size=(input_size_8bytes - input_size_dpu_8bytes * (NR_DPUS-1)) * sizeof(T); 
        input_arguments[nr_of_dpus-1].transfer_size=input_size_dpu_8bytes * sizeof(T); 
        input_arguments[nr_of_dpus-1].kernel=kernel;

        // Copy input arrays - DPU에 배열(데이터)를 전달하는 과정? 
        i = 0;
        DPU_FOREACH(dpu_set, dpu, i) {
            DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments[i]));
        }
        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments[0]), DPU_XFER_DEFAULT));

        DPU_FOREACH(dpu_set, dpu, i) {
            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferA + input_size_dpu_8bytes * i));
        }
        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
 
        DPU_FOREACH(dpu_set, dpu, i) {
            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferB + input_size_dpu_8bytes * i));
        }
        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
        if(rep >= p.n_warmup)
            stop(&timer, 1);

        printf("Run program on DPU(s) \n"); 
        // DPU로 벡터 덧셈 수행 (아까는 host - 즉, CPU)
        // Run DPU kernel
        if(rep >= p.n_warmup) {
            start(&timer, 2, rep - p.n_warmup);
            #if ENERGY
            DPU_ASSERT(dpu_probe_start(&probe));
            #endif
        }
        DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
        if(rep >= p.n_warmup) {
            stop(&timer, 2);
            #if ENERGY
            DPU_ASSERT(dpu_probe_stop(&probe));
            #endif
        }

#if PRINT
        {
            unsigned int each_dpu = 0;
            printf("Display DPU Logs\n");
            DPU_FOREACH (dpu_set, dpu) {
                printf("DPU#%d:\n", each_dpu);
                DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
                each_dpu++;
            }
        }
#endif

        printf("Retrieve results\n");
        if(rep >= p.n_warmup)
            start(&timer, 3, rep - p.n_warmup);
        i = 0;
        // PARALLEL RETRIEVE TRANSFER - DPU에서 계산 결과를 전송한다 
        DPU_FOREACH(dpu_set, dpu, i) {
            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferC + input_size_dpu_8bytes * i));
        }
        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
        if(rep >= p.n_warmup)
            stop(&timer, 3);

    }

    // Print timing results - CPU vs DPU 시간 비교를 위한 코드
    printf("CPU ");
    print(&timer, 0, p.n_reps);
    printf("CPU-DPU ");
    print(&timer, 1, p.n_reps);
    printf("DPU Kernel ");
    print(&timer, 2, p.n_reps);
    printf("DPU-CPU ");
    print(&timer, 3, p.n_reps);

// 중간중간 들어가는 에너지 코드는 뭐지?
#if ENERGY
    double energy;
    DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
    printf("DPU Energy (J): %f\t", energy);
#endif	

    // Check output
    bool status = true;
    for (i = 0; i < input_size; i++) {
        if(C[i] != bufferC[i]){ 
            status = false;
#if PRINT
            printf("%d: %u -- %u\n", i, C[i], bufferC[i]);
#endif
        }
    }
    if (status) {
        printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
    } else {
        printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
    }

    // Deallocation - 할당된 동적 메모리 해제
    free(A);
    free(B);
    free(C);
    free(C2);
    DPU_ASSERT(dpu_free(dpu_set));
	
    return status ? 0 : -1;
}

# dpu - task.c

/*
* Vector addition with multiple tasklets
*
*/
#include <stdint.h>
#include <stdio.h>
#include <defs.h>
#include <mram.h>
#include <alloc.h>
#include <perfcounter.h>
#include <barrier.h>

#include "../support/common.h"

__host dpu_arguments_t DPU_INPUT_ARGUMENTS;

// vector_addition: Computes the vector addition of a cached block 
// 벡터 덧셈 연산 at DPU
static void vector_addition(T *bufferB, T *bufferA, unsigned int l_size) {
    for (unsigned int i = 0; i < l_size; i++){
        bufferB[i] += bufferA[i];
    }
}

// Barrier - pthread - 스레드와 관련된 베리어 개념
// NR_TASKLETS개의 tasklet, 즉 스레드들의 동기화를 기다림 -> 이후 과정 진행 
BARRIER_INIT(my_barrier, NR_TASKLETS);

// 메인 함수를 가져온다는 의미? -> 외부에서 함수가 연결된다는 의미
extern int main_kernel1(void);

// 함수 포인터 배열을 초기화
// nr_kernels개 요소를 갖는 kernels array
// main_kernel1 함수가 first data 
// kernels array에 여러 커널 함수를 정의하여, 동적으로 호출
int (*kernels[nr_kernels])(void) = {main_kernel1};

int main(void) { 
    // kernel을 통해 kernels array의 여러 커널 함수를 호출할 수 있음! wow
    return kernels[DPU_INPUT_ARGUMENTS.kernel](); 
}

// main_kernel1
int main_kernel1() {
    unsigned int tasklet_id = me(); // 자기 자신의 id를 가져오기
    // 각각의 tasklet들은 고유한 id를 갖는다 

#if PRINT
    printf("tasklet_id = %u\n", tasklet_id);
#endif
    // print 구문을 매크로로 주는구나..!

    if (tasklet_id == 0){ // Initialize once the cycle counter
        mem_reset(); // Reset the heap
    }
    // Barrier - 다른 모든 tasklets 도달을 기다림 
    barrier_wait(&my_barrier);

    uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size; // Input size per DPU in bytes
    uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size; // Transfer input size per DPU in bytes

    // Address of the current processing block in MRAM -> MRAM에서 input 데이터를 저장할 base address 계산
    uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2; // shift 개념
    uint32_t mram_base_addr_A = (uint32_t)DPU_MRAM_HEAP_POINTER;
    uint32_t mram_base_addr_B = (uint32_t)(DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer);

    // Initialize a local cache to store the MRAM block 
    T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
    T *cache_B = (T *) mem_alloc(BLOCK_SIZE);

    for(unsigned int byte_index = base_tasklet; byte_index < input_size_dpu_bytes; byte_index += BLOCK_SIZE * NR_TASKLETS){

        // Bound checking
        uint32_t l_size_bytes = (byte_index + BLOCK_SIZE >= input_size_dpu_bytes) ? (input_size_dpu_bytes - byte_index) : BLOCK_SIZE;

        // Load cache with current MRAM block
        mram_read((__mram_ptr void const*)(mram_base_addr_A + byte_index), cache_A, l_size_bytes);
        mram_read((__mram_ptr void const*)(mram_base_addr_B + byte_index), cache_B, l_size_bytes);

        // Computer vector addition - 벡터 덧셈 연산 at DPU 
        // l_size = l_size_bytes >> DIV
        // DIV = 데이터 타입 별로 정의된 size -> 얘로 나누어서, add 할 개수를 구하는 것 
        vector_addition(cache_B, cache_A, l_size_bytes >> DIV);

        // Write cache to current MRAM block - 계산 결과를 MRAM에 write하는 과정 
        mram_write(cache_B, (__mram_ptr void*)(mram_base_addr_B + byte_index), l_size_bytes);

    }

    return 0;
}

✍️ 참고자료

- https://junstar92.tistory.com/233

- https://eunbin00.tistory.com/87

- https://velog.io/@hidaehyunlee/Makefile-자주-사용하는-문법-정리

- https://github.com/CMU-SAFARI/prim-benchmarks

GitHub - CMU-SAFARI/prim-benchmarks: PrIM (Processing-In-Memory benchmarks) is the first benchmark suite for a real-world proces

github.com

'ComputerScience > ProcessingInMemory' 카테고리의 다른 글

[Pin] Encoding Memory Visualization (2)	2024.10.22
[Pin] CoreBPE Memory Tracing by pinatrace (4)	2024.10.15
[PIM] HEAM: Hashed Embedding Acceleration Using Processing-In-Memory (0)	2024.06.25
[PIM] Processing-in-memory: A workload-driven perspective (0)	2024.05.21
[PIM] Benchmarking a New Paradigm: An Experimental Analysis of a Real Processing-in-Memory Architecture (0)	2024.05.17

KimAnt 🥦

KimAnt 🥦

태그

최근글

댓글

공지사항

아카이브

PrIM Benchmarks
Vertor Addition Code Reivew

'ComputerScience > ProcessingInMemory' 카테고리의 다른 글

관련글

티스토리툴바

KimAnt 🥦

태그

최근글

댓글

공지사항

아카이브

PrIM BenchmarksVertor Addition Code Reivew

'ComputerScience > ProcessingInMemory' 카테고리의 다른 글

관련글

티스토리툴바

PrIM Benchmarks
Vertor Addition Code Reivew