[PIM] CPU/DPU Programming Code Review

2024. 7. 16. 00:42ใ†ComputerScience/ComputerArchitecture

 

 

 

PrIM Benchmarks
Vertor Addition Code Reivew 

https://github.com/SohyeonKim-dev/prim-benchmarks

 

GitHub - SohyeonKim-dev/prim-benchmarks: PrIM (Processing-In-Memory benchmarks) is the first benchmark suite for a real-world pr

PrIM (Processing-In-Memory benchmarks) is the first benchmark suite for a real-world processing-in-memory (PIM) architecture. PrIM is developed to evaluate, analyze, and characterize the first publ...

github.com

 

 

 

barrier ๊ตฌ๋ฌธ์„ ์ƒˆ๋กญ๊ฒŒ ์•Œ๊ฒŒ ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.!

 

 

 

# Makefile

DPU_DIR := dpu
HOST_DIR := host
BUILDDIR ?= bin
NR_TASKLETS ?= 16
BL ?= 10
NR_DPUS ?= 1
TYPE ?= INT32
ENERGY ?= 0

define conf_filename
	${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2)_BL_$(3)_TYPE_$(4).conf
endef
CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS},${BL},${TYPE})

HOST_TARGET := ${BUILDDIR}/host_code
DPU_TARGET := ${BUILDDIR}/dpu_code

COMMON_INCLUDES := support
HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c)
DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c)
# ์™€์ผ๋“œ ์นด๋“œ - ๋ช…๋ น์„ ๋‚ด๋ฆด ๋•Œ, ์—ฌ๋Ÿฌ ํŒŒ์ผ์„ ํ•œ๊บผ๋ฒˆ์— ์ง€์ •ํ•  ๋ชฉ์ ์œผ๋กœ ์‚ฌ์šฉ (ํ•ด๋‹น ๋””๋ ‰ํ† ๋ฆฌ์˜ ์—ฌ๋Ÿฌ .c sources ์˜๋ฏธ)

.PHONY: all clean test

__dirs := $(shell mkdir -p ${BUILDDIR}) // ๋””๋ ‰ํ† ๋ฆฌ ์ƒ์„ฑ

COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TYPE} -DENERGY=${ENERGY}
DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -D${TYPE}

all: ${HOST_TARGET} ${DPU_TARGET}

${CONF}:
	$(RM) $(call conf_filename,*,*)
	touch ${CONF}
	# ์„ค์ • ํŒŒ์ผ์„ ์ƒ์„ฑ, ์ด์ „ ์„ค์ • ํŒŒ์ผ ์ œ๊ฑฐ

# HOST Compile
${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF}
	$(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS}

# DPU Compile
${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF}
	dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}

clean:
	$(RM) -r $(BUILDDIR)
	# build dir ์‚ญ์ œ

test: all
	./${HOST_TARGET}
	# ๋นŒ๋“œ๋œ host ์ฝ”๋“œ ์‹คํ–‰

 

+ makefile์ด๋ž€? 

- make๋Š” ํŒŒ์ผ ๊ด€๋ฆฌ ์œ ํ‹ธ๋ฆฌํ‹ฐ -> ์ˆ˜์ •๋œ ์†Œ์ŠคํŒŒ์ผ recompile ์šฉ์ด

- Makefile ๋‚ด๋ถ€์— ์ž‘์„ฑ๋œ ์ฝ”๋“œ์— ๋”ฐ๋ผ make๊ฐ€ ์ปดํŒŒ์ผ๋Ÿฌ์—๊ฒŒ SHELL ๋ช…๋ น์–ด๋ฅผ ์ˆœ์ฐจ์ ์œผ๋กœ ์‹คํ–‰ํ•˜๋„๋ก ๋ช…๋ น

-> ์†Œ์ŠคํŒŒ์ผ์„ ์ž๋™์œผ๋กœ ์ปดํŒŒ์ผ ๊ฐ€๋Šฅ 

+ ์ฟ ๋ฒ„๋„คํ‹ฐ์Šค/๋„์ปค ์‹ค์Šต์—์„œ ํ™œ์šฉํ•œ Dockerfile, docker-compose.yaml ํŒŒ์ผ๊ณผ ์œ ์‚ฌํ•œ ๊ฒƒ ๊ฐ™๋‹ค. 

-> ํ•ด๋‹น ํŒŒ์ผ๋“ค์€ ์ด๋ฏธ์ง€ or ๋„์ปค ์ปจํ…Œ์ด๋„ˆ๋ฅผ ์œ„ํ•ด์„œ, makefile์€ ๋นŒ๋“œ or ์ปดํŒŒ์ผ ์ž๋™ํ™”๋ฅผ ์œ„ํ•ด์„œ 

 

  • $@ : ๋ชฉํ‘œ ์ด๋ฆ„
  • $* : ๋ชฉํ‘œ ์ด๋ฆ„์—์„œ ํ™•์žฅ์ž๊ฐ€ ์—†๋Š” ์ด๋ฆ„
  • $< : ์ฒซ ๋ฒˆ์งธ ์ „์ œ ์กฐ๊ฑด์˜ ํŒŒ์ผ ์ด๋ฆ„
  • $? : ๋ชฉํ‘œ ํŒŒ์ผ ๋ณด๋‹ค ๋” ์ตœ๊ทผ์— ๊ฐฑ์‹ ๋œ ํŒŒ์ผ ์ด๋ฆ„
  • $^: ํ˜„์žฌ Target์ด ์˜์กดํ•˜๋Š” ๋Œ€์ƒ๋“ค์˜ ์ „์ฒด ๋ชฉ๋ก
  • $?: ํ˜„์žฌ Target์ด ์˜์กดํ•˜๋Š” ๋Œ€์ƒ๋“ค ์ค‘ ๋ณ€๊ฒฝ๋œ ๊ฒƒ๋“ค์˜ ๋ชฉ๋ก
  • $% : ๋Œ€์ƒ์˜ ์ด๋ฆ„ (ํ•ด๋‹น ๊ทœ์น™ ๋Œ€์ƒ์ด ์•„์นด์ด๋ธŒ ์ธ ๊ฒฝ์šฐ)

 

 

# host - app.c 

/**
* app.c
* VA Host Application Source File
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <dpu.h>
#include <dpu_log.h>
#include <unistd.h>
#include <getopt.h>
#include <assert.h>

#include "../support/common.h"
#include "../support/timer.h"
#include "../support/params.h"

// Define the DPU Binary path as DPU_BINARY here
#ifndef DPU_BINARY
#define DPU_BINARY "./bin/dpu_code"
#endif

#if ENERGY
#include <dpu_probe.h>
#endif

// Pointer declaration - generic 
// input, output์„ ์ €์žฅํ•  generic type์˜ ๋ฐฐ์—ด์„ ์„ ์–ธ (ํฌ์ธํ„ฐ ํ˜•ํƒœ๋กœ)
static T* A;
static T* B;
static T* C;
static T* C2; // ์™œ C2? -> 

// Create input arrays - random์œผ๋กœ ์ƒ์„ฑํ•œ ๋ฐ์ดํ„ฐ๋ฅผ A, B์— ๋Œ€์ž…ํ•˜์—ฌ ์ €์žฅ 
static void read_input(T* A, T* B, unsigned int nr_elements) {
    srand(0);
    printf("nr_elements\t%u\t", nr_elements);
    for (unsigned int i = 0; i < nr_elements; i++) {
        A[i] = (T) (rand());
        B[i] = (T) (rand());
    }
}

// Compute output in the host - ํ˜ธ์ŠคํŠธ์—์„œ ๋ฒกํ„ฐ ๋ง์…ˆ์„ (์š”์†Œ๋ณ„ ๋ง์…ˆ) ์ˆ˜ํ–‰ํ•˜๋Š” ์ฝ”๋“œ 
static void vector_addition_host(T* C, T* A, T* B, unsigned int nr_elements) {
    for (unsigned int i = 0; i < nr_elements; i++) {
        C[i] = A[i] + B[i];
    }
}

// Main of the Host Application
int main(int argc, char **argv) {

    struct Params p = input_params(argc, argv);

    struct dpu_set_t dpu_set, dpu;
    uint32_t nr_of_dpus;

#if ENERGY
    struct dpu_probe_t probe;
    DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
#endif

    // Allocate DPUs and load binary
    DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
    DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
    DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
    printf("Allocated %d DPU(s)\n", nr_of_dpus);
    unsigned int i = 0;

    const unsigned int input_size = p.exp == 0 ? p.input_size * nr_of_dpus : p.input_size; // Total input size (weak or strong scaling)
    const unsigned int input_size_8bytes = 
        ((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned
    const unsigned int input_size_dpu = divceil(input_size, nr_of_dpus); // Input size per DPU (max.)
    const unsigned int input_size_dpu_8bytes = 
        ((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned

    // Input/output allocation - ๋ฐฐ์—ด ํฌ๊ธฐ ๊ณ„์‚ฐํ•˜์—ฌ, ๊ฐ ์–ด๋ ˆ์ด์— ๋ฉ”๋ชจ๋ฆฌ๋ฅผ ํ• ๋‹นํ•˜๋Š” ๊ณผ์ •
    A = malloc(input_size_dpu_8bytes * nr_of_dpus * sizeof(T));
    B = malloc(input_size_dpu_8bytes * nr_of_dpus * sizeof(T));
    C = malloc(input_size_dpu_8bytes * nr_of_dpus * sizeof(T));
    C2 = malloc(input_size_dpu_8bytes * nr_of_dpus * sizeof(T));

    T *bufferA = A;
    T *bufferB = B;
    T *bufferC = C2;

    // Create an input file with arbitrary data
    read_input(A, B, input_size);

    // Timer declaration
    Timer timer;

    printf("NR_TASKLETS\t%d\tBL\t%d\n", NR_TASKLETS, BL);

    // Loop over main kernel
    for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {

        // Compute output on CPU (performance comparison and verification purposes)
        // ํƒ€์ด๋จธ + host์—์„œ ๋ฒกํ„ฐ ๋ง์…ˆ์„ ์ˆ˜ํ–‰ํ•˜๋Š” ๊ณผ์ • (๊ฒฐ๊ณผ ๋น„๊ต๋ฅผ ์œ„ํ•œ ๋ชฉ์ ์œผ๋กœ)
        if(rep >= p.n_warmup)
            start(&timer, 0, rep - p.n_warmup);
        vector_addition_host(C, A, B, input_size);
        if(rep >= p.n_warmup)
            stop(&timer, 0);

        printf("Load input data\n");
        if(rep >= p.n_warmup)
            start(&timer, 1, rep - p.n_warmup);
        
        // Input arguments
        unsigned int kernel = 0;
        dpu_arguments_t input_arguments[NR_DPUS];

        for(i=0; i<nr_of_dpus-1; i++) {
            input_arguments[i].size=input_size_dpu_8bytes * sizeof(T); 
            input_arguments[i].transfer_size=input_size_dpu_8bytes * sizeof(T); 
            input_arguments[i].kernel=kernel;
        }
        input_arguments[nr_of_dpus-1].size=(input_size_8bytes - input_size_dpu_8bytes * (NR_DPUS-1)) * sizeof(T); 
        input_arguments[nr_of_dpus-1].transfer_size=input_size_dpu_8bytes * sizeof(T); 
        input_arguments[nr_of_dpus-1].kernel=kernel;

        // Copy input arrays - DPU์— ๋ฐฐ์—ด(๋ฐ์ดํ„ฐ)๋ฅผ ์ „๋‹ฌํ•˜๋Š” ๊ณผ์ •? 
        i = 0;
        DPU_FOREACH(dpu_set, dpu, i) {
            DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments[i]));
        }
        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments[0]), DPU_XFER_DEFAULT));

        DPU_FOREACH(dpu_set, dpu, i) {
            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferA + input_size_dpu_8bytes * i));
        }
        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
 
        DPU_FOREACH(dpu_set, dpu, i) {
            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferB + input_size_dpu_8bytes * i));
        }
        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
        if(rep >= p.n_warmup)
            stop(&timer, 1);

        printf("Run program on DPU(s) \n"); 
        // DPU๋กœ ๋ฒกํ„ฐ ๋ง์…ˆ ์ˆ˜ํ–‰ (์•„๊นŒ๋Š” host - ์ฆ‰, CPU)
        // Run DPU kernel
        if(rep >= p.n_warmup) {
            start(&timer, 2, rep - p.n_warmup);
            #if ENERGY
            DPU_ASSERT(dpu_probe_start(&probe));
            #endif
        }
        DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
        if(rep >= p.n_warmup) {
            stop(&timer, 2);
            #if ENERGY
            DPU_ASSERT(dpu_probe_stop(&probe));
            #endif
        }

#if PRINT
        {
            unsigned int each_dpu = 0;
            printf("Display DPU Logs\n");
            DPU_FOREACH (dpu_set, dpu) {
                printf("DPU#%d:\n", each_dpu);
                DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
                each_dpu++;
            }
        }
#endif

        printf("Retrieve results\n");
        if(rep >= p.n_warmup)
            start(&timer, 3, rep - p.n_warmup);
        i = 0;
        // PARALLEL RETRIEVE TRANSFER - DPU์—์„œ ๊ณ„์‚ฐ ๊ฒฐ๊ณผ๋ฅผ ์ „์†กํ•œ๋‹ค 
        DPU_FOREACH(dpu_set, dpu, i) {
            DPU_ASSERT(dpu_prepare_xfer(dpu, bufferC + input_size_dpu_8bytes * i));
        }
        DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
        if(rep >= p.n_warmup)
            stop(&timer, 3);

    }

    // Print timing results - CPU vs DPU ์‹œ๊ฐ„ ๋น„๊ต๋ฅผ ์œ„ํ•œ ์ฝ”๋“œ
    printf("CPU ");
    print(&timer, 0, p.n_reps);
    printf("CPU-DPU ");
    print(&timer, 1, p.n_reps);
    printf("DPU Kernel ");
    print(&timer, 2, p.n_reps);
    printf("DPU-CPU ");
    print(&timer, 3, p.n_reps);

// ์ค‘๊ฐ„์ค‘๊ฐ„ ๋“ค์–ด๊ฐ€๋Š” ์—๋„ˆ์ง€ ์ฝ”๋“œ๋Š” ๋ญ์ง€?
#if ENERGY
    double energy;
    DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
    printf("DPU Energy (J): %f\t", energy);
#endif	

    // Check output
    bool status = true;
    for (i = 0; i < input_size; i++) {
        if(C[i] != bufferC[i]){ 
            status = false;
#if PRINT
            printf("%d: %u -- %u\n", i, C[i], bufferC[i]);
#endif
        }
    }
    if (status) {
        printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
    } else {
        printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
    }

    // Deallocation - ํ• ๋‹น๋œ ๋™์  ๋ฉ”๋ชจ๋ฆฌ ํ•ด์ œ
    free(A);
    free(B);
    free(C);
    free(C2);
    DPU_ASSERT(dpu_free(dpu_set));
	
    return status ? 0 : -1;
}

 

 

 

# dpu - task.c

/*
* Vector addition with multiple tasklets
*
*/
#include <stdint.h>
#include <stdio.h>
#include <defs.h>
#include <mram.h>
#include <alloc.h>
#include <perfcounter.h>
#include <barrier.h>

#include "../support/common.h"

__host dpu_arguments_t DPU_INPUT_ARGUMENTS;

// vector_addition: Computes the vector addition of a cached block 
// ๋ฒกํ„ฐ ๋ง์…ˆ ์—ฐ์‚ฐ at DPU
static void vector_addition(T *bufferB, T *bufferA, unsigned int l_size) {
    for (unsigned int i = 0; i < l_size; i++){
        bufferB[i] += bufferA[i];
    }
}

// Barrier - pthread - ์Šค๋ ˆ๋“œ์™€ ๊ด€๋ จ๋œ ๋ฒ ๋ฆฌ์–ด ๊ฐœ๋…
// NR_TASKLETS๊ฐœ์˜ tasklet, ์ฆ‰ ์Šค๋ ˆ๋“œ๋“ค์˜ ๋™๊ธฐํ™”๋ฅผ ๊ธฐ๋‹ค๋ฆผ -> ์ดํ›„ ๊ณผ์ • ์ง„ํ–‰ 
BARRIER_INIT(my_barrier, NR_TASKLETS);

// ๋ฉ”์ธ ํ•จ์ˆ˜๋ฅผ ๊ฐ€์ ธ์˜จ๋‹ค๋Š” ์˜๋ฏธ? -> ์™ธ๋ถ€์—์„œ ํ•จ์ˆ˜๊ฐ€ ์—ฐ๊ฒฐ๋œ๋‹ค๋Š” ์˜๋ฏธ
extern int main_kernel1(void);

// ํ•จ์ˆ˜ ํฌ์ธํ„ฐ ๋ฐฐ์—ด์„ ์ดˆ๊ธฐํ™”
// nr_kernels๊ฐœ ์š”์†Œ๋ฅผ ๊ฐ–๋Š” kernels array
// main_kernel1 ํ•จ์ˆ˜๊ฐ€ first data 
// kernels array์— ์—ฌ๋Ÿฌ ์ปค๋„ ํ•จ์ˆ˜๋ฅผ ์ •์˜ํ•˜์—ฌ, ๋™์ ์œผ๋กœ ํ˜ธ์ถœ
int (*kernels[nr_kernels])(void) = {main_kernel1};

int main(void) { 
    // kernel์„ ํ†ตํ•ด kernels array์˜ ์—ฌ๋Ÿฌ ์ปค๋„ ํ•จ์ˆ˜๋ฅผ ํ˜ธ์ถœํ•  ์ˆ˜ ์žˆ์Œ! wow
    return kernels[DPU_INPUT_ARGUMENTS.kernel](); 
}

// main_kernel1
int main_kernel1() {
    unsigned int tasklet_id = me(); // ์ž๊ธฐ ์ž์‹ ์˜ id๋ฅผ ๊ฐ€์ ธ์˜ค๊ธฐ
    // ๊ฐ๊ฐ์˜ tasklet๋“ค์€ ๊ณ ์œ ํ•œ id๋ฅผ ๊ฐ–๋Š”๋‹ค 

#if PRINT
    printf("tasklet_id = %u\n", tasklet_id);
#endif
    // print ๊ตฌ๋ฌธ์„ ๋งคํฌ๋กœ๋กœ ์ฃผ๋Š”๊ตฌ๋‚˜..!

    if (tasklet_id == 0){ // Initialize once the cycle counter
        mem_reset(); // Reset the heap
    }
    // Barrier - ๋‹ค๋ฅธ ๋ชจ๋“  tasklets ๋„๋‹ฌ์„ ๊ธฐ๋‹ค๋ฆผ 
    barrier_wait(&my_barrier);

    uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size; // Input size per DPU in bytes
    uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size; // Transfer input size per DPU in bytes

    // Address of the current processing block in MRAM -> MRAM์—์„œ input ๋ฐ์ดํ„ฐ๋ฅผ ์ €์žฅํ•  base address ๊ณ„์‚ฐ
    uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2; // shift ๊ฐœ๋…
    uint32_t mram_base_addr_A = (uint32_t)DPU_MRAM_HEAP_POINTER;
    uint32_t mram_base_addr_B = (uint32_t)(DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer);

    // Initialize a local cache to store the MRAM block 
    T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
    T *cache_B = (T *) mem_alloc(BLOCK_SIZE);

    for(unsigned int byte_index = base_tasklet; byte_index < input_size_dpu_bytes; byte_index += BLOCK_SIZE * NR_TASKLETS){

        // Bound checking
        uint32_t l_size_bytes = (byte_index + BLOCK_SIZE >= input_size_dpu_bytes) ? (input_size_dpu_bytes - byte_index) : BLOCK_SIZE;

        // Load cache with current MRAM block
        mram_read((__mram_ptr void const*)(mram_base_addr_A + byte_index), cache_A, l_size_bytes);
        mram_read((__mram_ptr void const*)(mram_base_addr_B + byte_index), cache_B, l_size_bytes);

        // Computer vector addition - ๋ฒกํ„ฐ ๋ง์…ˆ ์—ฐ์‚ฐ at DPU 
        // l_size = l_size_bytes >> DIV
        // DIV = ๋ฐ์ดํ„ฐ ํƒ€์ž… ๋ณ„๋กœ ์ •์˜๋œ size -> ์–˜๋กœ ๋‚˜๋ˆ„์–ด์„œ, add ํ•  ๊ฐœ์ˆ˜๋ฅผ ๊ตฌํ•˜๋Š” ๊ฒƒ 
        vector_addition(cache_B, cache_A, l_size_bytes >> DIV);

        // Write cache to current MRAM block - ๊ณ„์‚ฐ ๊ฒฐ๊ณผ๋ฅผ MRAM์— writeํ•˜๋Š” ๊ณผ์ • 
        mram_write(cache_B, (__mram_ptr void*)(mram_base_addr_B + byte_index), l_size_bytes);

    }

    return 0;
}

 

 

 

 

 

โœ๏ธ ์ฐธ๊ณ ์ž๋ฃŒ

- https://junstar92.tistory.com/233

- https://eunbin00.tistory.com/87 

- https://velog.io/@hidaehyunlee/Makefile-์ž์ฃผ-์‚ฌ์šฉํ•˜๋Š”-๋ฌธ๋ฒ•-์ •๋ฆฌ

- https://github.com/CMU-SAFARI/prim-benchmarks

 

GitHub - CMU-SAFARI/prim-benchmarks: PrIM (Processing-In-Memory benchmarks) is the first benchmark suite for a real-world proces

PrIM (Processing-In-Memory benchmarks) is the first benchmark suite for a real-world processing-in-memory (PIM) architecture. PrIM is developed to evaluate, analyze, and characterize the first publ...

github.com