2024. 7. 16. 00:42ใComputerScience/ComputerArchitecture
PrIM Benchmarks
Vertor Addition Code Reivew
https://github.com/SohyeonKim-dev/prim-benchmarks
# Makefile
DPU_DIR := dpu
HOST_DIR := host
BUILDDIR ?= bin
NR_TASKLETS ?= 16
BL ?= 10
NR_DPUS ?= 1
TYPE ?= INT32
ENERGY ?= 0
define conf_filename
${BUILDDIR}/.NR_DPUS_$(1)_NR_TASKLETS_$(2)_BL_$(3)_TYPE_$(4).conf
endef
CONF := $(call conf_filename,${NR_DPUS},${NR_TASKLETS},${BL},${TYPE})
HOST_TARGET := ${BUILDDIR}/host_code
DPU_TARGET := ${BUILDDIR}/dpu_code
COMMON_INCLUDES := support
HOST_SOURCES := $(wildcard ${HOST_DIR}/*.c)
DPU_SOURCES := $(wildcard ${DPU_DIR}/*.c)
# ์์ผ๋ ์นด๋ - ๋ช
๋ น์ ๋ด๋ฆด ๋, ์ฌ๋ฌ ํ์ผ์ ํ๊บผ๋ฒ์ ์ง์ ํ ๋ชฉ์ ์ผ๋ก ์ฌ์ฉ (ํด๋น ๋๋ ํ ๋ฆฌ์ ์ฌ๋ฌ .c sources ์๋ฏธ)
.PHONY: all clean test
__dirs := $(shell mkdir -p ${BUILDDIR}) // ๋๋ ํ ๋ฆฌ ์์ฑ
COMMON_FLAGS := -Wall -Wextra -g -I${COMMON_INCLUDES}
HOST_FLAGS := ${COMMON_FLAGS} -std=c11 -O3 `dpu-pkg-config --cflags --libs dpu` -DNR_TASKLETS=${NR_TASKLETS} -DNR_DPUS=${NR_DPUS} -DBL=${BL} -D${TYPE} -DENERGY=${ENERGY}
DPU_FLAGS := ${COMMON_FLAGS} -O2 -DNR_TASKLETS=${NR_TASKLETS} -DBL=${BL} -D${TYPE}
all: ${HOST_TARGET} ${DPU_TARGET}
${CONF}:
$(RM) $(call conf_filename,*,*)
touch ${CONF}
# ์ค์ ํ์ผ์ ์์ฑ, ์ด์ ์ค์ ํ์ผ ์ ๊ฑฐ
# HOST Compile
${HOST_TARGET}: ${HOST_SOURCES} ${COMMON_INCLUDES} ${CONF}
$(CC) -o $@ ${HOST_SOURCES} ${HOST_FLAGS}
# DPU Compile
${DPU_TARGET}: ${DPU_SOURCES} ${COMMON_INCLUDES} ${CONF}
dpu-upmem-dpurte-clang ${DPU_FLAGS} -o $@ ${DPU_SOURCES}
clean:
$(RM) -r $(BUILDDIR)
# build dir ์ญ์
test: all
./${HOST_TARGET}
# ๋น๋๋ host ์ฝ๋ ์คํ
+ makefile์ด๋?
- make๋ ํ์ผ ๊ด๋ฆฌ ์ ํธ๋ฆฌํฐ -> ์์ ๋ ์์คํ์ผ recompile ์ฉ์ด
- Makefile ๋ด๋ถ์ ์์ฑ๋ ์ฝ๋์ ๋ฐ๋ผ make๊ฐ ์ปดํ์ผ๋ฌ์๊ฒ SHELL ๋ช ๋ น์ด๋ฅผ ์์ฐจ์ ์ผ๋ก ์คํํ๋๋ก ๋ช ๋ น
-> ์์คํ์ผ์ ์๋์ผ๋ก ์ปดํ์ผ ๊ฐ๋ฅ
+ ์ฟ ๋ฒ๋คํฐ์ค/๋์ปค ์ค์ต์์ ํ์ฉํ Dockerfile, docker-compose.yaml ํ์ผ๊ณผ ์ ์ฌํ ๊ฒ ๊ฐ๋ค.
-> ํด๋น ํ์ผ๋ค์ ์ด๋ฏธ์ง or ๋์ปค ์ปจํ ์ด๋๋ฅผ ์ํด์, makefile์ ๋น๋ or ์ปดํ์ผ ์๋ํ๋ฅผ ์ํด์
- $@ : ๋ชฉํ ์ด๋ฆ
- $* : ๋ชฉํ ์ด๋ฆ์์ ํ์ฅ์๊ฐ ์๋ ์ด๋ฆ
- $< : ์ฒซ ๋ฒ์งธ ์ ์ ์กฐ๊ฑด์ ํ์ผ ์ด๋ฆ
- $? : ๋ชฉํ ํ์ผ ๋ณด๋ค ๋ ์ต๊ทผ์ ๊ฐฑ์ ๋ ํ์ผ ์ด๋ฆ
- $^: ํ์ฌ Target์ด ์์กดํ๋ ๋์๋ค์ ์ ์ฒด ๋ชฉ๋ก
- $?: ํ์ฌ Target์ด ์์กดํ๋ ๋์๋ค ์ค ๋ณ๊ฒฝ๋ ๊ฒ๋ค์ ๋ชฉ๋ก
- $% : ๋์์ ์ด๋ฆ (ํด๋น ๊ท์น ๋์์ด ์์นด์ด๋ธ ์ธ ๊ฒฝ์ฐ)
# host - app.c
/**
* app.c
* VA Host Application Source File
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include <dpu.h>
#include <dpu_log.h>
#include <unistd.h>
#include <getopt.h>
#include <assert.h>
#include "../support/common.h"
#include "../support/timer.h"
#include "../support/params.h"
// Define the DPU Binary path as DPU_BINARY here
#ifndef DPU_BINARY
#define DPU_BINARY "./bin/dpu_code"
#endif
#if ENERGY
#include <dpu_probe.h>
#endif
// Pointer declaration - generic
// input, output์ ์ ์ฅํ generic type์ ๋ฐฐ์ด์ ์ ์ธ (ํฌ์ธํฐ ํํ๋ก)
static T* A;
static T* B;
static T* C;
static T* C2; // ์ C2? ->
// Create input arrays - random์ผ๋ก ์์ฑํ ๋ฐ์ดํฐ๋ฅผ A, B์ ๋์
ํ์ฌ ์ ์ฅ
static void read_input(T* A, T* B, unsigned int nr_elements) {
srand(0);
printf("nr_elements\t%u\t", nr_elements);
for (unsigned int i = 0; i < nr_elements; i++) {
A[i] = (T) (rand());
B[i] = (T) (rand());
}
}
// Compute output in the host - ํธ์คํธ์์ ๋ฒกํฐ ๋ง์
์ (์์๋ณ ๋ง์
) ์ํํ๋ ์ฝ๋
static void vector_addition_host(T* C, T* A, T* B, unsigned int nr_elements) {
for (unsigned int i = 0; i < nr_elements; i++) {
C[i] = A[i] + B[i];
}
}
// Main of the Host Application
int main(int argc, char **argv) {
struct Params p = input_params(argc, argv);
struct dpu_set_t dpu_set, dpu;
uint32_t nr_of_dpus;
#if ENERGY
struct dpu_probe_t probe;
DPU_ASSERT(dpu_probe_init("energy_probe", &probe));
#endif
// Allocate DPUs and load binary
DPU_ASSERT(dpu_alloc(NR_DPUS, NULL, &dpu_set));
DPU_ASSERT(dpu_load(dpu_set, DPU_BINARY, NULL));
DPU_ASSERT(dpu_get_nr_dpus(dpu_set, &nr_of_dpus));
printf("Allocated %d DPU(s)\n", nr_of_dpus);
unsigned int i = 0;
const unsigned int input_size = p.exp == 0 ? p.input_size * nr_of_dpus : p.input_size; // Total input size (weak or strong scaling)
const unsigned int input_size_8bytes =
((input_size * sizeof(T)) % 8) != 0 ? roundup(input_size, 8) : input_size; // Input size per DPU (max.), 8-byte aligned
const unsigned int input_size_dpu = divceil(input_size, nr_of_dpus); // Input size per DPU (max.)
const unsigned int input_size_dpu_8bytes =
((input_size_dpu * sizeof(T)) % 8) != 0 ? roundup(input_size_dpu, 8) : input_size_dpu; // Input size per DPU (max.), 8-byte aligned
// Input/output allocation - ๋ฐฐ์ด ํฌ๊ธฐ ๊ณ์ฐํ์ฌ, ๊ฐ ์ด๋ ์ด์ ๋ฉ๋ชจ๋ฆฌ๋ฅผ ํ ๋นํ๋ ๊ณผ์
A = malloc(input_size_dpu_8bytes * nr_of_dpus * sizeof(T));
B = malloc(input_size_dpu_8bytes * nr_of_dpus * sizeof(T));
C = malloc(input_size_dpu_8bytes * nr_of_dpus * sizeof(T));
C2 = malloc(input_size_dpu_8bytes * nr_of_dpus * sizeof(T));
T *bufferA = A;
T *bufferB = B;
T *bufferC = C2;
// Create an input file with arbitrary data
read_input(A, B, input_size);
// Timer declaration
Timer timer;
printf("NR_TASKLETS\t%d\tBL\t%d\n", NR_TASKLETS, BL);
// Loop over main kernel
for(int rep = 0; rep < p.n_warmup + p.n_reps; rep++) {
// Compute output on CPU (performance comparison and verification purposes)
// ํ์ด๋จธ + host์์ ๋ฒกํฐ ๋ง์
์ ์ํํ๋ ๊ณผ์ (๊ฒฐ๊ณผ ๋น๊ต๋ฅผ ์ํ ๋ชฉ์ ์ผ๋ก)
if(rep >= p.n_warmup)
start(&timer, 0, rep - p.n_warmup);
vector_addition_host(C, A, B, input_size);
if(rep >= p.n_warmup)
stop(&timer, 0);
printf("Load input data\n");
if(rep >= p.n_warmup)
start(&timer, 1, rep - p.n_warmup);
// Input arguments
unsigned int kernel = 0;
dpu_arguments_t input_arguments[NR_DPUS];
for(i=0; i<nr_of_dpus-1; i++) {
input_arguments[i].size=input_size_dpu_8bytes * sizeof(T);
input_arguments[i].transfer_size=input_size_dpu_8bytes * sizeof(T);
input_arguments[i].kernel=kernel;
}
input_arguments[nr_of_dpus-1].size=(input_size_8bytes - input_size_dpu_8bytes * (NR_DPUS-1)) * sizeof(T);
input_arguments[nr_of_dpus-1].transfer_size=input_size_dpu_8bytes * sizeof(T);
input_arguments[nr_of_dpus-1].kernel=kernel;
// Copy input arrays - DPU์ ๋ฐฐ์ด(๋ฐ์ดํฐ)๋ฅผ ์ ๋ฌํ๋ ๊ณผ์ ?
i = 0;
DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, &input_arguments[i]));
}
DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, "DPU_INPUT_ARGUMENTS", 0, sizeof(input_arguments[0]), DPU_XFER_DEFAULT));
DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, bufferA + input_size_dpu_8bytes * i));
}
DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, 0, input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, bufferB + input_size_dpu_8bytes * i));
}
DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_TO_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
if(rep >= p.n_warmup)
stop(&timer, 1);
printf("Run program on DPU(s) \n");
// DPU๋ก ๋ฒกํฐ ๋ง์
์ํ (์๊น๋ host - ์ฆ, CPU)
// Run DPU kernel
if(rep >= p.n_warmup) {
start(&timer, 2, rep - p.n_warmup);
#if ENERGY
DPU_ASSERT(dpu_probe_start(&probe));
#endif
}
DPU_ASSERT(dpu_launch(dpu_set, DPU_SYNCHRONOUS));
if(rep >= p.n_warmup) {
stop(&timer, 2);
#if ENERGY
DPU_ASSERT(dpu_probe_stop(&probe));
#endif
}
#if PRINT
{
unsigned int each_dpu = 0;
printf("Display DPU Logs\n");
DPU_FOREACH (dpu_set, dpu) {
printf("DPU#%d:\n", each_dpu);
DPU_ASSERT(dpulog_read_for_dpu(dpu.dpu, stdout));
each_dpu++;
}
}
#endif
printf("Retrieve results\n");
if(rep >= p.n_warmup)
start(&timer, 3, rep - p.n_warmup);
i = 0;
// PARALLEL RETRIEVE TRANSFER - DPU์์ ๊ณ์ฐ ๊ฒฐ๊ณผ๋ฅผ ์ ์กํ๋ค
DPU_FOREACH(dpu_set, dpu, i) {
DPU_ASSERT(dpu_prepare_xfer(dpu, bufferC + input_size_dpu_8bytes * i));
}
DPU_ASSERT(dpu_push_xfer(dpu_set, DPU_XFER_FROM_DPU, DPU_MRAM_HEAP_POINTER_NAME, input_size_dpu_8bytes * sizeof(T), input_size_dpu_8bytes * sizeof(T), DPU_XFER_DEFAULT));
if(rep >= p.n_warmup)
stop(&timer, 3);
}
// Print timing results - CPU vs DPU ์๊ฐ ๋น๊ต๋ฅผ ์ํ ์ฝ๋
printf("CPU ");
print(&timer, 0, p.n_reps);
printf("CPU-DPU ");
print(&timer, 1, p.n_reps);
printf("DPU Kernel ");
print(&timer, 2, p.n_reps);
printf("DPU-CPU ");
print(&timer, 3, p.n_reps);
// ์ค๊ฐ์ค๊ฐ ๋ค์ด๊ฐ๋ ์๋์ง ์ฝ๋๋ ๋ญ์ง?
#if ENERGY
double energy;
DPU_ASSERT(dpu_probe_get(&probe, DPU_ENERGY, DPU_AVERAGE, &energy));
printf("DPU Energy (J): %f\t", energy);
#endif
// Check output
bool status = true;
for (i = 0; i < input_size; i++) {
if(C[i] != bufferC[i]){
status = false;
#if PRINT
printf("%d: %u -- %u\n", i, C[i], bufferC[i]);
#endif
}
}
if (status) {
printf("[" ANSI_COLOR_GREEN "OK" ANSI_COLOR_RESET "] Outputs are equal\n");
} else {
printf("[" ANSI_COLOR_RED "ERROR" ANSI_COLOR_RESET "] Outputs differ!\n");
}
// Deallocation - ํ ๋น๋ ๋์ ๋ฉ๋ชจ๋ฆฌ ํด์
free(A);
free(B);
free(C);
free(C2);
DPU_ASSERT(dpu_free(dpu_set));
return status ? 0 : -1;
}
# dpu - task.c
/*
* Vector addition with multiple tasklets
*
*/
#include <stdint.h>
#include <stdio.h>
#include <defs.h>
#include <mram.h>
#include <alloc.h>
#include <perfcounter.h>
#include <barrier.h>
#include "../support/common.h"
__host dpu_arguments_t DPU_INPUT_ARGUMENTS;
// vector_addition: Computes the vector addition of a cached block
// ๋ฒกํฐ ๋ง์
์ฐ์ฐ at DPU
static void vector_addition(T *bufferB, T *bufferA, unsigned int l_size) {
for (unsigned int i = 0; i < l_size; i++){
bufferB[i] += bufferA[i];
}
}
// Barrier - pthread - ์ค๋ ๋์ ๊ด๋ จ๋ ๋ฒ ๋ฆฌ์ด ๊ฐ๋
// NR_TASKLETS๊ฐ์ tasklet, ์ฆ ์ค๋ ๋๋ค์ ๋๊ธฐํ๋ฅผ ๊ธฐ๋ค๋ฆผ -> ์ดํ ๊ณผ์ ์งํ
BARRIER_INIT(my_barrier, NR_TASKLETS);
// ๋ฉ์ธ ํจ์๋ฅผ ๊ฐ์ ธ์จ๋ค๋ ์๋ฏธ? -> ์ธ๋ถ์์ ํจ์๊ฐ ์ฐ๊ฒฐ๋๋ค๋ ์๋ฏธ
extern int main_kernel1(void);
// ํจ์ ํฌ์ธํฐ ๋ฐฐ์ด์ ์ด๊ธฐํ
// nr_kernels๊ฐ ์์๋ฅผ ๊ฐ๋ kernels array
// main_kernel1 ํจ์๊ฐ first data
// kernels array์ ์ฌ๋ฌ ์ปค๋ ํจ์๋ฅผ ์ ์ํ์ฌ, ๋์ ์ผ๋ก ํธ์ถ
int (*kernels[nr_kernels])(void) = {main_kernel1};
int main(void) {
// kernel์ ํตํด kernels array์ ์ฌ๋ฌ ์ปค๋ ํจ์๋ฅผ ํธ์ถํ ์ ์์! wow
return kernels[DPU_INPUT_ARGUMENTS.kernel]();
}
// main_kernel1
int main_kernel1() {
unsigned int tasklet_id = me(); // ์๊ธฐ ์์ ์ id๋ฅผ ๊ฐ์ ธ์ค๊ธฐ
// ๊ฐ๊ฐ์ tasklet๋ค์ ๊ณ ์ ํ id๋ฅผ ๊ฐ๋๋ค
#if PRINT
printf("tasklet_id = %u\n", tasklet_id);
#endif
// print ๊ตฌ๋ฌธ์ ๋งคํฌ๋ก๋ก ์ฃผ๋๊ตฌ๋..!
if (tasklet_id == 0){ // Initialize once the cycle counter
mem_reset(); // Reset the heap
}
// Barrier - ๋ค๋ฅธ ๋ชจ๋ tasklets ๋๋ฌ์ ๊ธฐ๋ค๋ฆผ
barrier_wait(&my_barrier);
uint32_t input_size_dpu_bytes = DPU_INPUT_ARGUMENTS.size; // Input size per DPU in bytes
uint32_t input_size_dpu_bytes_transfer = DPU_INPUT_ARGUMENTS.transfer_size; // Transfer input size per DPU in bytes
// Address of the current processing block in MRAM -> MRAM์์ input ๋ฐ์ดํฐ๋ฅผ ์ ์ฅํ base address ๊ณ์ฐ
uint32_t base_tasklet = tasklet_id << BLOCK_SIZE_LOG2; // shift ๊ฐ๋
uint32_t mram_base_addr_A = (uint32_t)DPU_MRAM_HEAP_POINTER;
uint32_t mram_base_addr_B = (uint32_t)(DPU_MRAM_HEAP_POINTER + input_size_dpu_bytes_transfer);
// Initialize a local cache to store the MRAM block
T *cache_A = (T *) mem_alloc(BLOCK_SIZE);
T *cache_B = (T *) mem_alloc(BLOCK_SIZE);
for(unsigned int byte_index = base_tasklet; byte_index < input_size_dpu_bytes; byte_index += BLOCK_SIZE * NR_TASKLETS){
// Bound checking
uint32_t l_size_bytes = (byte_index + BLOCK_SIZE >= input_size_dpu_bytes) ? (input_size_dpu_bytes - byte_index) : BLOCK_SIZE;
// Load cache with current MRAM block
mram_read((__mram_ptr void const*)(mram_base_addr_A + byte_index), cache_A, l_size_bytes);
mram_read((__mram_ptr void const*)(mram_base_addr_B + byte_index), cache_B, l_size_bytes);
// Computer vector addition - ๋ฒกํฐ ๋ง์
์ฐ์ฐ at DPU
// l_size = l_size_bytes >> DIV
// DIV = ๋ฐ์ดํฐ ํ์
๋ณ๋ก ์ ์๋ size -> ์๋ก ๋๋์ด์, add ํ ๊ฐ์๋ฅผ ๊ตฌํ๋ ๊ฒ
vector_addition(cache_B, cache_A, l_size_bytes >> DIV);
// Write cache to current MRAM block - ๊ณ์ฐ ๊ฒฐ๊ณผ๋ฅผ MRAM์ writeํ๋ ๊ณผ์
mram_write(cache_B, (__mram_ptr void*)(mram_base_addr_B + byte_index), l_size_bytes);
}
return 0;
}
โ๏ธ ์ฐธ๊ณ ์๋ฃ
- https://junstar92.tistory.com/233
- https://eunbin00.tistory.com/87
- https://velog.io/@hidaehyunlee/Makefile-์์ฃผ-์ฌ์ฉํ๋-๋ฌธ๋ฒ-์ ๋ฆฌ
- https://github.com/CMU-SAFARI/prim-benchmarks
'ComputerScience > ComputerArchitecture' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
[Pin] Encoding Memory Visualization (2) | 2024.10.22 |
---|---|
[Pin] CoreBPE Memory Tracing by pinatrace (4) | 2024.10.15 |