diff --git a/test/microbench/Makefile b/test/microbench/Makefile index af7d1bab8cc838956a3a9d63838fd182a2b55818..03e080ebd374e2021318bd1d2aa62a4c427cc211 100644 --- a/test/microbench/Makefile +++ b/test/microbench/Makefile @@ -59,6 +59,16 @@ ifeq ($(ARCH),riscv64) endif endif +ifeq ($(ARCH),aarch64) + cpumodel := $$(grep -m1 -i 'Model' /proc/cpuinfo | tr '\t' ' ' | tr -s ' ' | cut -d ' ' -f3- | tr -c '[a-zA-Z0-9\.]' '-' | tr -s '-' | sed -e 's/-$$//g' | cut -c1-60) + product := $$(dmesg | grep -i 'Machine model' | cut -d ':' -f2 | sed -e "s/^ //g" | tr ' ' -f2) + + # If the host os is not $(ARCH), it should be cross compiling + ifneq ($(ARCH),$(shell uname -m)) + CROSS_COMPILE ?= aarch64-linux-gnu- + endif +endif + ifneq ($(wildcard benchmark/test/$(TEST_BIN).cc),) ifeq ($(shell grep "define OPTIMIZE_LEVEL $(OPTIMIZE_LEVEL)" benchmark/test/$(TEST_BIN).cc),) dummy := $(shell sed -i -e "s/define OPTIMIZE_LEVEL.*/define OPTIMIZE_LEVEL $(OPTIMIZE_LEVEL)/g" benchmark/test/$(TEST_BIN).cc) diff --git a/test/microbench/logs/Raspberry-Pi-4-Model-B-Rev-1.4-Raspberry-Pi-4-Model-B-Rev-1.4-aarch64-20220406-100246-O1.log b/test/microbench/logs/Raspberry-Pi-4-Model-B-Rev-1.4-Raspberry-Pi-4-Model-B-Rev-1.4-aarch64-20220406-100246-O1.log new file mode 100644 index 0000000000000000000000000000000000000000..5d996c39d00f68439fa57c31fe5e71b2405de2f1 --- /dev/null +++ b/test/microbench/logs/Raspberry-Pi-4-Model-B-Rev-1.4-Raspberry-Pi-4-Model-B-Rev-1.4-aarch64-20220406-100246-O1.log @@ -0,0 +1,23 @@ +System: #1514 SMP PREEMPT Mon Jan 17 17:39:38 GMT 2022 +Gcc: gcc (Debian 10.2.1-6) 10.2.1 20210110 +G++: g++ (Debian 10.2.1-6) 10.2.1 20210110 +benchmark/build/test/aarch64 +2022-04-06T10:02:46+01:00 +Running benchmark/build/test/aarch64 +Run on (4 X 1800 MHz CPU s) +Load Average: 0.02, 0.12, 0.08 +------------------------------------------------------------------------- +Benchmark Time CPU Iterations +------------------------------------------------------------------------- +BM_nop 0.557 ns 0.556 ns 1000000000 +BM_ub 1.67 ns 1.67 ns 419445121 +BM_bnez 2.24 ns 2.23 ns 314672244 +BM_beqz 1.11 ns 1.11 ns 629290582 +BM_load_bnez 1.11 ns 1.11 ns 629354498 +BM_load_beqz 1.11 ns 1.11 ns 629360258 +BM_cache_miss_load_bnez 3.05 ns 3.05 ns 229187641 +BM_cache_miss_load_beqz 3.05 ns 3.04 ns 229336829 +BM_branch_miss_load_bnez 3.08 ns 3.08 ns 227048424 +BM_branch_miss_load_beqz 3.08 ns 3.08 ns 227054739 +BM_cache_branch_miss_load_bnez 2.94 ns 2.94 ns 238234420 +BM_cache_branch_miss_load_beqz 5.45 ns 5.45 ns 129712057 diff --git a/test/microbench/logs/Raspberry-Pi-4-Model-B-Rev-1.4-Raspberry-Pi-4-Model-B-Rev-1.4-aarch64-20220406-100352-O0.log b/test/microbench/logs/Raspberry-Pi-4-Model-B-Rev-1.4-Raspberry-Pi-4-Model-B-Rev-1.4-aarch64-20220406-100352-O0.log new file mode 100644 index 0000000000000000000000000000000000000000..23528ec4c35250d0e73a91522266cf17c42c2aae --- /dev/null +++ b/test/microbench/logs/Raspberry-Pi-4-Model-B-Rev-1.4-Raspberry-Pi-4-Model-B-Rev-1.4-aarch64-20220406-100352-O0.log @@ -0,0 +1,70 @@ +System: #1514 SMP PREEMPT Mon Jan 17 17:39:38 GMT 2022 +Gcc: gcc (Debian 10.2.1-6) 10.2.1 20210110 +G++: g++ (Debian 10.2.1-6) 10.2.1 20210110 +cd benchmark && \ +git checkout -- test/CMakeLists.txt && \ +sed -i -e "/compile_benchmark_test(basic_test)/icompile_benchmark_test(aarch64)" test/CMakeLists.txt && \ +sed -i -e "/compile_benchmark_test(basic_test)/iadd_test(NAME aarch64 COMMAND aarch64 --benchmark_min_time=0.01)\n" test/CMakeLists.txt +touch benchmark/test/aarch64.dep +cd benchmark && cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_DOWNLOAD_DEPENDENCIES=on -DGOOGLETEST_PATH=/home/pi/riscv-linux/test/microbench/benchmark/build/third_party/googletest -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_FLAGS="" -DCMAKE_CXX_FLAGS="" -S . -B "build" +-- git version: v1.6.1-38-g60b16f11-dirty normalized to 1.6.1.38 +-- Version: 1.6.1.38 +-- Performing Test HAVE_STD_REGEX -- success +-- Performing Test HAVE_GNU_POSIX_REGEX -- failed to compile +-- Performing Test HAVE_POSIX_REGEX -- success +-- Performing Test HAVE_STEADY_CLOCK -- success +-- Looking for Google Test sources +-- Looking for Google Test sources in /home/pi/riscv-linux/test/microbench/benchmark/build/third_party/googletest +CMake Warning at CMakeLists.txt:37 (message): + Did not find Google Test sources! Fetching from web... + + +-- Configuring done +-- Generating done +-- Build files have been written to: /home/pi/riscv-linux/test/microbench/benchmark/build/third_party/googletest +gmake[2]: Entering directory '/home/pi/riscv-linux/test/microbench/benchmark/build/third_party/googletest' +gmake[3]: Entering directory '/home/pi/riscv-linux/test/microbench/benchmark/build/third_party/googletest' +gmake[4]: Entering directory '/home/pi/riscv-linux/test/microbench/benchmark/build/third_party/googletest' +gmake[4]: Leaving directory '/home/pi/riscv-linux/test/microbench/benchmark/build/third_party/googletest' +gmake[4]: Entering directory '/home/pi/riscv-linux/test/microbench/benchmark/build/third_party/googletest' +[ 11%] 34m1mPerforming update step for 'googletest'0m +[ 22%] 34m1mNo patch step for 'googletest'0m +[ 33%] 34m1mNo configure step for 'googletest'0m +[ 44%] 34m1mNo build step for 'googletest'0m +[ 55%] 34m1mNo install step for 'googletest'0m +[ 66%] 34m1mNo test step for 'googletest'0m +[ 77%] 34m1mCompleted 'googletest'0m +gmake[4]: Leaving directory '/home/pi/riscv-linux/test/microbench/benchmark/build/third_party/googletest' +[100%] Built target googletest +gmake[3]: Leaving directory '/home/pi/riscv-linux/test/microbench/benchmark/build/third_party/googletest' +gmake[2]: Leaving directory '/home/pi/riscv-linux/test/microbench/benchmark/build/third_party/googletest' +-- Configuring done +-- Generating done +-- Build files have been written to: /home/pi/riscv-linux/test/microbench/benchmark/build +cd benchmark && cmake --build "build" --config Release --target aarch64 +[ 90%] Built target benchmark +[ 95%] Built target benchmark_main +35m1mScanning dependencies of target aarch640m +[ 95%] 32mBuilding CXX object test/CMakeFiles/aarch64.dir/aarch64.cc.o0m +[100%] 32m1mLinking CXX executable aarch640m +[100%] Built target aarch64 +benchmark/build/test/aarch64 +2022-04-06T10:03:56+01:00 +Running benchmark/build/test/aarch64 +Run on (4 X 1800 MHz CPU s) +Load Average: 0.17, 0.15, 0.09 +------------------------------------------------------------------------- +Benchmark Time CPU Iterations +------------------------------------------------------------------------- +BM_nop 1.67 ns 1.67 ns 419211317 +BM_ub 2.22 ns 2.22 ns 314660377 +BM_bnez 2.22 ns 2.22 ns 314636098 +BM_beqz 1.69 ns 1.67 ns 419554342 +BM_load_bnez 2.23 ns 2.23 ns 314667844 +BM_load_beqz 2.23 ns 2.22 ns 314532265 +BM_cache_miss_load_bnez 6.30 ns 6.30 ns 111356245 +BM_cache_miss_load_beqz 6.29 ns 6.29 ns 113000904 +BM_branch_miss_load_bnez 6.16 ns 6.16 ns 114630037 +BM_branch_miss_load_beqz 6.17 ns 6.16 ns 113586032 +BM_cache_branch_miss_load_bnez 5.80 ns 5.80 ns 120822668 +BM_cache_branch_miss_load_beqz 5.80 ns 5.80 ns 120919759 diff --git a/test/microbench/test/aarch64.cc b/test/microbench/test/aarch64.cc new file mode 100644 index 0000000000000000000000000000000000000000..0cc43e248422abf25a853ef7debfc1a44d143bb8 --- /dev/null +++ b/test/microbench/test/aarch64.cc @@ -0,0 +1,307 @@ +// Copyright (C) 2022 Wu Zhangjin , All Rights Reserved. +// +// Gcc Inline Assembly: https://www.ibiblio.org/gferg/ldp/GCC-Inline-Assembly-HOWTO.html +// https://www.cristal.univ-lille.fr/~marquet/ens/ctx/doc/l-ia.html +// https://wiki.osdev.org/Inline_assembly + +// X86_64 ISA: https://www.aldeid.com/wiki/X86-assembly/Instructions + +#include "benchmark/benchmark.h" + +#define OPTIMIZE_LEVEL 1 + +#if defined(OPTIMIZE_LEVEL) && (OPTIMIZE_LEVEL == 0) +#define benchmark_DoNotOptimize() benchmark::DoNotOptimize(state.iterations()); +#else +#define benchmark_DoNotOptimize() do { } while(0) +#endif + +volatile int enabled; + +void BM_nop(benchmark::State& state) { + for (auto _ : state) { + benchmark_DoNotOptimize(); + asm volatile ("nop":::"memory"); + } +} +BENCHMARK(BM_nop); +// BENCHMARK(BM_nop)->ThreadPerCpu(); + +void BM_ub(benchmark::State& state) { + for (auto _ : state) { + benchmark_DoNotOptimize(); + asm volatile ( + "1: b 2f \n" + "2:" + :::"memory"); + } +} +BENCHMARK(BM_ub); +// BENCHMARK(BM_ub)->ThreadPerCpu(); + +void BM_bnez(benchmark::State& state) { + register int x = 1; + + for (auto _ : state) { + benchmark_DoNotOptimize(); + asm volatile ( + "1: cbnz x0, 2f \n" + "2:" + : + :"r" (x) + :"memory"); + } +} +BENCHMARK(BM_bnez); +// BENCHMARK(BM_bnez)->ThreadPerCpu(); + +void BM_beqz(benchmark::State& state) { + register int x = 0; + + for (auto _ : state) { + benchmark_DoNotOptimize(); + asm volatile ( + "1: cbz x0, 2f \n" + "2:" + : + :"r" (x) + :"memory"); + } +} +BENCHMARK(BM_beqz); +// BENCHMARK(BM_beqz)->ThreadPerCpu(); + +void BM_load_bnez(benchmark::State& state) { + enabled = 1; + + for (auto _ : state) { + benchmark_DoNotOptimize(); + if (enabled != 0) + asm volatile ("":::"memory"); + } +} +BENCHMARK(BM_load_bnez); +// BENCHMARK(BM_load_bnez)->ThreadPerCpu(); + +void BM_load_beqz(benchmark::State& state) { + enabled = 0; + + for (auto _ : state) { + benchmark_DoNotOptimize(); + if (enabled == 0) + asm volatile ("":::"memory"); + } +} +BENCHMARK(BM_load_beqz); +// BENCHMARK(BM_load_beqz)->ThreadPerCpu(); + +volatile int thread_start; +volatile int thread_exit; +struct th_data { + int enabled; + int type; +}; +volatile struct th_data tdata; + +enum { + CACHE_MISS = 0, + CACHE_BRANCH_MISS = 1, + BRANCH_MISS = 2, + NO_MISS = 3, +}; + +static void *thread_handler (void *data) +{ + volatile int *ptr = &enabled; + struct th_data *td = (struct th_data *)data; + long i = td->enabled; + + thread_start = 1; + + while (!thread_exit) { + switch (td->type) { + case CACHE_MISS: + *ptr = i; + break; + case BRANCH_MISS: + *ptr = 1 - i; + break; + case CACHE_BRANCH_MISS: + i = 1 - i; + *ptr = i; + break; + default: + break; + } + } + + return NULL; +} + +void BM_cache_miss_load_bnez(benchmark::State& state) { + pthread_t th; + + enabled = 1; + thread_start = 0; + thread_exit = 0; + + tdata.enabled = enabled; + tdata.type = CACHE_MISS; + + pthread_create (&th, NULL, thread_handler, (void *)&tdata); + + while (!thread_start); + + for (auto _ : state) { + benchmark_DoNotOptimize(); + if (enabled != 0) + asm volatile ("":::"memory"); + } + + thread_exit = 1; + pthread_join (th, NULL); +} +BENCHMARK(BM_cache_miss_load_bnez); +//BENCHMARK(BM_cache_miss_load_bnez)->ThreadRange(1,3); +//BENCHMARK(BM_cache_miss_load_bnez)->ThreadPerCpu(); + +void BM_cache_miss_load_beqz(benchmark::State& state) { + pthread_t th; + long i; + + enabled = 0; + thread_start = 0; + thread_exit = 0; + + tdata.enabled = enabled; + tdata.type = CACHE_MISS; + + pthread_create (&th, NULL, thread_handler, (void *)&tdata); + + while (!thread_start); + + for (auto _ : state) { + benchmark_DoNotOptimize(); + if (enabled == 0) + asm volatile ("":::"memory"); + } + + thread_exit = 1; + pthread_join (th, NULL); +} +BENCHMARK(BM_cache_miss_load_beqz); +//BENCHMARK(BM_cache_miss_load_beqz)->ThreadRange(1,3); +//BENCHMARK(BM_cache_miss_load_beqz)->ThreadPerCpu(); + +void BM_branch_miss_load_bnez(benchmark::State& state) { + pthread_t th; + + enabled = 1; + thread_start = 0; + thread_exit = 0; + + tdata.enabled = enabled; + tdata.type = BRANCH_MISS; + + pthread_create (&th, NULL, thread_handler, (void *)&tdata); + + while (!thread_start); + + for (auto _ : state) { + benchmark_DoNotOptimize(); + if (enabled != 0) + asm volatile ("":::"memory"); + } + + thread_exit = 1; + pthread_join (th, NULL); +} +BENCHMARK(BM_branch_miss_load_bnez); +//BENCHMARK(BM_branch_miss_load_bnez)->ThreadRange(1,3); +//BENCHMARK(BM_branch_miss_load_bnez)->ThreadPerCpu(); + +void BM_branch_miss_load_beqz(benchmark::State& state) { + pthread_t th; + long i; + + enabled = 0; + thread_start = 0; + thread_exit = 0; + + tdata.enabled = enabled; + tdata.type = BRANCH_MISS; + + pthread_create (&th, NULL, thread_handler, (void *)&tdata); + + while (!thread_start); + + for (auto _ : state) { + benchmark_DoNotOptimize(); + if (enabled == 0) + asm volatile ("":::"memory"); + } + + thread_exit = 1; + pthread_join (th, NULL); +} +BENCHMARK(BM_branch_miss_load_beqz); +//BENCHMARK(BM_branch_miss_load_beqz)->ThreadRange(1,3); +//BENCHMARK(BM_branch_miss_load_beqz)->ThreadPerCpu(); + +void BM_cache_branch_miss_load_bnez(benchmark::State& state) { + pthread_t th; + + enabled = 1; + thread_start = 0; + thread_exit = 0; + + tdata.enabled = enabled; + tdata.type = CACHE_BRANCH_MISS; + + pthread_create (&th, NULL, thread_handler, (void *)&tdata); + + while (!thread_start); + + for (auto _ : state) { + benchmark_DoNotOptimize(); + if (enabled != 0) + asm volatile ("":::"memory"); + } + + thread_exit = 1; + pthread_join (th, NULL); +} +BENCHMARK(BM_cache_branch_miss_load_bnez); +//BENCHMARK(BM_cache_branch_miss_load_bnez)->ThreadRange(1,3); +//BENCHMARK(BM_cache_branch_miss_load_bnez)->ThreadPerCpu(); + +void BM_cache_branch_miss_load_beqz(benchmark::State& state) { + pthread_t th; + long i; + + enabled = 0; + thread_start = 0; + thread_exit = 0; + + tdata.enabled = enabled; + tdata.type = CACHE_BRANCH_MISS; + + pthread_create (&th, NULL, thread_handler, (void *)&tdata); + + while (!thread_start); + + for (auto _ : state) { + benchmark_DoNotOptimize(); + if (enabled == 0) + asm volatile ("":::"memory"); + } + + thread_exit = 1; + pthread_join (th, NULL); +} +BENCHMARK(BM_cache_branch_miss_load_beqz); +//BENCHMARK(BM_cache_branch_miss_load_beqz)->ThreadRange(1,3); +//BENCHMARK(BM_cache_branch_miss_load_beqz)->ThreadPerCpu(); + + +BENCHMARK_MAIN();