// Overview / Features / Requirements / Showcase / API / FAQ / User Guide / License
Warning
Under development
Performanceis not a number!
Single
header/moduleperformancelibrary that combines the power of:
c++23,linux/perf,llvm/mca,gnuplot/sixel, ...
Profiling, Analyzing, Benchmarking
names description API infohardware/software info compiler,cpu,memory,sys,proc,bin[core]low-level utilities code,compiler,cpu,memoryprofilertiming (rdtsc/clock/chrono) tsc,process_time,thread_time,steady_time,profilercounting (linux/perf) instructions,cycles, ...,top_downprofilersampling (linux/perf) instructions,cycles, ...,mem_loads,mem_stores,top_downprofilertracing (linux/intel_pt) instructions,cycles,tscanalyzerdisassembling (llvm) mca::assembly,mca::address,mca::encoding,mca::size,mca::uops,mca::latency,mca::rthroughput,mca::may_load,mca::may_store,mca::has_side_effects, ...,mca::debug::sourceanalyzeranalyzing (llvm/mca) mca::timeline,mca::resource_pressure,mca::bottleneckrunnerbenchmarking bench::baseline,bench::latency,bench::throughput[io]logging/plotting (gnuplot/sixel) log,json,report,annotate,plot(hist,box,bar,line,ecdf)[] -
inline namespace
Optimal (All Features)
intel-12th+withPEBS,IPTsupport
libipt-apt-get install libipt-dev- (
clang-19+|gcc-13+) /c++23+
llvm-19+-apt-get install llvm-devlinux-6.x+
perf-event-open-apt-get install linux-tools-commonterminalwithsixelsupport
gnuplot-apt-get install gnuplotAuxiliary (Enhancements)
prof-https://github.com/qlibs/prof
linux-perf-apt get install linux-tools-commonintel-vtune-apt get install intel-oneapi-vtuneamd-uprof-https://www.amd.com/en/developer/uprof.html#downloadsgperftools-apt get install google-perftoolsllvm-xray-apt-get install llvmcallgrind-apt-get install valgrindut-https://github.com/qlibs/utgh-apt-get install gh
Usage
/** * Info/Core */ import perf; // #include <perf> int main() { auto&& spec = perf::info::spec{{ {"time", std::chrono::system_clock::now()}, {"perf.version", perf::info::version()}, {"sys.triple", perf::info::sys::triple()}, {"cxx.name", perf::info::compiler::name()}, {"cxx.version", perf::info::compiler::version()}, {"cpu.name", perf::info::cpu::name()}, {"cpu.code_name", perf::info::cpu::code_name()}, {"cpu.version", perf::info::cpu::version()}, {"cache.dL1", perf::info::memory::dcache()[perf::info::memory::level::L1])}, // ... }}; perf::log(spec); }/** * Profiling/Analyzing */ import perf; // #include <perf> int main() { perf::profiler profiler{ perf::stat::tsc, perf::stat::cycles, perf::trace::instructions }; auto invoke = [&](auto&& fn, auto&&... ts) { profiler.start(); perf::compiler::prevent_elision(fn(ts...)); profiler.stop(); }; invoke(fizz_buzz, std::rand()); perf::log(profiler[]); perf::verify(profiler[perf::stat::tsc] > 0ns); perf::analyzer analyzer{ perf::mca::assembly, perf::mca::timeline }; analyzer << profiler[perf::trace::instructions]; perf::log(analyzer[]); perf::verify(analyzer[perf::mca::timeline][0u].cycle_dispatched); perf::verify(analyzer[perf::mca::assembly][0u].contains("add")); }/** * Benchmarking */ import perf; // #include <perf> int main() { perf::runner bench{perf::bench::latency{}}; static constexpr auto fizz_buzz = [](int n) { if (n % 15 == 0) { return "FizzBuzz"; } else if (n % 3 == 0) { return "Fizz"; } else if (n % 5 == 0) { return "Buzz"; } else { return "Unknown"; } }; bench(fizz_buzz, 15); bench(fizz_buzz, 3); bench(fizz_buzz, 5); bench(fizz_buzz, perf::data::unpredictable<int>); perf::report(bench[perf::stat::tsc, perf::bench::operations]); perf::plot::bar(bench[perf::stat::tsc]); perf::annotate<perf::vsplit>(bench[perf::mca::assembly]); }Build & Test
# module clang++ -std=c++23 -O3 -I. --precompile perf.cppm # -DNTEST disables compile-time tests clang++ -std=c++23 -O3 -fprebuilt-module-path=. perf.pcm <source_file> -lLLVM -lipt # header $CXX -std=c++23 -O3 -I. <source_file> -lLLVM -lipt # -DNTEST disables compile-time tests.github/scripts/tune.sh # See #FAQ for moreExport & Share
./a.out | .github/scripts/export.sh html | gh gist create --public
scripts/export.sh-html,markdown,notebook
gh-apt-get install gh
Configuration
/** * PERF version (read-only) # https://semver.org */ #define PERF (MAJOR, MINOR, PATCH) // ex. (1, 0, 0)/** * GNU # default: deduced based on `__GNUC__` * - 0 not compatible * - 1 compatible */ #define PERF_GNU 0/1 /** * Linux # default: deduced based on `__linux__` * - 0 not supported * - 1 supported */ #define PERF_LINUX 0/1 /** * UEFI # default: 0 * - 0 not supported * - 1 supported */ #define PERF_UEFI 0/1 /** * LLVM # default: deduced based on `llvm-dev` headers * - 0 not supported * - 1 supported */ #define PERF_LLVM 0/1 /** * Intel Processor Trace # default: deduced based on `intel_pt` headers * - 0 not supported * - 1 supported */ #define PERF_INTEL 0/1 /** * I/O support # default: 1 * - 0 not compiled * - 1 supported (`log, json, report, annotate, plot`) */ #define PERF_IO 0/1 /** * tests # default: not-defined * - defined: disables all compile-time, run-time tests * - not-defined: compile-time tests executed, * run-time tests available by `perf::self::test()` API */ #define NTEST/** * gnuplot terminal # see `gnuplot -> set terminal` # default: 'sixel' * - 'sixel' # console image # https://www.arewesixelyet.com * - 'wxt' # popup window * - 'canvas' # html * - 'dumb size 150,25 ansi' # console with colors * - 'dumb size 80,25' # console */ ENV:PERF_IO_PLOT_TERM /** * style # default: dark * - light * - dark */ ENV:PERF_IO_PLOT_STYLEInfo/Core
namespace perf::info { /** * static_assert(version().major == 1); * static_assert(version().minor == 0); * static_assert(version().patch == 0); */ inline constexpr auto version = [] -> sem_ver; } // namespace perf::infonamespace perf::info::compiler { /** * verify(name() == "clang"s); */ inline constexpr auto name = [] -> std::string_view; /** * static_assert(version().major == 20); * static_assert(version().minor == 0); * static_assert(version().patch == 0); */ inline constexpr auto version = [] -> sem_ver; } // namespace perf::info::compiler// perf::info::cpu::name assert(perf::info::cpu::name() == "12th Gen Intel(R) Core(TM) i7-12650"s); // perf::info::cpu::code_name assert(perf::info::cpu::code_name() == "alderlake"s); // perf::info::cpu::version assert(perf::info::cpu::version().family == 6); assert(perf::info::cpu::version().model == 154); assert(perf::info::cpu::version().stepping == 3); // perf::info::cpu::dispatch_width assert(perf::info::cpu::dispatch_width() == 6); // perf::info::cpu::features assert(perf::info::cpu::features() == std::vector{"avx", "avx2", "bmi", ...});// info::memory::icache assert(perf::info::memory::icache() == std::map{{level::L1, {.size = 448KiB, .line_size = 64, .assoc = 8}}} ); // info::memory::dcache assert(perf::info::memory::dcache() == std::map{ {level::L1, {.size = 416KiB, .line_size = 64, .assoc = 8}}, ...}, {level::L2, {.size = 9.5MiB, .line_size = 64, .assoc = 12}}, ...}, {level::L3, {.size = 24Mib, .line_size = 64, .assoc = 12}}, ...}, });// info::sys::name assert(perf::info::sys::name() == "linux"s); // info::sys::triple assert(perf::info::sys::triple() == "x86_64-pc-linux-gnu"s); // info::sys::page_size assert(perf::info::sys::page_size() == 4096b);// info::proc::name assert(perf::info::proc::self::name() == "/full/path/example.out"s); // info::proc::base_address assert(perf::info::proc::self::base_address());// info::bin::addr_to_fn_name static auto fn = [] {}; auto&& fn_name = perf::info::bin::addr_to_fn_name( perf::info::proc::self::name(), std::uint64_t(&fn) - perf::info::proc::self::base_address() ); assert(fn_name.has_value() and *fn_name == "fn"s); // info::bin::addr_to_name static auto var = 0; auto&& var_name = perf::info::bin::addr_to_name( perf::info::proc::self::name(), std::uint64_t(&var) - perf::info::proc::self::base_address() ); assert(var_name.has_value() and *var_name == "var"s); // info::bin::addr_to_line # requires debug symbols (-g) label:; auto&& source = perf::info::bin::addr_to_line( perf::info::proc::self::name(), std::uint64_t(&&label) - perf::info::proc::self::base_address() ); assert(source.has_value() and source->contains("label:;"));// code::align perf::code::align<std::align_val_t(64u)>(); for (...) { } // code::label perf::code::label<"begin">(); // begin: perf::code::label<"end">(); // end: assert(perf::code::labels["begin"] != perf::code::labels["end"]);// compiler::prevent_reorder # std::atomic_signal_fence perf::compiler::prevent_reorder(); // copmiler::prevent_elision int i{}; assert(perf::compiler::prevent_elision(i++)); // copiler::is_elided assert(perf::compiler::is_elided([] { })); assert(perf::compiler::is_elided([] { int i{}; i++; })); assert(not perf::compiler::is_elided([] { int i{}; perf::compiler::prevent_elision(i++); }));// cpu::pipeline::flush perf::cpu::pipeline::flush();// memory::align/memory::is_aligned auto aligned_addr = perf::memory::align<perf::memory::direction::up>( addr, std::align_val_t(64u) ); assert(perf::memory::is_aligned(aligned_addr, std::align_val_t(64u))); // memory::synchronize # std::atomic_thread_fence perf::memory::synchronize(); // memory::prefetch perf::memory::prefetch<perf::memory::operation::write, perf::memory::locality::high>(addr); // memory::lock # scoped{mlockall, munlockall} { perf::memory::lock _; } // memory::protect const std::array add{ // x86-64 0x89, 0xf8, // mov eax, edi 0x01, 0xf0, // add eax, esi 0xc3 // ret }; perf::memory::protect( std::span(add), perf::memory::protection::read | perf::memory::protection::write | perf::memory::protection::exec) ); assert(invoke(add, 1, 2) == 3); assert(invoke(add, 2, 3) == 5); // memory::pollute # pollutes memory by making allocations perf::memory::pollute(1024u); // memory::pre_fault # touches all pages used by data perf::memory::pre_fault(std::span(...)); // memory::flush # clears all cache lines used by data perf::memory::cache::flush(std::span(...));// sys::affinity perf::thread::affinity::set(perf::thread::self, 2u); assert(2u == perf::thread::affinity::get(perf;:thread::self));Profiling/Analyzing
Setup Guide
How to setup
perfdocker?docker build -t perf .docker run \ -it \ --privileged \ --network=host \ -e DISPLAY=${DISPLAY} \ -v ${PWD}:${PWD} \ -w ${PWD} \ perfHow to install
perfdepenencies?apt-get install linux-tools-common # linux-perf (perf::stat/perf::record) apt-get install llvm-dev # llvm (perf::mc/perf::mca) apt-get install libipt-dev # libipt (perf::trace) apt-get install gnuplot # (perf::plot)How to setup
linux performance counters?.github/scripts/setup.sh --perf # --rdpmc --max-sample-rate 10000sudo mount -o remount,mode=755 /sys/kernel/debug sudo mount -o remount,mode=755 /sys/kernel/debug/tracing sudo chown `whoami` /sys/kernel/debug/tracing/uprobe_events sudo chmod a+rw /sys/kernel/debug/tracing/uprobe_events echo 0 | sudo tee /proc/sys/kernel/kptr_restrict echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid echo 1000 | sudo tee /proc/sys/kernel/perf_event_max_sample_rateecho 2 | sudo tee /sys/devices/cpu_core/rdpmcHow to reduce
execution variability?.github/scripts/tune.sh
pyperf-pip3 install pyperfsudo pyperf system tune sudo pyperf system show sudo pyperf system reset# Set Process CPU Affinity (apt install util-linux) taskset -c 0 ./a.out # Set Process Scheduling Priority (apt install coreutils) nice -n -20 taskset -c 0 ./a.out # -20..19 (most..less favorable to the process) # Disable CPU Frequency Scaling (apt install cpufrequtils) sudo cpupower frequency-set --governor performance # cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor # Disable Address Space Randomization echo 0 > /proc/sys/kernel/randomize_va_space # Disable Processor Boosting echo 0 | sudo tee /sys/devices/system/cpu/cpufreq/boost # Disable Turbo Mode echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo # Disable Hyperthreading/SMT echo off | sudo tee /sys/devices/system/cpu/smt/control # Restrict memory to a single socket numactl -m 0 -N 0 ./a.out # Enable Huge Pages sudo numactl --cpunodebind=1 --membind=1 hugeadm \ --obey-mempolicy --pool-pages-min=1G:64 sudo hugeadm --create-mounts# Enable Kernel Mode Task-Isolation (https://lwn.net/Articles/816298) # cat /sys/devices/system/cpu/isolated isolcpus=<cpu number>,...,<cpu number> # Disable P-states and C-states # cat /sys/devices/system/cpu/intel_pstate/status idle=pool intel_pstate=disable intel_idle.max_cstate=0 processor.max_cstate=1 # Disable NMI watchdog # cat /proc/sys/kernel/nmi_watchdog nmi_watchdog=0clang++-std=c++20 -I. \ -target x86_64-pc-win32-coff \ -fno-stack-protector \ -fshort-wchar \ -mno-red-zone \ -c uefi.cpp -o uefi.o lld-link \ -filealign:16 \ -subsystem:efi_application \ -nodefaultlib -dll \ -entry:efi_main \ -out:BOOTX64.EFI \ uefi.o mkdir -p efi/boot && cp BOOTX64.EFI /usr/share/ovmf/OVMF.fd efi/boot qemu-system-x86_64 \ -drive if=pflash,format=raw,file=efi/boot/OVMF.fd \ -drive format=raw,file=fat:rw:. \ -net noneUsage Guide
How to compile
perfwithmodules?clang++ -std=c++23 -O3 -I. --precompile perf.cppm clang++ -std=c++23 -O3 -fprebuilt-module-path=. perf.pcm <source_file> -lLLVM -liptimport perf;How to change
assemblysyntax?perf::llvm llvm{ {.syntax = perf::arch::syntax::att} // default: intel };How to
analyzefor a different architecture?perf::llvm llvm{ .triple = "x86_64-pc-linux-gnu" // see `llvm-llc` for details };Which
terminalcan display images?Any terminal with sixel support - https://www.arewesixelyet.com
(Visual Studio Codeimages support in terminal -Terminal -> Enable images option)How to change plotting
terminal?PERF_IO_PLOT_TERM='sixel' # terminal - sixel PERF_IO_PLOT_TERM='dumb size 80,25' # terminal asci PERF_IO_PLOT_TERM='dumb size 150,25 ansi' # terminal ansi PERF_IO_PLOT_TERM='wxt' # popup windows PERF_IO_PLOT_TERM='canvas' # html PERF_IO_PLOT_TERM='png' # pnggnuplot: set terminal # available optionsHow to change plot style?
PERF_IO_PLOT_STYLE='dark' # dark - default PERF_IO_PLOT_STYLE='light' # lightHow to save plot?
perf::plot::gnuplot plt{{.term = "png"}}; plt.send("set output 'output.png'"); perf::plot::bar(plt, ...);How to
exportresults?./a.out 2>&1 | .github/scripts/export.sh markdown > results.md ./a.out 2>&1 | .github/scripts/export.sh notebook > results.ipynb ./a.out 2>&1 | .github/scripts/export.sh html > results.htmlHow to
shareresults?
gh-apt-get install gh# https://jbt.github.io/markdown-editor gh gist create --public --web results.md# https://jupyter.org gh gist create --public --web results.ipynb# https://htmlpreview.github.io gh gist create --public --web results.htmlHow to write custom
profiler?struct my_profiler { constexpr auto start(); constexpr auto stop(); [[nodiscard]] constexpr auto operator[](Ts...) const; };static_assert(perf::profiler_like<my_profiler>);perf::runner bench{ [](auto&& fn, auto&&... ts) { my_profiler profiler{}; profiler.start(); perf::compiler::prevent_elision(fn(ts...)); profiler.stop(); } };How to integrate with
profilingtools?
profsupprots the following profilers
linux-perf-apt get install linux-tools-commonintel-vtune-apt get install intel-oneapi-vtuneamd-uprof-https://www.amd.com/en/developer/uprof.html#downloadsgperftools-apt get install google-perftoolsllvm-xray-apt-get install llvmcallgrind-apt-get install valgrind
perf::runner bench{ [](auto&& fn, auto&&... ts) { prof::callgrind profiler{"callgrind"}; profiler.start(); perf::compiler::prevent_elision(fn(ts...)); profiler.stop(); } };bench(fn, ts...);valgrind --tool=callgrind --instr-atstart=no ./a.out kcachegrind callgrind.*How to integrate with
unit-testingframeworks?import perf; import ut; // https://github.com/qlibs/ut int main() { perf::runner bench{perf::bench::latency{}}; perf::scoped _ { .on_exit = [&] { perf::report(bench[perf::stat::cpu_time]); } }; "benchmark1"_test = [] { bench(fn1, ts1...); }; "benchmark1"_test = [] { bench(fn2, ts2...); }; }How
perftests are working?
compile-timetests are executed uponinclude/import(enabled by default)
run-time/sanity checktests can be executed at run-timeint main() { perf::self::test({.verbose = true}); // run-time/sanity check tests }
-DNTESTcan be used to disable tests (not recommended)$CXX -DNTEST ... # tests will NOT be compiled
perftests execution model#ifndef NTEST "perf"_suite = [] { "run-time and compile-time"_test = [] constexpr { expect(3 == accumulate({1, 2, 3}, 0)); }; "run-time"_test = [] mutable { expect(std::rand() >= 0); }; "compile-time"_test = [] consteval { expect(sizeof(int) == sizeof(0)); }; }; #endifHow to integrate with
jupyter?
jupyter(apt install jupyter) can be used for data analysis (python)int main() { // ... perf perf::json("perf.json"); }# notebook.ipynb import pandas as pd df = pd.read_json("perf.json") print(df.head())jupyter notebook notebook.ipynbWhat is the difference between
latencyandthroughput?
latencyis the time it takes for a single operation to complete (ns)
throughputis the total number of operations or tasks completed in a given amount of time (op/s)What is
top-down microarchitecture analysismethod?https://www.intel.com/content/www/us/en/docs/vtune-profiler/cookbook/2023-0/top-down-microarchitecture-analysis-method.html
https://github.com/andikleen/pmu-tools/wiki/toplev-manualWhat are
performancecompilation flags?-O1 # optimizations (O1) [0] -O2 # optimizations (O1 + O2) [0] -O3 # optimizations (O1 + O2 + O3) [0] -march=native # architecture specific [1] -DNDEBUG # disables asserts, etc.-fno-omit-frame-pointer # keeps the frame pointer in a register-ffast-math # [unsafe] faster but non-conforming math [2] -fcf-protection=none # [unsafe] stops emmitting `endbr64`[0] https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html
[1] https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html
[2] https://gcc.gnu.org/wiki/FloatingPointMathWhat are
performancecompiler attributes?
gnu::target[[gnu::target("avx2")]] [[gnu::target("bmi")]]
gnu::optimize[[gnu::optimize("O3")] [[gnu::optimize("ffast-math")]https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html
MIT/Apache2+LLVM
license namespace guard description MIT perf::*- https://opensource.org/license/mit Apache2+LLVM perf::mca::*PERF_LLVM == 1https://llvm.org/LICENSE.txt
