|
156 | 156 | > import perf; // #include <perf> |
157 | 157 | > |
158 | 158 | > int main() { |
159 | | - > erf::profiler profiler{...}; |
| 159 | + > perf::profiler profiler{ |
| 160 | + > perf::stat::tsc, perf::stat::cycles, perf::trace::instructions |
| 161 | + > }; |
160 | 162 | > |
161 | | - > constexpr auto invoke = [](auto& profiler, auto&& fn, auto&&... ts) { |
| 163 | + > auto invoke = [&](auto&& fn, auto&&... ts) { |
162 | 164 | > profiler.start(); |
163 | 165 | > perf::compiler::prevent_elision(fn(ts...)); |
164 | 166 | > profiler.stop(); |
|
169 | 171 | > perf::log(profiler[]); |
170 | 172 | > perf::verify(profiler[perf::stat::tsc] > 0ns); |
171 | 173 | > |
172 | | - > perf::analyzer analyzer{perf::mca::assembly, perf::mca::timeline}; |
| 174 | + > perf::analyzer analyzer{ |
| 175 | + > perf::mca::assembly, perf::mca::timeline |
| 176 | + > }; |
173 | 177 | > analyzer << profiler[perf::trace::instructions]; |
174 | 178 | > |
175 | 179 | > perf::log(analyzer[]); |
|
528 | 532 | > <details> |
529 | 533 | > <summary>Profiling/Analyzing</summary> |
530 | 534 | > |
531 | | - > ```cpp |
532 | | - > // stat::timer |
533 | | - > perf::stat::timer t{perf::stat::steady_time, perf::stat::cpu}; |
534 | | - > |
535 | | - > t.start(); |
536 | | - > fn(); |
537 | | - > t.stop(); |
538 | | - > |
539 | | - > assert(t[perf::stat::steady_time] > 0ns); |
540 | | - > assert(t[perf::stat::cpu_time] > 0ns); |
541 | | - > |
542 | | - > // `t[]` - returns std::tuple of all timers |
543 | | - > assert(std::get<0u>(t[]) > 0ns); // steady_time |
544 | | - > assert(std::get<1u>(t[]) > 0ns); // time_cpu |
545 | | - > ``` |
546 | | - > |
547 | | - > ```cpp |
548 | | - > perf::stat::tsc - time-stamp-counter |
549 | | - > perf::stat::steady_time - monotonic time |
550 | | - > perf::stat::cpu_time - user time + sys time |
551 | | - > perf::stat::thread_time - cpu time for the current thread |
552 | | - > perf::stat::real_time - wall clock time |
553 | | - > ``` |
554 | | - > |
555 | | - > ```cpp |
556 | | - > // stat::counter |
557 | | - > |
558 | | - > // metrics/dsl |
559 | | - > // top_down |
560 | | - > ``` |
561 | | - > |
562 | | - > ```cpp |
563 | | - > // instruction per cycle (ipc) |
564 | | - > ipc = instructions / cycles; |
565 | | - > |
566 | | - > // cycles per instruction (cpi, inverse of ipc) |
567 | | - > cpi = cycles / instructions; |
568 | | - > |
569 | | - > // branch miss rate (branch misses per branch instruction) |
570 | | - > branch_miss_rate = branch_misses / branches; |
571 | | - > |
572 | | - > // cache miss rate (cache misses per cache reference) |
573 | | - > cache_miss_rate = cache_misses / cache_references; |
574 | | - > |
575 | | - > // llc miss rate |
576 | | - > llc_miss_rate = llc_misses / cache_references; |
577 | | - > |
578 | | - > // l1 data cache miss rate |
579 | | - > l1_dcache_miss_rate = l1_dcache_load_misses / l1_dcache_loads; |
580 | | - > |
581 | | - > // l1 instruction cache miss rate |
582 | | - > l1_icache_miss_rate = l1_icache_load_misses / l1_icache_loads; |
583 | | - > |
584 | | - > // dtlb miss rate |
585 | | - > dtlb_miss_rate = dtlb_load_misses / dtlb_loads; |
586 | | - > |
587 | | - > // itlb miss rate |
588 | | - > itlb_miss_rate = itlb_load_misses / itlb_loads; |
589 | | - > |
590 | | - > // stalled cycles rate (frontend) |
591 | | - > frontend_stall_rate = stalled_cycles_frontend / cycles; |
592 | | - > |
593 | | - > // stalled cycles rate (backend) |
594 | | - > backend_stall_rate = stalled_cycles_backend / cycles; |
595 | | - > |
596 | | - > // memory access rate |
597 | | - > memory_stall_ratio = stalled_cycles_backend / cycles; |
598 | | - > |
599 | | - > // overall stall rate |
600 | | - > total_stall_rate = (stalled_cycles_backend + stalled_cycles_frontend) / cycles; |
601 | | - > |
602 | | - > // cpu migrations per cycles |
603 | | - > cpu_migration_rate = cpu_migrations / cycles; |
604 | | - > |
605 | | - > // context switches per cycles |
606 | | - > context_switch_rate = context_switches / cycles; |
607 | | - > |
608 | | - > // page fault rate |
609 | | - > page_fault_rate = faults / cycles; |
610 | | - > |
611 | | - > // page fault rate (major faults per total faults) |
612 | | - > major_fault_rate = major_faults / cycles; |
613 | | - > |
614 | | - > // page fault rate (minor faults per total faults) |
615 | | - > minor_fault_rate = minor_faults / cycles; |
616 | | - > ``` |
617 | | - > |
618 | | - > ```cpp |
619 | | - > // record::sampler |
620 | | - > ``` |
621 | | - > |
622 | | - > ```cpp |
623 | | - > // https://github.com/qlibs/prof |
624 | | - > // prof::callgrind |
625 | | - > // prof::... |
626 | | - > ``` |
627 | | - > |
628 | | - > ```cpp |
629 | | - > // trace::tracer |
630 | | - > auto&& instructions = perf::trace::trace( |
631 | | - > [&] { return fizz_buzz(i); } |
632 | | - > )[perf::trace::instructions]; |
633 | | - > |
634 | | - > perf::analyzer analyzer{perf::mca::assembly}; |
635 | | - > for (auto&& i : (analyzer << instructions)[perf::mca::assembly]) { |
636 | | - > perf::log("{}", i); |
637 | | - > } |
638 | | - > ``` |
639 | | - > |
640 | | - > ```cpp |
641 | | - > // perf::analyzer |
642 | | - > ``` |
643 | | - > |
644 | | - > </details> |
645 | | - > |
646 | | - > <details> |
647 | | - > <summary>Benchmarking</summary> |
648 | | - > |
649 | | - > ```cpp |
650 | | - > // perf::runner |
651 | | - > auto&& runner = [](auto&& fn, auto&&... ts) { |
652 | | - > perf::dataset ds{}; |
653 | | - > setat::timer timer{perf::stat::steady_time}; |
654 | | - > timer.start(); |
655 | | - > for (auto i = 0u; i < 1'000; ++i) { |
656 | | - > compiler::prevent_elision(fn(ts...)); |
657 | | - > } |
658 | | - > timer.stop(); |
659 | | - > ds += timer; |
660 | | - > return ds; |
661 | | - > }; |
662 | | - > |
663 | | - > // `runner` deduces what to run based on the usage of `bench[...]` |
664 | | - > // unless otherwise explicilty specified |
665 | | - > perf::runner bench{perf::named("runner", runner}}; |
666 | | - > |
667 | | - > static auto fizz_buzz = [](int n) { |
668 | | - > if (n % 15 == 0) { |
669 | | - > return "FizzBuzz"; |
670 | | - > } else if (n % 3 == 0) { |
671 | | - > return "Fizz"; |
672 | | - > } else if (n % 5 == 0) { |
673 | | - > return "Buzz"; |
674 | | - > } else { |
675 | | - > return "Unknown"; |
676 | | - > } |
677 | | - > }; |
678 | | - > |
679 | | - > bench(fizz_buzz, 15); |
680 | | - > bench(fizz_buzz, 3); |
681 | | - > bench(fizz_buzz, 5); |
682 | | - > |
683 | | - > perf::report(bench[perf::stat::steady_time]); |
684 | | - > ``` |
685 | | - > |
686 | | - > ```cpp |
687 | | - > // perf::data |
688 | | - > bench(fizz_buzz, perf::data::sequence<int>{{3,5,15}}); |
689 | | - > bench(fizz_buzz, perf::data::uniform<int>{.min = 0, .max = 15}); |
690 | | - > // choice |
691 | | - > ``` |
692 | | - > |
693 | | - > ```cpp |
694 | | - > // perf::bench::latency |
695 | | - > auto add = [](int a, int b) { return a + b; }; |
696 | | - > auto sub = [](int a, int b) { return a - b; }; |
697 | | - > auto mult = [](int a, int b) { return a * b; }; |
698 | | - > auto div = [](int a, int b) { return a / b; }; |
699 | | - > |
700 | | - > perf::runner bench{perf::bench::latency{}}; |
701 | | - > |
702 | | - > bench(add, 0, 0); |
703 | | - > bench(sub, 0, 0); |
704 | | - > bench(mult, 0, 0); |
705 | | - > bench(div, 0, 0); |
706 | | - > |
707 | | - > using perf::metric::operator/; |
708 | | - > auto ops = perf::bench::operations; |
709 | | - > perf::report(bench[perf::stat::tsc / ops, perf::stat::cycles / ops]); |
710 | | - > ``` |
711 | | - > |
712 | | - > ```cpp |
713 | | - > // perf::bench::throughput |
714 | | - > ``` |
715 | | - > |
716 | | - > ```cpp |
717 | | - > latency = time / operations; |
718 | | - > throughput = operations / time; |
719 | | - > inverse_throughput = time / operations; |
720 | | - > ``` |
721 | | - > |
722 | | - > ```cpp |
723 | | - > // perf::bench::baseline |
724 | | - > // perf::bench::debug |
725 | | - > ``` |
726 | | - > |
727 | | - > ```cpp |
728 | | - > // perf::plot::hist |
729 | | - > // perf::plot::bar |
730 | | - > // perf::plot::box |
731 | | - > // perf::plot::line |
732 | | - > // perf::plot::ecdf |
733 | | - > // perf::plot::complexity |
734 | | - > ``` |
735 | | - > |
736 | 535 | > </details> |
737 | 536 | > |
738 | 537 | > [`Synopsis`](https://github.com/qlibs/perf/blob/main/perf.cppm#L39) |
|
0 commit comments