Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
2a918e4
Allow Linux perf profiler to see Python calls
pablogsal Aug 7, 2022
cea1420
Add test
pablogsal Aug 20, 2022
4107c53
Update PCbuild/_freeze_module.vcxproj.filters
pablogsal Aug 20, 2022
5e34e66
munmap pages on shutdown, keep FILE open
tiran Aug 21, 2022
a26a850
Fix tests
pablogsal Aug 21, 2022
8170b24
Skip tests if sanitizer is active
pablogsal Aug 21, 2022
9df1c93
Add ARM64 code generated by aarch64-linux-gnu-gcc
tiran Aug 21, 2022
d8f396d
Address review comments
pablogsal Aug 21, 2022
d35c5d7
Secure fopen, use unraisable, continue on error
tiran Aug 22, 2022
2664b12
cleanup resources, set to uninit
tiran Aug 22, 2022
e6c365a
Allow to set custom callbacks
pablogsal Aug 22, 2022
5513fb1
Add comment to asm file
pablogsal Aug 22, 2022
76c7dc0
fixup! Merge pull request #36 from tiran/perf-file
pablogsal Aug 22, 2022
a545b3c
Add comments to the perf_trampoline file and format file
pablogsal Aug 22, 2022
5130c8d
Correct News entry
pablogsal Aug 22, 2022
991366b
Update Lib/test/test_perf_profiler.py
pablogsal Aug 22, 2022
0a0e53d
Rename perf macro
pablogsal Aug 22, 2022
7ea3371
Fix some typos
pablogsal Aug 22, 2022
680db66
Improve perf profiler tests
tiran Aug 22, 2022
1263a29
Add guard for initialization
pablogsal Aug 22, 2022
a42bde5
Add acks
pablogsal Aug 22, 2022
b780d2a
Initialize perf file lazily
pablogsal Aug 22, 2022
04bf416
Address review comments
pablogsal Aug 22, 2022
7558df2
Complain if there is already a evaluator frame when deactivating/acti…
pablogsal Aug 22, 2022
d1ebc88
Fix some errors on CI
pablogsal Aug 22, 2022
a83a31b
Reorder arguments to speed up trampoline
tiran Aug 22, 2022
0febd84
Preserve frame pointer
pablogsal Aug 22, 2022
dc5a6a5
Support perf backend and better handle forks
pablogsal Aug 22, 2022
be72b92
Fix more fork problems
pablogsal Aug 22, 2022
b5739f4
Update Lib/test/test_perf_profiler.py
pablogsal Aug 22, 2022
04c0c14
Handle missing backends
pablogsal Aug 22, 2022
e810ce6
Update Lib/test/test_perf_profiler.py
pablogsal Aug 22, 2022
bc8bf4e
clean up perf files
pablogsal Aug 22, 2022
0252845
Update Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issu…
pablogsal Aug 22, 2022
264bed7
Test fork support, fix some fork problems and improve test file
pablogsal Aug 23, 2022
a31a498
Add more tests
pablogsal Aug 23, 2022
f591e8d
Update Objects/perf_trampoline.c
pablogsal Aug 23, 2022
0af2a08
make argument mandatory
pablogsal Aug 23, 2022
861ae09
Use struct for perf callbacks
tiran Aug 23, 2022
3058cf0
Rename macro to PY_HAVE_PERF_TRAMPOLINE
tiran Aug 23, 2022
07ee991
Merge pull request #39 from tiran/perf_callback_struct
pablogsal Aug 23, 2022
be612a9
Allow gdb to unwind
pablogsal Aug 23, 2022
f4e3fff
Merge remote-tracking branch 'upstream/main' into perf
pablogsal Aug 25, 2022
c27f8b1
Add docs
pablogsal Aug 25, 2022
e27a2c4
fixup! Add docs
pablogsal Aug 25, 2022
81c7f4b
fixup! fixup! Add docs
pablogsal Aug 25, 2022
ef0650b
Update sys API names in the NEWS entry.
gpshead Aug 29, 2022
d8932d2
Add environment variable
pablogsal Aug 29, 2022
0f303ff
Merge branch 'main' into perf
pablogsal Aug 29, 2022
e3f846e
Document the env var and the -X option
pablogsal Aug 29, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add comments to the perf_trampoline file and format file
  • Loading branch information
pablogsal committed Aug 23, 2022
commit a545b3cc7d9a262b6231a7193953c4f8a00037cc
4 changes: 4 additions & 0 deletions Lib/test/test_perf_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import sys
import sysconfig
import os
from test import support
from test.support.script_helper import make_script
from test.support.os_helper import temp_dir
from test.support import check_sanitizer
Expand All @@ -25,6 +26,9 @@ def get_perf_version():
raise Exception("unable to parse perf version: %r" % version)
return (version, match.group(1))

if not support.has_subprocess_support:
raise unittest.SkipTest("test module requires subprocess")


_, version = get_perf_version()

Expand Down
238 changes: 189 additions & 49 deletions Objects/perf_trampoline.c
Original file line number Diff line number Diff line change
@@ -1,3 +1,128 @@
/*

Perf trampoline instrumentation
===============================

This file contains instrumentation to allow to associate
calls to the CPython eval loop back to the names of the Python
fuctions and filename being executed.

Many natve performance profilers like the Linux perf tools are
only available to 'see' the C stack when sampling from the profiled
process. This means that if we have the following python code:

import time
def foo(n):
# Some CPU intensive code

def bar(n):
foo(n)

def baz(n):
bar(n)

baz(10000000)

A performace profiler that is only able to see native frames will
produce the following backtrace whe sampling from foo():

_PyEval_EvalFrameDefault -----> Evaluation frame of foo()
_PyEval_Vector
_PyFunction_Vectorcall
PyObject_Vectorcall
call_function

_PyEval_EvalFrameDefault ------> Evaluation frame of bar()
_PyEval_EvalFrame
_PyEval_Vector
_PyFunction_Vectorcall
PyObject_Vectorcall
call_function

_PyEval_EvalFrameDefault -------> Evaluation frame of baz()
_PyEval_EvalFrame
_PyEval_Vector
_PyFunction_Vectorcall
PyObject_Vectorcall
call_function

...

Py_RunMain

Because the profiler is only able to see the native frames and the native
function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault)
then the profiler and any reporter generated by it will not be able to
associate the names of the Python functions and the filenames associated with
those calls, rendering the results useless in the Python world.

To fix this problem, we introduce the concept of a trampoline frame. A
trampoline frame is a piece of code that is uniqued per Python code object that
is executed before entering the CPython eval loop. This piece of code just
calls the original Python evaluattion function (_PyEval_EvalFrameDefault) and
forwards all the arguments received. In this way, when a profiler samples
frames from the previous example it will see;

_PyEval_EvalFrameDefault -----> Evaluation frame of foo()
[Jit compiled code 3]
_PyEval_Vector
_PyFunction_Vectorcall
PyObject_Vectorcall
call_function

_PyEval_EvalFrameDefault ------> Evaluation frame of bar()
[Jit compiled code 2]
_PyEval_EvalFrame
_PyEval_Vector
_PyFunction_Vectorcall
PyObject_Vectorcall
call_function

_PyEval_EvalFrameDefault -------> Evaluation frame of baz()
[Jit compiled code 1]
_PyEval_EvalFrame
_PyEval_Vector
_PyFunction_Vectorcall
PyObject_Vectorcall
call_function

...

Py_RunMain

When we generate every unique copy of the trampoline (what here we called "[Jit
compiled code N]") we write the relationship between the compiled code and the
Python function that is associated with it. Every profiler requires this
information in a different format. For example, the Linux "perf" profiler
requires a file in "/tmp/perf-PID.map" (name and location not configurable)
with the following format:

<compiled code address> <compiled code size> <name of the compiled code>

If this file is available when "perf" generates reports, it will automatically
associate every trampoline with the Python function that it is associated with
allowing it to generate reports that include Python information. These reports
then can also be filtered in a way that *only* Python information appears.

Notice that for this to work, there must be a unique copied of the trampoline
per Python code object even if the code in the trampoline is the same. To
achieve this we have a assembly template in Objects/asm_trampiline.S that is
compiled into the Python executable/shared library. This template generates a
symbol that maps the start of the assembly code and another that marks the end
of the assembly code for the trampoline. Then, every time we need a unique
trampoline for a Python code object, we copy the assembly code into a mmaped
area that has executable permissions and we return the start of that area as
our trampoline function.

Asking for a mmap-ed memory area for trampoline is very wastefull so we
allocate big arenas of memory in a single mmap call, we populate the entire
arena with copies of the trampoline (this allows us to now have to invalidate
the icache for the instructions in the page) and then we return the next
available chunk every time someone asks for a new trampoline. We keep a linked
list of arenas in case the current memory arena is exhausted and another one is
needed.
*/

#include "Python.h"
#include "pycore_ceval.h"
#include "pycore_frame.h"
Expand All @@ -16,31 +141,37 @@ typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *,
int throwflag);
typedef PyObject *(*py_trampoline)(py_evaluator, PyThreadState *,
_PyInterpreterFrame *, int throwflag);
extern void *_Py_trampoline_func_start;
extern void *_Py_trampoline_func_end;

extern void *_Py_trampoline_func_start; // Start of the template of the
// assembly trampoline
extern void *
_Py_trampoline_func_end; // End of the template of the assembly trampoline

struct code_arena_st {
char *start_addr;
char *current_addr;
size_t size;
size_t size_left;
size_t code_size;
struct code_arena_st *prev;
char *start_addr; // Start of the memory arena
char *current_addr; // Address of the current trampoline within the arena
size_t size; // Size of the memory arena
size_t size_left; // Remaining size of the memory arena
size_t code_size; // Size of the code of every trampoline in the arena
struct code_arena_st
*prev; // Pointer to the arena or NULL if this is the first arena.
};

typedef enum {
PERF_STATUS_FAILED = -1,
PERF_STATUS_NO_INIT = 0,
PERF_STATUS_OK = 1,
PERF_STATUS_FAILED = -1, // Perf trampoline is in an invalid state
PERF_STATUS_NO_INIT = 0, // Perf trampoline is not initialized
PERF_STATUS_OK = 1, // Perf trampoline is ready to be executed
} perf_status_t;

typedef struct code_arena_st code_arena_t;

struct trampoline_api_st {
trampoline_state_init init_state;
trampoline_state_write write_state;
trampoline_state_free free_state;
void* state;
trampoline_state_init
init_state; // Callback to initialize the trampoline state
trampoline_state_write
write_state; // Callback to register every trampoline being created
trampoline_state_free free_state; // Callback to free the trampoline state
void *state;
};

typedef struct trampoline_api_st trampoline_api_t;
Expand All @@ -51,7 +182,7 @@ static code_arena_t *code_arena;
static trampoline_api_t trampoline_api;

static FILE *perf_map_file;
void*
void *
_Py_perf_map_get_file(void)
{
if (perf_map_file) {
Expand All @@ -62,7 +193,8 @@ _Py_perf_map_get_file(void)
// Location and file name of perf map is hard-coded in perf tool.
// Use exclusive create flag wit nofollow to prevent symlink attacks.
int flags = O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW | O_CLOEXEC;
snprintf(filename, sizeof(filename)-1, "/tmp/perf-%jd.map", (intmax_t)pid);
snprintf(filename, sizeof(filename) - 1, "/tmp/perf-%jd.map",
(intmax_t)pid);
int fd = open(filename, flags, 0600);
if (fd == -1) {
perf_status = PERF_STATUS_FAILED;
Expand All @@ -75,16 +207,17 @@ _Py_perf_map_get_file(void)
perf_status = PERF_STATUS_FAILED;
PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename);
close(fd);
_PyErr_WriteUnraisableMsg("Failed to create perf map file handle", NULL);
_PyErr_WriteUnraisableMsg("Failed to create perf map file handle",
NULL);
return NULL;
}
return perf_map_file;
}

int
_Py_perf_map_close(void* state)
_Py_perf_map_close(void *state)
{
FILE *fp = (FILE*)state;
FILE *fp = (FILE *)state;
if (fp) {
return fclose(fp);
}
Expand All @@ -94,23 +227,25 @@ _Py_perf_map_close(void* state)
}

void
_Py_perf_map_write_entry(void* state, const void *code_addr,
unsigned int code_size, PyCodeObject* co)
_Py_perf_map_write_entry(void *state, const void *code_addr,
unsigned int code_size, PyCodeObject *co)
{
assert(state != NULL);
FILE *method_file = (FILE*)state;
const char* entry = PyUnicode_AsUTF8(co->co_qualname);
FILE *method_file = (FILE *)state;
const char *entry = PyUnicode_AsUTF8(co->co_qualname);
if (entry == NULL) {
_PyErr_WriteUnraisableMsg( "Failed to get qualname from code object", NULL);
_PyErr_WriteUnraisableMsg("Failed to get qualname from code object",
NULL);
return;
}
const char* filename = PyUnicode_AsUTF8(co->co_filename);
const char *filename = PyUnicode_AsUTF8(co->co_filename);
if (filename == NULL) {
_PyErr_WriteUnraisableMsg( "Failed to get filename from code object", NULL);
_PyErr_WriteUnraisableMsg("Failed to get filename from code object",
NULL);
return;
}
fprintf(method_file, "%p %x py::%s:%s\n", code_addr,
code_size, entry, filename);
fprintf(method_file, "%p %x py::%s:%s\n", code_addr, code_size, entry,
filename);
fflush(method_file);
}

Expand All @@ -120,12 +255,11 @@ new_code_arena(void)
// non-trivial programs typically need 64 to 256 kiB.
size_t mem_size = 4096 * 16;
assert(mem_size % sysconf(_SC_PAGESIZE) == 0);
char *memory = mmap(NULL, // address
mem_size,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
-1, // fd (not used here)
0); // offset (not used here)
char *memory =
mmap(NULL, // address
mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
-1, // fd (not used here)
0); // offset (not used here)
if (!memory) {
PyErr_SetFromErrno(PyExc_OSError);
_PyErr_WriteUnraisableMsg(
Expand All @@ -147,15 +281,16 @@ new_code_arena(void)
PyErr_SetFromErrno(PyExc_OSError);
munmap(memory, mem_size);
_PyErr_WriteUnraisableMsg(
"Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC", NULL);
"Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC",
NULL);
}

code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t));
if (new_arena == NULL) {
PyErr_NoMemory();
munmap(memory, mem_size);
_PyErr_WriteUnraisableMsg(
"Failed to allocate new code arena struct", NULL);
_PyErr_WriteUnraisableMsg("Failed to allocate new code arena struct",
NULL);
return -1;
}

Expand All @@ -174,8 +309,8 @@ free_code_arenas(void)
{
code_arena_t *cur = code_arena;
code_arena_t *prev;
code_arena = NULL; // invalid static pointer
while(cur) {
code_arena = NULL; // invalid static pointer
while (cur) {
munmap(cur->start_addr, cur->size);
prev = cur->prev;
PyMem_RawFree(cur);
Expand All @@ -195,7 +330,8 @@ code_arena_new_code(code_arena_t *code_arena)
static inline py_trampoline
compile_trampoline(void)
{
if ((code_arena == NULL) || (code_arena->size_left <= code_arena->code_size)) {
if ((code_arena == NULL) ||
(code_arena->size_left <= code_arena->code_size)) {
if (new_code_arena() < 0) {
return NULL;
}
Expand All @@ -208,13 +344,16 @@ static PyObject *
py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
int throw)
{
if (perf_status == PERF_STATUS_FAILED || perf_status == PERF_STATUS_NO_INIT) {
if (perf_status == PERF_STATUS_FAILED ||
perf_status == PERF_STATUS_NO_INIT) {
goto default_eval;
}
PyCodeObject *co = frame->f_code;
py_trampoline f = NULL;
_PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f);
if (f == NULL) {
// This is the first time we see this code object so we need
// to compile a trampoline for it.
if (extra_code_index == -1) {
extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
}
Expand All @@ -231,9 +370,10 @@ py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame,
assert(f != NULL);
return f(_PyEval_EvalFrameDefault, ts, frame, throw);
default_eval:
// Something failed, fall back to the default evaluator.
return _PyEval_EvalFrameDefault(ts, frame, throw);
}
#endif // HAVE_PERF_TRAMPOLINE
#endif // HAVE_PERF_TRAMPOLINE

int
_PyIsPerfTrampolineActive(void)
Expand All @@ -245,11 +385,11 @@ _PyIsPerfTrampolineActive(void)
return 0;
}

int _PyPerfTrampoline_SetCallbacks(
trampoline_state_init init_state,
trampoline_state_write write_state,
trampoline_state_free free_state
) {
int
_PyPerfTrampoline_SetCallbacks(trampoline_state_init init_state,
trampoline_state_write write_state,
trampoline_state_free free_state)
{
#ifdef HAVE_PERF_TRAMPOLINE
if (trampoline_api.state) {
Py_FatalError("Trampoline state already initialized");
Expand All @@ -258,7 +398,7 @@ int _PyPerfTrampoline_SetCallbacks(
trampoline_api.init_state = init_state;
trampoline_api.write_state = write_state;
trampoline_api.free_state = free_state;
void* state = trampoline_api.init_state();
void *state = trampoline_api.init_state();
if (state == NULL) {
return -1;
}
Expand Down