Analysis of ptrace internals in linux kernel

First doing a “nm vmlinux|grep ptrace” gives (for 3.2.0-32 Ubuntu 12.04 kernel):

c1587edd t __ptrace_detach.part.5
c1064ff0 T __ptrace_link
c1065230 T __ptrace_may_access
c1065040 T __ptrace_unlink
c127a090 t apparmor_ptrace_access_check
c127a060 t apparmor_ptrace_traceme
c12447f0 T cap_ptrace_access_check
c1244870 T cap_ptrace_traceme
c1016e30 T flush_ptrace_hw_breakpoint
c1065990 T generic_ptrace_peekdata
c10659e0 T generic_ptrace_pokedata
c137c580 T proc_ptrace_connector
c1065130 T ptrace_check_attach
c101b540 T ptrace_disable
c106bc50 t ptrace_do_notify
c10660e0 T ptrace_get_breakpoints
c1065310 T ptrace_may_access
c101b0c0 t ptrace_modify_breakpoint.isra.16
c106c950 T ptrace_notify
c1066140 T ptrace_put_breakpoints
c1065490 T ptrace_readdata
c1065a20 T ptrace_request
c1064f40 t ptrace_resume
c1813448 d ptrace_scope
c101b2a0 T ptrace_set_debugreg
c106ba60 t ptrace_stop
c15880b5 t ptrace_trap_notify
c1064f20 t ptrace_trapping_sleep_fn
c101a870 t ptrace_triggered
c1065580 T ptrace_writedata
c1245e80 T security_ptrace_access_check
c1245ec0 T security_ptrace_traceme
c124bb50 t selinux_ptrace_access_check
c124c7f0 t selinux_ptrace_traceme
c1264410 t smack_ptrace_access_check
c1264390 t smack_ptrace_access_check.part.32
c1262e20 t smack_ptrace_traceme
c1262da0 t smack_ptrace_traceme.part.31
c127d6e0 T yama_ptrace_access_check

And then so focusing on the “ptrace_*” API above, I wrote the script:

#!/bin/bash
set -x
echo 0 >/debug/tracing/tracing_enabled
echo 'ptrace_*' > /debug/tracing/set_ftrace_filter
echo function >/debug/tracing/current_tracer
echo 1 >/debug/tracing/tracing_enabled

gdb /bin/ls <<EOF
run
EOF

echo 0 >/debug/tracing/tracing_enabled
cat /debug/tracing/trace

The result is (after lots of truncation):

+ cat /debug/tracing/trace
# tracer: function
#
# TASK-PID CPU# TIMESTAMP FUNCTION
# | | | | |
iconv-9761 [003] 15007.702161: ptrace_put_breakpoints <-do_exit
ls-9762 [001] 15007.743735: ptrace_signal.isra.26 <-get_signal_to_deliver
ls-9762 [001] 15007.743738: ptrace_stop <-ptrace_signal.isra.26
gdb-9760 [000] 15007.743778: ptrace_check_attach <-sys_ptrace
gdb-9760 [000] 15007.743781: ptrace_request <-arch_ptrace
gdb-9760 [000] 15007.743785: ptrace_check_attach <-sys_ptrace
gdb-9760 [000] 15007.743869: ptrace_may_access <-do_task_stat
gdb-9760 [000] 15007.743918: ptrace_check_attach <-sys_ptrace
gdb-9760 [000] 15007.743918: ptrace_request <-arch_ptrace
gdb-9760 [000] 15007.743918: ptrace_resume <-ptrace_request
ls-9762 [000] 15007.747867: ptrace_signal.isra.26 <-get_signal_to_deliver
ls-9762 [000] 15007.747868: ptrace_stop <-ptrace_signal.isra.26
gdb-9760 [001] 15007.747885: ptrace_check_attach <-sys_ptrace

gdb-9763 [000] 15007.748708: ptrace_notify <-do_fork
gdb-9763 [000] 15007.748708: ptrace_do_notify <-ptrace_notify
gdb-9763 [000] 15007.748709: ptrace_stop <-ptrace_do_notify
gdb-9764 [001] 15007.748712: ptrace_signal.isra.26 <-get_signal_to_deliver
gdb-9764 [001] 15007.748713: ptrace_stop <-ptrace_signal.isra.26
gdb-9760 [000] 15007.748715: ptrace_check_attach <-sys_ptrace
gdb-9760 [000] 15007.748715: ptrace_request <-arch_ptrace
gdb-9760 [000] 15007.748716: ptrace_check_attach <-sys_ptrace

gdb-9760 [000] 15007.749644: ptrace_request <-arch_ptrace
gdb-9760 [000] 15007.749645: ptrace_regset <-ptrace_request
gdb-9760 [000] 15007.749849: ptrace_check_attach <-sys_ptrace
gdb-9760 [000] 15007.749885: ptrace_may_access <-mm_access.part.6
gdb-9760 [000] 15007.749894: ptrace_may_access <-mm_access.part.6
gdb-9760 [000] 15007.749907: ptrace_may_access <-mm_access.part.6
gdb-9760 [000] 15007.749919: ptrace_may_access <-mm_access.part.6

gdb-9760 [002] 15007.764082: ptrace_request <-arch_ptrace
gdb-9760 [002] 15007.764084: ptrace_check_attach <-sys_ptrace
gdb-9760 [002] 15007.764100: ptrace_may_access <-do_task_stat
gdb-9760 [002] 15007.764121: ptrace_check_attach <-sys_ptrace
gdb-9760 [002] 15007.764185: ptrace_request <-arch_ptrace
gdb-9760 [002] 15007.764185: ptrace_resume <-ptrace_request
ls-9762 [001] 15007.766752: ptrace_put_breakpoints <-do_exit
gdb-9760 [002] 15007.767883: ptrace_put_breakpoints <-do_exit

From above, “ftrace” clearly showed us who the caller and callee are, when “gdb /bin/ls” is used to trigger ptrace_*() API. Analysing through the source code:

From userspace level, the syscall ptrace() directly mapped to kernel/ptrace.c:sys_ptrace() as shown below:

SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
unsigned long, data)
{
struct task_struct *child;
long ret;

if (request == PTRACE_TRACEME) {
ret = ptrace_traceme();
if (!ret)
arch_ptrace_attach(current);
goto out;
}

From here onwards, the APIs delved into architecture specific APIs, eg, in x86 it is inside arch/x86/kernel/ptrace.c, the API to set the architecture-specific x86 hardware debug register:

/*
* Handle ptrace writes to debug register 7.
*/
static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
{
struct thread_struct *thread = &(tsk->thread);
unsigned long old_dr7;
int i, orig_ret = 0, rc = 0;
int enabled, second_pass = 0;
unsigned len, type;
struct perf_event *bp;

A list of other hardware related APIs are:

int regs_query_register_offset(const char *name)
const char *regs_query_register_name(unsigned int offset)
static inline bool invalid_selector(u16 value)
static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
static int set_segment_reg(struct task_struct *task,
static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset)
static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
static int set_segment_reg(struct task_struct *task,
static unsigned long get_flags(struct task_struct *task)
static int set_flags(struct task_struct *task, unsigned long value)
static int putreg(struct task_struct *child,
static unsigned long getreg(struct task_struct *task, unsigned long offset)
static int genregs_get(struct task_struct *target,
static int genregs_set(struct task_struct *target,
static void ptrace_triggered(struct perf_event *bp,
static unsigned long ptrace_get_dr7(struct perf_event *bp[])
ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
static int ioperm_active(struct task_struct *target,
static int ioperm_get(struct task_struct *target,
void ptrace_disable(struct task_struct *child)
long arch_ptrace(struct task_struct *child, long request,
static int putreg32(struct task_struct *child, unsigned regno, u32 value)
static int getreg32(struct task_struct *child, unsigned regno, u32 *val)
static int genregs32_get(struct task_struct *target,
static int genregs32_set(struct task_struct *target,
long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
const struct user_regset_view *task_user_regset_view(struct task_struct *task)
static void fill_sigtrap_info(struct task_struct *tsk,
void user_single_step_siginfo(struct task_struct *tsk,
void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,

Essentially the hardware feature of singlestepping, and debug registers to register the different types of breakpoints is needed. Debug registers also provide additional information after the the breakpoints are reached during execution.

It is also inside arch/x86/kernel/ptrace.c that ptrace_request() is called, located inside kernel/ptrace.c, and this is where put_xxxx() are used to put data into userspace, before returning back to userspace.

To re-summarize, the entire ptrace operation inside the kernel starts here inside kernel/ptrace.c:

SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
unsigned long, data)
{
struct task_struct *child;
long ret;

And inside the “ptrace_attach()” is called to setup all the architecture-independent kernel information, and then arch_ptrace_attach() (and arch_ptrace()) to setup the architecture-dependent information.

if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
ret = ptrace_attach(child, request, data);
/*
* Some architectures need to do book-keeping after
* a ptrace attach.
*/
if (!ret)
arch_ptrace_attach(child);
goto out_put_task_struct;
}

ret = ptrace_check_attach(child, request == PTRACE_KILL ||
request == PTRACE_INTERRUPT);
if (ret < 0)
goto out_put_task_struct;

ret = arch_ptrace(child, request, addr, data);

out_put_task_struct:
put_task_struct(child);

And finally put_xxxx() APIs are used to directly copy the results into userspace memory. Through security checks are done inside ptrace_check_attach(), ptrace_may_access(), security_ptrace_access_check etc.

And looking into this:

http://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2010-0729

is an example where security has been violated in the kernel, by missing the checks occasionally.

There are other good documentation sources on ptrace:

http://lwn.net/Articles/446593/

http://lwn.net/Articles/371501/

http://lwn.net/Articles/432114/

Advertisements

Leave a Reply

Please log in using one of these methods to post your comment:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

%d bloggers like this: