eBPF Trace from Kernel to Userspace Gary Lin SUSE Labs Software Engineer Technology Sharing Day 2016
eBPFTrace from Kernel to Userspace
Gary LinSUSE LabsSoftware Engineer
Technology Sharing Day
2016
Tracer
tick_nohz_idle_enterset_cpu_sd_state_idleup_write__tick_nohz_idle_enterktime_getuprobe_mmapread_hpetvma_set_page_protvma_wants_writenotify
rcu_needs_cpufputget_next_timer_interrupt_raw_spin_lockhrtimer_get_next_event_raw_spin_lock_irqsave_raw_spin_unlock_irqrestoresyscall_trace_leave _raw_write_unlock_irqrestore__audit_syscall_exitpath_putdputmntput
up_write
rax: 0x0000000000000000rbx: 0xffff88012b5a5a28rcx: 0xffff8800987c18e0rdx: 0x0000000000000000rsi: 0xffff88012b439f20rdi: 0xffff88012b464628rbp: 0xffff8800959e3d98
kprobe
Kernel
Userspace
uprobe
/sys/kernel/debug/tracing/kprobe_events
/sys/kernel/debug/tracing/uprobe_events
eBPF
BPF?
Berkeley Packet Filter
BPF
No RedBPF Program
The BSD Packet Filter:A New Architecture for User-level
Packet CaptureDecember 19, 1992
SCO lawsuit, August 2003
Old
Stable
BPF ASM
ldh [12]jne #0x800, dropldb [23]jneq #1, drop# get a random uint32 numberld randmod #4jneq #1, dropret #-1drop: ret #0
BPF Bytecode
struct sock_filter code[] = { { 0x28, 0, 0, 0x0000000c }, { 0x15, 0, 8, 0x000086dd }, { 0x30, 0, 0, 0x00000014 }, { 0x15, 2, 0, 0x00000084 }, { 0x15, 1, 0, 0x00000006 }, { 0x15, 0, 17, 0x00000011 }, { 0x28, 0, 0, 0x00000036 }, { 0x15, 14, 0, 0x00000016 }, { 0x28, 0, 0, 0x00000038 }, { 0x15, 12, 13, 0x00000016 }, ...};
Virtual Machinekind of
BPF JIT
BPFBytecode
NativeMachine
CodeBPF JIT
$ find arch/ -name bpf_jit*arch/sparc/net/bpf_jit_comp.carch/sparc/net/bpf_jit_asm.Sarch/sparc/net/bpf_jit.harch/arm/net/bpf_jit_32.carch/arm/net/bpf_jit_32.harch/arm64/net/bpf_jit_comp.carch/arm64/net/bpf_jit.harch/powerpc/net/bpf_jit_comp.carch/powerpc/net/bpf_jit_asm.Sarch/powerpc/net/bpf_jit.harch/s390/net/bpf_jit_comp.carch/s390/net/bpf_jit.Sarch/s390/net/bpf_jit.harch/mips/net/bpf_jit.carch/mips/net/bpf_jit_asm.Sarch/mips/net/bpf_jit.harch/x86/net/bpf_jit_comp.carch/x86/net/bpf_jit.S
Stable and Efficient
eBPF
Extended BPF
eBPF
userspacekernel
eBPFProgramBPF_PROG_LOAD
At most4096instructions
Extended RegisterseBPF VerifiereBPF MapProbe Event
Extended RegisterseBPF VerifiereBPF MapProbe Event
Classic BPF: 32 bitExtended BPF: 64 bit
Classic BPF: A, X (2)Extended BPF: R0 – R9 (10)
R10 (read-only)
For x86_64 JIT
R0 → raxR1 → rdiR2 → rsiR3 → rdxR4 → rcxR5 → r8R6 → rbxR7 → r13R8 → r14R9 → r15R10 → rbp
BPF Calling Convention
● R0
Return value from in-kernel function, and exit value for eBPF program
● R1 – R5
Arguments from eBPF program to in-kernel function
● R6 – R9
Callee saved registers that in-kernel function will preserve
● R10
Read-only frame pointer to access stack
Extended RegisterseBPF VerifiereBPF MapProbe Event
Two-Step Verification
Step 1
Directed Acyclic GraphCheck
Loops
Unreachable Instructions
Loops
Unreachable Instructions
Step 2
Simulate the Execution
Read a never-written register
Do arithmetic of two valid pointer
Load/store registers of invalid types
Read stack before writing data into stack
Read a never-written register
Do arithmetic of two valid pointer
Load/store registers of invalid types
Read stack before writing data into stack
Extended RegisterseBPF VerifiereBPF MapProbe Event
eBPF
userspacekernel
UserProgram
Map BPF_MAP_*
eBPF Map Types
● BPF_MAP_TYPE_HASH● BPF_MAP_TYPE_ARRAY● BPF_MAP_TYPE_PROG_ARRAY● BPF_MAP_TYPE_PERF_EVENT_ARRAY
eBPF Map Syscalls
● BPF_MAP_CREATE● BPF_MAP_LOOKUP_ELEM● BPF_MAP_UPDATE_ELEM● BPF_MAP_DELETE_ELEM● BPF_MAP_GET_NEXT_KEY
Extended RegisterseBPF VerifiereBPF MapProbe Event
New ioctl request
PERF_EVENT_IOC_SET_BPF
Kprobe
BPF_PROG_LOAD
User Program
eBPF
userspace
kernel
KernelProgram
kprobe
Eventfd
fd
PERF_EVENT_IOC_SET_BPF
fd
Attach
Registration
perf_tp_event_init() kernel/events/core.cperf_trace_init() kernel/trace/trace_event_perf.cperf_trace_event_init() kernel/trace/trace_event_perf.cperf_trace_event_reg() kernel/trace/trace_event_perf.c
ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
kprobe_register() kernel/trace/trace_kprobe.cenable_trace_kprobe() kernel/trace/trace_kprobe.cenable_kprobe() kernel/kprobes.c
Attach
perf_ioctl() kernel/events/core.c_perf_ioctl() kernel/events/core.c
case PERF_EVENT_IOC_SET_BPF: return perf_event_set_bpf_prog(event, arg);
perf_event_set_bpf_prog() kernel/events/core.c
prog = bpf_prog_get(prog_fd); event->tp_event->prog = prog;
Dispatch Event
kprobe_dispatcher() kernel/trace/trace_kprobe.ckprobe_perf_func() kernel/trace/trace_kprobe.c
if (prog && !trace_call_bpf(prog, regs)) Return;
trace_call_bpf() kernel/trace/bpf_trace.cBPF_PROG_RUN() include/linux/filter.h__bpf_prog_run() kernel/bpf/core.c
kfree_skb(struct sk_buff *skb){ if (unlikely(!skb)) return; ….}
kprobe
eBPF
BPF bytecode Read Map
BPF bytecode Map
BPF_PROG_LOAD BPF_MAP_*
userspace
kernel
bpf_tracer.c
Uprobe
BPF_PROG_LOAD
User Program
eBPF
userspace
kernel
KernelProgram
uprobe
Eventfd
fd
PERF_EVENT_IOC_SET_BPF
fd
Attach
__libc_malloc(size_t *bytes){ arena_lookup(ar_ptr);
arena_lock(ar_ptr, bytes); ….}
uprobe
eBPF
BPF bytecode
BPF bytecode
userspace
kernel
bpf_tracer.c
glibc
How to use eBPF?
Linux Kernel >= 4.1
Kernel Config
● CONFIG_BPF=y● CONFIG_BPF_SYSCALL=y● CONFIG_BPF_JIT=y● CONFIG_HAVE_BPF_JIT=y● CONFIG_BPF_EVENTS=y
BPF ASM
BPF ASMRestricted C
LLVM >= 3.7
clang:llc:
--emit-llvm--march=bpf
C codeLLVM
IR BitcodeBPF Bytecodeclang llc
User Program
eBPF
userspace
kernel
eBPF MAP
KernelProgram
As simpleas possible
Whatever you want
BPF Compiler Collection
obs://Base:System/bcc
C & Python Library
Built-in BPF compiler
Hello World
from bcc import BPF
bpf_prog="""void kprobe__sys_clone(void *ctx) { bpf_trace_printk(“Hello, World\\n”);}"""
BPF(text=bpf_prog).trace_print()
Access Map
In bitehist.c:
BPF_HISTOGRAM(dist);dist.increment(bpf_log2l(req->__data_len / 1024));
In bitehist.py:
b = BPF(src_file = "bitehist.c")b["dist"].print_log2_hist("kbytes")
Access Map (Cont’)
# ./bitehist.pyTracing... Hit Ctrl-C to end.^C kbytes : count distribution 0 -> 1 : 8 |****** | 2 -> 3 : 0 | | 4 -> 7 : 51 |****************************************| 8 -> 15 : 8 |****** | 16 -> 31 : 1 | | 32 -> 63 : 3 |** | 64 -> 127 : 2 |* |
memleak.py
if not kernel_trace: print("Attaching to malloc and free in pid %d," "Ctrl+C to quit." % pid) bpf_program.attach_uprobe(name="c", sym="malloc", fn_name="alloc_enter", pid=pid) bpf_program.attach_uretprobe(name="c", sym="malloc", fn_name="alloc_exit", pid=pid) bpf_program.attach_uprobe(name="c", sym="free", fn_name="free_enter", pid=pid)else: print("Attaching to kmalloc and kfree, Ctrl+C to quit.") bpf_program.attach_kprobe(event="__kmalloc", fn_name="alloc_enter") bpf_program.attach_kretprobe(event="__kmalloc", fn_name="alloc_exit") bpf_program.attach_kprobe(event="kfree", fn_name="free_enter")
memleak.py (alloc_enter)
BPF_HASH(sizes, u64);BPF_HASH(allocs, u64, struct alloc_info_t);
int alloc_enter(struct pt_regs *ctx, size_t size) { ... u64 pid = bpf_get_current_pid_tgid(); u64 size64 = size; sizes.update(&pid, &size64); ...}
memleak.py (alloc_exit)
BPF_HASH(sizes, u64);BPF_HASH(allocs, u64, struct alloc_info_t);
int alloc_exit(struct pt_regs *ctx) { u64 address = ctx->ax; u64 pid = bpf_get_current_pid_tgid(); u64* size64 = sizes.lookup(&pid); struct alloc_info_t info = {0};
if (size64 == 0) return 0; // missed alloc entry
info.size = *size64; sizes.delete(&pid);
info.timestamp_ns = bpf_ktime_get_ns(); info.num_frames = grab_stack(ctx, &info) - 2; allocs.update(&address, &info); ...}
memleak.py (free)
BPF_HASH(sizes, u64);BPF_HASH(allocs, u64, struct alloc_info_t);
int free_enter(struct pt_regs *ctx, void *address){ u64 addr = (u64)address; struct alloc_info_t *info = allocs.lookup(&addr); if (info == 0) return 0;
allocs.delete(&addr); ...}
Demo
Question?
ThankYou
References
● Documentation/networking/filter.txt
● http://www.brendangregg.com/blog/2015-05-15/ebpf-one-small-step.html
● https://suchakra.wordpress.com/2015/05/18/bpf-internals-i/
● https://suchakra.wordpress.com/2015/08/12/bpf-internals-ii/
● https://lkml.org/lkml/2013/9/30/627
● https://lwn.net/Articles/612878/
● https://lwn.net/Articles/650953/
● https://github.com/iovisor/bcc