Sending data from eBPF program to userspace

In this article we will walk through enabling the ktrace tracepoint, attaching a eBPF program to it, and communicating the data from this program to userspace. The BCC suite provides a very convenient interface for doing this, but it provides a fairly high level abstraction, hiding the details of operation, which is nice for the usage, but it is interesting to understand in details how things are plumbed together - this article aims to help with that. We will use a kprobe on "sock_sendmsg" syscall as an example. Disclaimer: This is a result of experimentation and looking at the code and playing with examples. There is no guarantee I am correct everywhere. If you find any mistakes please let me know - @ayourtch on twitter. The overall diagram of how all of this works low-level is as follows:

[sock_sendmsg kprobe] ----> efd
        ^
        | ioctl(efd, PERF_EVENT_IOC_SET_BPF, prog_fd);
        |
        |           
   [eBPF program] --------------> prog_fd
             ^
             |   BPF_LD_MAP_FD(BPF_REG_2, map_fd) 
             |
             |
             +-------- map_fd ---- [ bpf map ]
                                       ^
                                       |  bpf_update_elem(map_fd, &key, &efd2, BPF_ANY)
                                       |
      [ perf count output ] ---------> efd2-----------------+
                                       |                    |
                                       |                    | mmap() 
                                       |                    |
                                       |                    |
                                       v                    v
                                      [ poll ] -->*-| [mmapped ringbuffer]
                                                    |                 |
                                                    +---> read events | 
                                                                      |
                                                                      |
                                                                      |
                                                                      |
      [ process events ] <--------------------------------------------+
                                      
First of all, we need to create the map, and get its referencing file descriptor into map_fd, so we can use it in the eBPF program:

int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
                   int max_entries, int map_flags)
{
        union bpf_attr attr = {
                .map_type = map_type,
                .key_size = key_size,
                .value_size = value_size,
                .max_entries = max_entries,
          //VERSION DEPENDENT .map_flags = map_flags,
        };

        return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
}

map_fd = bpf_create_map(BPF_MAP_TYPE_PERF_EVENT_ARRAY, sizeof(int), sizeof(uint32_t), 256, 0);
  printf("map_fd: %d\n", map_fd);
  if (map_fd < 0) {
    perror("map_fd");
  }
This map will store the references to the event file descriptor used to send the data to userspace. Next, we can go on with eBPF program itself, here is a test example:

char bpf_log_buf[LOG_BUF_SIZE];

static __u64 ptr_to_u64(void *ptr)
{
        return (__u64) (unsigned long) ptr;
}

int bpf_prog_load(enum bpf_prog_type prog_type,
                  const struct bpf_insn *insns, int prog_len,
                  const char *license, int kern_version)
{
        union bpf_attr attr = {
                .prog_type = prog_type,
                .insns = ptr_to_u64((void *) insns),
                .insn_cnt = prog_len / sizeof(struct bpf_insn),
                .license = ptr_to_u64((void *) license),
                .log_buf = ptr_to_u64(bpf_log_buf),
                .log_size = LOG_BUF_SIZE,
                .log_level = 1,
        };

        /* assign one field outside of struct init to make sure any
         * padding is zero initialized
         */
        attr.kern_version = 263174; // kern_version;

        bpf_log_buf[0] = 0;

        int ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
        if (ret < 0) {
          printf("ERROR compiling bpf: %s\n", bpf_log_buf);
        }
        return ret;
}



struct bpf_insn prog[] = {
#ifdef XXXXX

        /**
         * bpf_perf_event_output(ctx, map, index, data, size) - output perf raw sample
         * @ctx: struct pt_regs*
         * @map: pointer to perf_event_array map
         * @index: index of event in the map
         * @data: data on stack to be output as raw data
         * @size: size of data
         * Return: 0 on success
         */
        //BPF_FUNC_perf_event_output,

#endif
/* Actually, I should not have clobbered R8. FIXME here. */
/* See: https://github.com/iovisor/bpf-docs/blob/master/bpf-internals-2.md */

               BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
               BPF_MOV64_IMM(BPF_REG_6, 42),
               BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_6, -4),
               BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_6, -8),
               BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_6, -12),
               BPF_MOV64_IMM(BPF_REG_6, 0),
               BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_6, -16),

               BPF_CALL_FUNC(BPF_FUNC_get_smp_processor_id),

               BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),        /* r1 = r8 */
               BPF_LD_MAP_FD(BPF_REG_2, map_fd),           /* r2 = map_fd */
               BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),                /* r3 = index */
               BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
               BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -16),

               BPF_MOV64_IMM(BPF_REG_5, 16),                /* r5 = data len */
               BPF_CALL_FUNC(BPF_FUNC_perf_event_output),

               //BPF_MOV64_IMM(BPF_REG_6, 0x62636464),
               BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
               BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 0x50505050),
               BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_6, -20),

               BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
               BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20),
               BPF_MOV64_IMM(BPF_REG_2, 5),
               // An option to also stuff some data to debug buffer
               // BPF_CALL_FUNC(BPF_FUNC_trace_printk),


               BPF_MOV64_IMM(BPF_REG_0, 0),                /* r0 = 0 */
               BPF_EXIT_INSN(),                            /* return r0 */
           };

  prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, prog, sizeof(prog), "GPL", 0);
Now we need to create the kprobe to attach the eBPF to. In this example we will use the kprobe attached to "sock_sendmsg" function. Before doing that, let's clear all the existing kprobes:
echo "" >/sys/kernel/debug/tracing/kprobe_events
echo "p:myprobe_sock_sendmsg sock_sendmsg" >>/sys/kernel/debug/tracing/kprobe_events
Now we can read the ID of the kprobe which we will use later on as a reference to attach the eBPF program to it:
cat /sys/kernel/debug/tracing/events/kprobes/myprobe_sock_sendmsg/id
In order to attach the program, we need to get the file descriptor, associated with this kprobe, as follows:

int
perf_event_open (struct perf_event_attr *attr, int pid, int cpu,
                 int group_fd, unsigned long flags)
{
  return syscall (__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
}

struct perf_event_attr attr = { };

attr.type = PERF_TYPE_TRACEPOINT;
attr.sample_type = PERF_SAMPLE_RAW;
attr.sample_period = 1;
attr.wakeup_events = 1;
attr.config = id;

efd = perf_event_open (&attr, -1 /*pid */ , 0 /*cpu */ , -1 /*group_fd */ , 0);
if (efd < 0) {
  printf ("event %d fd %d err %s\n", id, efd, strerror (errno));
  return -1;
}
printf("event FD: %d\n", efd);

Now, we could attach the eBPF to this kprobe and enable it:
  int ret = ioctl(efd, PERF_EVENT_IOC_SET_BPF, prog_fd);
  printf("SET BPF result: %d\n", ret);
  ret = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
  printf("IOC enable result: %d\n", ret);
However, because the eBPF program does not know how to get the data to user space, we can not get the events just yet. To do that, let's open the second event descriptor with the appropriate flags:

int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags)
{
        union bpf_attr attr = {
                .map_fd = fd,
                .key = ptr_to_u64(key),
                .value = ptr_to_u64(value),
                .flags = flags,
        };

        return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}



  struct perf_event_attr attr2 = {
                .sample_type = PERF_SAMPLE_RAW,
                .type = PERF_TYPE_SOFTWARE,
                .config = PERF_COUNT_SW_BPF_OUTPUT,
        };

  efd2 =
    perf_event_open (&attr2, -1 /*pid */ , 0 /*cpu */ , -1 /*group_fd */ , 0);
  if (efd2 < 0) {
    printf ("event2 %d fd %d err %s\n", id, efd2, strerror (errno));
    return -1;
  }
  printf("event2 FD: %d\n", efd2);

  /* stuff the efd2 into the map */`
  int key = 0;
  int ret0 = bpf_update_elem(map_fd, &key, &efd2, BPF_ANY);

Now, if we poll on the efd2, we can know when the events are ready to be read. But, before we are able to read them, we need to mmap the circular buffer:

volatile struct perf_event_mmap_page *header;

static int perf_event_mmap(int fd)
{
        void *base;
        int mmap_size;

        page_size = getpagesize();
        mmap_size = page_size * (page_cnt + 1);

        base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
        if (base == MAP_FAILED) {
                printf("mmap err\n");
                return -1;
        }

        header = base;
        return 0;
}
Now we can do the actual poll:
static int perf_event_poll(int fd)
{
        struct pollfd pfd = { .fd = fd, .events = POLLIN };

        return poll(&pfd, 1, 1000);
}
In this example, for reading the event, we will just print its hex dump:

struct perf_event_sample {
        struct perf_event_header header;
        __u32 size;
        char data[];
};

void hexdump(void *addr, int size) {
  uint8_t *p = addr;
  int cnt = 0;
  while (size--) {
    if(0 == (p- (uint8_t*)addr)%16 ) {
      printf("\n");
    }
    printf("%02x ", *p++);
  }
  printf("\n");
}


void perf_event_read()
{
        __u64 data_tail = header->data_tail;
        __u64 data_head = header->data_head;
        __u64 buffer_size = page_cnt * page_size;
        void *base, *begin, *end;
        char buf[256];

        asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
        if (data_head == data_tail)
                return;

        base = ((char *)header) + page_size;

        begin = base + data_tail % buffer_size;
        end = base + data_head % buffer_size;

        while (begin != end) {
                struct perf_event_sample *e;

                e = begin;
                if (begin + e->header.size > base + buffer_size) {
                        long len = base + buffer_size - begin;

                        // assert(len < e->header.size);
                        memcpy(buf, begin, len);
                        memcpy(buf + len, base, e->header.size - len);
                        e = (void *) buf;
                        begin = base + e->header.size - len;
                } else if (begin + e->header.size == base + buffer_size) {
                        begin = base;
                } else {
                        begin += e->header.size;
                }

                if (e->header.type == PERF_RECORD_SAMPLE) {
                        printf("%p %d ", e->data, e->size);
                        hexdump(e->data, e->size);
                } else if (e->header.type == PERF_RECORD_LOST) {
                        struct {
                                struct perf_event_header header;
                                __u64 id;
                                __u64 lost;
                        } *lost = (void *) e;
                        printf("lost %lld events\n", lost->lost);
                } else {
                        printf("unknown event type=%d size=%d\n",
                               e->header.type, e->header.size);
                }
        }

        __sync_synchronize(); /* smp_mb() */
        header->data_tail = data_head;
}

And this is the actual poll loop:
  while (1) {
    if (perf_event_poll(efd2)) {
      printf("%llx %llx\n", header->data_head, header->data_tail);
      perf_event_read();
    }
  }
If you put all of this together and run it, you will get:
ID: 1284
map_fd: 3
Prog fd: 4
event FD: 5
event2 FD: 6
map fd: 3 Update result: 0
SET BPF result: 0
IOC enable result: 0
0x7fc53edf0fac 20
00 00 00 00 2a 00 00 00 2a 00 00 00 2a 00 00 00
00 00 00 00
0x7fc53edf0fcc 20
00 00 00 00 2a 00 00 00 2a 00 00 00 2a 00 00 00
00 00 00 00
0x7fc53edf0fec 20
00 00 00 00 2a 00 00 00 2a 00 00 00 2a 00 00 00
00 00 00 00
0x7fc53edf100c 20
00 00 00 00 2a 00 00 00 2a 00 00 00 2a 00 00 00
00 00 00 00
0x7fc53edf102c 20
00 00 00 00 2a 00 00 00 2a 00 00 00 2a 00 00 00
00 00 00 00
0x7fc53edf104c 20
00 00 00 00 2a 00 00 00 2a 00 00 00 2a 00 00 00
00 00 00 00
Not a whole lot of useful data, but it shows how the things work.

For the eBPF operations, you will need the libbpf.h, I used the one from the samples/bpf directory in the kernel sources, with a slight modification.


/* eBPF mini library */
#ifndef __LIBBPF_H
#define __LIBBPF_H

struct bpf_insn;

int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
		   int max_entries, int map_flags);
int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags);
int bpf_lookup_elem(int fd, void *key, void *value);
int bpf_delete_elem(int fd, void *key);
int bpf_get_next_key(int fd, void *key, void *next_key);

int bpf_prog_load(enum bpf_prog_type prog_type,
		  const struct bpf_insn *insns, int insn_len,
		  const char *license, int kern_version);

int bpf_obj_pin(int fd, const char *pathname);
int bpf_obj_get(const char *pathname);

#define LOG_BUF_SIZE 65536
extern char bpf_log_buf[LOG_BUF_SIZE];

/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */

#define BPF_CALL_FUNC(FUNC) \
	((struct bpf_insn) {                                    \
		.code = BPF_JMP | BPF_CALL | BPF_K,				\
		.dst_reg = 0,	\
		.src_reg = 0,	\
		.off = 0, \
		.imm = FUNC })

#define BPF_ALU64_REG(OP, DST, SRC)				\
	((struct bpf_insn) {					\
		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_X,	\
		.dst_reg = DST,					\
		.src_reg = SRC,					\
		.off   = 0,					\
		.imm   = 0 })

#define BPF_ALU32_REG(OP, DST, SRC)				\
	((struct bpf_insn) {					\
		.code  = BPF_ALU | BPF_OP(OP) | BPF_X,		\
		.dst_reg = DST,					\
		.src_reg = SRC,					\
		.off   = 0,					\
		.imm   = 0 })

/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */

#define BPF_ALU64_IMM(OP, DST, IMM)				\
	((struct bpf_insn) {					\
		.code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,	\
		.dst_reg = DST,					\
		.src_reg = 0,					\
		.off   = 0,					\
		.imm   = IMM })

#define BPF_ALU32_IMM(OP, DST, IMM)				\
	((struct bpf_insn) {					\
		.code  = BPF_ALU | BPF_OP(OP) | BPF_K,		\
		.dst_reg = DST,					\
		.src_reg = 0,					\
		.off   = 0,					\
		.imm   = IMM })

/* Short form of mov, dst_reg = src_reg */

#define BPF_MOV64_REG(DST, SRC)					\
	((struct bpf_insn) {					\
		.code  = BPF_ALU64 | BPF_MOV | BPF_X,		\
		.dst_reg = DST,					\
		.src_reg = SRC,					\
		.off   = 0,					\
		.imm   = 0 })

#define BPF_MOV32_REG(DST, SRC)					\
	((struct bpf_insn) {					\
		.code  = BPF_ALU | BPF_MOV | BPF_X,		\
		.dst_reg = DST,					\
		.src_reg = SRC,					\
		.off   = 0,					\
		.imm   = 0 })

/* Short form of mov, dst_reg = imm32 */

#define BPF_MOV64_IMM(DST, IMM)					\
	((struct bpf_insn) {					\
		.code  = BPF_ALU64 | BPF_MOV | BPF_K,		\
		.dst_reg = DST,					\
		.src_reg = 0,					\
		.off   = 0,					\
		.imm   = IMM })

/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
#define BPF_LD_IMM64(DST, IMM)					\
	BPF_LD_IMM64_RAW(DST, 0, IMM)

#define BPF_LD_IMM64_RAW(DST, SRC, IMM)				\
	((struct bpf_insn) {					\
		.code  = BPF_LD | BPF_DW | BPF_IMM,		\
		.dst_reg = DST,					\
		.src_reg = SRC,					\
		.off   = 0,					\
		.imm   = (__u32) (IMM) }),			\
	((struct bpf_insn) {					\
		.code  = 0, /* zero is reserved opcode */	\
		.dst_reg = 0,					\
		.src_reg = 0,					\
		.off   = 0,					\
		.imm   = ((__u64) (IMM)) >> 32 })

#ifndef BPF_PSEUDO_MAP_FD
# define BPF_PSEUDO_MAP_FD	1
#endif

/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
#define BPF_LD_MAP_FD(DST, MAP_FD)				\
	BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)


/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */

#define BPF_LD_ABS(SIZE, IMM)					\
	((struct bpf_insn) {					\
		.code  = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS,	\
		.dst_reg = 0,					\
		.src_reg = 0,					\
		.off   = 0,					\
		.imm   = IMM })

/* Memory load, dst_reg = *(uint *) (src_reg + off16) */

#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)			\
	((struct bpf_insn) {					\
		.code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,	\
		.dst_reg = DST,					\
		.src_reg = SRC,					\
		.off   = OFF,					\
		.imm   = 0 })

/* Memory store, *(uint *) (dst_reg + off16) = src_reg */

#define BPF_STX_MEM(SIZE, DST, SRC, OFF)			\
	((struct bpf_insn) {					\
		.code  = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,	\
		.dst_reg = DST,					\
		.src_reg = SRC,					\
		.off   = OFF,					\
		.imm   = 0 })

/* Memory store, *(uint *) (dst_reg + off16) = imm32 */

#define BPF_ST_MEM(SIZE, DST, OFF, IMM)				\
	((struct bpf_insn) {					\
		.code  = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM,	\
		.dst_reg = DST,					\
		.src_reg = 0,					\
		.off   = OFF,					\
		.imm   = IMM })

/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */

#define BPF_JMP_REG(OP, DST, SRC, OFF)				\
	((struct bpf_insn) {					\
		.code  = BPF_JMP | BPF_OP(OP) | BPF_X,		\
		.dst_reg = DST,					\
		.src_reg = SRC,					\
		.off   = OFF,					\
		.imm   = 0 })

/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */

#define BPF_JMP_IMM(OP, DST, IMM, OFF)				\
	((struct bpf_insn) {					\
		.code  = BPF_JMP | BPF_OP(OP) | BPF_K,		\
		.dst_reg = DST,					\
		.src_reg = 0,					\
		.off   = OFF,					\
		.imm   = IMM })

/* Raw code statement block */

#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM)			\
	((struct bpf_insn) {					\
		.code  = CODE,					\
		.dst_reg = DST,					\
		.src_reg = SRC,					\
		.off   = OFF,					\
		.imm   = IMM })

/* Program exit */

#define BPF_EXIT_INSN()						\
	((struct bpf_insn) {					\
		.code  = BPF_JMP | BPF_EXIT,			\
		.dst_reg = 0,					\
		.src_reg = 0,					\
		.off   = 0,					\
		.imm   = 0 })

/* create RAW socket and bind to interface 'name' */
int open_raw_sock(const char *name);

struct perf_event_attr;
int perf_event_open(struct perf_event_attr *attr, int pid, int cpu,
		    int group_fd, unsigned long flags);
#endif

`