CVE-2022-0847-DirtyPipe分析

漏洞成因

  • pipe维护了一个struct pipe_buffer的数组,每个pipe_buffer指向一个page,page里存的就是pipe的数据
  • 正常情况下,往pipe里写数据时会申请一个page,把数据拷贝到page里后再让pipe_buffer指向这个page。splice系统调用实现了一种零拷贝的技术,直接让pipe_buffer指向这个原始的数据page,这样就省去了内存拷贝的过程,提升效率
  • 往pipe里写数据时不可能每次都正好是page_size的整数倍,如果每次写数据都要重新分配一个新的page来存,必然会造成空间的浪费。但是如果pipe_buffer的PIPE_BUF_FLAG_CAN_MERGEflag被置位,数据就会接着上一次的数据在同一个page中写入,而不是申请新的page,减少了空间的浪费
  • 但是splice在给pipe_buffer赋值时没有初始化flag,这就造成之前被置位的PIPE_BUF_FLAG_CAN_MERGEflag不会被清除,所以只要先让所有的pipe_buffer的PIPE_BUF_FLAG_CAN_MERGEflag被置位,然后调用splice让pipe_buffer指向目标文件page cache,这时再向pipe里写数据就会直接修改page cache里的内容,造成任意文件覆盖漏洞

源码分析

以下源码来自Linux5.8.1

pipe

关键数据结构

pipe_inode_info

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
/**
* struct pipe_inode_info - a linux kernel pipe
* @mutex: mutex protecting the whole thing
* @rd_wait: reader wait point in case of empty pipe
* @wr_wait: writer wait point in case of full pipe
* @head: The point of buffer production
* @tail: The point of buffer consumption
* @note_loss: The next read() should insert a data-lost message
* @max_usage: The maximum number of slots that may be used in the ring
* @ring_size: total number of buffers (should be a power of 2)
* @nr_accounted: The amount this pipe accounts for in user->pipe_bufs
* @tmp_page: cached released page
* @readers: number of current readers of this pipe
* @writers: number of current writers of this pipe
* @files: number of struct file referring this pipe (protected by ->i_lock)
* @r_counter: reader counter
* @w_counter: writer counter
* @fasync_readers: reader side fasync
* @fasync_writers: writer side fasync
* @bufs: the circular array of pipe buffers
* @user: the user who created this pipe
* @watch_queue: If this pipe is a watch_queue, this is the stuff for that
**/
struct pipe_inode_info {
struct mutex mutex;
wait_queue_head_t rd_wait, wr_wait;
unsigned int head;
unsigned int tail;
unsigned int max_usage;
unsigned int ring_size;
#ifdef CONFIG_WATCH_QUEUE
bool note_loss;
#endif
unsigned int nr_accounted;
unsigned int readers;
unsigned int writers;
unsigned int files;
unsigned int r_counter;
unsigned int w_counter;
struct page *tmp_page;
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
struct pipe_buffer *bufs;
struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
struct watch_queue *watch_queue;
#endif
};

其中bufs是一个struct pipe_buffer的数组,默认数量为16,每个pipe_buffer能存储一个page的数据。这16个page组成一个环形缓冲区,用来存储管道里的数据。

pipe_buffer

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/**
* struct pipe_buffer - a linux kernel pipe buffer
* @page: the page containing the data for the pipe buffer
* @offset: offset of data inside the @page
* @len: length of data inside the @page
* @ops: operations associated with this buffer. See @pipe_buf_operations.
* @flags: pipe buffer flags. See above.
* @private: private data owned by the ops.
**/
struct pipe_buffer {
struct page *page;
unsigned int offset, len;
const struct pipe_buf_operations *ops;
unsigned int flags;
unsigned long private;
};

PIPE_BUF_FLAG_CAN_MERGE就包含在flags字段中,它将影响page指向的内存页

写pipe

调用write向pipe里写数据时会经过层层调用,最终实际调用pipe_write

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
static ssize_t
pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
unsigned int head;
ssize_t ret = 0;
size_t total_len = iov_iter_count(from);
ssize_t chars;
bool was_empty = false;
bool wake_next_writer = false;

/* Null write succeeds. */
if (unlikely(total_len == 0))
return 0;

__pipe_lock(pipe);

// 确保读者数量不为0
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
goto out;
}

#ifdef CONFIG_WATCH_QUEUE
if (pipe->watch_queue) {
ret = -EXDEV;
goto out;
}
#endif

/*
* Only wake up if the pipe started out empty, since
* otherwise there should be no readers waiting.
*
* If it wasn't empty we try to merge new data into
* the last buffer.
*
* That naturally merges small writes, but it also
* page-aligs the rest of the writes for large writes
* spanning multiple pages.
*/
head = pipe->head;
was_empty = pipe_empty(head, pipe->tail);
chars = total_len & (PAGE_SIZE-1); // 要写入的数据的大小相对页帧大小的余数
// 如果余数不为0,且pipe不为空
if (chars && !was_empty) {
unsigned int mask = pipe->ring_size - 1;
// 当前头部的上一个缓冲区,因为要尝试将多余的数据与之前的数据合并
struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
int offset = buf->offset + buf->len;

// 如果PIPE_BUF_FLAG_CAN_MERGE被置位,且buf能容下chars大小的数据
if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
offset + chars <= PAGE_SIZE) {
ret = pipe_buf_confirm(pipe, buf);
if (ret)
goto out;

// 将chars大小的数据写入缓冲区
ret = copy_page_from_iter(buf->page, offset, chars, from);
if (unlikely(ret < chars)) {
ret = -EFAULT;
goto out;
}

buf->len += ret;
// 如果没有其余数据需要写入,则退出
if (!iov_iter_count(from))
goto out;
}
}

for (;;) {
// 确保对着数量不为0
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
}

head = pipe->head;
// 如果pipe没被填满
if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf = &pipe->bufs[head & mask];
struct page *page = pipe->tmp_page; // tmp_page用来临时存数据
int copied;

// 如果tmp_page还未分配,则用alloc_page分配一个page并赋值
if (!page) {
page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
if (unlikely(!page)) {
ret = ret ? : -ENOMEM;
break;
}
pipe->tmp_page = page;
}

/* Allocate a slot in the ring in advance and attach an
* empty buffer. If we fault or otherwise fail to use
* it, either the reader will consume it or it'll still
* be there for the next write.
*/
// 自旋锁锁住读者等待队列
spin_lock_irq(&pipe->rd_wait.lock);

head = pipe->head;
// 如果pipe已经被填满则进入下一次循环
if (pipe_full(head, pipe->tail, pipe->max_usage)) {
spin_unlock_irq(&pipe->rd_wait.lock);
continue;
}

// 先让头部指针指向下一个缓冲区
pipe->head = head + 1;
spin_unlock_irq(&pipe->rd_wait.lock);// 释放自旋锁

/* Insert it into the buffer array */
buf = &pipe->bufs[head & mask];
buf->page = page; //将之前分配的tmp_page赋值给buf->page
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
buf->len = 0;
// 如果创建pipe时没有指定O_DIRECT选项,则将flags设置为PIPE_BUF_FLAG_CAN_MERGE
// 所以只要创建pipe时不指定flags,就能将buffer的PIPE_BUF_FLAG_CAN_MERGE置位
if (is_packetized(filp))
buf->flags = PIPE_BUF_FLAG_PACKET;
else
buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
pipe->tmp_page = NULL; // tmp_page置空

// 拷贝一页大小的数据到page里
copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
if (!ret)
ret = -EFAULT;
break;
}
ret += copied;
buf->offset = 0;
buf->len = copied;

if (!iov_iter_count(from))
break;
}
......
}

splice

splice系统调用主要由do_splice函数完成,do_splice根据输入的文件描述符进入不同的分支,在本次漏洞利用中因为in是普通文件,out是pipe,所以会进入if (opipe)这个分支

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
/*
* Determine where to splice to/from.
*/
long do_splice(struct file *in, loff_t __user *off_in,
struct file *out, loff_t __user *off_out,
size_t len, unsigned int flags)
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
loff_t offset;
long ret;

if (unlikely(!(in->f_mode & FMODE_READ) ||
!(out->f_mode & FMODE_WRITE)))
return -EBADF;

ipipe = get_pipe_info(in, true);
opipe = get_pipe_info(out, true);

// in和out都是pipe
if (ipipe && opipe) {
......
}

// 只有in是pipe
if (ipipe) {
......
}
// 只有out是pipe
if (opipe) {
// 处理in和out的偏移
if (off_out)
return -ESPIPE;
if (off_in) {
if (!(in->f_mode & FMODE_PREAD))
return -EINVAL;
if (copy_from_user(&offset, off_in, sizeof(loff_t)))
return -EFAULT;
} else {
offset = in->f_pos;
}

if (out->f_flags & O_NONBLOCK)
flags |= SPLICE_F_NONBLOCK;

pipe_lock(opipe);
// 等待pipe有可用的缓冲区
ret = wait_for_space(opipe, flags);
if (!ret) {
unsigned int p_space;

/* Don't try to read more the pipe has space for. */
p_space = opipe->max_usage - pipe_occupancy(opipe->head, opipe->tail);// pipe可用空间
len = min_t(size_t, len, p_space << PAGE_SHIFT);// 实际读取长度不能超过pipe可用空间

ret = do_splice_to(in, &offset, opipe, len, flags); // 调用do_splice_to完成主要工作
}
pipe_unlock(opipe);
if (ret > 0)
wakeup_pipe_readers(opipe);
if (!off_in)
in->f_pos = offset;
else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
ret = -EFAULT;

return ret;
}

return -EINVAL;
}

在do_splice_to中又调用了输入文件的splice_read函数,之后又经过一系列的调用,最终由copy_page_to_iter_pipe完成关联page_cage和pipe缓冲区的工作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
tatic size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
struct pipe_inode_info *pipe = i->pipe;
struct pipe_buffer *buf;
unsigned int p_tail = pipe->tail;
unsigned int p_mask = pipe->ring_size - 1;
unsigned int i_head = i->head;
size_t off;

if (unlikely(bytes > i->count))
bytes = i->count;

if (unlikely(!bytes))
return 0;

if (!sanity(i))
return 0;

off = i->iov_offset;
buf = &pipe->bufs[i_head & p_mask];
if (off) {
// 如果要求的offset和实际的offset相同,且头部的buffer指向的就是当前的page cache
// 则直接移动offset即可
if (offset == off && buf->page == page) {
/* merge with the last one */
buf->len += bytes;
i->iov_offset += bytes;
goto out;
}
i_head++;
buf = &pipe->bufs[i_head & p_mask];
}
if (pipe_full(i_head, p_tail, pipe->max_usage))
return 0;

buf->ops = &page_cache_pipe_buf_ops;
// 增加page的应用计数
get_page(page);
// 将pipe缓冲区的page指针指向文件的page cache
buf->page = page;
buf->offset = offset;
buf->len = bytes;

pipe->head = i_head + 1;
i->iov_offset = offset + bytes;
i->head = i_head;
out:
i->count -= bytes;
return bytes;
}

可以看到copy_page_to_iter_pipe函数直接将page cache赋值给对应buffer的page指针,而没有对buffer的flags做初始化存在,使得之前被设置的PIPE_BUF_FLAG_CAN_MERGE仍然有效

能真正地覆盖文件内容吗

由上面的漏洞分析可知,最终完成的对page cache的覆写,而不是覆盖磁盘上的文件内容。当文件的page cache存在时,之后读取该文件都将直接从page cache中获取,所以只要该page cache存在,就相当于覆盖了文件内容。经测试,只要重启系统后page cache就会消失,此时再读取文件将会得到原文件内容。

但是page cache不是有writeback机制吗,只要触发该机制是不是就能将覆写后的page cache写回磁盘呢?

为了验证这个问题,我调用sync来手动触发writeback

image-20220316174559026

观察程序输出结果发现,调用sync之后读取文件内容仍然是篡改过后的内容,看起来sync似乎真的把page cache里写回到了磁盘里

image-20220316174717827

但当我重启系统之后发现文件内容又复原了,说明sync即没有把page cache写回到磁盘,也没有清除缓存中的内容,相当于直接忽略了这个被篡改过的page,这是为什么呢?

经过调试发现,在向普通文件写入数据时,调用的是generic_file_write_iter函数

image-20220316160455049

经过如下图所示的调用,最终会调用set_page_dirty函数将该page置为dirty状态,所以最终会被writeback机制写回到磁盘中

image-20220316164628152

正如源码分析中所说的,向pipe中写入数据时调用的是pipe_write,这时我给set_page_dirty函数设置断点发现,程序之后都没有调用这个函数,这点从源码中也可以证明。

image-20220316173149686

这说明当我们利用漏洞修改page cache中的内容时,系统并没有将对应的page设置为dirty,所以这个修改对writeback机制来说是不可见的,自然会被忽略掉。

那为什么重启系统文件内容又会恢复呢?那是因为重启系统将所有的缓存都回收了,执行echo 1 > /proc/sys/vm/drop_caches命令能手动回收缓存,也能将文件内容恢复

image-20220316181600303

参考资料

The Dirty Pipe Vulnerability

CVE-2022-0847-DirtyPipe-Exploit

Linux5.8.1源码

CVE-2022-0847 漏洞分析

VFS源码分析-Page Cache Writeback机制

文章目录
  1. 1. 漏洞成因
  2. 2. 源码分析
    1. 2.1. pipe
      1. 2.1.1. 关键数据结构
        1. 2.1.1.1. pipe_inode_info
        2. 2.1.1.2. pipe_buffer
      2. 2.1.2. 写pipe
    2. 2.2. splice
  3. 3. 能真正地覆盖文件内容吗
  4. 4. 参考资料
|