Cgroup －从 CPU 的资源隔离说起（四）

V2EX = way to explore

V2EX 是一个关于分享和探索的地方

现在注册

已注册用户请登录

Distributions

› Ubuntu

› Fedora

› CentOS

中文资源站

› 网易开源镜像站

这是一个创建于 3328 天前的主题，其中的信息可能已经有所发展或是发生改变。

CPU 资源隔离在 sys 较高的情况下是什么表现？

内核资源不冲突的情况

首先我们简单说一下什么叫 sys 较高。先看 mpstat 命令的输出：

[root@zorrozou-pc ~]# mpstat 1
Linux 3.10.90-1-linux (zorrozou-pc)         12/24/15    _x86_64_    (24 CPU)

16:08:52     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
16:08:53     all    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
16:08:54     all    0.00    0.00    0.04    0.00    0.04    0.00    0.00    0.00    0.00   99.92
16:08:55     all    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00  100.00
16:08:56     all    0.04    0.00    0.04    0.00    0.00    0.00    0.00    0.00    0.00   99.92
16:08:57     all    0.04    0.00    0.04    0.00    0.00    0.00    0.00    0.00    0.00   99.92
16:08:58     all    0.00    0.00    0.04    0.00    0.00    0.00    0.00    0.00    0.00   99.96

Average:     all    0.01    0.00    0.03    0.00    0.01    0.00    0.00    0.00    0.00   99.95

这里面我们看到 cpu 的使用比率分了很多栏目，我们一般评估进程占用 CPU 的时候，最重要的是％ user 和％ sys 。％ sys 一般是指，进程陷入内核执行时所占用的时间，这些时间是内核在工作。常见的情况时，进程执行过程中之行了某个系统调用，而陷入内核态执行所产生的 cpu 占用。

所以在这一部分，我们需要重新提供一个测试用例，让 sys 部分的 cpu 占用变高。基于筛质数进行改造即可，我们这次让每个筛质数的线程，在做运算之前都用非阻塞方式 open()打开一个文件，每次拿到一个数运算的时候，循环中都用系统调用 read()读一下文件。以此来增加 sys 占用时间的比率。先来改程序：

#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>

#define NUM 48
#define START 1010001
#define END 1020000

pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
static int count = 0;

void *prime(void *p)
{
    int n, i, flag;
    int num, fd, ret;
    char name[BUFSIZ];
    char buf[BUFSIZ];

    bzero(name, BUFSIZ);

    num = (int *)p;
    sprintf(name, "/tmp/tmpfilezorro%d", num);

    fd = open(name, O_RDWR|O_CREAT|O_TRUNC|O_NONBLOCK , 0644);
    if (fd < 0) {
        perror("open()");
        exit(1);
    }

    while (1) {
        if (pthread_mutex_lock(&mutex) != 0) {
            perror("pthread_mutex_lock()");
            pthread_exit(NULL);
        }
        while (count == 0) {
            if (pthread_cond_wait(&cond, &mutex) != 0) {
                perror("pthread_cond_wait()");
                pthread_exit(NULL);
            }
        }
        if (count == -1) {
            if (pthread_mutex_unlock(&mutex) != 0) {
                perror("pthread_mutex_unlock()");
                pthread_exit(NULL);
            }
            break;
        }
        n = count;
        count = 0;
        if (pthread_cond_broadcast(&cond) != 0) {
            perror("pthread_cond_broadcast()");
            pthread_exit(NULL);
        }
        if (pthread_mutex_unlock(&mutex) != 0) {
            perror("pthread_mutex_unlock()");
            pthread_exit(NULL);
        }
        flag = 1;
        for (i=2;i<n/2;i++) {
            ret = read(fd, buf, BUFSIZ);
            if (ret < 0) {
                perror("read()");
            }
            if (n%i == 0) {
                flag = 0;
                break;
            }
        }
        if (flag == 1) {
            printf("%d is a prime form %d!\n", n, pthread_self());
        }
    }

    close(fd);
    pthread_exit(NULL);
}

int main(void)
{
    pthread_t tid[NUM];
    int ret, i, num;

    for (i=0;i<NUM;i++) {
        ret = pthread_create(&tid[i], NULL, prime, (void *)i);
        if (ret != 0) {
            perror("pthread_create()");
            exit(1);
        } 
    }

    for (i=START;i<END;i+=2) {
        if (pthread_mutex_lock(&mutex) != 0) {
            perror("pthread_mutex_lock()");
            pthread_exit(NULL);
        }
        while (count != 0) {
            if (pthread_cond_wait(&cond, &mutex) != 0) {
                perror("pthread_cond_wait()");
                pthread_exit(NULL);
            }
        }
        count = i;
        if (pthread_cond_broadcast(&cond) != 0) {
            perror("pthread_cond_broadcast()");
            pthread_exit(NULL);
        }
        if (pthread_mutex_unlock(&mutex) != 0) {
            perror("pthread_mutex_unlock()");
            pthread_exit(NULL);
        }
    }
    if (pthread_mutex_lock(&mutex) != 0) {
        perror("pthread_mutex_lock()");
        pthread_exit(NULL);
    }
    while (count != 0) {
        if (pthread_cond_wait(&cond, &mutex) != 0) {
            perror("pthread_cond_wait()");
            pthread_exit(NULL);
        }
    }
    count = -1;
    if (pthread_cond_broadcast(&cond) != 0) {
        perror("pthread_cond_broadcast()");
        pthread_exit(NULL);
    }
    if (pthread_mutex_unlock(&mutex) != 0) {
        perror("pthread_mutex_unlock()");
        pthread_exit(NULL);
    }

    for (i=0;i<NUM;i++) {
        ret = pthread_join(tid[i], NULL);
        if (ret != 0) {
            perror("pthread_join()");
            exit(1);
        } 
    }

    exit(0);
}

我们将筛质数的范围缩小了两个数量级，并且每个线程都打开一个文件，每次计算的循环中都 read 一遍。此时这个进程执行的时候的 cpu 使用状态是这样的：

17:20:46     CPU    %usr   %nice    %sys %iowait    %irq   %soft  %steal  %guest  %gnice   %idle
17:20:47     all   53.04    0.00   46.96    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47       0   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47       1   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47       2   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47       3   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47       4   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47       5   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47       6   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47       7   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47       8   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47       9   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47      10   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47      11   53.47    0.00   46.53    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47      12   52.00    0.00   48.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47      13   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47      14   53.47    0.00   46.53    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47      15   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47      16   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47      17   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47      18   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47      19   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47      20   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47      21   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47      22   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00
17:20:47      23   53.00    0.00   47.00    0.00    0.00    0.00    0.00    0.00    0.00    0.00

[zorro@zorrozou-pc ~/test]$ time ./prime_sys &> /dev/null

real    0m12.227s
user    2m34.869s
sys 2m17.239s

测试用例已经基本符合我们的测试条件，可以达到近 50%的 sys 占用，下面开始进行对比测试。测试方法根上一轮一样，仍然用 jerry 账户运行一个相同的程序在另一个 cgroup 不断的循环，然后分别看在不同资源分配比率下的 zorro 用户筛质数程序运行的时间。以下是测试结果：

shares zorro/shares jerry （核心数）	cpuset realtime	cpushare realtime	cpuquota realtime
2000/22000(2)	2m27.666s	2m27.599s	2m27.918s
4000/20000(4)	1m12.621s	1m14.345s	1m13.581s
6000/18000(6)	0m48.612s	0m49.474s	0m48.730s
8000/16000(8)	0m36.412s	0m37.269s	0m36.784s
12000/12000(12)	0m24.611s	0m24.624s	0m24.628s
16000/8000(16)	0m18.401s	0m18.688s	0m18.480s
24000/0(24)	0m12.188s	0m12.487s	0m12.147s

shares zorro/shares jerry （核心数）	cpuset systime	cpushare systime	cpuquota systime
2000/22000(2)	2m20.115s	2m21.024s	2m21.854s
4000/20000(4)	2m16.450s	2m21.103s	2m20.352s
6000/18000(6)	2m18.273s	2m20.455s	2m20.039s
8000/16000(8)	2m18.054s	2m20.611s	2m19.891s
12000/12000(12)	2m20.358s	2m18.331s	2m20.363s
16000/8000(16)	2m17.724s	2m18.958s	2m18.637s
24000/0(24)	2m16.723s	2m17.707s	2m16.176s

这次我们多了一个表格专门记录 systime 时间占用。根据数据结果我们会发现，在这次测试循环中，三种隔离方式都呈现出随着资源的增加进程是执行的总时间线性下降，并且隔离效果区别不大。由于调用 read 的次数一样， systime 的使用基本都稳定在一个固定的时间范围内。这说明，在 sys 占用较高的情况下，各种 cpu 资源隔离手段都表现出比较理想的效果。

内核资源冲突的情况

但是现实的生产环境往往并不是这么理想的，有没有可能在某种情况下，各种 CPU 资源隔离的手段并不会表现出这么理想的效果呢？有没有可能不同的隔离方式会导致进程的执行会有影响呢？其实这是很可能发生的。我们上一轮测试中，每个 cgroup 中的线程打开的文件都不是同一个文件，内核在处理这种场景的时候，并不需要使用内核中的一些互斥资源(比如自旋锁或者屏障)进行竞争条件的处理。如果环境变成大家 read 的是同一个文件，那么情况就可能有很大不同了。下面我们来测试一下每个 zorro 组中的所有线程都 open 同一个文件并且 read 时的执行效果，我们照例把测试用例代码贴出来：

#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>

#define NUM 48
#define START 1010001
#define END 1020000

pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
static int count = 0;
#define PATH "/etc/passwd"

void *prime(void *p)
{
    int n, i, flag;
    int num, fd, ret;
    char name[BUFSIZ];
    char buf[BUFSIZ];

    fd = open(PATH, O_RDONLY|O_NONBLOCK);
    if (fd < 0) {
        perror("open()");
        exit(1);
    }

    while (1) {
        if (pthread_mutex_lock(&mutex) != 0) {
            perror("pthread_mutex_lock()");
            pthread_exit(NULL);
        }
        while (count == 0) {
            if (pthread_cond_wait(&cond, &mutex) != 0) {
                perror("pthread_cond_wait()");
                pthread_exit(NULL);
            }
        }
        if (count == -1) {
            if (pthread_mutex_unlock(&mutex) != 0) {
                perror("pthread_mutex_unlock()");
                pthread_exit(NULL);
            }
            break;
        }
        n = count;
        count = 0;
        if (pthread_cond_broadcast(&cond) != 0) {
            perror("pthread_cond_broadcast()");
            pthread_exit(NULL);
        }
        if (pthread_mutex_unlock(&mutex) != 0) {
            perror("pthread_mutex_unlock()");
            pthread_exit(NULL);
        }
        flag = 1;
        for (i=2;i<n/2;i++) {
            ret = read(fd, buf, BUFSIZ);
            if (ret < 0) {
                perror("read()");
            }
            if (n%i == 0) {
                flag = 0;
                break;
            }
        }
        if (flag == 1) {
            printf("%d is a prime form %d!\n", n, pthread_self());
        }
    }

    close(fd);
    pthread_exit(NULL);
}

int main(void)
{
    pthread_t tid[NUM];
    int ret, i, num;

    for (i=0;i<NUM;i++) {
        ret = pthread_create(&tid[i], NULL, prime, (void *)i);
        if (ret != 0) {
            perror("pthread_create()");
            exit(1);
        } 
    }

    for (i=START;i<END;i+=2) {
        if (pthread_mutex_lock(&mutex) != 0) {
            perror("pthread_mutex_lock()");
            pthread_exit(NULL);
        }
        while (count != 0) {
            if (pthread_cond_wait(&cond, &mutex) != 0) {
                perror("pthread_cond_wait()");
                pthread_exit(NULL);
            }
        }
        count = i;
        if (pthread_cond_broadcast(&cond) != 0) {
            perror("pthread_cond_broadcast()");
            pthread_exit(NULL);
        }
        if (pthread_mutex_unlock(&mutex) != 0) {
            perror("pthread_mutex_unlock()");
            pthread_exit(NULL);
        }
    }
    if (pthread_mutex_lock(&mutex) != 0) {
        perror("pthread_mutex_lock()");
        pthread_exit(NULL);
    }
    while (count != 0) {
        if (pthread_cond_wait(&cond, &mutex) != 0) {
            perror("pthread_cond_wait()");
            pthread_exit(NULL);
        }
    }
    count = -1;
    if (pthread_cond_broadcast(&cond) != 0) {
        perror("pthread_cond_broadcast()");
        pthread_exit(NULL);
    }
    if (pthread_mutex_unlock(&mutex) != 0) {
        perror("pthread_mutex_unlock()");
        pthread_exit(NULL);
    }

    for (i=0;i<NUM;i++) {
        ret = pthread_join(tid[i], NULL);
        if (ret != 0) {
            perror("pthread_join()");
            exit(1);
        } 
    }

    exit(0);
}

此时 jerry 组中的所有线程仍然是每个线程一个文件，与上一轮测试一样。测试结果如下：

shares zorro/shares jerry （核心数）	cpuset realtime	cpushare realtime	cpuquota realtime
2000/22000(2)	2m27.402s	2m41.015s	4m37.149s
4000/20000(4)	1m18.178s	1m25.214s	2m42.455s
6000/18000(6)	0m52.592s	1m2.691s	1m48.492s
8000/16000(8)	0m43.598s	0m57.000s	1m21.044s
12000/12000(12)	0m52.182s	0m59.613s	0m58.004s
16000/8000(16)	0m50.712s	0m54.371s	0m56.911s
24000/0(24)	0m50.599s	0m50.550s	0m50.496s

shares zorro/shares jerry （核心数）	cpuset systime	cpushare systime	cpuquota systime
2000/22000(2)	2m19.829s	2m47.706s	6m39.800s
4000/20000(4)	2m41.928s	3m6.575s	8m14.087s
6000/18000(6)	2m45.671s	3m38.722s	8m13.668s
8000/16000(8)	3m14.434s	4m54.451s	8m12.904s
12000/12000(12)	7m39.542s	9m7.751s	8m57.332s
16000/8000(16)	10m47.425s	11m41.443s	12m21.056s
24000/0(24)	17m17.661s	17m7.311s	17m14.788s

观察这轮测试的结果我们会发现，当线程同时 read 同一个文件时，时间的消耗并不在呈现线性下降的趋势了，而且，随着分配的资源越来越多， sys 占用时间也越来越高，这种现象如何解释呢？本质上来讲，使用 cgroup 进行资源隔离时，内核资源仍然是共享的。如果业务使用内核资源如果没有产生冲突，那么隔离效果应该会比较理想，但是业务一旦使用了会导致内核资源冲突的逻辑时，那么业务的执行效率就会下降，此时可能所有进程在内核中处理的时候都可能会在竞争的资源上忙等（如果使用了 spinlock ）。自然的，如果多个 cgroup 的进程之间也正好使用了可能会导致内核触发竞争条件的资源时，自然也会发生所谓的 cgroup 之间的相互影响。可能的现象就是，当某一个业务 A 的 cgroup 正在运行着，突然 B 业务的 cgroup 有请求要处理，会导致 A 业务的响应速度和处理能力下降。而这种相互干扰，正是资源隔离手段想要尽量避免的。我们认为，如果出现了上述效果，那么资源隔离手段就是打了折扣的。

根据我们的实验结果可以推论，在内核资源有竞争条件的情况下， cpuset 的资源隔离方式表现出了相对其他方式的优势， cpushare 方式的性能折损尚可接受，而 cpuquota 表现出了最差的性能，或者说在 cpuquota 的隔离条件下， cgroup 之间进程相互影响的可能性最大。

那么在内核资源存在竞争的时候， cgroup 的 cpu 资源隔离会有相互干扰。结论就是这样了么？这个推断靠谱么？我们再来做一轮实验，这次只对比 cpuset 和 cpuquota 。这次我们不用 jerry 来运行干扰程序测试隔离性，我们让 zorro 只在单纯的隔离状态下，再有内核资源竞争的条件下进行运算效率测试，就是说这个环境没有多个 cgroup 可能造成的相互影响。先来看数据：

cpu 比率（核心数）	cpuset realtime	cpuquota realtime
8.3%(2)	2m26.815s	9m4.490s
16.7%(4)	1m17.894s	4m49.167s
25%(6)	0m52.356s	3m13.144s
33.3%(8)	0m42.946s	2m23.010s
50%(12)	0m52.014s	1m33.571s
66.7%(16)	0m50.903s	1m10.553s
100%(24)	0m50.331s	0m50.304s

cpu 比率（核心数）	cpuset systime	cpuquota systime
8.3%(2)	2m18.713s	15m27.738s
16.7%(4)	2m41.172s	16m30.741s
25%(6)	2m44.618s	16m30.964s
33.3%(8)	3m12.587s	16m18.366s
50%(12)	7m36.929s	15m55.407s
66.7%(16)	10m49.327s	16m1.463s
100%(24)	17m9.482s	17m9.533s

不知道看完这组数据之后，大家会不会困惑？ cpuset 的测试结果根上一轮基本一样，这可以理解。但是为什么 cpuquota 这轮测试反倒比刚才有 jerry 用户进程占用 cpu 进行干扰的时候的性能更差了？

如果了解了内核在这种资源竞争条件的原理的话，这个现象并不难解释。可以这样想，如果某一个资源存在竞争的话，那么是不是同时竞争的人越多，那么对于每个人来说，单次得到资源的可能性更低？比如说，老师给学生发苹果，每次只发一个，但是同时有 10 个人一起抢，每个人每次抢到苹果的几率是 10%，如果 20 个人一起抢，那么每次每人强到苹果的几率就只有 5 ％了。在内核竞争条件下，也是一样的道理，资源只有一个，当抢的进程少的时候，每个进程抢到资源的概率大，于是浪费在忙等上的时间就少。本轮测试的 cpuset 就可以说明这个现象，可以观察到， cpuset systime 随着分配的核心数的增多而上升，就是同时跑的进程越多， sys 消耗在忙等资源上的时间就越大。而 cpuquota systime 消耗从头到尾都基本变化不大，意味着再以 quota 方式分配 cpu 的时候，所有核心都是用得上的，所以一直都有 24 个进程在抢资源，大家消耗在忙等上的时间是一样的。
为什么有 jerry 进程同时占用 cpu 的情况下， cpuquota 反倒效率要快些呢？这个其实也好理解。在 jerry 进程执行的时候，这个 cgroup 的相关线程打开的是不同的文件，所以从内核竞争上没有冲突。另外， jerry 消耗了部分 cpu ，导致内核会在 zorro 的进程和 jerry 的进程之间发生调度，这意味着，同一时刻核心数只有 24 个，可能有 18 个在给 jerry 的线程使用，另外 6 个在给 zorro 的进程使用，这导致 zorro 同时争抢资源的进程个数不能始终保持 24 个，所以内核资源冲突反倒减小了。这导致，使用 cpuquota 的情况下，有其他 cgroup 执行的时候，还可能会使某些业务的执行效率提升，而不是下降。这种相互影响实在太让人意外了！但这确实是事实！

那么什么情况下会导致 cgroup 之间的相互影响使性能下降呢？也好理解，当多个 cgroup 的应用之间使用了相同的内核资源的时候。请大家思考一个问题：现实情况是同一种业务使用冲突资源的可能性更大还是不同业务使用冲突资源的可能性更大呢？从概率上说应该是同一种业务。从这个角度出发来看，如果我们有两台多核服务器，有两个跟我们测试逻辑类似的业务 A 、 B ，让你选择一种部署方案，你是选择让 A 、 B 两个业务分别独占一个服务器？还是让 A 、 B 业务使用资源隔离分别在两个服务器上占用 50%的资源？通过这轮分析我想答案很明确了：

从容灾的角度说，让某一个业务使用多台服务器肯定会增加容灾能力。
从资源利用率的角度说，如果让一个业务放在一个服务器上，那么他在某些资源冲突的情况下并不能发挥会最大效率。然而如果使用 group 分布在两个不同的服务器上，无论你用 cpuset ，还是 cpushare ，又或是 cpuquota ，它的 cpu 性能表现都应该强于在一个独立的服务器上部署。况且 cgroup 的 cpu 隔离是在 cfs 中实现的，这种隔离几乎是不会浪费额外的计算能力的，就是说，做隔离相比不做隔离，系统本身的性能损耗都可以忽略不计。

那么，究竟还有什么会妨碍我们使用 cgoup 的 cpu 资源隔离呢？

5 条回复 • 2015-12-30 00:40:51 +08:00