系统调用eventfd介绍

eventfd是一个利用匿名文件设计的系统调用，用作高效的进程间通信，本文介绍一下eventfd的内核实现和用户测试。方便后续编程时可以考虑使用eventfd

内核实现

eventfd是通过syscall实现，如下


SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
{
        return do_eventfd(count, flags);
}
 
SYSCALL_DEFINE1(eventfd, unsigned int, count)
{
        return do_eventfd(count, 0);
}

其实现如下：


static int do_eventfd(unsigned int count, int flags)
{
        struct eventfd_ctx *ctx;
        struct file *file;
        int fd;
 
        /* Check the EFD_* constants for consistency.  */
        BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
        BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
 
        if (flags & ~EFD_FLAGS_SET)
                return -EINVAL;
 
        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;
 
        kref_init(&ctx->kref);
        init_waitqueue_head(&ctx->wqh);
        ctx->count = count;
        ctx->flags = flags;
        ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
 
        flags &= EFD_SHARED_FCNTL_FLAGS;
        flags |= O_RDWR;
        fd = get_unused_fd_flags(flags);
        if (fd < 0)
                goto err;
 
        file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
        if (IS_ERR(file)) {
                put_unused_fd(fd);
                fd = PTR_ERR(file);
                goto err;
        }
 
        file->f_mode |= FMODE_NOWAIT;
        fd_install(fd, file);
        return fd;
err:
        eventfd_free_ctx(ctx);
        return fd;
}

do_eventfd比较重要的点在于anon_inode_getfile，这里通过匿名页来设置此系统调用。

再重要的就是eventfd_ctx结构，如下：


struct eventfd_ctx {
        struct kref kref;
        wait_queue_head_t wqh;
        /*
         * Every time that a write(2) is performed on an eventfd, the
         * value of the __u64 being written is added to "count" and a
         * wakeup is performed on "wqh". A read(2) will return the "count"
         * value to userspace, and will reset "count" to zero. The kernel
         * side eventfd_signal() also, adds to the "count" counter and
         * issue a wakeup.
         */
        __u64 count;
        unsigned int flags;
        int id;
};

这里看到了我们read和write作用的是count值，所以这个fd只能通过count来传递信息。而read/write是通过标准的fops实现，如下


static const struct file_operations eventfd_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo    = eventfd_show_fdinfo,
#endif
        .release        = eventfd_release,
        .poll           = eventfd_poll,
        .read_iter      = eventfd_read,
        .write          = eventfd_write,
        .llseek         = noop_llseek,
};

read操作的核心实现是eventfd_ctx_do_read，这里如果是flag设置了semaphore则只会减1，否则可以直接是count值，如下：


void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
{
        lockdep_assert_held(&ctx->wqh.lock);
 
        *cnt = ((ctx->flags & EFD_SEMAPHORE) && ctx->count) ? 1 : ctx->count;
        ctx->count -= *cnt;
}

write操作直接在eventfd_write中，每次write会自增，也就是


ctx->count += ucnt

poll操作根据poll_wait来等待，READ_ONCE来保证count的读取只有一次，根据注释我们可以知道，wqh锁和qwh锁不会竞争问题，也就是安全的。


*     poll                               write
*     -----------------                  ------------
*     lock ctx->wqh.lock (in poll_wait)
*     count = ctx->count
*     __add_wait_queue
*     unlock ctx->wqh.lock
*                                        lock ctx->qwh.lock
*                                        ctx->count += n
*                                        if (waitqueue_active)
*                                          wake_up_locked_poll
*                                        unlock ctx->qwh.lock
*     eventfd_poll returns 0

其代码实现如下


static unsigned int eventfd_poll(struct file *file, poll_table *wait)
{
        struct eventfd_ctx *ctx = file->private_data;
        unsigned int events = 0;
        u64 count;
 
        poll_wait(file, &ctx->wqh, wait);
        count = READ_ONCE(ctx->count);
 
        if (count > 0)
                events |= POLLIN;
        if (count == ULLONG_MAX)
                events |= POLLERR;
        if (ULLONG_MAX - 1 > count)
                events |= POLLOUT;
 
        return events;
}

而对于内核空间对eventfd的调用，可以通过eventfd_signal函数，其实现是eventfd_signal_mask，这里同样是自加如下：


__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
{
        unsigned long flags;
 
        /*
         * Deadlock or stack overflow issues can happen if we recurse here
         * through waitqueue wakeup handlers. If the caller users potentially
         * nested waitqueues with custom wakeup handlers, then it should
         * check eventfd_signal_count() before calling this function. If
         * it returns true, the eventfd_signal() call should be deferred to a
         * safe context.
         */
        if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
                return 0;
 
        spin_lock_irqsave(&ctx->wqh.lock, flags);
        this_cpu_inc(eventfd_wake_count);
        if (ULLONG_MAX - ctx->count < n)
                n = ULLONG_MAX - ctx->count;
        ctx->count += n;
        if (waitqueue_active(&ctx->wqh))
                wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
        this_cpu_dec(eventfd_wake_count);
        spin_unlock_irqrestore(&ctx->wqh.lock, flags);
 
        return n;
}

应用测试

为了使用eventfd，我们可以直接使用c库封装的eventfd函数，示例如下：


#include <sys/eventfd.h>
#include <unistd.h>
#include <stdint.h>
#include <stdio.h>
 
int main() {
    int efd;
    uint64_t value;
 
    efd = eventfd(0, 0);
    if (efd == -1) {
        perror("eventfd");
        return 1;
    }
 
    value = 1;
    if (write(efd, &value, sizeof(value)) == -1) {
        perror("write");
        return 1;
    }
 
    if (write(efd, &value, sizeof(value)) == -1) {
        perror("write");
        return 1;
    }
 
    if (read(efd, &value, sizeof(value)) == -1) {
        perror("read");
        return 1;
    }
    printf("[kylin]: read value: %lu\n", value);
 
    close(efd);
 
    return 0;
}

运行后结果如下：


[kylin]: read value: 2

至此eventfd介绍完成，详细大家在使用高性能的进程通信的时候，可以适当考虑eventfd

目录

系统调用eventfd介绍

内核实现

应用测试