引言

在《Linux 系统编程手册》5.4 节，关于文件描述符和打开的文件关系是这样描述的：「内核为所有打开的文件维护了一个系统级的描述表（open file description table），有时也称之为打开文件表（open file table），并将表中的每个条目称为打开文件句柄（open file handle）。而针对每个进程，内核又为其维护了打开文件的描述符表（open file descriptor table）」。

后来在这篇文章 Linux 文件句柄的这些技术内幕，只有 1% 的人知道中，看到了这样的的描述：「简单来说，每个进程都有一个打开的文件表（fdtable)。表中的每一项是struct file类型，包含了打开文件的一些属性比如偏移量，读写访问模式等，这是真正意义上的文件句柄」。

那么这里提到的「打开的文件表」又是什么呢，怎么又变成了每个进程都有的呢？《Linux 系统编程手册》中不是说「打开文件表（open file table）」是独立于进程的系统级表吗？是不是觉得有点困惑和矛盾呢？

为了能够解答困惑，加深对文件描述符的理解，特地深扒了下 Linux 内核的相关源码。接下来，我们将会看到上文提到的描述表（open file description table）、打开文件表（open file table）、打开文件句柄（open file handle）这三种抽象的数据结构具体是怎么实现的？以便能够更好地理解书中的概念。

文件描述符与打开的文件关系

在区分这些概念前，我们先来看看 open() 系统调用的 man page 中提到的一段说明：

A call to open() creates a new open file description, an entry in the
system-wide table of open files. The open file description records
the file offset and the file status flags (see below). A file
descriptor is a reference to an open file description; this reference
is unaffected if pathname is subsequently removed or modified to
refer to a different file…

在看完上面的介绍后，结合《Linux 系统编程手册》提到的名词，我们可以作出这样的映射：

open file description: 打开的文件句柄（open file handle），它才会关联到真正地文件 inode
table of open files: 就是书中提到的系统级描述表（open file table）
file descriptor: 其实就是一个针对文件句柄的引用

好啦，下面来看看与文件描述符相关的实现细节，并了解几个重要的系统调用实现。

实现细节

以下代码摘自 Linux Kernel 5.4，考虑到内核代码非常复杂，处理细节也很多，这里并没有把每个函数或数据结构所有代码都贴出来，只保留了一些和本文焦点有关的代码行。

Linux 内核中相关的数据结构

每个进程都关联指向了一个 files_struct，即打开的文件信息：

struct task_struct {
    // ...
    /* Filesystem information*/
    struct fs_struct *fs
    /* Open file information*/
    struct files_struct        *files;
   // ...
}

那么，打开的文件信息长什么样呢？

struct files_struct {
  /*
   * read mostly part
   */
    // 引用计数，可以和其它 task 共享
    atomic_t count;
    bool resize_in_progress;
    wait_queue_head_t resize_wait;
    // fdtable 是每个进程相关的文件描述符表
    struct fdtable __rcu *fdt;
    struct fdtable fdtab;
  /*
   * written part on a separate cache line in SMP
   */
    spinlock_t file_lock ____cacheline_aligned_in_smp;
    unsigned int next_fd;
    unsigned long close_on_exec_init[1];
    unsigned long open_fds_init[1];
    unsigned long full_fds_bits_init[1];
    struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

而关于 fdtable （这个可以理解为进程独立的打开文件的描述符表（open file descriptor table））的定义如下：

struct fdtable {
    unsigned int max_fds;
    // 这里 fd 数组，维护了进程关联的文件描述符及其文件句柄的指针
    // 文件句柄可以共享（比如，dup 系统调用）
    // 但是在使用 open 系统调用的时候会创建新的 file，即文件句柄
    struct file __rcu **fd; /* current fd array */
    unsigned long *close_on_exec;
    unsigned long *open_fds;
    unsigned long *full_fds_bits;
    struct rcu_head rcu;
};

再来看看文件句柄（即 open file description）是什么？它维护了和打开文件有关的重要信息：

struct file {
    // ...
    // 文件路径
    struct path        f_path;
    // 指向真正的文件，inode 指针
    struct inode        *f_inode;    /* cached value */
    // 文件相关的操作
    const struct file_operations    *f_op;
    /*
     * Protects f_ep_links, f_flags.
     * Must not be taken from IRQ context.
     */
    spinlock_t        f_lock;
    enum rw_hint        f_write_hint;
    // 引用计数，只有 count 为 0 时，才会被真正地回收
    atomic_long_t        f_count;
    unsigned int         f_flags;
    fmode_t            f_mode;
    struct mutex        f_pos_lock;
    // 文件偏移
    loff_t            f_pos;
    struct fown_struct    f_owner;
    const struct cred    *f_cred;
    struct file_ra_state    f_ra;
    u64            f_version;
   // ...
}

画了一个图，方便了解上述数据结构关系：
file-descriptor-file-description

三个重要的系统调用实现

open

open() 系统调用会分配新的文件句柄（file description），用来维护与打开文件相关的元信息（如偏移量、路径、操作方法等），并会给进程返回一个文件描述符（其实就是个小整数）。它对应的实现流程如下：

// fs/open.c
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
    struct open_flags op;
    // 不要被名字 fd 迷惑了，其实这里返回的是错误信息（非 0 表示出错了！）
    int fd = build_open_flags(flags, mode, &op);
    struct filename *tmp;
    if (fd)
        return fd;
    tmp = getname(filename);
    if (IS_ERR(tmp))
        return PTR_ERR(tmp);

    // 分配文件描述符
    fd = get_unused_fd_flags(flags);
    if (fd >= 0) {
        // 分配文件句柄
        struct file *f = do_filp_open(dfd, tmp, &op);
        if (IS_ERR(f)) {
            put_unused_fd(fd);
            fd = PTR_ERR(f);
        } else {
            fsnotify_open(f);
            // 注册到进程的 fdtable 中
            fd_install(fd, f);
        }
    }
    putname(tmp);
    return fd;
}

// fs/namei.c
struct file *do_filp_open(int dfd, struct filename *pathname,
        const struct open_flags *op)
{
    // ...
    struct file *filp;
    filp = path_openat(&nd, op, flags | LOOKUP_RCU);
    // ...
    return filp;
}

static struct file *path_openat(struct nameidata *nd,
            const struct open_flags *op, unsigned flags)
{
    struct file *file;
    file = alloc_empty_file(op->open_flag, current_cred());
    // ...
    return file
}

void fd_install(unsigned int fd, struct file *file)
{
    __fd_install(current->files, fd, file);
}

void __fd_install(struct files_struct *files, unsigned int fd,
        struct file *file)
{
    struct fdtable *fdt;
    // ...
    rcu_assign_pointer(fdt->fd[fd], file);
}

dup

dup() 系统调用实际上是会分配一个新的文件描述符，但是底层还是会指向传入的文件描述符关联的文件句柄（file description）。

// fs/file.c
SYSCALL_DEFINE1(dup, unsigned int, fildes)
{
    int ret = -EBADF;
    // 基于传入的文件描述，查找到关联的文件句柄
    // 隐藏了一些错误判断逻辑
    // fget_raw 会调用 __fget 函数
    struct file *file = fget_raw(fildes);
    ret = get_unused_fd_flags(0);
    fd_install(ret, file);
    return ret;
}

// fs/file.c
static struct file *__fget(unsigned int fd, fmode_t mask, unsigned int refs)
{
    // 得到当前进程关联的打开文件表（Open file info table）
    struct files_struct *files = current->files;
    struct file *file;
    // 查找 fd -> 文件句柄
    file = fcheck_files(files, fd);
    if (file) {
        /* File object ref couldn't be taken.
         * dup2() atomicity guarantee is the reason
         * we loop to catch the new file (or NULL pointer)
         */
        if (file->f_mode & mask)
            file = NULL;
        else if (!get_file_rcu_many(file, refs))
            goto loop;
    }
    return file;
}

close

close() 系统调会回收文件描述符，同时会给文件描述符指向的文件句柄（file description）的引用计数减 1，并在需要的时候进行回收。该系统调用的实现流程总结如下：

其对应的代码实现如下：

SYSCALL_DEFINE1(close, unsigned int, fd)
{
    int retval = __close_fd(current->files, fd);
    // ...
}

// fs/file.c
// __close_fd 关闭文件描述符，其中 `files` 指向的是当前进程关联的
// 打开文件描述符表。
int __close_fd(struct files_struct *files, unsigned fd)
{
    struct file *file;
    struct fdtable *fdt;
    spin_lock(&files->file_lock);
    fdt = files_fdtable(files);
    // 判断传入的 file descriptor 有效性
    if (fd >= fdt->max_fds)
        goto out_unlock;
    // 查找到关联的 file description，即文件句柄
    file = fdt->fd[fd];
    // 回收 file descriptor
    rcu_assign_pointer(fdt->fd[fd], NULL);
    __put_unused_fd(files, fd);
    spin_unlock(&files->file_lock);
    // 关闭 file description
    return filp_close(file, files);
out_unlock:
    spin_unlock(&files->file_lock);
    return -EBADF;
}

// filp_close 关闭指向的 file description，其中 id 为
// POSIX 线程 ID。
int filp_close(struct file *filp, fl_owner_t id)
{
    int retval = 0;
    // 如果文件引用计数为 0，说明存在错误，无法关闭
    if (!file_count(filp)) {
        printk(KERN_ERR "VFS: Close: file count is 0\n");
        return 0;
    }
    if (filp->f_op->flush)
        retval = filp->f_op->flush(filp, id);
    // 可能会回收 file description，但是会考虑其引用计数
    fput(filp);
    return retval;
}

// 以下定义在：fs/file_table.c
void fput(struct file *file)
{
    // 给 file description 的引用计数减 1
    fput_many(file, 1);
}

void fput_many(struct file *file, unsigned int refs)
{
    // 原子操作：&file->f_count -= refs
    if (atomic_long_sub_and_test(refs, &file->f_count)) {
        // ...
    }
}

总结

本文介绍了下文件描述符和打开文件的关系，并简要讲解了 Linux 内核中关于这些抽象概念的具体实现。同时，还简单介绍了下 open(), dup() 和 close() 系统调用的实现。相信在看完这些后，能够更加深入和清晰地理解这三个概念：系统级打开文件表（open file table）/描述表（open file description table）、文件句柄（open file handle）/文件描述（file description）以及文件描述符（file descriptor）。

最后，回答下本文开头提到的问题。其实，站在进程的角度来看，作者在 Linux 文件句柄的这些技术内幕，只有 1% 的人知道中提到「每个进程都有一个打开的文件表（fdtable)」这样的说法其实也没什么问题。只是此处打开的文件表（fdtable)和《Linux 系统编程》中提到的系统级打开文件表（open file table）并非一个概念。作者提到的打开的文件表（fdtable)其实正是抽象的进程文件描述符（file descriptor）表。当然，我们没必要为此纠结，咬文嚼字也没什么意义，不过对于困惑的东西还是能够搞清楚才好。

Valar Morghulis

理解文件描述符与文件句柄

引言