结构体
fd
fd也就是文件描述符,用于标识已经打开的文件、管道、socket等。是进程和内核的桥梁,允许进程执行各种文件操作
struct fd {
struct file *file;
unsigned int flags;
};
file
Linux内核中表示打开文件的结构体,包含了文件操作所需的各种信息和元数据。这是文件系统操作的核心结构之一,允许内核跟踪每个打开的文件及其相关的状态。
struct file {
// 用于链接或者引用计数
union {
// 链表节点
struct llist_node fu_llist;
// Read-Copy-Update头
struct rcu_head fu_rcuhead;
} f_u;
// 文件路径信息
struct path f_path;
// 文件的 inode 结构体,表示文件的具体内容和属性
struct inode *f_inode; /* cached value */
// 指向文件操作结构体的指针,包含与文件相关的各种操作函数指针,如读、写、打开、关闭等
const struct file_operations *f_op;
/*
* Protects f_ep_links, f_flags.
* Must not be taken from IRQ context.
*/
spinlock_t f_lock;
enum rw_hint f_write_hint;
// 引用计数,表示有多少引用指向这个文件结构体
atomic_long_t f_count;
// 文件标志,描述文件的各种属性,如只读、只写、非阻塞等
unsigned int f_flags;
// 文件模式,指示文件的打开模式,如读、写、执行等
fmode_t f_mode;
// 位置锁,用于保护文件读写位置的锁
struct mutex f_pos_lock;
// 文件的读写位置偏移量
loff_t f_pos;
// 文件所有者结构体,包含文件的拥有者和访问权限信息
struct fown_struct f_owner;
const struct cred *f_cred;
// 文件预读取状态结构体,包含文件预读取的相关信息
struct file_ra_state f_ra;
// 文件版本号,表示文件的版本信息。
u64 f_version;
#ifdef CONFIG_SECURITY
void *f_security;
#endif
/* needed for tty driver, and maybe others */
void *private_data;
#ifdef CONFIG_EPOLL
/* Used by fs/eventpoll.c to link all the hooks to this file */
// 用于事件轮询(epoll)系统调用的链表结
struct list_head f_ep_links;
struct list_head f_tfile_llink;
#endif /* #ifdef CONFIG_EPOLL */
// 地址空间指针,表示文件的内存映射状态
struct address_space *f_mapping;
// 写回错误序列号,用于跟踪文件写回操作的错误
errseq_t f_wb_err;
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
inode
inode包含文件的所有元数据,支撑访问控制、文件操作、同步、状态管理和特定文件类型支持
/*
* Keep mostly read-only and often accessed (especially for
* the RCU path lookup and 'stat' data) fields at the beginning
* of the 'struct inode'
*/
struct inode {
// 文件的模式,包括文件类型和文件权限
umode_t i_mode;
// 操作标志,标识文件系统特定的操作
unsigned short i_opflags;
// 文件所有者的用户 ID
kuid_t i_uid;
// 文件所有者的组 ID
kgid_t i_gid;
// 文件标志
unsigned int i_flags;
#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
#endif
// 指向 inode 操作函数的指针
const struct inode_operations *i_op;
// 指向文件系统超级块
struct super_block *i_sb;
// 地址空间,描述文件内容在内存中的映射
struct address_space *i_mapping;
#ifdef CONFIG_SECURITY
void *i_security;
#endif
/* Stat data, not accessed from path walking */
// inode 号,唯一标识一个文件
unsigned long i_ino;
/*
* Filesystems may only read i_nlink directly. They shall use the
* following functions for modification:
*
* (set|clear|inc|drop)_nlink
* inode_(inc|dec)_link_count
*/
// 链接数,表示有多少个目录项指向此 inode
union {
const unsigned int i_nlink;
unsigned int __i_nlink;
};
// 设备号,对于设备文件有效
dev_t i_rdev;
// 文件大小
loff_t i_size;
struct timespec64 i_atime;
struct timespec64 i_mtime;
struct timespec64 i_ctime;
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
// 文件字节数、块大小位数、写入提示、文件占用块数
unsigned short i_bytes;
u8 i_blkbits;
u8 i_write_hint;
blkcnt_t i_blocks;
#ifdef __NEED_I_SIZE_ORDERED
seqcount_t i_size_seqcount;
#endif
/* Misc */
// 文件状态
unsigned long i_state;
// 读写信号量,用于同步
struct rw_semaphore i_rwsem;
unsigned long dirtied_when; /* jiffies of first dirtying */
unsigned long dirtied_time_when;
// 特定结构:hash链表节点、IO列表
struct hlist_node i_hash;
struct list_head i_io_list; /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
struct bdi_writeback *i_wb; /* the associated cgroup wb */
/* foreign inode detection, see wbc_detach_inode() */
int i_wb_frn_winner;
u16 i_wb_frn_avg_time;
u16 i_wb_frn_history;
#endif
// 用于缓存回收的LRU列表
struct list_head i_lru; /* inode LRU list */
// 用于管理同一超级块中的 inode的超级块链表
struct list_head i_sb_list;
// 用于写回缓冲的写回列表
struct list_head i_wb_list; /* backing dev writeback list */
union {
struct hlist_head i_dentry;
struct rcu_head i_rcu;
};
// inode版本号
atomic64_t i_version;
// 引用计数
atomic_t i_count;
// 直接IO计数
atomic_t i_dio_count;
// 写操作计数
atomic_t i_writecount;
#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
atomic_t i_readcount; /* struct files open RO */
#endif
union {
const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
void (*free_inode)(struct inode *);
};
struct file_lock_context *i_flctx;
struct address_space i_data;
struct list_head i_devices;
union {
struct pipe_inode_info *i_pipe;
struct block_device *i_bdev;
struct cdev *i_cdev;
char *i_link;
unsigned i_dir_seq;
};
__u32 i_generation;
#ifdef CONFIG_FSNOTIFY
__u32 i_fsnotify_mask; /* all events this inode cares about */
struct fsnotify_mark_connector __rcu *i_fsnotify_marks;
#endif
#ifdef CONFIG_FS_ENCRYPTION
struct fscrypt_info *i_crypt_info;
#endif
#ifdef CONFIG_FS_VERITY
struct fsverity_info *i_verity_info;
#endif
void *i_private; /* fs or device private pointer */
} __randomize_layout;
写入——从write()到vfs
write()系统调用在内核的实现为sys_write。
本部分在真正文件系统操作调用之外,只是获取释放文件描述符、更新位置指针、写入前检查等操作
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
// 获取文件描述符fd
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
// 获取文件当前位置指针
loff_t pos, *ppos = file_ppos(f.file);
if (ppos) {
pos = *ppos;
ppos = &pos;
}
// VFS执行实际写操作
ret = vfs_write(f.file, buf, count, ppos);
// 更新文件指针位置
if (ret >= 0 && ppos)
f.file->f_pos = pos;
// 释放文件描述符,减少其引用计数
fdput_pos(f);
}
return ret;
}
接着进入vfs,vfs实际也是调用真正文件系统的接口实现
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
// 检查文件是否可写
if (!(file->f_mode & FMODE_WRITE))
return -EBADF; // 文件不可写,返回错误码 EBADF
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL; // 文件模式不支持写操作,返回错误码 EINVAL
// 检查用户空间指针是否有效
if (unlikely(!access_ok(buf, count)))
return -EFAULT; // 用户空间指针无效,返回错误码 EFAULT
// 验证写操作范围
ret = rw_verify_area(WRITE, file, pos, count);
if (!ret) {
// 限制最大写入字节数
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
// 开始文件写入
file_start_write(file);
// 实际执行写操作
ret = __vfs_write(file, buf, count, pos);
// 如果写入成功,发送文件系统通知并更新写字节数
if (ret > 0) {
fsnotify_modify(file);
add_wchar(current, ret);
}
// 更新系统调用写计数
inc_syscw(current);
// 结束文件写入
file_end_write(file);
}
return ret;
}
static ssize_t __vfs_write(struct file *file, const char __user *p,
size_t count, loff_t *pos)
{
// 首先检查文件操作结构是否有write方法,有直接用
if (file->f_op->write)
return file->f_op->write(file, p, count, pos);
else if (file->f_op->write_iter)
return new_sync_write(file, p, count, pos);
else
return -EINVAL;
}
以下是ext4文件系统实现vfs接口的方法
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read_iter = ext4_file_read_iter,
.write_iter = ext4_file_write_iter,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
.mmap = ext4_file_mmap,
.mmap_supported_flags = MAP_SYNC,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
.get_unmapped_area = thp_get_unmapped_area,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
};
ext4 buffered or direct
在Linux中存在几种不同的IO写入方式
-
DAX: 字节级别的操作。要求额外的硬件支持
-
DIO:直接从用户态写入数据到硬盘中,跳过内核缓冲区,减少了上下文切换和数据复制开销
块级别操作,数据的读写需要是设备的块大小和linux系统的页大小的整数倍
-
BIO:默认标准方式。数据会先从应用程序的地址空间拷贝到 操作系统内核地址空间的页缓存,然后再写入磁盘。根据Linux的延迟写机制,当数据写到操作系统内核地址空间的页缓存就意味write
缓冲写入操作通常是异步的,数据首先写入页缓存,后续由内核的pdflush守护进程或kworker线程将缓存数据写入磁盘。直接I/O则是同步的,数据直接写入磁盘。
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
// 获取文件关联的inode
struct inode *inode = file_inode(iocb->ki_filp);
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
// 如果文件系统配置支持直接访问,且inode也允许,则进行直接写入
#ifdef CONFIG_FS_DAX
if (IS_DAX(inode))
return ext4_dax_write_iter(iocb, from);
#endif
// 如果IO控制块设置了IOCB_DIRECT,则执行直接IO写入,绕过页缓存
if (iocb->ki_flags & IOCB_DIRECT)
return ext4_dio_write_iter(iocb, from);
// 否则进行缓存写入
return ext4_buffered_write_iter(iocb, from);
}
extent
在以下代码中出现了extent,那么extent是什么呢?
extent是一段连续的物理块,表示文件数据在磁盘上的位置和长度。
- 起始块
- 物理块
- 长度
每个文件都有一个与之关联的 extent 树,其根节点存储在 inode 中。树中的节点包含 extent 或指向子节点的指针。
叶子节点:存储实际的 extent 信息(起始块、物理块和长度)
内部节点:存储指向下一级节点的指针。
内联数据
内联数据适用于包含大量小文件场景,将小文件数据直接储存到文件系统的元数据结构中,可以减少空间浪费
孤儿列表
孤儿列表用于跟踪在文件操作中可能会被中途删除或者截断的文件,确保即使在系统崩溃的情况下也能被正确处理
比如,在文件删除中,inode被更新表示文件被删除了,但是系统中途崩溃了,而实际删除工作在后面进行,就会导致这些文件变为孤儿,文件元数据仍然存在,可是文件本身被逻辑删除了
ext4 buffered IO
buffered IO部分主要做了以下事情
- 锁定inode,防止并发修改,保证page缓存的一致性
- 检查写入操作是否合法,并进行一些预处理
- 写入
static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
ssize_t ret;
struct inode *inode = file_inode(iocb->ki_filp);
// 如果 iocb 的标志中包含 IOCB_NOWAIT,则返回不支持的操作错误
if (iocb->ki_flags & IOCB_NOWAIT)
return -EOPNOTSUPP;
// 加锁以保护 inode 数据结构
inode_lock(inode);
// 检查写入操作是否合法,并进行一些预处理
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
// 设置当前进程的 backing_dev_info 为 inode 对应的设备
current->backing_dev_info = inode_to_bdi(inode);
// 执行通用的写入操作,将数据写入到文件中
ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
// 清除当前进程的 backing_dev_info
current->backing_dev_info = NULL;
out:
// 解锁 inode
inode_unlock(inode);
// 如果写入操作成功,则更新文件位置,并同步写入数据
if (likely(ret > 0)) {
iocb->ki_pos += ret;
ret = generic_write_sync(iocb, ret);
}
// 返回写入的字节数或者错误码
return ret;
}
写入的执行最后还是回到了VFS。generic_perform_write处理从用户空间到文件的写入数据,方法是遍历数据块、与页面缓存交互以定位或分配页面、将数据复制到这些页面、更新文件的元数据、将页面标记为脏页面以便稍后回写到存储,以及确保整个过程中的数据完整性和错误处理。
ssize_t generic_perform_write(struct file *file,
struct iov_iter *i, loff_t pos)
{
struct address_space *mapping = file->f_mapping;
const struct address_space_operations *a_ops = mapping->a_ops;
long status = 0;
ssize_t written = 0;
unsigned int flags = 0;
do {
struct page *page;
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
size_t copied; /* Bytes copied from user */
void *fsdata;
offset = (pos & (PAGE_SIZE - 1));
bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_count(i));
again:
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*
* Not only is this an optimisation, but it is also required
* to check that the address is actually valid, when atomic
* usercopies are used, below.
*/
// 错误处理和信号检测
if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
status = -EFAULT;
break;
}
if (fatal_signal_pending(current)) {
status = -EINTR;
break;
}
// 负责将目标文件对应的页加载到内存中,准备好缓冲区以便写入数据。这个函数可能会涉及到文件系统特定的逻辑,例如预分配块或者处理写入锁。
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata);
if (unlikely(status < 0))
break;
// 如果页面映射到用户空间并且可能被写入,则确保缓存一致性,以防止缓存中的旧数据与内存中的新数据冲突。
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
// 从用户空间缓冲区复制数据到内核页面缓存
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
flush_dcache_page(page);
// 负责处理写操作后的收尾工作,例如更新文件大小、标记页面脏、解除页面锁定等
status = a_ops->write_end(file, mapping, pos, bytes, copied,
page, fsdata);
if (unlikely(status < 0))
break;
copied = status;
cond_resched();
iov_iter_advance(i, copied);
if (unlikely(copied == 0)) {
/*
* If we were unable to copy any data at all, we must
* fall back to a single segment length write.
*
* If we didn't fallback here, we could livelock
* because not all segments in the iov can be copied at
* once without a pagefault.
*/
bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_single_seg_count(i));
goto again;
}
pos += copied;
written += copied;
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(i));
return written ? written : status;
}
EXPORT_SYMBOL(generic_perform_write);
ext4 write begin
ext4_write_begin处理将数据写入文件的准备工作。确保正确设置数据结构和状态,以便实际的数据写入操作顺利进行
锁定inode、在页面缓存中分配页面以及初始化日志事务以确保文件系统的一致性、确定需要修改的特定块,并在必要时从磁盘读取任何现有数据,以避免覆盖块的未初始化部分。
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
int ret, needed_blocks;
handle_t *handle;
int retries = 0;
struct page *page;
pgoff_t index;
unsigned from, to;
// 检查文件系统是否被强制关闭
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
// 记录写入操作的跟踪信息
trace_ext4_write_begin(inode, pos, len, flags);
/*
* Reserve one block more for addition to orphan list in case
* we allocate blocks but write fails for some reason
*/
// 计算写操作所需的块数,包括一个额外的块用于孤儿列表(orphan list)的添加
needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
// 计算写入位置的页索引、起始偏移量和结束偏移量
index = pos >> PAGE_SHIFT;
from = pos & (PAGE_SIZE - 1);
to = from + len;
// 如果文件可能包含内联数据,尝试写入内联数据
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
flags, pagep);
if (ret < 0)
return ret;
if (ret == 1)
return 0;
}
// 进行事务处理之前需要先调用 grab_cache_page_write_begin() 获取页面。这样做可以避免在高系统负载或内存压力下造成的长时间等待,同时允许更灵活的内存分配,从而减少潜在的死锁风险。这种策略有助于确保文件系统写入操作的效率和可靠性
retry_grab:
// 获取要写入的缓存页。如果获取失败
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
unlock_page(page);
retry_journal:
// 开始一个新的Ext4事务
handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
if (IS_ERR(handle)) {
put_page(page);
return PTR_ERR(handle);
}
// 锁定页面并确保页面稳定。如果页面映射发生变化,重新获取页面
lock_page(page);
if (page->mapping != mapping) {
/* The page got truncated from under us */
unlock_page(page);
put_page(page);
ext4_journal_stop(handle);
goto retry_grab;
}
/* In case writeback began while the page was unlocked */
wait_for_stable_page(page);
// 根据文件系统状态选择写入方法,并执行实际的写入操作
#ifdef CONFIG_FS_ENCRYPTION
if (ext4_should_dioread_nolock(inode))
ret = ext4_block_write_begin(page, pos, len,
ext4_get_block_unwritten);
else
ret = ext4_block_write_begin(page, pos, len,
ext4_get_block);
#else
if (ext4_should_dioread_nolock(inode))
ret = __block_write_begin(page, pos, len,
ext4_get_block_unwritten);
else
ret = __block_write_begin(page, pos, len, ext4_get_block);
#endif
if (!ret && ext4_should_journal_data(inode)) {
ret = ext4_walk_page_buffers(handle, page_buffers(page),
from, to, NULL,
do_journal_get_write_access);
}
// 处理写入过程中出现的错误。如果需要重试分配块,重新开始事务
if (ret) {
bool extended = (pos + len > inode->i_size) &&
!ext4_verity_in_progress(inode);
unlock_page(page);
/*
* __block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
* i_size_read because we hold i_mutex.
*
* Add inode to orphan list in case we crash before
* truncate finishes
*/
if (extended && ext4_can_truncate(inode))
ext4_orphan_add(handle, inode);
ext4_journal_stop(handle);
if (extended) {
ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might
* still be on the orphan list; we need to
* make sure the inode is removed from the
* orphan list in that case.
*/
if (inode->i_nlink)
ext4_orphan_del(NULL, inode);
}
if (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry_journal;
put_page(page);
return ret;
}
*pagep = page;
return ret;
}
block_write_begin通过映射页中必要的块来准备要写的页。遍历每个块,确保将其映射并标记为最新的,如果有必要,还会对需要从磁盘读取的块发起读取,以避免覆盖未初始化的数据
// • page: 需要写入数据的页面。
// • pos: 写操作的起始位置。
// • len: 写入数据的长度。
// • get_block: 用于映射逻辑块号到物理块号的回调函数。
// • iomap: I/O 映射结构体,用于描述 I/O 操作。
int __block_write_begin(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block)
{
return __block_write_begin_int(page, pos, len, get_block, NULL);
}
EXPORT_SYMBOL(__block_write_begin);
int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block, struct iomap *iomap)
{
unsigned from = pos & (PAGE_SIZE - 1);
unsigned to = from + len;
struct inode *inode = page->mapping->host;
unsigned block_start, block_end;
sector_t block;
int err = 0;
unsigned blocksize, bbits;
struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
BUG_ON(!PageLocked(page));
BUG_ON(from > PAGE_SIZE);
BUG_ON(to > PAGE_SIZE);
BUG_ON(from > to);
// 为页面分配缓冲头,并设置缓冲区大小和块大小的位数
head = create_page_buffers(page, inode, 0);
blocksize = head->b_size;
bbits = block_size_bits(blocksize);
block = (sector_t)page->index << (PAGE_SHIFT - bbits);
// 遍历页面的每个缓冲头(buffer head),处理每个块
for(bh = head, block_start = 0; bh != head || !block_start;
block++, block_start=block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (PageUptodate(page)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
}
continue;
}
if (buffer_new(bh))
clear_buffer_new(bh);
// 如果缓冲区尚未映射,则调用get_block或者iomap_to_bh进行块映射
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
if (get_block) {
err = get_block(inode, block, bh, 1);
if (err)
break;
} else {
iomap_to_bh(inode, block, bh, iomap);
}
if (buffer_new(bh)) {
clean_bdev_bh_alias(bh);
if (PageUptodate(page)) {
clear_buffer_new(bh);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
continue;
}
if (block_end > to || block_start < from)
zero_user_segments(page,
to, block_end,
block_start, from);
continue;
}
}
if (PageUptodate(page)) {
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
continue;
}
// 如果缓冲区未更新且未延迟,也未写入,则从磁盘读取块数据
if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
!buffer_unwritten(bh) &&
(block_start < from || block_end > to)) {
ll_rw_block(REQ_OP_READ, 0, 1, &bh);
*wait_bh++=bh;
}
}
/*
* If we issued read requests - let them complete.
*/
// 等待所有读取操作完成
while(wait_bh > wait) {
wait_on_buffer(*--wait_bh);
if (!buffer_uptodate(*wait_bh))
err = -EIO;
}
if (unlikely(err))
page_zero_new_buffers(page, from, to);
return err;
}
ext4 write end
ext4_write_end对页的数据写入做收尾工作。
如果写入扩展了文件,则更新inode大小,必要时将inode标记为脏的,并处理任何清理,包括处理日志事务,如果写入部分失败,则截断超出新文件大小的未初始化块。保证写操作后数据的完整性和一致性。
/*
* We need to pick up the new inode size which generic_commit_write gave us
* `file' can be NULL - eg, when called from page_symlink().
*
* ext4 never places buffers on inode->i_mapping->private_list. metadata
* buffers are managed internally.
*/
static int ext4_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
int ret = 0, ret2;
int i_size_changed = 0;
int inline_data = ext4_has_inline_data(inode);
bool verity = ext4_verity_in_progress(inode);
trace_ext4_write_end(inode, pos, len, copied);
// 包含内联数据,处理内联数据写入,否则进行块写入
if (inline_data) {
ret = ext4_write_inline_data_end(inode, pos, len,
copied, page);
if (ret < 0) {
unlock_page(page);
put_page(page);
goto errout;
}
copied = ret;
} else
copied = block_write_end(file, mapping, pos,
len, copied, page, fsdata);
/*
* it's important to update i_size while still holding page lock:
* page writeout could otherwise come in and zero beyond i_size.
*
* If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
* blocks are being written past EOF, so skip the i_size update.
*/
if (!verity)
i_size_changed = ext4_update_inode_size(inode, pos + copied);
unlock_page(page);
put_page(page);
// 如果旧文件大小小于写入位置,且没有正在进行的文件校验扩展操作,更新页面缓存的文件大小
if (old_size < pos && !verity)
pagecache_isize_extended(inode, old_size, pos);
/*
* Don't mark the inode dirty under page lock. First, it unnecessarily
* makes the holding time of page lock longer. Second, it forces lock
* ordering of page lock and transaction start for journaling
* filesystems.
*/
// 如果文件大小发生变化或包含内联数据,标记inode为脏
if (i_size_changed || inline_data)
ext4_mark_inode_dirty(handle, inode);
// 如果写入位置加上写入长度超过了文件大小,并且文件系统允许截断,添加inode到孤儿列表
if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
* inode->i_size. So truncate them
*/
ext4_orphan_add(handle, inode);
errout:
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
if (pos + len > inode->i_size && !verity) {
ext4_truncate_failed_write(inode);
/*
* If truncate failed early the inode might still be
* on the orphan list; we need to make sure the inode
* is removed from the orphan list in that case.
*/
if (inode->i_nlink)
ext4_orphan_del(NULL, inode);
}
return ret ? ret : copied;
}
ext4 direct IO
static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
ssize_t ret;
size_t count;
loff_t offset;
handle_t *handle;
struct inode *inode = file_inode(iocb->ki_filp);
bool extend = false, overwrite = false, unaligned_aio = false;
// 锁定inode。
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!inode_trylock(inode))
return -EAGAIN;
} else {
inode_lock(inode);
}
// 检查是否支持直接IO
if (!ext4_dio_supported(inode)) {
inode_unlock(inode);
/*
* Fallback to buffered I/O if the inode does not support
* direct I/O.
*/
return ext4_buffered_write_iter(iocb, from);
}
// 写入前检查
ret = ext4_write_checks(iocb, from);
if (ret <= 0) {
inode_unlock(inode);
return ret;
}
// 同步未对齐的异步direct IO,防止数据损坏
offset = iocb->ki_pos;
count = iov_iter_count(from);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
!is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
unaligned_aio = true;
inode_dio_wait(inode);
}
/*
* Determine whether the I/O will overwrite allocated and initialized
* blocks. If so, check to see whether it is possible to take the
* dioread_nolock path.
*/
// 如果IO对齐且I/O覆盖已分配和初始化的块且 inode 支持无锁直接读取,则设置 overwrite 并降级写锁
if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
ext4_should_dioread_nolock(inode)) {
overwrite = true;
downgrade_write(&inode->i_rwsem);
}
// 检查写操作的结束(offset + count)是否超过了inode的当前磁盘大小,启动一个日志句柄来安全地管理对inode的更改,将该inode添加到孤立列表中,以处理写操作期间可能发生的崩溃,并设置一个标志(extend),表示将扩展inode的大小。然后停止日志记录句柄
if (offset + count > EXT4_I(inode)->i_disksize) {
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
}
ret = ext4_orphan_add(handle, inode);
if (ret) {
ext4_journal_stop(handle);
goto out;
}
extend = true;
ext4_journal_stop(handle);
}
// 执行直接 I/O 写入
ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops,
is_sync_kiocb(iocb) || unaligned_aio || extend);
// 如果是扩展操作,需要再次启动一个事务并更新磁盘大小
if (extend)
ret = ext4_handle_inode_extension(inode, offset, ret, count);
out:
if (overwrite)
inode_unlock_shared(inode);
else
inode_unlock(inode);
if (ret >= 0 && iov_iter_count(from)) {
ssize_t err;
loff_t endbyte;
// 回退到缓冲IO
offset = iocb->ki_pos;
err = ext4_buffered_write_iter(iocb, from);
if (err < 0)
return err;
// 在当前 I/O 操作覆盖的范围内,确保页面缓存中的页面被写入磁盘并失效(即使缓存无效)。这是为了在必要时回退到缓冲 I/O 时,尽量保持直接 I/O 的语义
ret += err;
endbyte = offset + err - 1;
err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
offset, endbyte);
if (!err)
invalidate_mapping_pages(iocb->ki_filp->f_mapping,
offset >> PAGE_SHIFT,
endbyte >> PAGE_SHIFT);
}
return ret;
}
ext4 BIO与DIO代码有感
ext4 BIO(Buffered IO)与DIO(Direct IO)
-
ext4 BIO与DIO都尝试对inode进行锁定。不同的是DIO还允许无等待,也就是在锁已经被获取的情况下,直接返回
-
BIO经过内核page缓存,而DIO则直接从用户空间写入到设备
-
DIO还确保写入操作覆盖范围内的缓存页面被写入磁盘并失效,以保证直接 I/O 语义,和未对齐的异步直接 I/O 写入,防止数据损坏
Ref
- https://elixir.bootlin.com/linux/v5.5-rc2/source