学习I/O栈的一环,分析pread的代码。
流程
系统调用
用户通过syscall指令触发系统调用,CPU切换到内核态并跳转到入口函数entry_SYSCALL_64
entry_SYSCALL_64通过汇编call do_syscall_64进行系统调用,里面大概有这样的代码
if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
/* Invalid system call, but still a system call. */
regs->ax = __x64_sys_ni_syscall(regs);
}可以发现进行了系统调用,如果x64和x32先后都失败了,则给函数返回值,也就是存放在寄存器%rax处的值赋为__x64_sys_ni_syscall(regs)。
然后查看x64的调用,大概如下
static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
{
/*
* Convert negative numbers to very high and thus out of range
* numbers for comparisons.
*/
unsigned int unr = nr;
if (likely(unr < NR_syscalls)) {
unr = array_index_nospec(unr, NR_syscalls);
regs->ax = sys_call_table[unr](regs);
return true;
}
return false;
}可以发现根据nr查表,找到对应的系统调用,也就是下文的ksys_pread64进行调用,然后返回值放在%rax。
ksys_pread64
首先是进入系统调用的第一步,也就是ksys_pread64函数
ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
loff_t pos)
{
struct fd f;
ssize_t ret = -EBADF;
if (pos < 0)
return -EINVAL;
f = fdget(fd);
if (f.file) {
ret = -ESPIPE;
if (f.file->f_mode & FMODE_PREAD)
ret = vfs_read(f.file, buf, count, &pos);
fdput(f);
}
return ret;
}首先是fd,结构如下,是用户空间的文件描述符
struct fd {
struct file *file;
unsigned int flags;
};其中包含了file类型,file类型存储了打开的文件的相关所有信息,具体如下所示
struct file {
union {
struct llist_node fu_llist;
struct rcu_head fu_rcuhead;
} f_u;
struct path f_path;
struct inode *f_inode; /* 缓存的inode指针 */
const struct file_operations *f_op;
spinlock_t f_lock; /* 保护f_ep和f_flags的锁 */
enum rw_hint f_write_hint;
atomic_long_t f_count; /* 引用计数 */
unsigned int f_flags; /* 打开标志,如O_RDONLY等 */
fmode_t f_mode; /* 文件访问模式 */
struct mutex f_pos_lock; /* 文件位置锁 */
loff_t f_pos; /* 文件当前位置指针 */
struct fown_struct f_owner; /* 文件所有权信息 */
const struct cred *f_cred; /* 凭证 */
struct file_ra_state f_ra; /* 预读状态 */
u64 f_version;
void *private_data;/* 私有数据,如tty驱动使用 */
struct address_space *f_mapping; /* 页缓存映射 */
errseq_t f_wb_err; /* 回写错误序列 */
errseq_t f_sb_err; /* 超级块错误序列 */
}然后是返回值ret,其中初值为-EBADF,表示是一个无效文件;如果pos小于零,则返回-EINVAL表示偏移量无效。
然后通过fdget获取对应文件描述符fd获取f,也就是打开文件状态,fdget会增加文件的引用计数,关闭时通过fdput释放。
如果f.file不为空,则成功打开了文件,先把ret变为-ESPIPE,表示不支持随机访问,然后判断打开文件的mode是否支持read,调用vfs接口vfs_read去读取文件内容并更新ret,最后通过fdput,减少文件的引用计数。
最后返回读取结果ret。
vfs_read
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_READ))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;
if (unlikely(!access_ok(buf, count)))
return -EFAULT;
ret = rw_verify_area(READ, file, pos, count);
if (ret)
return ret;
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
if (file->f_op->read)
ret = file->f_op->read(file, buf, count, pos);
else if (file->f_op->read_iter)
ret = new_sync_read(file, buf, count, pos);
else
ret = -EINVAL;
if (ret > 0) {
fsnotify_access(file);
add_rchar(current, ret);
}
inc_syscr(current);
return ret;
}首先是检查权限
- 读取权限:FMODE_READ表示文件是以读权限打开的。
- 能力检查:FMODE_CAN_READ表示文件系统允许读取。
- 内存安全:access_ok()验证用户空间缓冲区的有效性。
然后判断了溢出和安全
- 根据
count和pos进行一些判断,判断pos+count的大小等表示是否会出现越界或其他情况。 - 获取
sid,也就是文件的安全标识符,判断当前sid是否与刚打开时相同,如果不同则重新进行权限检查。
然后判断当前读取字节数是否超过上限MAX_RW_COUNT,通常是2GB,超过则只读2GB。
而后尝试先调用read,如不存在,则调用read_iter。
read_iter
static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len };
struct kiocb kiocb;
struct iov_iter iter;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = (ppos ? *ppos : 0);
iov_iter_init(&iter, READ, &iov, 1, len);
ret = call_read_iter(filp, &kiocb, &iter);
BUG_ON(ret == -EIOCBQUEUED);
if (ppos)
*ppos = kiocb.ki_pos;
return ret;
}创建一个I/O向量结构体(iovec),将用户缓冲区信息封装起来。这是Linux中进行I/O操作的标准方式:
iov_base: 指向目标缓冲区的指针iov_len: 缓冲区长度
kiocb(Kernel I/O Control Block)是内核中表示I/O操作的结构:
init_sync_kiocb初始化为同步操作模式
设置读取位置(ki_pos),如果提供了位置指针就使用它,否则从位置0开始
然后进行初始化
static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
{
*kiocb = (struct kiocb) {
.ki_filp = filp,
.ki_flags = iocb_flags(filp),
.ki_hint = ki_hint_validate(file_write_hint(filp)),
.ki_ioprio = get_current_ioprio(),
};
}参数的意义为
ki_filp: 表示该I/O操作关联的文件对象ki_flags: 根据文件特性设置适当的I/O控制标志,可能包含诸如直接I/O、同步I/O、追加I/O等标志ki_hint: 提供文件访问模式的提示信息(如顺序读取、随机访问等)ki_ioprio: 设置当前进程的I/O优先级,影响调度器对该I/O请求的处理优先级
然后初始化iter
void iov_iter_init(struct iov_iter *i, unsigned int direction,
const struct iovec *iov, unsigned long nr_segs,
size_t count)
{
WARN_ON(direction & ~(READ | WRITE));
*i = (struct iov_iter) {
.iter_type = ITER_IOVEC,
.nofault = false,
.data_source = direction,
.iov = iov,
.nr_segs = nr_segs,
.iov_offset = 0,
.count = count
};
}下面就是开始同步读的调用
call_read_iter
具体只是一个调用,会调用ext4_file_read_iter
static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
if (!iov_iter_count(to))
return 0; /* skip atime */
#ifdef CONFIG_FS_DAX
if (IS_DAX(inode))
return ext4_dax_read_iter(iocb, to);
#endif
if (iocb->ki_flags & IOCB_DIRECT)
return ext4_dio_read_iter(iocb, to);
return generic_file_read_iter(iocb, to);
}然后是generic_file_read_iter,可以参考这里
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
size_t count = iov_iter_count(iter);
ssize_t retval = 0;
if (!count)
return 0; /* skip atime */
if (iocb->ki_flags & IOCB_DIRECT) {
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
loff_t size;
size = i_size_read(inode);
if (iocb->ki_flags & IOCB_NOWAIT) {
if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
iocb->ki_pos + count - 1))
return -EAGAIN;
} else {
retval = filemap_write_and_wait_range(mapping,
iocb->ki_pos,
iocb->ki_pos + count - 1);
if (retval < 0)
return retval;
}
file_accessed(file);
retval = mapping->a_ops->direct_IO(iocb, iter);
if (retval >= 0) {
iocb->ki_pos += retval;
count -= retval;
}
if (retval != -EIOCBQUEUED)
iov_iter_revert(iter, count - iov_iter_count(iter));
/*
* Btrfs can have a short DIO read if we encounter
* compressed extents, so if there was an error, or if
* we've already read everything we wanted to, or if
* there was a short read because we hit EOF, go ahead
* and return. Otherwise fallthrough to buffered io for
* the rest of the read. Buffered reads will not work for
* DAX files, so don't bother trying.
*/
if (retval < 0 || !count || iocb->ki_pos >= size ||
IS_DAX(inode))
return retval;
}
return filemap_read(iocb, iter, retval);
}
EXPORT_SYMBOL(generic_file_read_iter);filemap_read上面都是直接I/O,即跳过pagecache进行读取,这里主要讨论filemap_read的具体操作。
filemap_read
filemap_read是基于folio的页缓存核心读取函数
filemap_read的目的是从文件系统的page cache中读取数据并将其复制到用户空间。如果数据不在page cache中,则会触发预读或者加载缺失的页。
然后调用filemap_get_pages函数
static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
struct pagevec *pvec)
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
struct file_ra_state *ra = &filp->f_ra;
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
pgoff_t last_index;
struct page *page;
int err = 0;
last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
retry:
if (fatal_signal_pending(current))
return -EINTR;
filemap_get_read_batch(mapping, index, last_index, pvec);
...
}可以发现首先调用了filemap_get_read_batch,功能是尝试从page cache中获得page,传入的参数为
mapping:文件的地址空间对象,管理文件在页缓存中的存在。index:起始页面索引,要获取的第一个页面。max:最大页面索引,获取操作的上限。pvec:页面向量,用于存储获取的页面引用。
可以发现max是这样计算的
last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);即(当前位置+偏移量)/PAGE_SIZE。
/*
* filemap_get_read_batch - Get a batch of pages for read
*
* Get a batch of pages which represent a contiguous range of bytes
* in the file. No tail pages will be returned. If @index is in the
* middle of a THP, the entire THP will be returned. The last page in
* the batch may have Readahead set or be not Uptodate so that the
* caller can take the appropriate action.
*/
static void filemap_get_read_batch(struct address_space *mapping,
pgoff_t index, pgoff_t max, struct pagevec *pvec)
{
XA_STATE(xas, &mapping->i_pages, index);
struct page *head;
rcu_read_lock();
for (head = xas_load(&xas); head; head = xas_next(&xas)) {
if (xas_retry(&xas, head))
continue;
if (xas.xa_index > max || xa_is_value(head))
break;
if (!page_cache_get_speculative(head))
goto retry;
/* Has the page moved or been split? */
if (unlikely(head != xas_reload(&xas)))
goto put_page;
if (!pagevec_add(pvec, head))
break;
if (!PageUptodate(head))
break;
if (PageReadahead(head))
break;
if (PageHead(head)) {
xas_set(&xas, head->index + thp_nr_pages(head));
/* Handle wrap correctly */
if (xas.xa_index - 1 >= max)
break;
}
continue;
put_page:
put_page(head);
retry:
xas_reset(&xas);
}
rcu_read_unlock();
}可以发现先用rcu上锁,然后从基向树中读取index代表的head,接着向后挨个遍历基向树中的page。其中index代表的是页的标号,max是最大的页的标号。读取的过程中如果超过了max则退出循环,否则通过pagevec_add将head对应的页加到pvec中。
从page cache中尝试读取后判断是否读取到了页,即判断pvec->nr是否等于0,如果等于0则触发预读
if (!pagevec_count(pvec)) {
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
page_cache_sync_readahead(mapping, ra, filp, index,
last_index - index);
filemap_get_read_batch(mapping, index, last_index, pvec);
}
static inline
void page_cache_sync_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *file, pgoff_t index,
unsigned long req_count)
{
DEFINE_READAHEAD(ractl, file, ra, mapping, index);
page_cache_sync_ra(&ractl, req_count);
}
void page_cache_sync_ra(struct readahead_control *ractl,
unsigned long req_count)
{
bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
/*
* Even if read-ahead is disabled, issue this request as read-ahead
* as we'll need it to satisfy the requested range. The forced
* read-ahead will do the right thing and limit the read to just the
* requested range, which we'll set to 1 page for this case.
*/
if (!ractl->ra->ra_pages || blk_cgroup_congested()) {
if (!ractl->file)
return;
req_count = 1;
do_forced_ra = true;
}
/* be dumb */
if (do_forced_ra) {
force_page_cache_ra(ractl, req_count);
return;
}
/* do read-ahead */
ondemand_readahead(ractl, false, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);可以发现最终是调用了page_cache_sync_ra
- 如果最大预读页数为
0,则没有预读 - 如果是随机读,则调用
force_page_cache_readahead() - 否则调用
ondemand_readahead开始预读
ondemand_readahead
static void ondemand_readahead(struct readahead_control *ractl,
bool hit_readahead_marker, unsigned long req_size)
{
struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
struct file_ra_state *ra = ractl->ra;
unsigned long max_pages = ra->ra_pages;
unsigned long add_pages;
unsigned long index = readahead_index(ractl);
pgoff_t prev_index;
/*
* If the request exceeds the readahead window, allow the read to
* be up to the optimal hardware IO size
*/
if (req_size > max_pages && bdi->io_pages > max_pages)
max_pages = min(req_size, bdi->io_pages);
/*
* start of file
*/
if (!index)
goto initial_readahead;
/*
* It's the expected callback index, assume sequential access.
* Ramp up sizes, and push forward the readahead window.
*/
if ((index == (ra->start + ra->size - ra->async_size) ||
index == (ra->start + ra->size))) {
ra->start += ra->size;
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
goto readit;
}
/*
* Hit a marked page without valid readahead state.
* E.g. interleaved reads.
* Query the pagecache for async_size, which normally equals to
* readahead size. Ramp it up and use it as the new readahead size.
*/
if (hit_readahead_marker) {
pgoff_t start;
rcu_read_lock();
start = page_cache_next_miss(ractl->mapping, index + 1,
max_pages);
rcu_read_unlock();
if (!start || start - index > max_pages)
return;
ra->start = start;
ra->size = start - index; /* old async_size */
ra->size += req_size;
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
goto readit;
}
/*
* oversize read
*/
if (req_size > max_pages)
goto initial_readahead;
/*
* sequential cache miss
* trivial case: (index - prev_index) == 1
* unaligned reads: (index - prev_index) == 0
*/
prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
if (index - prev_index <= 1UL)
goto initial_readahead;
/*
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
if (try_context_readahead(ractl->mapping, ra, index, req_size,
max_pages))
goto readit;
/*
* standalone, small random read
* Read as is, and do not pollute the readahead state.
*/
do_page_cache_ra(ractl, req_size, 0);
return;
initial_readahead:
ra->start = index;
ra->size = get_init_ra_size(req_size, max_pages);
ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
readit:
/*
* Will this read hit the readahead marker made by itself?
* If so, trigger the readahead marker hit now, and merge
* the resulted next readahead window into the current one.
* Take care of maximum IO pages as above.
*/
if (index == ra->start && ra->size == ra->async_size) {
add_pages = get_next_ra_size(ra, max_pages);
if (ra->size + add_pages <= max_pages) {
ra->async_size = add_pages;
ra->size += add_pages;
} else {
ra->size = max_pages;
ra->async_size = max_pages >> 1;
}
}
ractl->_index = ra->start;
do_page_cache_ra(ractl, ra->size, ra->async_size);
}主要逻辑大概是
- 首先判断如果是从头开始读,则认为是顺序读,初始化预读信息。默认设置预读为4个页
- 如果不是从文件头开始读,则判断请求是否是连续请求,如果是,则扩大预读数量,一般等于上次预读*2
- 否则就是随机的读取,不适合预读
- 调用
do_page_cache_ra提交读请求
rdo_page_cache_ra
实际上调用了page_cache_ra_unbounded,先对预读取的页的数量判断了一下,然后调用
void page_cache_ra_unbounded(struct readahead_control *ractl,
unsigned long nr_to_read, unsigned long lookahead_size)
{
struct address_space *mapping = ractl->mapping;
unsigned long index = readahead_index(ractl);
LIST_HEAD(page_pool);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
unsigned long i;
/*
* Partway through the readahead operation, we will have added
* locked pages to the page cache, but will not yet have submitted
* them for I/O. Adding another page may need to allocate memory,
* which can trigger memory reclaim. Telling the VM we're in
* the middle of a filesystem operation will cause it to not
* touch file-backed pages, preventing a deadlock. Most (all?)
* filesystems already specify __GFP_NOFS in their mapping's
* gfp_mask, but let's be explicit here.
*/
unsigned int nofs = memalloc_nofs_save();
filemap_invalidate_lock_shared(mapping);
/*
* Preallocate as many pages as we will need.
*/
for (i = 0; i < nr_to_read; i++) {
struct page *page = xa_load(&mapping->i_pages, index + i);
if (page && !xa_is_value(page)) {
/*
* Page already present? Kick off the current batch
* of contiguous pages before continuing with the
* next batch. This page may be the one we would
* have intended to mark as Readahead, but we don't
* have a stable reference to this page, and it's
* not worth getting one just for that.
*/
read_pages(ractl, &page_pool, true);
i = ractl->_index + ractl->_nr_pages - index - 1;
continue;
}
page = __page_cache_alloc(gfp_mask);
if (!page)
break;
if (mapping->a_ops->readpages) {
page->index = index + i;
list_add(&page->lru, &page_pool);
} else if (add_to_page_cache_lru(page, mapping, index + i,
gfp_mask) < 0) {
put_page(page);
read_pages(ractl, &page_pool, true);
i = ractl->_index + ractl->_nr_pages - index - 1;
continue;
}
if (i == nr_to_read - lookahead_size)
SetPageReadahead(page);
ractl->_nr_pages++;
}
/*
* Now start the IO. We ignore I/O errors - if the page is not
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
read_pages(ractl, &page_pool, false);
filemap_invalidate_unlock_shared(mapping);
memalloc_nofs_restore(nofs);
}- 在分配内存前,首先检查页是否已经放进了
page cache,因为别的进程可能将这个页读进来 - 若
cache中没有这个页,分配内存页,读进page pool - 当分配到第
nr_to_read-lookahead_size个页面时,就设置该页面标志PG_readahead,以让下一次进行异步预读 - 页准备好后,调用
read_pages读取文件数据
read_pages
static void read_pages(struct readahead_control *rac, struct list_head *pages,
bool skip_page)
{
const struct address_space_operations *aops = rac->mapping->a_ops;
struct page *page;
struct blk_plug plug;
if (!readahead_count(rac))
goto out;
blk_start_plug(&plug);
if (aops->readahead) {
aops->readahead(rac);
/* Clean up the remaining pages */
while ((page = readahead_page(rac))) {
unlock_page(page);
put_page(page);
}
} else if (aops->readpages) {
aops->readpages(rac->file, rac->mapping, pages,
readahead_count(rac));
/* Clean up the remaining pages */
put_pages_list(pages);
rac->_index += rac->_nr_pages;
rac->_nr_pages = 0;
} else {
while ((page = readahead_page(rac))) {
aops->readpage(rac->file, page);
put_page(page);
}
}
blk_finish_plug(&plug);
BUG_ON(!list_empty(pages));
BUG_ON(readahead_count(rac));
out:
if (skip_page)
rac->_index++;
}核心点在于会调用指定文件系统的readpages函数,这里是ext4函数,readpages是属于inode的功能
static int ext4_readpage(struct file *file, struct page *page)
{
int ret = -EAGAIN;
struct inode *inode = page->mapping->host;
trace_ext4_readpage(page);
if (ext4_has_inline_data(inode))
ret = ext4_readpage_inline(inode, page);
if (ret == -EAGAIN)
return ext4_mpage_readpages(inode, NULL, page);
return ret;
}首先判断是否有内联优化,即数据是否直接存在inode上,是则调用inline读取,否则调用ext4_mpage_readpages读取。
大体流程是根据要读取文件的逻辑地址,转换得到对应的磁盘物理地址,文件数据就保存在磁盘物理地址,最后调用submit_bio根据这些磁盘地址读取文件数据到page文件页。
int ext4_mpage_readpages(struct inode *inode,
struct readahead_control *rac, struct page *page)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
const unsigned blkbits = inode->i_blkbits;
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
sector_t next_block;
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
sector_t blocks[MAX_BUF_PER_PAGE];
unsigned page_block;
struct block_device *bdev = inode->i_sb->s_bdev;
int length;
unsigned relative_block = 0;
struct ext4_map_blocks map;
unsigned int nr_pages = rac ? readahead_count(rac) : 1;
map.m_pblk = 0;
map.m_lblk = 0;
map.m_len = 0;
map.m_flags = 0;
for (; nr_pages; nr_pages--) {
int fully_mapped = 1;
unsigned first_hole = blocks_per_page;
if (rac) {
page = readahead_page(rac);
prefetchw(&page->flags);
}
if (page_has_buffers(page))
goto confused;
block_in_file = next_block =
(sector_t)page->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
last_block_in_file = (ext4_readpage_limit(inode) +
blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
page_block = 0;
/*
* Map blocks using the previous result first.
*/
if ((map.m_flags & EXT4_MAP_MAPPED) &&
block_in_file > map.m_lblk &&
block_in_file < (map.m_lblk + map.m_len)) {
unsigned map_offset = block_in_file - map.m_lblk;
unsigned last = map.m_len - map_offset;
for (relative_block = 0; ; relative_block++) {
if (relative_block == last) {
/* needed? */
map.m_flags &= ~EXT4_MAP_MAPPED;
break;
}
if (page_block == blocks_per_page)
break;
blocks[page_block] = map.m_pblk + map_offset +
relative_block;
page_block++;
block_in_file++;
}
}
/*
* Then do more ext4_map_blocks() calls until we are
* done with this page.
*/
while (page_block < blocks_per_page) {
if (block_in_file < last_block) {
map.m_lblk = block_in_file;
map.m_len = last_block - block_in_file;
if (ext4_map_blocks(NULL, inode, &map, 0) < 0) {
set_error_page:
SetPageError(page);
zero_user_segment(page, 0,
PAGE_SIZE);
unlock_page(page);
goto next_page;
}
}
if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
fully_mapped = 0;
if (first_hole == blocks_per_page)
first_hole = page_block;
page_block++;
block_in_file++;
continue;
}
if (first_hole != blocks_per_page)
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
if (page_block && blocks[page_block-1] != map.m_pblk-1)
goto confused;
for (relative_block = 0; ; relative_block++) {
if (relative_block == map.m_len) {
/* needed? */
map.m_flags &= ~EXT4_MAP_MAPPED;
break;
} else if (page_block == blocks_per_page)
break;
blocks[page_block] = map.m_pblk+relative_block;
page_block++;
block_in_file++;
}
}
if (first_hole != blocks_per_page) {
zero_user_segment(page, first_hole << blkbits,
PAGE_SIZE);
if (first_hole == 0) {
if (ext4_need_verity(inode, page->index) &&
!fsverity_verify_page(page))
goto set_error_page;
SetPageUptodate(page);
unlock_page(page);
goto next_page;
}
} else if (fully_mapped) {
SetPageMappedToDisk(page);
}
if (fully_mapped && blocks_per_page == 1 &&
!PageUptodate(page) && cleancache_get_page(page) == 0) {
SetPageUptodate(page);
goto confused;
}
/*
* This page will go to BIO. Do we need to send this
* BIO off first?
*/
if (bio && (last_block_in_bio != blocks[0] - 1 ||
!fscrypt_mergeable_bio(bio, inode, next_block))) {
submit_and_realloc:
submit_bio(bio);
bio = NULL;
}
if (bio == NULL) {
/*
* bio_alloc will _always_ be able to allocate a bio if
* __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
*/
bio = bio_alloc(GFP_KERNEL, bio_max_segs(nr_pages));
fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
GFP_KERNEL);
ext4_set_bio_post_read_ctx(bio, inode, page->index);
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
bio_set_op_attrs(bio, REQ_OP_READ,
rac ? REQ_RAHEAD : 0);
}
length = first_hole << blkbits;
if (bio_add_page(bio, page, length, 0) < length)
goto submit_and_realloc;
if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
(relative_block == map.m_len)) ||
(first_hole != blocks_per_page)) {
submit_bio(bio);
bio = NULL;
} else
last_block_in_bio = blocks[blocks_per_page - 1];
goto next_page;
confused:
if (bio) {
submit_bio(bio);
bio = NULL;
}
if (!PageUptodate(page))
block_read_full_page(page, ext4_get_block);
else
unlock_page(page);
next_page:
if (rac)
put_page(page);
}
if (bio)
submit_bio(bio);
return 0;
}构造请求,向块设备提交bio请求读取数据。
首先设置内存的预取,然后计算读取块的范围,逻辑映射到对应的块上。注意复用之前的块结果,避免重复调用ext4_map_blocks。
然后将连续的block作为请求,要处理页面存在空洞的情况
- 页面部分是空洞:将空洞部分填充为0
- 页面完全是空洞:直接标记页面为最新并解锁
- 页面完全映射:设置页面已映射到磁盘标志
最后提交bio请求。
struct bio {
struct bio *bi_next; /* request queue link */
struct block_device *bi_bdev;
unsigned int bi_opf; /* bottom bits REQ_OP, top bits
* req_flags.
*/
unsigned short bi_flags; /* BIO_* below */
unsigned short bi_ioprio;
unsigned short bi_write_hint;
blk_status_t bi_status;
atomic_t __bi_remaining;
struct bvec_iter bi_iter;
bio_end_io_t *bi_end_io;
void *bi_private;
#ifdef CONFIG_BLK_CGROUP
/*
* Represents the association of the css and request_queue for the bio.
* If a bio goes direct to device, it will not have a blkg as it will
* not have a request_queue associated with it. The reference is put
* on release of the bio.
*/
struct blkcg_gq *bi_blkg;
struct bio_issue bi_issue;
#ifdef CONFIG_BLK_CGROUP_IOCOST
u64 bi_iocost_cost;
#endif
#endif
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
struct bio_crypt_ctx *bi_crypt_context;
#endif
union {
#if defined(CONFIG_BLK_DEV_INTEGRITY)
struct bio_integrity_payload *bi_integrity; /* data integrity */
#endif
};
unsigned short bi_vcnt; /* how many bio_vec's */
/*
* Everything starting with bi_max_vecs will be preserved by bio_reset()
*/
unsigned short bi_max_vecs; /* max bvl_vecs we can hold */
atomic_t __bi_cnt; /* pin count */
struct bio_vec *bi_io_vec; /* the actual vec list */
struct bio_set *bi_pool;
/*
* We can inline a number of vecs at the end of the bio, to avoid
* double allocations for a small number of bio_vecs. This member
* MUST obviously be kept at the very end of the bio.
*/
struct bio_vec bi_inline_vecs[];
};submit_bio
再往后的流程就暂时不解析了,对于上层存储应用过于底层,无法优化。
blk_qc_t submit_bio(struct bio *bio)
{
if (blkcg_punt_bio_submit(bio))
return BLK_QC_T_NONE;
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
*/
if (bio_has_data(bio)) {
unsigned int count;
if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
count = queue_logical_block_size(
bio->bi_bdev->bd_disk->queue) >> 9;
else
count = bio_sectors(bio);
if (op_is_write(bio_op(bio))) {
count_vm_events(PGPGOUT, count);
} else {
task_io_account_read(bio->bi_iter.bi_size);
count_vm_events(PGPGIN, count);
}
}
/*
* If we're reading data that is part of the userspace workingset, count
* submission time as memory stall. When the device is congested, or
* the submitting cgroup IO-throttled, submission can be a significant
* part of overall IO time.
*/
if (unlikely(bio_op(bio) == REQ_OP_READ &&
bio_flagged(bio, BIO_WORKINGSET))) {
unsigned long pflags;
blk_qc_t ret;
psi_memstall_enter(&pflags);
ret = submit_bio_noacct(bio);
psi_memstall_leave(&pflags);
return ret;
}
return submit_bio_noacct(bio);
}