学习I/O栈的一环，分析pread的代码。

流程

系统调用

用户通过syscall指令触发系统调用，CPU切换到内核态并跳转到入口函数entry_SYSCALL_64

entry_SYSCALL_64通过汇编call do_syscall_64进行系统调用，里面大概有这样的代码

if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
	/* Invalid system call, but still a system call. */
	regs->ax = __x64_sys_ni_syscall(regs);
}

可以发现进行了系统调用，如果x64和x32先后都失败了，则给函数返回值，也就是存放在寄存器%rax处的值赋为__x64_sys_ni_syscall(regs)。

然后查看x64的调用，大概如下

static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
{
	/*
	 * Convert negative numbers to very high and thus out of range
	 * numbers for comparisons.
	 */
	unsigned int unr = nr;
 
	if (likely(unr < NR_syscalls)) {
		unr = array_index_nospec(unr, NR_syscalls);
		regs->ax = sys_call_table[unr](regs);
		return true;
	}
	return false;
}

可以发现根据nr查表，找到对应的系统调用，也就是下文的ksys_pread64进行调用，然后返回值放在%rax。

ksys_pread64

首先是进入系统调用的第一步，也就是ksys_pread64函数

ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
		     loff_t pos)
{
	struct fd f;
	ssize_t ret = -EBADF;
 
	if (pos < 0)
		return -EINVAL;
 
	f = fdget(fd);
	if (f.file) {
		ret = -ESPIPE;
		if (f.file->f_mode & FMODE_PREAD)
			ret = vfs_read(f.file, buf, count, &pos);
		fdput(f);
	}
 
	return ret;
}

首先是fd，结构如下，是用户空间的文件描述符

struct fd {
	struct file *file;
	unsigned int flags;
};

其中包含了file类型，file类型存储了打开的文件的相关所有信息，具体如下所示

struct file {
    union {
        struct llist_node    fu_llist;
        struct rcu_head      fu_rcuhead;
    } f_u;
    struct path        f_path;
    struct inode       *f_inode;    /* 缓存的inode指针 */
    const struct file_operations    *f_op;
 
    spinlock_t         f_lock;      /* 保护f_ep和f_flags的锁 */
    enum rw_hint       f_write_hint;
    atomic_long_t      f_count;     /* 引用计数 */
    unsigned int       f_flags;     /* 打开标志，如O_RDONLY等 */
    fmode_t            f_mode;      /* 文件访问模式 */
    struct mutex       f_pos_lock;  /* 文件位置锁 */
    loff_t             f_pos;       /* 文件当前位置指针 */
    struct fown_struct f_owner;     /* 文件所有权信息 */
    const struct cred  *f_cred;     /* 凭证 */
    struct file_ra_state f_ra;      /* 预读状态 */
 
    u64                f_version;
    void               *private_data;/* 私有数据，如tty驱动使用 */
    struct address_space *f_mapping; /* 页缓存映射 */
    errseq_t           f_wb_err;    /* 回写错误序列 */
    errseq_t           f_sb_err;    /* 超级块错误序列 */
}

然后是返回值ret，其中初值为-EBADF，表示是一个无效文件；如果pos小于零，则返回-EINVAL表示偏移量无效。然后通过fdget获取对应文件描述符fd获取f，也就是打开文件状态，fdget会增加文件的引用计数，关闭时通过fdput释放。

如果f.file不为空，则成功打开了文件，先把ret变为-ESPIPE，表示不支持随机访问，然后判断打开文件的mode是否支持read，调用vfs接口vfs_read去读取文件内容并更新ret，最后通过fdput，减少文件的引用计数。

最后返回读取结果ret。

vfs_read

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
	ssize_t ret;
 
	if (!(file->f_mode & FMODE_READ))
		return -EBADF;
	if (!(file->f_mode & FMODE_CAN_READ))
		return -EINVAL;
	if (unlikely(!access_ok(buf, count)))
		return -EFAULT;
 
	ret = rw_verify_area(READ, file, pos, count);
	if (ret)
		return ret;
	if (count > MAX_RW_COUNT)
		count =  MAX_RW_COUNT;
 
	if (file->f_op->read)
		ret = file->f_op->read(file, buf, count, pos);
	else if (file->f_op->read_iter)
		ret = new_sync_read(file, buf, count, pos);
	else
		ret = -EINVAL;
	if (ret > 0) {
		fsnotify_access(file);
		add_rchar(current, ret);
	}
	inc_syscr(current);
	return ret;
}

首先是检查权限

读取权限：FMODE_READ表示文件是以读权限打开的。
能力检查：FMODE_CAN_READ表示文件系统允许读取。
内存安全：access_ok()验证用户空间缓冲区的有效性。

然后判断了溢出和安全

根据count和pos进行一些判断，判断pos+count的大小等表示是否会出现越界或其他情况。
获取sid，也就是文件的安全标识符，判断当前sid是否与刚打开时相同，如果不同则重新进行权限检查。

然后判断当前读取字节数是否超过上限MAX_RW_COUNT，通常是2GB，超过则只读2GB。

而后尝试先调用read，如不存在，则调用read_iter。

read_iter

static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
	struct iovec iov = { .iov_base = buf, .iov_len = len };
	struct kiocb kiocb;
	struct iov_iter iter;
	ssize_t ret;
 
	init_sync_kiocb(&kiocb, filp);
	kiocb.ki_pos = (ppos ? *ppos : 0);
	iov_iter_init(&iter, READ, &iov, 1, len);
 
	ret = call_read_iter(filp, &kiocb, &iter);
	BUG_ON(ret == -EIOCBQUEUED);
	if (ppos)
		*ppos = kiocb.ki_pos;
	return ret;
}

创建一个I/O向量结构体(iovec)，将用户缓冲区信息封装起来。这是Linux中进行I/O操作的标准方式：

iov_base: 指向目标缓冲区的指针
iov_len: 缓冲区长度

kiocb(Kernel I/O Control Block)是内核中表示I/O操作的结构： init_sync_kiocb初始化为同步操作模式设置读取位置(ki_pos)，如果提供了位置指针就使用它，否则从位置0开始

然后进行初始化

static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
{
	*kiocb = (struct kiocb) {
		.ki_filp = filp,
		.ki_flags = iocb_flags(filp),
		.ki_hint = ki_hint_validate(file_write_hint(filp)),
		.ki_ioprio = get_current_ioprio(),
	};
}

参数的意义为

ki_filp: 表示该I/O操作关联的文件对象
ki_flags: 根据文件特性设置适当的I/O控制标志，可能包含诸如直接I/O、同步I/O、追加I/O等标志
ki_hint: 提供文件访问模式的提示信息（如顺序读取、随机访问等）
ki_ioprio: 设置当前进程的I/O优先级，影响调度器对该I/O请求的处理优先级

然后初始化iter

void iov_iter_init(struct iov_iter *i, unsigned int direction,
            const struct iovec *iov, unsigned long nr_segs,
            size_t count)
{
    WARN_ON(direction & ~(READ | WRITE));
    *i = (struct iov_iter) {
        .iter_type = ITER_IOVEC,
        .nofault = false,
        .data_source = direction,
        .iov = iov,
        .nr_segs = nr_segs,
        .iov_offset = 0,
        .count = count
    };
}

下面就是开始同步读的调用

call_read_iter

具体只是一个调用，会调用ext4_file_read_iter

static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
	struct inode *inode = file_inode(iocb->ki_filp);
 
	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
		return -EIO;
 
	if (!iov_iter_count(to))
		return 0; /* skip atime */
 
#ifdef CONFIG_FS_DAX
	if (IS_DAX(inode))
		return ext4_dax_read_iter(iocb, to);
#endif
	if (iocb->ki_flags & IOCB_DIRECT)
		return ext4_dio_read_iter(iocb, to);
 
	return generic_file_read_iter(iocb, to);
}

然后是generic_file_read_iter，可以参考这里

ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
	size_t count = iov_iter_count(iter);
	ssize_t retval = 0;
 
	if (!count)
		return 0; /* skip atime */
 
	if (iocb->ki_flags & IOCB_DIRECT) {
		struct file *file = iocb->ki_filp;
		struct address_space *mapping = file->f_mapping;
		struct inode *inode = mapping->host;
		loff_t size;
 
		size = i_size_read(inode);
		if (iocb->ki_flags & IOCB_NOWAIT) {
			if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
						iocb->ki_pos + count - 1))
				return -EAGAIN;
		} else {
			retval = filemap_write_and_wait_range(mapping,
						iocb->ki_pos,
					        iocb->ki_pos + count - 1);
			if (retval < 0)
				return retval;
		}
 
		file_accessed(file);
 
		retval = mapping->a_ops->direct_IO(iocb, iter);
		if (retval >= 0) {
			iocb->ki_pos += retval;
			count -= retval;
		}
		if (retval != -EIOCBQUEUED)
			iov_iter_revert(iter, count - iov_iter_count(iter));
 
		/*
		 * Btrfs can have a short DIO read if we encounter
		 * compressed extents, so if there was an error, or if
		 * we've already read everything we wanted to, or if
		 * there was a short read because we hit EOF, go ahead
		 * and return.  Otherwise fallthrough to buffered io for
		 * the rest of the read.  Buffered reads will not work for
		 * DAX files, so don't bother trying.
		 */
		if (retval < 0 || !count || iocb->ki_pos >= size ||
		    IS_DAX(inode))
			return retval;
	}
 
	return filemap_read(iocb, iter, retval);
}
EXPORT_SYMBOL(generic_file_read_iter);

filemap_read上面都是直接I/O，即跳过pagecache进行读取，这里主要讨论filemap_read的具体操作。

filemap_read

filemap_read是基于folio的页缓存核心读取函数

filemap_read的目的是从文件系统的page cache中读取数据并将其复制到用户空间。如果数据不在page cache中，则会触发预读或者加载缺失的页。

然后调用filemap_get_pages函数

static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
		struct pagevec *pvec)
{
	struct file *filp = iocb->ki_filp;
	struct address_space *mapping = filp->f_mapping;
	struct file_ra_state *ra = &filp->f_ra;
	pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
	pgoff_t last_index;
	struct page *page;
	int err = 0;
 
	last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
retry:
	if (fatal_signal_pending(current))
		return -EINTR;
 
	filemap_get_read_batch(mapping, index, last_index, pvec);
  ...
}

可以发现首先调用了filemap_get_read_batch，功能是尝试从page cache中获得page，传入的参数为

mapping:文件的地址空间对象，管理文件在页缓存中的存在。
index：起始页面索引，要获取的第一个页面。
max:最大页面索引，获取操作的上限。
pvec:页面向量，用于存储获取的页面引用。

可以发现max是这样计算的

last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);

即(当前位置+偏移量)/PAGE_SIZE。

/*
 * filemap_get_read_batch - Get a batch of pages for read
 *
 * Get a batch of pages which represent a contiguous range of bytes
 * in the file.  No tail pages will be returned.  If @index is in the
 * middle of a THP, the entire THP will be returned.  The last page in
 * the batch may have Readahead set or be not Uptodate so that the
 * caller can take the appropriate action.
 */
static void filemap_get_read_batch(struct address_space *mapping,
		pgoff_t index, pgoff_t max, struct pagevec *pvec)
{
	XA_STATE(xas, &mapping->i_pages, index);
	struct page *head;
 
	rcu_read_lock();
	for (head = xas_load(&xas); head; head = xas_next(&xas)) {
		if (xas_retry(&xas, head))
			continue;
		if (xas.xa_index > max || xa_is_value(head))
			break;
		if (!page_cache_get_speculative(head))
			goto retry;
 
		/* Has the page moved or been split? */
		if (unlikely(head != xas_reload(&xas)))
			goto put_page;
 
		if (!pagevec_add(pvec, head))
			break;
		if (!PageUptodate(head))
			break;
		if (PageReadahead(head))
			break;
		if (PageHead(head)) {
			xas_set(&xas, head->index + thp_nr_pages(head));
			/* Handle wrap correctly */
			if (xas.xa_index - 1 >= max)
				break;
		}
		continue;
put_page:
		put_page(head);
retry:
		xas_reset(&xas);
	}
	rcu_read_unlock();
}

可以发现先用rcu上锁，然后从基向树中读取index代表的head，接着向后挨个遍历基向树中的page。其中index代表的是页的标号，max是最大的页的标号。读取的过程中如果超过了max则退出循环，否则通过pagevec_add将head对应的页加到pvec中。

从page cache中尝试读取后判断是否读取到了页，即判断pvec->nr是否等于0，如果等于0则触发预读

if (!pagevec_count(pvec)) {
	if (iocb->ki_flags & IOCB_NOIO)
		return -EAGAIN;
	page_cache_sync_readahead(mapping, ra, filp, index,
		last_index - index);
	filemap_get_read_batch(mapping, index, last_index, pvec);
}
 
static inline
void page_cache_sync_readahead(struct address_space *mapping,
		struct file_ra_state *ra, struct file *file, pgoff_t index,
		unsigned long req_count)
{
	DEFINE_READAHEAD(ractl, file, ra, mapping, index);
	page_cache_sync_ra(&ractl, req_count);
}
 
void page_cache_sync_ra(struct readahead_control *ractl,
		unsigned long req_count)
{
	bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
 
	/*
	 * Even if read-ahead is disabled, issue this request as read-ahead
	 * as we'll need it to satisfy the requested range. The forced
	 * read-ahead will do the right thing and limit the read to just the
	 * requested range, which we'll set to 1 page for this case.
	 */
	if (!ractl->ra->ra_pages || blk_cgroup_congested()) {
		if (!ractl->file)
			return;
		req_count = 1;
		do_forced_ra = true;
	}
 
	/* be dumb */
	if (do_forced_ra) {
		force_page_cache_ra(ractl, req_count);
		return;
	}
 
	/* do read-ahead */
	ondemand_readahead(ractl, false, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);

可以发现最终是调用了page_cache_sync_ra

如果最大预读页数为0，则没有预读
如果是随机读，则调用force_page_cache_readahead()
否则调用ondemand_readahead开始预读

ondemand_readahead

static void ondemand_readahead(struct readahead_control *ractl,
		bool hit_readahead_marker, unsigned long req_size)
{
	struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
	struct file_ra_state *ra = ractl->ra;
	unsigned long max_pages = ra->ra_pages;
	unsigned long add_pages;
	unsigned long index = readahead_index(ractl);
	pgoff_t prev_index;
 
	/*
	 * If the request exceeds the readahead window, allow the read to
	 * be up to the optimal hardware IO size
	 */
	if (req_size > max_pages && bdi->io_pages > max_pages)
		max_pages = min(req_size, bdi->io_pages);
 
	/*
	 * start of file
	 */
	if (!index)
		goto initial_readahead;
 
	/*
	 * It's the expected callback index, assume sequential access.
	 * Ramp up sizes, and push forward the readahead window.
	 */
	if ((index == (ra->start + ra->size - ra->async_size) ||
	     index == (ra->start + ra->size))) {
		ra->start += ra->size;
		ra->size = get_next_ra_size(ra, max_pages);
		ra->async_size = ra->size;
		goto readit;
	}
 
	/*
	 * Hit a marked page without valid readahead state.
	 * E.g. interleaved reads.
	 * Query the pagecache for async_size, which normally equals to
	 * readahead size. Ramp it up and use it as the new readahead size.
	 */
	if (hit_readahead_marker) {
		pgoff_t start;
 
		rcu_read_lock();
		start = page_cache_next_miss(ractl->mapping, index + 1,
				max_pages);
		rcu_read_unlock();
 
		if (!start || start - index > max_pages)
			return;
 
		ra->start = start;
		ra->size = start - index;	/* old async_size */
		ra->size += req_size;
		ra->size = get_next_ra_size(ra, max_pages);
		ra->async_size = ra->size;
		goto readit;
	}
 
	/*
	 * oversize read
	 */
	if (req_size > max_pages)
		goto initial_readahead;
 
	/*
	 * sequential cache miss
	 * trivial case: (index - prev_index) == 1
	 * unaligned reads: (index - prev_index) == 0
	 */
	prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
	if (index - prev_index <= 1UL)
		goto initial_readahead;
 
	/*
	 * Query the page cache and look for the traces(cached history pages)
	 * that a sequential stream would leave behind.
	 */
	if (try_context_readahead(ractl->mapping, ra, index, req_size,
			max_pages))
		goto readit;
 
	/*
	 * standalone, small random read
	 * Read as is, and do not pollute the readahead state.
	 */
	do_page_cache_ra(ractl, req_size, 0);
	return;
 
initial_readahead:
	ra->start = index;
	ra->size = get_init_ra_size(req_size, max_pages);
	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
 
readit:
	/*
	 * Will this read hit the readahead marker made by itself?
	 * If so, trigger the readahead marker hit now, and merge
	 * the resulted next readahead window into the current one.
	 * Take care of maximum IO pages as above.
	 */
	if (index == ra->start && ra->size == ra->async_size) {
		add_pages = get_next_ra_size(ra, max_pages);
		if (ra->size + add_pages <= max_pages) {
			ra->async_size = add_pages;
			ra->size += add_pages;
		} else {
			ra->size = max_pages;
			ra->async_size = max_pages >> 1;
		}
	}
 
	ractl->_index = ra->start;
	do_page_cache_ra(ractl, ra->size, ra->async_size);
}

主要逻辑大概是

首先判断如果是从头开始读，则认为是顺序读，初始化预读信息。默认设置预读为4个页
如果不是从文件头开始读，则判断请求是否是连续请求，如果是，则扩大预读数量，一般等于上次预读*2
否则就是随机的读取，不适合预读
调用do_page_cache_ra提交读请求

rdo_page_cache_ra

实际上调用了page_cache_ra_unbounded，先对预读取的页的数量判断了一下，然后调用

void page_cache_ra_unbounded(struct readahead_control *ractl,
		unsigned long nr_to_read, unsigned long lookahead_size)
{
	struct address_space *mapping = ractl->mapping;
	unsigned long index = readahead_index(ractl);
	LIST_HEAD(page_pool);
	gfp_t gfp_mask = readahead_gfp_mask(mapping);
	unsigned long i;
 
	/*
	 * Partway through the readahead operation, we will have added
	 * locked pages to the page cache, but will not yet have submitted
	 * them for I/O.  Adding another page may need to allocate memory,
	 * which can trigger memory reclaim.  Telling the VM we're in
	 * the middle of a filesystem operation will cause it to not
	 * touch file-backed pages, preventing a deadlock.  Most (all?)
	 * filesystems already specify __GFP_NOFS in their mapping's
	 * gfp_mask, but let's be explicit here.
	 */
	unsigned int nofs = memalloc_nofs_save();
 
	filemap_invalidate_lock_shared(mapping);
	/*
	 * Preallocate as many pages as we will need.
	 */
	for (i = 0; i < nr_to_read; i++) {
		struct page *page = xa_load(&mapping->i_pages, index + i);
 
		if (page && !xa_is_value(page)) {
			/*
			 * Page already present?  Kick off the current batch
			 * of contiguous pages before continuing with the
			 * next batch.  This page may be the one we would
			 * have intended to mark as Readahead, but we don't
			 * have a stable reference to this page, and it's
			 * not worth getting one just for that.
			 */
			read_pages(ractl, &page_pool, true);
			i = ractl->_index + ractl->_nr_pages - index - 1;
			continue;
		}
 
		page = __page_cache_alloc(gfp_mask);
		if (!page)
			break;
		if (mapping->a_ops->readpages) {
			page->index = index + i;
			list_add(&page->lru, &page_pool);
		} else if (add_to_page_cache_lru(page, mapping, index + i,
					gfp_mask) < 0) {
			put_page(page);
			read_pages(ractl, &page_pool, true);
			i = ractl->_index + ractl->_nr_pages - index - 1;
			continue;
		}
		if (i == nr_to_read - lookahead_size)
			SetPageReadahead(page);
		ractl->_nr_pages++;
	}
 
	/*
	 * Now start the IO.  We ignore I/O errors - if the page is not
	 * uptodate then the caller will launch readpage again, and
	 * will then handle the error.
	 */
	read_pages(ractl, &page_pool, false);
	filemap_invalidate_unlock_shared(mapping);
	memalloc_nofs_restore(nofs);
}

在分配内存前，首先检查页是否已经放进了page cache，因为别的进程可能将这个页读进来
若cache中没有这个页，分配内存页，读进page pool
当分配到第nr_to_read-lookahead_size个页面时，就设置该页面标志PG_readahead，以让下一次进行异步预读
页准备好后，调用read_pages读取文件数据

read_pages

static void read_pages(struct readahead_control *rac, struct list_head *pages,
		bool skip_page)
{
	const struct address_space_operations *aops = rac->mapping->a_ops;
	struct page *page;
	struct blk_plug plug;
 
	if (!readahead_count(rac))
		goto out;
 
	blk_start_plug(&plug);
 
	if (aops->readahead) {
		aops->readahead(rac);
		/* Clean up the remaining pages */
		while ((page = readahead_page(rac))) {
			unlock_page(page);
			put_page(page);
		}
	} else if (aops->readpages) {
		aops->readpages(rac->file, rac->mapping, pages,
				readahead_count(rac));
		/* Clean up the remaining pages */
		put_pages_list(pages);
		rac->_index += rac->_nr_pages;
		rac->_nr_pages = 0;
	} else {
		while ((page = readahead_page(rac))) {
			aops->readpage(rac->file, page);
			put_page(page);
		}
	}
 
	blk_finish_plug(&plug);
 
	BUG_ON(!list_empty(pages));
	BUG_ON(readahead_count(rac));
 
out:
	if (skip_page)
		rac->_index++;
}

核心点在于会调用指定文件系统的readpages函数，这里是ext4函数，readpages是属于inode的功能

static int ext4_readpage(struct file *file, struct page *page)
{
	int ret = -EAGAIN;
	struct inode *inode = page->mapping->host;
 
	trace_ext4_readpage(page);
 
	if (ext4_has_inline_data(inode))
		ret = ext4_readpage_inline(inode, page);
 
	if (ret == -EAGAIN)
		return ext4_mpage_readpages(inode, NULL, page);
 
	return ret;
}

首先判断是否有内联优化，即数据是否直接存在inode上，是则调用inline读取，否则调用ext4_mpage_readpages读取。

大体流程是根据要读取文件的逻辑地址，转换得到对应的磁盘物理地址，文件数据就保存在磁盘物理地址，最后调用submit_bio根据这些磁盘地址读取文件数据到page文件页。

int ext4_mpage_readpages(struct inode *inode,
		struct readahead_control *rac, struct page *page)
{
	struct bio *bio = NULL;
	sector_t last_block_in_bio = 0;
 
	const unsigned blkbits = inode->i_blkbits;
	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
	const unsigned blocksize = 1 << blkbits;
	sector_t next_block;
	sector_t block_in_file;
	sector_t last_block;
	sector_t last_block_in_file;
	sector_t blocks[MAX_BUF_PER_PAGE];
	unsigned page_block;
	struct block_device *bdev = inode->i_sb->s_bdev;
	int length;
	unsigned relative_block = 0;
	struct ext4_map_blocks map;
	unsigned int nr_pages = rac ? readahead_count(rac) : 1;
 
	map.m_pblk = 0;
	map.m_lblk = 0;
	map.m_len = 0;
	map.m_flags = 0;
 
	for (; nr_pages; nr_pages--) {
		int fully_mapped = 1;
		unsigned first_hole = blocks_per_page;
 
		if (rac) {
			page = readahead_page(rac);
			prefetchw(&page->flags);
		}
 
		if (page_has_buffers(page))
			goto confused;
 
		block_in_file = next_block =
			(sector_t)page->index << (PAGE_SHIFT - blkbits);
		last_block = block_in_file + nr_pages * blocks_per_page;
		last_block_in_file = (ext4_readpage_limit(inode) +
				      blocksize - 1) >> blkbits;
		if (last_block > last_block_in_file)
			last_block = last_block_in_file;
		page_block = 0;
 
		/*
		 * Map blocks using the previous result first.
		 */
		if ((map.m_flags & EXT4_MAP_MAPPED) &&
		    block_in_file > map.m_lblk &&
		    block_in_file < (map.m_lblk + map.m_len)) {
			unsigned map_offset = block_in_file - map.m_lblk;
			unsigned last = map.m_len - map_offset;
 
			for (relative_block = 0; ; relative_block++) {
				if (relative_block == last) {
					/* needed? */
					map.m_flags &= ~EXT4_MAP_MAPPED;
					break;
				}
				if (page_block == blocks_per_page)
					break;
				blocks[page_block] = map.m_pblk + map_offset +
					relative_block;
				page_block++;
				block_in_file++;
			}
		}
 
		/*
		 * Then do more ext4_map_blocks() calls until we are
		 * done with this page.
		 */
		while (page_block < blocks_per_page) {
			if (block_in_file < last_block) {
				map.m_lblk = block_in_file;
				map.m_len = last_block - block_in_file;
 
				if (ext4_map_blocks(NULL, inode, &map, 0) < 0) {
				set_error_page:
					SetPageError(page);
					zero_user_segment(page, 0,
							  PAGE_SIZE);
					unlock_page(page);
					goto next_page;
				}
			}
			if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
				fully_mapped = 0;
				if (first_hole == blocks_per_page)
					first_hole = page_block;
				page_block++;
				block_in_file++;
				continue;
			}
			if (first_hole != blocks_per_page)
				goto confused;		/* hole -> non-hole */
 
			/* Contiguous blocks? */
			if (page_block && blocks[page_block-1] != map.m_pblk-1)
				goto confused;
			for (relative_block = 0; ; relative_block++) {
				if (relative_block == map.m_len) {
					/* needed? */
					map.m_flags &= ~EXT4_MAP_MAPPED;
					break;
				} else if (page_block == blocks_per_page)
					break;
				blocks[page_block] = map.m_pblk+relative_block;
				page_block++;
				block_in_file++;
			}
		}
		if (first_hole != blocks_per_page) {
			zero_user_segment(page, first_hole << blkbits,
					  PAGE_SIZE);
			if (first_hole == 0) {
				if (ext4_need_verity(inode, page->index) &&
				    !fsverity_verify_page(page))
					goto set_error_page;
				SetPageUptodate(page);
				unlock_page(page);
				goto next_page;
			}
		} else if (fully_mapped) {
			SetPageMappedToDisk(page);
		}
		if (fully_mapped && blocks_per_page == 1 &&
		    !PageUptodate(page) && cleancache_get_page(page) == 0) {
			SetPageUptodate(page);
			goto confused;
		}
 
		/*
		 * This page will go to BIO.  Do we need to send this
		 * BIO off first?
		 */
		if (bio && (last_block_in_bio != blocks[0] - 1 ||
			    !fscrypt_mergeable_bio(bio, inode, next_block))) {
		submit_and_realloc:
			submit_bio(bio);
			bio = NULL;
		}
		if (bio == NULL) {
			/*
			 * bio_alloc will _always_ be able to allocate a bio if
			 * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
			 */
			bio = bio_alloc(GFP_KERNEL, bio_max_segs(nr_pages));
			fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
						  GFP_KERNEL);
			ext4_set_bio_post_read_ctx(bio, inode, page->index);
			bio_set_dev(bio, bdev);
			bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
			bio->bi_end_io = mpage_end_io;
			bio_set_op_attrs(bio, REQ_OP_READ,
						rac ? REQ_RAHEAD : 0);
		}
 
		length = first_hole << blkbits;
		if (bio_add_page(bio, page, length, 0) < length)
			goto submit_and_realloc;
 
		if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
		     (relative_block == map.m_len)) ||
		    (first_hole != blocks_per_page)) {
			submit_bio(bio);
			bio = NULL;
		} else
			last_block_in_bio = blocks[blocks_per_page - 1];
		goto next_page;
	confused:
		if (bio) {
			submit_bio(bio);
			bio = NULL;
		}
		if (!PageUptodate(page))
			block_read_full_page(page, ext4_get_block);
		else
			unlock_page(page);
	next_page:
		if (rac)
			put_page(page);
	}
	if (bio)
		submit_bio(bio);
	return 0;
}

构造请求，向块设备提交bio请求读取数据。

首先设置内存的预取，然后计算读取块的范围，逻辑映射到对应的块上。注意复用之前的块结果，避免重复调用ext4_map_blocks。

然后将连续的block作为请求，要处理页面存在空洞的情况

页面部分是空洞：将空洞部分填充为0
页面完全是空洞：直接标记页面为最新并解锁
页面完全映射：设置页面已映射到磁盘标志

最后提交bio请求。

struct bio {
	struct bio		*bi_next;	/* request queue link */
	struct block_device	*bi_bdev;
	unsigned int		bi_opf;		/* bottom bits REQ_OP, top bits
						 * req_flags.
						 */
	unsigned short		bi_flags;	/* BIO_* below */
	unsigned short		bi_ioprio;
	unsigned short		bi_write_hint;
	blk_status_t		bi_status;
	atomic_t		__bi_remaining;
 
	struct bvec_iter	bi_iter;
 
	bio_end_io_t		*bi_end_io;
 
	void			*bi_private;
#ifdef CONFIG_BLK_CGROUP
	/*
	 * Represents the association of the css and request_queue for the bio.
	 * If a bio goes direct to device, it will not have a blkg as it will
	 * not have a request_queue associated with it.  The reference is put
	 * on release of the bio.
	 */
	struct blkcg_gq		*bi_blkg;
	struct bio_issue	bi_issue;
#ifdef CONFIG_BLK_CGROUP_IOCOST
	u64			bi_iocost_cost;
#endif
#endif
 
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
	struct bio_crypt_ctx	*bi_crypt_context;
#endif
 
	union {
#if defined(CONFIG_BLK_DEV_INTEGRITY)
		struct bio_integrity_payload *bi_integrity; /* data integrity */
#endif
	};
 
	unsigned short		bi_vcnt;	/* how many bio_vec's */
 
	/*
	 * Everything starting with bi_max_vecs will be preserved by bio_reset()
	 */
 
	unsigned short		bi_max_vecs;	/* max bvl_vecs we can hold */
 
	atomic_t		__bi_cnt;	/* pin count */
 
	struct bio_vec		*bi_io_vec;	/* the actual vec list */
 
	struct bio_set		*bi_pool;
 
	/*
	 * We can inline a number of vecs at the end of the bio, to avoid
	 * double allocations for a small number of bio_vecs. This member
	 * MUST obviously be kept at the very end of the bio.
	 */
	struct bio_vec		bi_inline_vecs[];
};

submit_bio

再往后的流程就暂时不解析了，对于上层存储应用过于底层，无法优化。

blk_qc_t submit_bio(struct bio *bio)
{
	if (blkcg_punt_bio_submit(bio))
		return BLK_QC_T_NONE;
 
	/*
	 * If it's a regular read/write or a barrier with data attached,
	 * go through the normal accounting stuff before submission.
	 */
	if (bio_has_data(bio)) {
		unsigned int count;
 
		if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
			count = queue_logical_block_size(
					bio->bi_bdev->bd_disk->queue) >> 9;
		else
			count = bio_sectors(bio);
 
		if (op_is_write(bio_op(bio))) {
			count_vm_events(PGPGOUT, count);
		} else {
			task_io_account_read(bio->bi_iter.bi_size);
			count_vm_events(PGPGIN, count);
		}
	}
 
	/*
	 * If we're reading data that is part of the userspace workingset, count
	 * submission time as memory stall.  When the device is congested, or
	 * the submitting cgroup IO-throttled, submission can be a significant
	 * part of overall IO time.
	 */
	if (unlikely(bio_op(bio) == REQ_OP_READ &&
	    bio_flagged(bio, BIO_WORKINGSET))) {
		unsigned long pflags;
		blk_qc_t ret;
 
		psi_memstall_enter(&pflags);
		ret = submit_bio_noacct(bio);
		psi_memstall_leave(&pflags);
 
		return ret;
	}
 
	return submit_bio_noacct(bio);
}

🪴 meox3259

Explorer

pread流程分析

流程