Data interaction analysis during nvme I/O request
The main function is nvme_queue_rq:
static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) {<!-- --> struct nvme_ns *ns = hctx->queue->queuedata; struct nvme_queue *nvmeq = hctx->driver_data; struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_command cmnd; blk_status_t ret; if (unlikely(nvmeq->cq_vector < 0)) return BLK_STS_IOERR; ret = nvme_setup_cmd(ns, req, & amp;cmnd); if (ret) return ret; ret = nvme_init_iod(req, dev); if (ret) goto out_free_cmd; if (blk_rq_nr_phys_segments(req)) {<!-- --> //The number of physical segments, the length of each physical segment is not necessarily 4096 ret = nvme_map_data(dev, req, & amp;cmnd); if (ret) goto out_cleanup_iod; } blk_mq_start_request(req); nvme_submit_cmd(nvmeq, & amp;cmnd);//Submit the command to the sq queue and then write to the db register return BLK_STS_OK; out_cleanup_iod: nvme_free_iod(dev, req); out_free_cmd: nvme_cleanup_cmd(req); return ret; }
The focus here is to analyze the nvme_map_data function. Before analyzing this function, first take a look at the nvme_init_iod function, which will be helpful for subsequent understanding.
nvme_init_iod function:
static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) {<!-- --> struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); int nseg = blk_rq_nr_phys_segments(rq); unsigned int size = blk_rq_payload_bytes(rq); iod->use_sgl = nvme_pci_use_sgls(dev, rq);//Determine whether to use sgl or prp //nseg > 2 || size > 2 * (dev)->ctrl.page_size (assuming the value is 4096) if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {<!-- --> iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);//The size is alloc_size if (!iod->sg) return BLK_STS_RESOURCE; } else {<!-- --> iod->sg = iod->inline_sg; //This feels a bit strange. This variable is a pointer. Can it be used directly without applying for memory space later? } iod->aborted = 0; iod->npages = -1; iod->nents = 0; iod->length = size; return BLK_STS_OK; }
Here we first assume that iod->sg takes the first branch, which is the memory allocated through the memory pool, and the size is alloc_size,
Where does this value come from?
In the nvme_probe function, there is this piece of code:
alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ, NVME_MAX_SEGS, true); WARN_ON_ONCE(alloc_size > PAGE_SIZE); //Only print exception information once alloc_size = 2040 dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, mempool_kfree, (void *) alloc_size, GFP_KERNEL, node); if (!dev->iod_mempool) {<!-- --> result = -ENOMEM; goto release_pools; }
Among them, NVME_MAX_KB_SZ is 4096 and NVME_MAX_SEGS is 127. Let’s look at the nvme_pci_iod_alloc_size function.
static int nvme_npages(unsigned size, struct nvme_dev *dev) {<!-- --> unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size, dev->ctrl.page_size); return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); //Some memory may be wasted } static int nvme_pci_npages_sgl(unsigned int num_seg) //Calculate the number of pages required for the SGL segment. For example, a 4k page can hold 256 SGL descriptors. {<!-- --> return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE); //int(A/B) + 1 ->int(127 * 16)/4096 + 1 } static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev, unsigned int size, unsigned int nseg, bool use_sgl) {<!-- --> size_t alloc_size; if (use_sgl) alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg);//8 * 1 else alloc_size = sizeof(__le64 *) * nvme_npages(size, dev); return alloc_size + sizeof(struct scatterlist) * nseg; //alloc_size + 16 * 127 mapping + record }
Because the value passed in by use_sgl is true, the branch above is taken, so the final size of alloc_size is expressed as follows. What does the expression alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg); mean? Let’s talk about the conclusion first. This is for recording these memories when applying for memory through the pool later. Because these memory addresses are 64-bit, sizeof(__le64 *) is used here to multiply nvme_pci_npages_sgl(nseg). As for the nvme_pci_npages_sgl(nseg) function Not much to say, it’s easy to understand by looking at it yourself. What I want to say here is that the size of the NVME_MAX_KB_SZ and NVME_MAX_SEGS values can be adjusted. And nseg1-nseg127 is the memory space applied for when mapping sgl later. The mapped sgl is recorded here.
Then come back and look at the nvme_map_data function.
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd) {<!-- --> struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct request_queue *q = req->q; enum dma_data_direction dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;//Data transfer direction blk_status_t ret = BLK_STS_IOERR; int nr_mapped; //Mainly initializing iod->sg, the number of sge segments returned by blk_rq_nr_phys_segments(req) sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); iod->nents = blk_rq_map_sg(q, req, iod->sg);//This function mainly transfers the data in bio to iod->sg if (!iod->nents) goto out; ret = BLK_STS_RESOURCE; //With the data here, we can do dma mapping nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir, DMA_ATTR_NO_WARN);//iod->sg dma mapping if (!nr_mapped) goto out; if (iod->use_sgl) ret = nvme_pci_setup_sgls(dev, req, & amp;cmnd->rw, nr_mapped); else ret = nvme_pci_setup_prps(dev, req, & amp;cmnd->rw); if (ret != BLK_STS_OK) goto out_unmap; ret = BLK_STS_IOERR; //This if statement performs a map operation for metadata, and then gives the dma address to the cmnd->rw.metadata member. It seems that the amount of data should not be too large. if (blk_integrity_rq(req)) {<!-- --> if (blk_rq_count_integrity_sg(q, req->bio) != 1) goto out_unmap; sg_init_table( & amp;iod->meta_sg, 1); if (blk_rq_map_integrity_sg(q, req->bio, & amp;iod->meta_sg) != 1) goto out_unmap; if (!dma_map_sg(dev->dev, & amp;iod->meta_sg, 1, dma_dir)) goto out_unmap; } if (blk_integrity_rq(req)) cmnd->rw.metadata = cpu_to_le64(sg_dma_address( & amp;iod->meta_sg)); return BLK_STS_OK; out_unmap: dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); out: return ret; }
Then look at the nvme_pci_setup_sgls function first, and then look at the nvme_pci_setup_prps function.
static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, struct request *req, struct nvme_rw_command *cmd, int entries) {<!-- --> struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct dma_pool *pool; struct nvme_sgl_desc *sg_list; struct scatterlist *sg = iod->sg; dma_addr_t sgl_dma; int i = 0; cmd->flags = NVME_CMD_SGL_METABUF; //setting the transfer type as SGL if (entries == 1) {<!-- --> nvme_pci_sgl_set_data( & amp;cmd->dptr.sgl, sg); //If you use sgl, use the struct nvme_sgl_desc structure return BLK_STS_OK; } //This is based on the number of sges to specify which branch to take to avoid wasting memory. if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {<!-- --> //256 / 16 = 16 The total size is 256, divided by 16 means how many struct nvme_sgl_desc can be placed pool = dev->prp_small_pool; //size 256 (if you take this branch, the size of the pool is 256, which can represent 16 struct nvme_sgl_desc) iod->npages = 0; } else {<!-- --> pool = dev->prp_page_pool; //Size 4096/16 = 256 so it can represent 256 struct nvme_sgl_desc iod->npages = 1; } sg_list = dma_pool_alloc(pool, GFP_ATOMIC, & amp;sgl_dma); if (!sg_list) {<!-- --> iod->npages = -1; return BLK_STS_RESOURCE; } //Are these two operations for later release? Looks like yes /* Record this value, similar to *(iod->sg + blk_rq_nr_phys_segments) = sg_list Using a secondary pointer can record a larger range. If a primary pointer is used to record an incomplete range of values, */ nvme_pci_iod_list(req)[0] = sg_list; iod->first_dma = sgl_dma; nvme_pci_sgl_set_seg( & amp;cmd->dptr.sgl, sgl_dma, entries); //Set the starting address of the chain for rw command sgl do {<!-- --> if (i == SGES_PER_PAGE) {<!-- --> //256 pool = dev->prp_page_pool; this branch will be taken struct nvme_sgl_desc *old_sg_desc = sg_list; struct nvme_sgl_desc *link = & amp;old_sg_desc[i - 1]; sg_list = dma_pool_alloc(pool, GFP_ATOMIC, & amp;sgl_dma); if (!sg_list) return BLK_STS_RESOURCE; i = 0; nvme_pci_iod_list(req)[iod->npages + + ] = sg_list;//Record the applied dma addr for later release /*Because the last sg_desc of the previous sg_list is used to record the linked list, *link the last record data of the previous list place, change to the first position of the next list. */ sg_list[i + + ] = *link; nvme_pci_sgl_set_seg(link, sgl_dma, entries); } nvme_pci_sgl_set_data( & amp;sg_list[i + + ], sg); sg = sg_next(sg); } while (--entries > 0); return BLK_STS_OK; }
Let’s first look at the data structure of read and write commands. NVME commands are all 64 bytes.
struct nvme_sgl_desc {<!-- --> __le64 addr; __le32 length; __u8 rsvd[3]; __u8 type; }; struct nvme_keyed_sgl_desc {<!-- --> __le64 addr; __u8 length[3]; __u8 key[4]; __u8 type; }; union nvme_data_ptr {<!-- --> struct {<!-- --> __le64 prp1; __le64 prp2; }; struct nvme_sgl_desc sgl; struct nvme_keyed_sgl_desc ksgl; }; struct nvme_rw_command {<!-- --> __u8 opcode; __u8 flags; __u16 command_id; __le32 nsid; __u64 rsvd2; __le64 metadata; union nvme_data_ptr dptr; __le64 slba; __le16 length; __le16 control; __le32 dsmgmt; __le32 reftag; __le16 apptag; __le16 appmask; };
Data is expressed like this, so if you use sgl, addr records the address, length records the length of an sge, and the second prp only has two 64-bit prp pointers, which are used to represent the address and the length to be transmitted. It’s a little more troublesome.
static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, struct scatterlist *sg) {<!-- --> sge->addr = cpu_to_le64(sg_dma_address(sg)); sge->length = cpu_to_le32(sg_dma_len(sg)); sge->type = NVME_SGL_FMT_DATA_DESC << 4; } static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, dma_addr_t dma_addr, int entries) {<!-- --> sge->addr = cpu_to_le64(dma_addr); if (entries < SGES_PER_PAGE) {<!-- --> sge->length = cpu_to_le32(entries * sizeof(*sge)); sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; } else {<!-- --> sge->length = cpu_to_le32(PAGE_SIZE); sge->type = NVME_SGL_FMT_SEG_DESC << 4; } } static void **nvme_pci_iod_list(struct request *req) {<!-- --> struct nvme_iod *iod = blk_mq_rq_to_pdu(req); //The mapped address is recorded in the front, so it is iod->sg plus blk_rq_nr_phys_segments(req) return (void **)(iod->sg + blk_rq_nr_phys_segments(req)); }
Finally, let’s take a look at the released code:
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) {<!-- --> struct nvme_iod *iod = blk_mq_rq_to_pdu(req); enum dma_data_direction dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; if (iod->nents) {<!-- --> dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); if (blk_integrity_rq(req)) dma_unmap_sg(dev->dev, & amp;iod->meta_sg, 1, dma_dir); } nvme_cleanup_cmd(req); nvme_free_iod(dev, req); }
static void nvme_free_iod(struct nvme_dev *dev, struct request *req) {<!-- --> struct nvme_iod *iod = blk_mq_rq_to_pdu(req); const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1; dma_addr_t dma_addr = iod->first_dma, next_dma_addr; int i; if (iod->npages == 0) dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], dma_addr); for (i = 0; i < iod->npages; i + + ) {<!-- --> void *addr = nvme_pci_iod_list(req)[i];//This is a virtual address if (iod->use_sgl) {<!-- --> struct nvme_sgl_desc *sg_list = addr; //256 - 1 The addr of the last descriptor records the starting dma address of the next list dma pool. next_dma_addr = le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr); } else {<!-- --> __le64 *prp_list = addr; next_dma_addr = le64_to_cpu(prp_list[last_prp]); } dma_pool_free(dev->prp_page_pool, addr, dma_addr); dma_addr = next_dma_addr; } //Inequality indicates that iod->sg applies for memory through mempool_alloc, and is released through mempool_free here. if (iod->sg != iod->inline_sg) mempool_free(iod->sg, dev->iod_mempool); }
The next article will analyze the prp code.