nvme_queue_rq function analysis one

Data interaction analysis during nvme I/O request

The main function is nvme_queue_rq:

static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd)
{<!-- -->
struct nvme_ns *ns = hctx->queue->queuedata;
struct nvme_queue *nvmeq = hctx->driver_data;
struct nvme_dev *dev = nvmeq->dev;
struct request *req = bd->rq;
struct nvme_command cmnd;
blk_status_t ret;

if (unlikely(nvmeq->cq_vector < 0))
return BLK_STS_IOERR;

ret = nvme_setup_cmd(ns, req, & amp;cmnd);
if (ret)
return ret;

ret = nvme_init_iod(req, dev);
if (ret)
goto out_free_cmd;

if (blk_rq_nr_phys_segments(req)) {<!-- --> //The number of physical segments, the length of each physical segment is not necessarily 4096
ret = nvme_map_data(dev, req, & amp;cmnd);
if (ret)
goto out_cleanup_iod;
}

blk_mq_start_request(req);
nvme_submit_cmd(nvmeq, & amp;cmnd);//Submit the command to the sq queue and then write to the db register
return BLK_STS_OK;
out_cleanup_iod:
nvme_free_iod(dev, req);
out_free_cmd:
nvme_cleanup_cmd(req);
return ret;
}

The focus here is to analyze the nvme_map_data function. Before analyzing this function, first take a look at the nvme_init_iod function, which will be helpful for subsequent understanding.

nvme_init_iod function:

static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
{<!-- -->
struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
int nseg = blk_rq_nr_phys_segments(rq);
unsigned int size = blk_rq_payload_bytes(rq);

iod->use_sgl = nvme_pci_use_sgls(dev, rq);//Determine whether to use sgl or prp
//nseg > 2 || size > 2 * (dev)->ctrl.page_size (assuming the value is 4096)
if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {<!-- -->
iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);//The size is alloc_size
if (!iod->sg)
return BLK_STS_RESOURCE;
} else {<!-- -->
iod->sg = iod->inline_sg; //This feels a bit strange. This variable is a pointer. Can it be used directly without applying for memory space later?
}

iod->aborted = 0;
iod->npages = -1;
iod->nents = 0;
iod->length = size;
return BLK_STS_OK;
}

Here we first assume that iod->sg takes the first branch, which is the memory allocated through the memory pool, and the size is alloc_size,
Where does this value come from?
In the nvme_probe function, there is this piece of code:

 alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ, NVME_MAX_SEGS, true);
WARN_ON_ONCE(alloc_size > PAGE_SIZE); //Only print exception information once alloc_size = 2040

dev->iod_mempool = mempool_create_node(1, mempool_kmalloc, mempool_kfree, (void *) alloc_size, GFP_KERNEL, node);
if (!dev->iod_mempool) {<!-- -->
result = -ENOMEM;
goto release_pools;
}

Among them, NVME_MAX_KB_SZ is 4096 and NVME_MAX_SEGS is 127. Let’s look at the nvme_pci_iod_alloc_size function.

static int nvme_npages(unsigned size, struct nvme_dev *dev)
{<!-- -->
unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size, dev->ctrl.page_size);
return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); //Some memory may be wasted
}

static int nvme_pci_npages_sgl(unsigned int num_seg) //Calculate the number of pages required for the SGL segment. For example, a 4k page can hold 256 SGL descriptors.
{<!-- -->
return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE); //int(A/B) + 1 ->int(127 * 16)/4096 + 1
}

static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev, unsigned int size, unsigned int nseg, bool use_sgl)
{<!-- -->
size_t alloc_size;

if (use_sgl)
alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg);//8 * 1
else
alloc_size = sizeof(__le64 *) * nvme_npages(size, dev);

return alloc_size + sizeof(struct scatterlist) * nseg; //alloc_size + 16 * 127 mapping + record
}

Because the value passed in by use_sgl is true, the branch above is taken, so the final size of alloc_size is expressed as follows. What does the expression alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg); mean? Let’s talk about the conclusion first. This is for recording these memories when applying for memory through the pool later. Because these memory addresses are 64-bit, sizeof(__le64 *) is used here to multiply nvme_pci_npages_sgl(nseg). As for the nvme_pci_npages_sgl(nseg) function Not much to say, it’s easy to understand by looking at it yourself. What I want to say here is that the size of the NVME_MAX_KB_SZ and NVME_MAX_SEGS values can be adjusted. And nseg1-nseg127 is the memory space applied for when mapping sgl later. The mapped sgl is recorded here.

Then come back and look at the nvme_map_data function.

static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd)
{<!-- -->
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct request_queue *q = req->q;
enum dma_data_direction dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;//Data transfer direction
blk_status_t ret = BLK_STS_IOERR;
int nr_mapped;

//Mainly initializing iod->sg, the number of sge segments returned by blk_rq_nr_phys_segments(req)
sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
iod->nents = blk_rq_map_sg(q, req, iod->sg);//This function mainly transfers the data in bio to iod->sg
if (!iod->nents)
goto out;

ret = BLK_STS_RESOURCE;
//With the data here, we can do dma mapping
nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir, DMA_ATTR_NO_WARN);//iod->sg dma mapping
if (!nr_mapped)
goto out;

if (iod->use_sgl)
ret = nvme_pci_setup_sgls(dev, req, & amp;cmnd->rw, nr_mapped);
else
ret = nvme_pci_setup_prps(dev, req, & amp;cmnd->rw);

if (ret != BLK_STS_OK)
goto out_unmap;

ret = BLK_STS_IOERR;
//This if statement performs a map operation for metadata, and then gives the dma address to the cmnd->rw.metadata member. It seems that the amount of data should not be too large.
if (blk_integrity_rq(req)) {<!-- -->
if (blk_rq_count_integrity_sg(q, req->bio) != 1)
goto out_unmap;

sg_init_table( & amp;iod->meta_sg, 1);
if (blk_rq_map_integrity_sg(q, req->bio, & amp;iod->meta_sg) != 1)
goto out_unmap;

if (!dma_map_sg(dev->dev, & amp;iod->meta_sg, 1, dma_dir))
goto out_unmap;
}

if (blk_integrity_rq(req))
cmnd->rw.metadata = cpu_to_le64(sg_dma_address( & amp;iod->meta_sg));
return BLK_STS_OK;
out_unmap:
dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
out:
return ret;
}

Then look at the nvme_pci_setup_sgls function first, and then look at the nvme_pci_setup_prps function.

static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, struct request *req, struct nvme_rw_command *cmd, int entries)
{<!-- -->
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct dma_pool *pool;
struct nvme_sgl_desc *sg_list;
struct scatterlist *sg = iod->sg;
dma_addr_t sgl_dma;
int i = 0;

cmd->flags = NVME_CMD_SGL_METABUF; //setting the transfer type as SGL
if (entries == 1) {<!-- -->
nvme_pci_sgl_set_data( & amp;cmd->dptr.sgl, sg); //If you use sgl, use the struct nvme_sgl_desc structure
return BLK_STS_OK;
}

//This is based on the number of sges to specify which branch to take to avoid wasting memory.
if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {<!-- --> //256 / 16 = 16 The total size is 256, divided by 16 means how many struct nvme_sgl_desc can be placed
pool = dev->prp_small_pool; //size 256 (if you take this branch, the size of the pool is 256, which can represent 16 struct nvme_sgl_desc)
iod->npages = 0;
} else {<!-- -->
pool = dev->prp_page_pool; //Size 4096/16 = 256 so it can represent 256 struct nvme_sgl_desc
iod->npages = 1;
}

sg_list = dma_pool_alloc(pool, GFP_ATOMIC, & amp;sgl_dma);
if (!sg_list) {<!-- -->
iod->npages = -1;
return BLK_STS_RESOURCE;
}
//Are these two operations for later release? Looks like yes
/*
Record this value, similar to *(iod->sg + blk_rq_nr_phys_segments) = sg_list
Using a secondary pointer can record a larger range. If a primary pointer is used to record an incomplete range of values,
*/
nvme_pci_iod_list(req)[0] = sg_list;
iod->first_dma = sgl_dma;

nvme_pci_sgl_set_seg( & amp;cmd->dptr.sgl, sgl_dma, entries); //Set the starting address of the chain for rw command sgl
do {<!-- -->
if (i == SGES_PER_PAGE) {<!-- --> //256 pool = dev->prp_page_pool; this branch will be taken
struct nvme_sgl_desc *old_sg_desc = sg_list;
struct nvme_sgl_desc *link = & amp;old_sg_desc[i - 1];

sg_list = dma_pool_alloc(pool, GFP_ATOMIC, & amp;sgl_dma);
if (!sg_list)
return BLK_STS_RESOURCE;
i = 0;
nvme_pci_iod_list(req)[iod->npages + + ] = sg_list;//Record the applied dma addr for later release
/*Because the last sg_desc of the previous sg_list is used to record the linked list, *link the last record data of the previous list
place, change to the first position of the next list.
*/
sg_list[i + + ] = *link;
nvme_pci_sgl_set_seg(link, sgl_dma, entries);
}
nvme_pci_sgl_set_data( & amp;sg_list[i + + ], sg);
sg = sg_next(sg);
} while (--entries > 0);
return BLK_STS_OK;
}

Let’s first look at the data structure of read and write commands. NVME commands are all 64 bytes.

struct nvme_sgl_desc {<!-- -->
__le64 addr;
__le32 length;
__u8 rsvd[3];
__u8 type;
};

struct nvme_keyed_sgl_desc {<!-- -->
__le64 addr;
__u8 length[3];
__u8 key[4];
__u8 type;
};

union nvme_data_ptr {<!-- -->
struct {<!-- -->
__le64 prp1;
__le64 prp2;
};
struct nvme_sgl_desc sgl;
struct nvme_keyed_sgl_desc ksgl;
};

struct nvme_rw_command {<!-- -->
__u8 opcode;
__u8 flags;
__u16 command_id;
__le32 nsid;
__u64 rsvd2;
__le64 metadata;
union nvme_data_ptr dptr;
__le64 slba;
__le16 length;
__le16 control;
__le32 dsmgmt;
__le32 reftag;
__le16 apptag;
__le16 appmask;
};

Data is expressed like this, so if you use sgl, addr records the address, length records the length of an sge, and the second prp only has two 64-bit prp pointers, which are used to represent the address and the length to be transmitted. It’s a little more troublesome.

static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, struct scatterlist *sg)
{<!-- -->
sge->addr = cpu_to_le64(sg_dma_address(sg));
sge->length = cpu_to_le32(sg_dma_len(sg));
sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}

static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, dma_addr_t dma_addr, int entries)
{<!-- -->
sge->addr = cpu_to_le64(dma_addr);
if (entries < SGES_PER_PAGE) {<!-- -->
sge->length = cpu_to_le32(entries * sizeof(*sge));
sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
} else {<!-- -->
sge->length = cpu_to_le32(PAGE_SIZE);
sge->type = NVME_SGL_FMT_SEG_DESC << 4;
}
}
static void **nvme_pci_iod_list(struct request *req)
{<!-- -->
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
//The mapped address is recorded in the front, so it is iod->sg plus blk_rq_nr_phys_segments(req)
return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
}

Finally, let’s take a look at the released code:

static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{<!-- -->
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
enum dma_data_direction dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;

if (iod->nents) {<!-- -->
dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
if (blk_integrity_rq(req))
dma_unmap_sg(dev->dev, & amp;iod->meta_sg, 1, dma_dir);
}

nvme_cleanup_cmd(req);
nvme_free_iod(dev, req);
}

static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
{<!-- -->
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
int i;

if (iod->npages == 0)
dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], dma_addr);

for (i = 0; i < iod->npages; i + + ) {<!-- -->
void *addr = nvme_pci_iod_list(req)[i];//This is a virtual address

if (iod->use_sgl) {<!-- -->
struct nvme_sgl_desc *sg_list = addr;
//256 - 1 The addr of the last descriptor records the starting dma address of the next list dma pool.
next_dma_addr = le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr);
} else {<!-- -->
__le64 *prp_list = addr;
next_dma_addr = le64_to_cpu(prp_list[last_prp]);
}
dma_pool_free(dev->prp_page_pool, addr, dma_addr);
dma_addr = next_dma_addr;
}
//Inequality indicates that iod->sg applies for memory through mempool_alloc, and is released through mempool_free here.
if (iod->sg != iod->inline_sg)
mempool_free(iod->sg, dev->iod_mempool);
}

The next article will analyze the prp code.