SPDK NVMe/TCPバッファ処理 – カフーの備忘録

構造体

reqのiovを管理するreq->iovがあると、pduのデータを管理するpdu->iov、pdu->data_iovがある。

req->iovはR2T送信前にio_unit_sizeで確保するバッファ領域を管理する。

struct nvme_tcp_pdu {
    union {
        /* to hold error pdu data */
        uint8_t                 raw[SPDK_NVME_TCP_TERM_REQ_PDU_MAX_SIZE];
        struct spdk_nvme_tcp_common_pdu_hdr common;
        struct spdk_nvme_tcp_ic_req     ic_req;
        struct spdk_nvme_tcp_term_req_hdr   term_req;
        struct spdk_nvme_tcp_cmd        capsule_cmd;
        struct spdk_nvme_tcp_h2c_data_hdr   h2c_data;
        struct spdk_nvme_tcp_ic_resp        ic_resp;
        struct spdk_nvme_tcp_rsp        capsule_resp;
        struct spdk_nvme_tcp_c2h_data_hdr   c2h_data;
        struct spdk_nvme_tcp_r2t_hdr        r2t;

    } hdr;

    bool                        has_hdgst;
    bool                        ddgst_enable;
    uint32_t                    data_digest_crc32;
    uint8_t                     data_digest[SPDK_NVME_TCP_DIGEST_LEN];

    uint8_t                     ch_valid_bytes;
    uint8_t                     psh_valid_bytes;
    uint8_t                     psh_len;

    nvme_tcp_qpair_xfer_complete_cb         cb_fn;
    void                        *cb_arg;

    /* The sock request ends with a 0 length iovec. Place the actual iovec immediately
     * after it. There is a static assert below to check if the compiler inserted
     * any unwanted padding */
    struct spdk_sock_request            sock_req;
    struct iovec                    iov[NVME_TCP_MAX_SGL_DESCRIPTORS * 2];★
    struct iovec                    data_iov[NVME_TCP_MAX_SGL_DESCRIPTORS];★
    uint32_t                    data_iovcnt;
    uint32_t                    data_len;

    uint32_t                    rw_offset;
    TAILQ_ENTRY(nvme_tcp_pdu)           tailq;
    uint32_t                    remaining;
    uint32_t                    padding_len;
    struct spdk_iov_sgl             sgl;

    struct spdk_dif_ctx             *dif_ctx;

    void                        *req; /* data tied to a tcp request */
    void                        *qpair;
    SLIST_ENTRY(nvme_tcp_pdu)           slist;
};

struct spdk_nvmf_request {
    struct spdk_nvmf_qpair      *qpair;
    uint32_t            length;
    uint8_t             xfer; /* type enum spdk_nvme_data_transfer */
    bool                data_from_pool;
    bool                dif_enabled;
    void                *data;
    union nvmf_h2c_msg      *cmd;
    union nvmf_c2h_msg      *rsp;
    STAILQ_ENTRY(spdk_nvmf_request) buf_link;
    uint64_t            timeout_tsc;

    uint32_t            iovcnt;
    struct iovec            iov[NVMF_REQ_MAX_BUFFERS];
    void                *buffers[NVMF_REQ_MAX_BUFFERS];
    struct spdk_nvmf_stripped_data  *stripped_data;

    struct spdk_nvmf_dif_info   dif;

    struct spdk_bdev_io_wait_entry  bdev_io_wait;
    spdk_nvmf_nvme_passthru_cmd_cb  cmd_cb_fn;
    struct spdk_nvmf_request    *first_fused_req;
    struct spdk_nvmf_request    *req_to_abort;
    struct spdk_poller      *poller;
    struct spdk_bdev_io     *zcopy_bdev_io; /* Contains the bdev_io when using ZCOPY */
    enum spdk_nvmf_zcopy_phase  zcopy_phase;

    TAILQ_ENTRY(spdk_nvmf_request)  link;
};

iovはそれぞれsgl構造体で管理する。

struct spdk_iov_sgl {
    struct iovec    *iov;
    int             iovcnt;
    uint32_t        iov_offset;
    uint32_t        total_size;
}

req->iov初期化

R2T送信前にnvmf_tcp_req_processで、req->iovに受信バッファを確保する。

static bool
nvmf_tcp_req_process(struct spdk_nvmf_tcp_transport *ttransport,
             struct spdk_nvmf_tcp_req *tcp_req)
{
(..)
        case TCP_REQUEST_STATE_NEED_BUFFER:
            spdk_trace_record(TRACE_TCP_REQUEST_STATE_NEED_BUFFER, tqpair->qpair.qid, 0, (uintptr_t)tcp_req,
                      tqpair);

            assert(tcp_req->req.xfer != SPDK_NVME_DATA_NONE);

            if (!tcp_req->has_in_capsule_data && (&tcp_req->req != STAILQ_FIRST(&group->pending_buf_queue))) {
                SPDK_DEBUGLOG(nvmf_tcp,
                          "Not the first element to wait for the buf for tcp_req(%p) on tqpair=%p\n",
                          tcp_req, tqpair);
                /* This request needs to wait in line to obtain a buffer */
                break;
            }

            /* Try to get a data buffer */
            if (nvmf_tcp_req_parse_sgl(tcp_req, transport, group) < 0) {
                break;
            }

static int
nvmf_tcp_req_parse_sgl(struct spdk_nvmf_tcp_req *tcp_req,
               struct spdk_nvmf_transport *transport,
               struct spdk_nvmf_transport_poll_group *group)
{
    struct spdk_nvmf_request        *req = &tcp_req->req;
    struct spdk_nvme_cmd            *cmd;
    struct spdk_nvme_sgl_descriptor     *sgl;
    struct spdk_nvmf_tcp_poll_group     *tgroup;
    enum spdk_nvme_tcp_term_req_fes     fes;
    struct nvme_tcp_pdu         *pdu;
    struct spdk_nvmf_tcp_qpair      *tqpair;
    uint32_t                length, error_offset = 0;

    cmd = &req->cmd->nvme_cmd;
    sgl = &cmd->dptr.sgl1;

    if (sgl->generic.type == SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK &&
        sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_TRANSPORT) {
        /* get request length from sgl */
        length = sgl->unkeyed.length;
        if (spdk_unlikely(length > transport->opts.max_io_size)) {
            SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n",
                    length, transport->opts.max_io_size);
            fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED;
            goto fatal_err;
        }

        /* fill request length and populate iovs */
        req->length = length;

        SPDK_DEBUGLOG(nvmf_tcp, "Data requested length= 0x%x\n", length);

        if (spdk_unlikely(req->dif_enabled)) {
            req->dif.orig_length = length;
            length = spdk_dif_get_length_with_md(length, &req->dif.dif_ctx);
            req->dif.elba_length = length;
        }

        if (nvmf_ctrlr_use_zcopy(req)) {
            SPDK_DEBUGLOG(nvmf_tcp, "Using zero-copy to execute request %p\n", tcp_req);
            req->data_from_pool = false;
            return 0;
        }

        if (spdk_nvmf_request_get_buffers(req, group, transport, length)) {★ここでreq->iovにio_unit_size分のバッファを確保。
            /* No available buffers. Queue this request up. */
            SPDK_DEBUGLOG(nvmf_tcp, "No available large data buffers. Queueing request %p\n",
                      tcp_req);
            return 0;
        }

        /* backward compatible */
        req->data = req->iov[0].iov_base;

        SPDK_DEBUGLOG(nvmf_tcp, "Request %p took %d buffer/s from central pool, and data=%p\n",
                  tcp_req, req->iovcnt, req->data);

        return 0;
    } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK &&
           sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) {
        uint64_t offset = sgl->address;
        uint32_t max_len = transport->opts.in_capsule_data_size;

        assert(tcp_req->has_in_capsule_data);
        /* Capsule Cmd with In-capsule Data should get data length from pdu header */
        tqpair = tcp_req->pdu->qpair;
        /* receiving pdu is not same with the pdu in tcp_req */
        pdu = tqpair->pdu_in_progress;
        length = pdu->hdr.common.plen - pdu->psh_len - sizeof(struct spdk_nvme_tcp_common_pdu_hdr);
        if (tqpair->host_ddgst_enable) {
            length -= SPDK_NVME_TCP_DIGEST_LEN;
        }
        /* This error is not defined in NVMe/TCP spec, take this error as fatal error */
        if (spdk_unlikely(length != sgl->unkeyed.length)) {
            SPDK_ERRLOG("In-Capsule Data length 0x%x is not equal to SGL data length 0x%x\n",
                    length, sgl->unkeyed.length);
            fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
            error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, plen);
            goto fatal_err;
        }

        SPDK_DEBUGLOG(nvmf_tcp, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n",
                  offset, length);

        /* The NVMe/TCP transport does not use ICDOFF to control the in-capsule data offset. ICDOFF should be '0' */
        if (spdk_unlikely(offset != 0)) {
            /* Not defined fatal error in NVMe/TCP spec, handle this error as a fatal error */
            SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " should be ZERO in NVMe/TCP\n", offset);
            fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER;
            error_offset = offsetof(struct spdk_nvme_tcp_cmd, ccsqe.dptr.sgl1.address);
            goto fatal_err;
        }

        if (spdk_unlikely(length > max_len)) {
            /* According to the SPEC we should support ICD up to 8192 bytes for admin and fabric commands */
            if (length <= SPDK_NVME_TCP_IN_CAPSULE_DATA_MAX_SIZE &&
                (cmd->opc == SPDK_NVME_OPC_FABRIC || req->qpair->qid == 0)) {

                /* Get a buffer from dedicated list */
                SPDK_DEBUGLOG(nvmf_tcp, "Getting a buffer from control msg list\n");
                tgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_tcp_poll_group, group);
                assert(tgroup->control_msg_list);
                req->data = nvmf_tcp_control_msg_get(tgroup->control_msg_list);
                if (!req->data) {
                    /* No available buffers. Queue this request up. */
                    SPDK_DEBUGLOG(nvmf_tcp, "No available ICD buffers. Queueing request %p\n", tcp_req);
                    return 0;
                }
            } else {
                SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n",
                        length, max_len);
                fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_LIMIT_EXCEEDED;
                goto fatal_err;
            }
        } else {
            req->data = tcp_req->buf;
        }

        req->length = length;
        req->data_from_pool = false;

        if (spdk_unlikely(req->dif_enabled)) {
            length = spdk_dif_get_length_with_md(length, &req->dif.dif_ctx);
            req->dif.elba_length = length;
        }

        req->iov[0].iov_base = req->data;
        req->iov[0].iov_len = length;
        req->iovcnt = 1;

        return 0;
    }
    /* If we want to handle the problem here, then we can't skip the following data segment.
     * Because this function runs before reading data part, now handle all errors as fatal errors. */
    SPDK_ERRLOG("Invalid NVMf I/O Command SGL:  Type 0x%x, Subtype 0x%x\n",
            sgl->generic.type, sgl->generic.subtype);
    fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER;
    error_offset = offsetof(struct spdk_nvme_tcp_cmd, ccsqe.dptr.sgl1.generic);
fatal_err:
    nvmf_tcp_send_c2h_term_req(tcp_req->pdu->qpair, tcp_req->pdu, fes, error_offset);
    return -1;
}

H2C Data処理時のバッファ管理

PSH処理時は、nvmf_tcp_h2c_data_hdr_handleでPDU->iovにreq->iovで確保したバッファを割り当てる。これによりpdu->iovにて既に割り当て済のバッファ領域にアクセスできる。割り当てはnvme_tcp_pdu_set_data_bufで行う。

static int
nvmf_tcp_sock_process(struct spdk_nvmf_tcp_qpair *tqpair)
{
(..)
        case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH:
            rc = nvme_tcp_read_data(tqpair->sock,
                        pdu->psh_len - pdu->psh_valid_bytes,
                        (void *)&pdu->hdr.raw + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) + pdu->psh_valid_bytes);
            if (rc < 0) {
                return NVME_TCP_PDU_FATAL;
            } else if (rc > 0) {
                spdk_trace_record(TRACE_TCP_READ_FROM_SOCKET_DONE, tqpair->qpair.qid, rc, 0, tqpair);
                pdu->psh_valid_bytes += rc;
            }

            if (pdu->psh_valid_bytes < pdu->psh_len) {
                return NVME_TCP_PDU_IN_PROGRESS;
            }

            /* All header(ch, psh, head digist) of this PDU has now been read from the socket. */
            nvmf_tcp_pdu_psh_handle(tqpair, ttransport);★ここでH２C用のバッファ領域を割り当てる。
            break;
(..)
        case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD:
            /* check whether the data is valid, if not we just return */
            if (!pdu->data_len) {
                return NVME_TCP_PDU_IN_PROGRESS;
            }

            data_len = pdu->data_len;
            /* data digest */
            if (spdk_unlikely((pdu->hdr.common.pdu_type != SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ) &&
                      tqpair->host_ddgst_enable)) {
                data_len += SPDK_NVME_TCP_DIGEST_LEN;
                pdu->ddgst_enable = true;
            }


            rc = nvme_tcp_read_payload_data(tqpair->sock, pdu);★ここで受信データを。読み取る。
            if (rc < 0) {
                return NVME_TCP_PDU_FATAL;
            }
            pdu->rw_offset += rc;

            if (pdu->rw_offset < data_len) {
                return NVME_TCP_PDU_IN_PROGRESS;
            }

            /* Generate and insert DIF to whole data block received if DIF is enabled */
            if (spdk_unlikely(pdu->dif_ctx != NULL) &&
                spdk_dif_generate_stream(pdu->data_iov, pdu->data_iovcnt, 0, data_len,
                             pdu->dif_ctx) != 0) {
                SPDK_ERRLOG("DIF generate failed\n");
                return NVME_TCP_PDU_FATAL;
            }

            /* All of this PDU has now been read from the socket. */
            nvmf_tcp_pdu_payload_handle(tqpair, pdu);
            break;

static void
nvmf_tcp_h2c_data_hdr_handle(struct spdk_nvmf_tcp_transport *ttransport,
                 struct spdk_nvmf_tcp_qpair *tqpair,
                 struct nvme_tcp_pdu *pdu)
{
    struct spdk_nvmf_tcp_req *tcp_req;
    uint32_t error_offset = 0;
    enum spdk_nvme_tcp_term_req_fes fes = 0;
    struct spdk_nvme_tcp_h2c_data_hdr *h2c_data;

    h2c_data = &pdu->hdr.h2c_data;

    SPDK_DEBUGLOG(nvmf_tcp, "tqpair=%p, r2t_info: datao=%u, datal=%u, cccid=%u, ttag=%u\n",
              tqpair, h2c_data->datao, h2c_data->datal, h2c_data->cccid, h2c_data->ttag);

    if (h2c_data->ttag > tqpair->resource_count) {
        SPDK_DEBUGLOG(nvmf_tcp, "ttag %u is larger than allowed %u.\n", h2c_data->ttag,
                  tqpair->resource_count);
        fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR;
        error_offset = offsetof(struct spdk_nvme_tcp_h2c_data_hdr, ttag);
        goto err;
    }

    tcp_req = &tqpair->reqs[h2c_data->ttag - 1];★qpairが管理するreq構造体をH2Cに割当。

    if (spdk_unlikely(tcp_req->state != TCP_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER &&
              tcp_req->state != TCP_REQUEST_STATE_AWAITING_R2T_ACK)) {
        SPDK_DEBUGLOG(nvmf_tcp, "tcp_req(%p), tqpair=%p, has error state in %d\n", tcp_req, tqpair,
                  tcp_req->state);
        fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD;
        error_offset = offsetof(struct spdk_nvme_tcp_h2c_data_hdr, ttag);
        goto err;
    }

    if (spdk_unlikely(tcp_req->req.cmd->nvme_cmd.cid != h2c_data->cccid)) {
        SPDK_DEBUGLOG(nvmf_tcp, "tcp_req(%p), tqpair=%p, expected %u but %u for cccid.\n", tcp_req, tqpair,
                  tcp_req->req.cmd->nvme_cmd.cid, h2c_data->cccid);
        fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR;
        error_offset = offsetof(struct spdk_nvme_tcp_h2c_data_hdr, cccid);
        goto err;
    }

    if (tcp_req->h2c_offset != h2c_data->datao) {
        SPDK_DEBUGLOG(nvmf_tcp,
                  "tcp_req(%p), tqpair=%p, expected data offset %u, but data offset is %u\n",
                  tcp_req, tqpair, tcp_req->h2c_offset, h2c_data->datao);
        fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE;
        goto err;
    }

    if ((h2c_data->datao + h2c_data->datal) > tcp_req->req.length) {
        SPDK_DEBUGLOG(nvmf_tcp,
                  "tcp_req(%p), tqpair=%p,  (datao=%u + datal=%u) exceeds requested length=%u\n",
                  tcp_req, tqpair, h2c_data->datao, h2c_data->datal, tcp_req->req.length);
        fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE;
        goto err;
    }

    pdu->req = tcp_req;

    if (spdk_unlikely(tcp_req->req.dif_enabled)) {
        pdu->dif_ctx = &tcp_req->req.dif.dif_ctx;
    }

    nvme_tcp_pdu_set_data_buf(pdu, tcp_req->req.iov, tcp_req->req.iovcnt,
              h2c_data->datao, h2c_data->datal);★pdu->data_iovにtcp_req->iovを割当。

    nvmf_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD);
    return;

err:
    nvmf_tcp_send_c2h_term_req(tqpair, pdu, fes, error_offset);
}

static void
nvme_tcp_pdu_set_data_buf(struct nvme_tcp_pdu *pdu,
              struct iovec *iov /* tcp_req->req.io */, int iovcnt /* tcp_req->req.iovcnt */,
              uint32_t data_offset /* = h2c_data->datao */, uint32_t data_len /* = h2c_data->datal */)
{  
    uint32_t buf_offset, buf_len, remain_len, len;
    uint8_t *buf;
    struct spdk_iov_sgl *pdu_sgl, buf_sgl;

    pdu->data_len = data_len; ★pdu->data_lenにh2c_data->datalを代入。

    if (spdk_likely(!pdu->dif_ctx)) {
        buf_offset = data_offset;
        buf_len = data_len;
    } else {
        spdk_dif_ctx_set_data_offset(pdu->dif_ctx, data_offset);
        spdk_dif_get_range_with_md(data_offset, data_len,
                       &buf_offset, &buf_len, pdu->dif_ctx);
    }
   
    if (iovcnt == 1) {
        _nvme_tcp_pdu_set_data(pdu, (void *)((uint64_t)iov[0].iov_base + buf_offset), buf_len);
    } else {
        pdu_sgl = &pdu->sgl;
   
        spdk_iov_sgl_init(pdu_sgl, pdu->data_iov, NVME_TCP_MAX_SGL_DESCRIPTORS, 0);★pdu_sglにpdu->data_iovをれいる。
        spdk_iov_sgl_init(&buf_sgl, iov, iovcnt, 0);

        spdk_iov_sgl_advance(&buf_sgl, buf_offset);
        remain_len = buf_len;

        while (remain_len > 0) {
            _nvme_tcp_sgl_get_buf(&buf_sgl, (void *)&buf, &len);
            len = spdk_min(len, remain_len);

            spdk_iov_sgl_advance(&buf_sgl, len);
            remain_len -= len;

            if (!spdk_iov_sgl_append(pdu_sgl, buf, len)) {
                break;
            }
        }

        assert(remain_len == 0);
        assert(pdu_sgl->total_size == buf_len);

        pdu->data_iovcnt = NVME_TCP_MAX_SGL_DESCRIPTORS - pdu_sgl->iovcnt;
    }
}

/**
 * Initialize struct spdk_iov_sgl with iov, iovcnt and iov_offset.
 *
 * \param s the spdk_iov_sgl to be filled.
 * \param iov the io vector to fill the s
 * \param iovcnt the size the iov
 * \param iov_offset the current filled iov_offset for s.
 */

static inline void
spdk_iov_sgl_init(struct spdk_iov_sgl *s, struct iovec *iov, int iovcnt,
          uint32_t iov_offset)
{
    s->iov = iov;
    s->iovcnt = iovcnt;
    s->iov_offset = iov_offset;
    s->total_size = 0;
}

Payloadデータ受信

static int
nvme_tcp_read_payload_data(struct spdk_sock *sock /* = tqpair->sock */, struct nvme_tcp_pdu *pdu)
{
    struct iovec iov[NVME_TCP_MAX_SGL_DESCRIPTORS + 1];
    int iovcnt;

    iovcnt = nvme_tcp_build_payload_iovs(iov, NVME_TCP_MAX_SGL_DESCRIPTORS + 1, pdu,
                         pdu->ddgst_enable, NULL);★iovのbuild
    assert(iovcnt >= 0);

    return nvme_tcp_readv_data(sock, iov, iovcnt);★データ受信
}

static int
nvme_tcp_build_payload_iovs(struct iovec *iov　/* = stack内のiov[] */, int iovcnt /* =  NVME_TCP_MAX_SGL_DESCRIPTORS + 1 */, struct nvme_tcp_pdu *pdu,
                bool ddgst_enable, uint32_t *_mapped_length)
{
    struct spdk_iov_sgl *sgl;

    if (iovcnt == 0) {
        return 0;
    }

    sgl = &pdu->sgl;
    spdk_iov_sgl_init(sgl, iov, iovcnt, pdu->rw_offset);

    if (spdk_likely(!pdu->dif_ctx)) {
        if (!_nvme_tcp_sgl_append_multi(sgl, pdu->data_iov, pdu->data_iovcnt)) {
            goto end;
        }
    } else {
        if (!_nvme_tcp_sgl_append_multi_with_md(sgl, pdu->data_iov, pdu->data_iovcnt,
                            pdu->data_len, pdu->dif_ctx)) {
            goto end;
        }
    }

    /* Data Digest */
    if (ddgst_enable) {
        spdk_iov_sgl_append(sgl, pdu->data_digest, SPDK_NVME_TCP_DIGEST_LEN);
    }

end:
    if (_mapped_length != NULL) {
        *_mapped_length = sgl->total_size;
    }
    return iovcnt - sgl->iovcnt;
}

static inline bool
_nvme_tcp_sgl_append_multi(struct spdk_iov_sgl *s, struct iovec *iov /* = pdu->data_iov */, int iovcnt /* = pdu->data_iovcnt */)
{
    int i;

    for (i = 0; i < iovcnt; i++) {
        if (!spdk_iov_sgl_append(s, iov[i].iov_base, iov[i].iov_len)) {
            return false;
        }
    }

    return true;
}

/** 
 * Append the data to the struct spdk_iov_sgl pointed by s
 *
 * \param s the address of the struct spdk_iov_sgl
 * \param data the data buffer to be appended
 * \param data_len the length of the data.
 *
 * \return true if all the data is appended.
 */

static inline bool
spdk_iov_sgl_append(struct spdk_iov_sgl *s, uint8_t *data, uint32_t data_len)
{
    if (s->iov_offset >= data_len) {
        s->iov_offset -= data_len;
    } else {
        assert(s->iovcnt > 0);
        s->iov->iov_base = data + s->iov_offset;
        s->iov->iov_len = data_len - s->iov_offset;
        s->total_size += data_len - s->iov_offset;
        s->iov_offset = 0;
        s->iov++;
        s->iovcnt--;
        if (s->iovcnt == 0) {
            return false;
        }
    }

    return true;
}

static int
nvme_tcp_readv_data(struct spdk_sock *sock, struct iovec *iov, int iovcnt)
{
    int ret;

    assert(sock != NULL);
    if (iov == NULL || iovcnt == 0) {
        return 0;
    }

    if (iovcnt == 1) {
        return nvme_tcp_read_data(sock, iov->iov_len, iov->iov_base);
    }

    ret = spdk_sock_readv(sock, iov, iovcnt);
    
    if (ret > 0) {
        return ret;
    }

    if (ret < 0) {
        if (errno == EAGAIN || errno == EWOULDBLOCK) {
            return 0;
        }

        /* For connect reset issue, do not output error log */
        if (errno != ECONNRESET) {
            SPDK_ERRLOG("spdk_sock_readv() failed, errno %d: %s\n",
                    errno, spdk_strerror(errno));
        }
    }

    /* connection closed */
    return NVME_TCP_CONNECTION_FATAL;
}

■元の動作
①CapsulePDU処理中：nvmf_tcp_req_parse_sglでtcp_reqにio_unitサイズのspdk_nvmf_request_get_buffersでバッファを必要な数用意。
②H2CData PDU受信：nvmf_tcp_h2c_data_hdr_handleから、nvme_tcp_pdu_set_data_bufを呼出し。
pdu_sglを初期化。buf_sglにtcp_reqのsglで割当。 spdk_iov_sgl_initで初期化。
　　　　　　　　　　 pdu->sglを初期化。tcp_reqのバッファ領域をPDUに割当。※
③case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD→nvmf_tcp_h2c_data_payload_handle。
④nvmf_tcp_h2c_data_payload_handleでデータを全て受信したら、TCP_REQUEST_STATE_READY_TO_EXECUTEに移行。
　→カーネル受信バッファから、pduのバッファ（実体はtcp_reqのバッファ）にコピー
nvmf_tcp_h2c_data_payload_handleでデータを途中まで受信したら、③に戻る。

※sglとは。
struct spdk_iov_sgl {
struct iovec *iov; # IOベクタ
int iovcnt; # IOベクタの要素数
uint32_t iov_offset; #これまで埋まっているIOベクタのオフセット
uint32_t total_size; # IOベクタのサイズ（データの長さ）
};

②確保buf(8KB) ← コピー
②確保buf(8KB) ← コピー
②確保buf(8KB) ← コピー
②確保buf(8KB) ← コピー

構造体

req->iov初期化

H2C Data処理時のバッファ管理

Payloadデータ受信

コメントを残す コメントをキャンセル

コメントを残すコメントをキャンセル