Files
mckernel/test/rusage/verbs/post.c
2017-09-20 19:48:32 +09:00

454 lines
12 KiB
C
Executable File

#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include "ibcomm.h"
#include "debug.h"
//#define DEBUG_POST
#ifdef DEBUG_POST
#define dprintf printf
#else
#define dprintf(...)
#endif
static unsigned long rdtsc() {
unsigned long x;
__asm__ __volatile__("xorl %%eax, %%eax; cpuid;" : : : "%rax", "%rbx", "%rcx", "%rdx"); /* rdtsc cannot be executed earlier than this */
__asm__ __volatile__("rdtsc; shl $32, %%rdx; or %%rdx, %%rax" : "=a"(x) : : "memory"); /* rdtsc cannot be executed earlier than here */
__asm__ __volatile__("xorl %%eax, %%eax; cpuid;" : : : "%rax", "%rbx", "%rcx", "%rdx"); /* following instructions cannot be executed earlier than this */
return x;
}
#define MAX_POLL_TIME (1000000ULL * 1000000)
int swr_id_tag_map[1000];
int rwr_id_tag_map[1000];
void put_swr_id_tag(int wr_id, int tag){
swr_id_tag_map[wr_id] = tag;
}
int get_swr_id_tag(int wr_id){
int tag = swr_id_tag_map[wr_id];
return tag;
}
void put_rwr_id_tag(int wr_id, int tag){
rwr_id_tag_map[wr_id] = tag;
}
int get_rwr_id_tag(int wr_id){
int tag = rwr_id_tag_map[wr_id];
return tag;
}
int post_send_req(qpinfo_t *qpinfo, mrinfo_t *mrinfo, int opcode, int tag, qp_conn_info_t* remote_conn_info, uint32_t imm_data){
struct ibv_send_wr sr, *bad_wr = NULL;
struct ibv_sge sge[1];
int ret = 0;
/* Create sge*/
sge[0].addr = (uintptr_t)mrinfo->buf;
sge[0].length = mrinfo->buf_size;
sge[0].lkey = mrinfo->mr->lkey;
/* Create a SR */
memset(&sr, 0, sizeof(struct ibv_send_wr));
sr.next = NULL;
sr.wr_id = ++qpinfo->sr_num;
sr.sg_list = sge;
sr.num_sge = 1;
sr.opcode = opcode;
sr.imm_data = imm_data;
sr.send_flags = IBV_SEND_SIGNALED;
if(opcode != IBV_WR_RDMA_READ && mrinfo->buf_size <= qpinfo->max_inline_data) { sr.send_flags |= IBV_SEND_INLINE; }
put_swr_id_tag(sr.wr_id, tag);
// set addr and key if is RDMA op
if(opcode != IBV_WR_SEND){
sr.wr.rdma.remote_addr = remote_conn_info->addr;
sr.wr.rdma.rkey = remote_conn_info->rkey;
}
/* Post SR to SQ */
ret = ibv_post_send(qpinfo->qp, &sr, &bad_wr);
if(ret){
error_perror("ibv_post_send");
error_printf("ibv_post_send return %d\n", ret);
return IBCOMM_ERR_CODE;
}
return 0;
}
/* write to addr + sz * seq_num */
int post_send_req2(qpinfo_t *qpinfo, mrinfo_t *mrinfo, int opcode, qp_conn_info_t* remote_conn_info, uint32_t imm_data, uint32_t seq_num) {
struct ibv_send_wr sr, *bad_wr = NULL;
struct ibv_sge sge[1];
int ret = 0;
/* prepare sge*/
sge[0].addr = (uintptr_t)mrinfo->buf;
sge[0].length = mrinfo->buf_size;
sge[0].lkey = mrinfo->mr->lkey;
dprintf("post_send_req2,sge[0].addr=%lx,sz=%d\n", (unsigned long)sge[0].addr, sge[0].length = mrinfo->buf_size);
/* prepare send request or work request */
//memset(&sr, 0, sizeof(struct ibv_send_wr));
sr.next = NULL;
sr.wr_id = 0;
sr.sg_list = sge;
sr.num_sge = 1;
sr.opcode = opcode;
sr.imm_data = imm_data;
sr.send_flags = IBV_SEND_SIGNALED;
if(opcode != IBV_WR_RDMA_READ && mrinfo->buf_size <= qpinfo->max_inline_data) {
sr.send_flags |= IBV_SEND_INLINE;
}
if(opcode == IBV_WR_RDMA_WRITE || opcode == IBV_WR_RDMA_WRITE_WITH_IMM) {
sr.wr.rdma.remote_addr = remote_conn_info->addr + IBCOM_RDMABUF_SZSEG * seq_num;
sr.wr.rdma.rkey = remote_conn_info->rkey;
dprintf("post_send_req2,raddr=%lx\n", sr.wr.rdma.remote_addr);
}
//__asm__ __volatile__("" ::: "memory");
ret = ibv_post_send(qpinfo->qp, &sr, &bad_wr);
if(ret){
printf("ibv_post_send return %d\n", ret);
return IBCOMM_ERR_CODE;
}
return 0;
}
int ibcom_isend_chain(qpinfo_t *qpinfo, mrinfo_t *mrinfo, int opcode, qp_conn_info_t* remote_conn_info, uint32_t imm_data, uint32_t seq_num) {
int ibcom_errno = 0;
int ib_errno;
int i;
struct ibv_send_wr sr[NCHAIN], *bad_wr = NULL;
struct ibv_sge sge[NCHAIN];
for(i = 0; i < NCHAIN; i++) {
sge[i].addr = (uintptr_t)mrinfo->buf + IBCOM_INLINE_DATA * i;
sge[i].length = IBCOM_INLINE_DATA;
sge[i].lkey = mrinfo->mr->lkey;
sr[i].next = (i == NCHAIN - 1) ? NULL : &sr[i+1];
//sr[i].wr_id = 0;
sr[i].sg_list = &sge[i];
sr[i].num_sge = 1;
#define SKIP_POLL_RCQ
#ifdef SKIP_POLL_RCQ /* if you want all to be IBV_WR_RDMA_WRITE */
sr[i].opcode = opcode;
#else
sr[i].opcode = (i == NCHAIN - 1) ? IBV_WR_RDMA_WRITE_WITH_IMM : IBV_WR_RDMA_WRITE;
#endif
sr[i].imm_data = imm_data;
sr[i].send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;
sr[i].wr.rdma.remote_addr = remote_conn_info->addr + IBCOM_INLINE_DATA * NCHAIN * seq_num + IBCOM_INLINE_DATA * i;
sr[i].wr.rdma.rkey = remote_conn_info->rkey;
}
ib_errno = ibv_post_send(qpinfo->qp, &sr[0], &bad_wr);
IBCOM_ERR_CHKANDJUMP(ib_errno, -1, printf("ibv_post_send\n"));
fn_exit:
return ibcom_errno;
fn_fail:
goto fn_exit;
}
/* write to addr + sz * seq_num */
int post_send_req4(qpinfo_t *qpinfo, mrinfo_t *mrinfo, int opcode, qp_conn_info_t* remote_conn_info, uint32_t imm_data, uint32_t seq_num, uint32_t offset) {
int ibcom_errno = 0;
int ib_errno;
struct ibv_send_wr sr, *bad_wr = NULL;
struct ibv_sge sge[1];
sge[0].addr = (uintptr_t)mrinfo->buf + offset;
sge[0].length = IBCOM_INLINE_DATA;
sge[0].lkey = mrinfo->mr->lkey;
sr.next = NULL;
//sr.wr_id = 0;
sr.sg_list = sge;
sr.num_sge = 1;
sr.opcode = opcode;
sr.imm_data = imm_data;
sr.send_flags = IBV_SEND_SIGNALED | IBV_SEND_INLINE;
sr.wr.rdma.remote_addr = remote_conn_info->addr + IBCOM_INLINE_DATA * seq_num;
sr.wr.rdma.rkey = remote_conn_info->rkey;
ib_errno = ibv_post_send(qpinfo->qp, &sr, &bad_wr);
IBCOM_ERR_CHKANDJUMP(ib_errno, -1, printf("ibv_post_send\n"));
fn_exit:
return ibcom_errno;
fn_fail:
goto fn_exit;
}
int post_send_req_ud(qpinfo_t *qpinfo, mrinfo_t *mrinfo, int opcode, qp_conn_info_ud_t* remote_conn_info, struct ibv_ah *ah) {
struct ibv_send_wr sr, *bad_wr;
struct ibv_sge sge[1];
int ibcom_errno = 0, ib_errno;
/* Create sge*/
/* addr to addr + length - 1 will be on the payload, but see "post_send_req_ud" part */
if(mrinfo->buf_size <= 40) { printf("buf_size too short\n"); ibcom_errno = -1; goto fn_fail; }
sge[0].addr = (uintptr_t)mrinfo->buf + 40;
sge[0].length = mrinfo->buf_size - 40;
sge[0].lkey = mrinfo->mr->lkey;
/* Create a SR */
//memset(&sr, 0, sizeof(struct ibv_send_wr));
sr.next = NULL;
sr.wr_id = 0;
sr.sg_list = sge;
sr.num_sge = 1;
sr.opcode = opcode;
//sr.imm_data = 0;
sr.send_flags = IBV_SEND_SIGNALED;
#if 0
if(mrinfo->buf_size <= qpinfo->max_inline_data){
sr.send_flags |= IBV_SEND_INLINE;
}
#endif
sr.wr.ud.ah = ah;
sr.wr.ud.remote_qpn = remote_conn_info->qp_num;
sr.wr.ud.remote_qkey = remote_conn_info->qkey;
dprintf("ibv_post_send,qpn=%08x,qkey=%08x\n", sr.wr.ud.remote_qpn, sr.wr.ud.remote_qkey);
// printf("ibv_post_send,dlid=%02x,is_global=%02x\n", ah->dlid, ah->is_global);
ib_errno = ibv_post_send(qpinfo->qp, &sr, &bad_wr);
if(ib_errno) {
error_perror("ibv_post_send");
printf("ib_errno=%d\n", ib_errno);
ibcom_errno = IBCOMM_ERR_CODE;
goto fn_fail;
}
fn_exit:
return ibcom_errno;
fn_fail:
goto fn_exit;
}
int post_recv_req(qpinfo_t *qpinfo, mrinfo_t *mrinfo, int tag){
struct ibv_recv_wr *rr;
struct ibv_sge *sge;
struct ibv_recv_wr *bad_wr;
int ret = 0;
/* Prepare scatter/gather entry list */
sge = malloc(sizeof(struct ibv_sge));
memset(sge, 0, sizeof(struct ibv_sge));
sge->addr = (uintptr_t)mrinfo->buf;
sge->length = mrinfo->buf_size;
sge->lkey = mrinfo->mr->lkey;
/* Create RR list */
rr = malloc(sizeof(*rr));
memset(rr, 0, sizeof(*rr));
rr->next = NULL;
rr->wr_id = ++qpinfo->rr_num;
rr->sg_list = sge;
rr->num_sge = 1;
put_rwr_id_tag(rr->wr_id, tag);
/* Post RR to RQ */
ret = ibv_post_recv(qpinfo->qp, rr, &bad_wr);
if(ret){
dprintf("ibv_post_recv ret=%d\n", ret);
free(sge);
free(rr);
return IBCOMM_ERR_CODE;
} else {
dprintf("ibv_post_recv ret=%d\n", ret);
}
free(sge);
free(rr);
return 0;
}
int ibcom_irecv(qpinfo_t *qpinfo, uint64_t wr_id){
struct ibv_recv_wr rr;
struct ibv_recv_wr *bad_wr;
int ibcom_errno = 0;
int ib_errno;
rr.next = NULL;
rr.sg_list = NULL;
rr.num_sge = 0;
rr.wr_id = wr_id;
/* post rr */
ib_errno = ibv_post_recv(qpinfo->qp, &rr, &bad_wr);
IBCOM_ERR_CHKANDJUMP(ib_errno, -1, printf("ibv_post_recv\n"));
fn_exit:
return ibcom_errno;
fn_fail:
goto fn_exit;
}
int post_recv_req_ud(qpinfo_t *qpinfo, mrinfo_t *mrinfo, uint64_t wr_id){
struct ibv_recv_wr rr, *bad_wr;
struct ibv_sge sge[1];
int ibcom_errno = 0, ib_errno;
/* Prepare scatter/gather entry list */
memset(sge, 0, sizeof(struct ibv_sge));
/* addr to addr + 39 are not filled, addr + 40 to addr + length - 1 are filled with payload */
if(mrinfo->buf_size <= 40) { printf("buf_size too short\n"); ibcom_errno = -1; goto fn_fail; }
sge[0].addr = (uintptr_t)mrinfo->buf;
sge[0].length = mrinfo->buf_size;
sge[0].lkey = mrinfo->mr->lkey;
/* Create RR list */
memset(&rr, 0, sizeof(struct ibv_recv_wr));
rr.next = NULL;
rr.wr_id = wr_id;
rr.sg_list = sge;
rr.num_sge = 1;
/* Post RR to RQ */
ib_errno = ibv_post_recv(qpinfo->qp, &rr, &bad_wr);
if(ib_errno){
printf("ibv_post_recv ib_errno=%d\n", ib_errno);
ibcom_errno = IBCOMM_ERR_CODE;
goto fn_fail;
}
fn_exit:
return ibcom_errno;
fn_fail:
goto fn_exit;
}
int poll_cq(qpinfo_t *qpinfo, int cq_flg, int *tag) {
struct ibv_wc wc;
int wc_num = 0, time=0, rc = IBCOMM_ERR_CODE;
// wc = malloc(sizeof(struct ibv_wc));
memset(&wc, 0, sizeof(struct ibv_wc));
switch(cq_flg){
case SEND_CQ_FLG:
do{
wc_num = ibv_poll_cq(qpinfo->scq, 1, &wc);
}while(!wc_num && ++time < MAX_POLL_TIME);
break;
case RECV_CQ_FLG:
do{
wc_num = ibv_poll_cq(qpinfo->rcq, 1, &wc);
}while(!wc_num && ++time < MAX_POLL_TIME);
break;
}
if(wc_num < 0){
error_perror("ibv_poll_cq");
goto poll_cq_exit;
}
if(wc_num == 0){
error_printf("no wc is found\n");
goto poll_cq_exit;
}
if (wc.status != IBV_WC_SUCCESS){
error_printf("wrong wc state: %d, %s\n", wc.status, ibv_wc_status_str(wc.status));
goto poll_cq_exit;
}
switch(cq_flg){
case SEND_CQ_FLG:
*tag = get_swr_id_tag(wc.wr_id);
break;
case RECV_CQ_FLG:
*tag = get_rwr_id_tag(wc.wr_id);
break;
}
rc = 0;
poll_cq_exit:
return rc;
}
int poll_cq2(qpinfo_t *qpinfo, int cq_flg, int *tag, int *result) {
struct ibv_wc cqe;
int rc = 0;
switch(cq_flg){
case SEND_CQ_FLG:
*result = ibv_poll_cq(qpinfo->scq, 1, &cqe);
break;
case RECV_CQ_FLG:
*result = ibv_poll_cq(qpinfo->rcq, 1, &cqe);
break;
}
if(*result < 0){
error_perror("ibv_poll_cq");
rc = *result;
goto fn_fail;
}
if(*result > 0 && cqe.status != IBV_WC_SUCCESS){
error_printf("cqe status=%08x,%s\n", cqe.status, ibv_wc_status_str(cqe.status));
rc = -1;
goto fn_fail;
}
if(*result > 0) {
dprintf("cqe.imm_data=%d\n", cqe.imm_data);
switch(cq_flg){
case SEND_CQ_FLG:
*tag = get_swr_id_tag(cqe.wr_id);
break;
case RECV_CQ_FLG:
*tag = get_rwr_id_tag(cqe.wr_id);
break;
}
}
fn_exit:
return rc;
fn_fail:
goto fn_exit;
}
int poll_cq2_ud(qpinfo_t *qpinfo, int cq_flg, int *result) {
struct ibv_wc cqe;
int rc = 0;
switch(cq_flg){
case SEND_CQ_FLG: {
unsigned long tscs = rdtsc();
*result = ibv_poll_cq(qpinfo->scq, 1, &cqe);
unsigned long tsce = rdtsc();
printf("poll_cq,send,%ld\n", tsce-tscs);
break; }
case RECV_CQ_FLG:
*result = ibv_poll_cq(qpinfo->rcq, 1, &cqe);
break;
}
if(*result < 0){
error_perror("ibv_poll_cq");
rc = *result;
goto fn_fail;
}
if(*result > 0 && cqe.status != IBV_WC_SUCCESS){
error_printf("cqe status=%08x,%s\n", cqe.status, ibv_wc_status_str(cqe.status));
rc = -1;
goto fn_fail;
}
fn_exit:
return rc;
fn_fail:
goto fn_exit;
}