35 #include "rdma_common.h"
40 #define SLEEP_IN_NANOS (10 * 1000)
80 void *server_remote_export_F =
NULL;
81 size_t server_remote_export_F_len;
99 if (cuda_err != cudaSuccess) {
100 DOCA_LOG_ERR(
"Can't CUDA memset buffer A: %d", cuda_err);
134 if (recv(
oob_sock_fd, &server_remote_export_F_len,
sizeof(
size_t), 0) < 0) {
135 DOCA_LOG_ERR(
"Failed to receive remote connection details");
140 server_remote_export_F = calloc(1, server_remote_export_F_len);
141 if (server_remote_export_F ==
NULL) {
142 DOCA_LOG_ERR(
"Failed to allocate memory for remote mmap export");
147 if (recv(
oob_sock_fd, server_remote_export_F, server_remote_export_F_len, 0) < 0) {
148 DOCA_LOG_ERR(
"Failed to receive remote connection details");
154 server_remote_export_F,
155 server_remote_export_F_len,
189 free(server_remote_export_F);
194 if (server_remote_export_F)
195 free(server_remote_export_F);
212 void *client_remote_export_A =
NULL;
213 size_t client_remote_export_A_len;
215 cudaError_t cuda_err;
234 if (cuda_err != cudaSuccess) {
235 DOCA_LOG_ERR(
"Can't CUDA memset buffer B: %d", cuda_err);
266 if (cuda_err != cudaSuccess) {
267 DOCA_LOG_ERR(
"Can't CUDA memset buffer C: %d", cuda_err);
314 if (recv(
oob_sock_fd, &client_remote_export_A_len,
sizeof(
size_t), 0) < 0) {
315 DOCA_LOG_ERR(
"Failed to receive remote connection details");
320 client_remote_export_A = calloc(1, client_remote_export_A_len);
321 if (client_remote_export_A ==
NULL) {
322 DOCA_LOG_ERR(
"Failed to allocate memory for remote mmap export");
327 if (recv(
oob_sock_fd, client_remote_export_A, client_remote_export_A_len, 0) < 0) {
328 DOCA_LOG_ERR(
"Failed to receive remote connection details");
334 client_remote_export_A,
335 client_remote_export_A_len,
417 free(client_remote_export_A);
422 if (client_remote_export_A)
423 free(client_remote_export_A);
437 for (
int conn_idx = 0; conn_idx <
NUM_CONN; conn_idx++) {
479 for (
int conn_idx = 0; conn_idx <
NUM_CONN; conn_idx++) {
556 struct doca_rdma_connection *connection =
NULL;
559 void *remote_conn_details =
NULL;
560 size_t remote_conn_details_len = 0;
561 cudaError_t cuda_ret;
563 struct timespec ts = {
579 goto destroy_resources;
585 DOCA_LOG_ERR(
"Failed to setup OOB connection with remote peer");
587 goto destroy_resources;
590 if (!
cfg->use_rdma_cm) {
597 DOCA_LOG_ERR(
"Failed to export RDMA with connection details");
598 goto close_connection;
602 DOCA_LOG_INFO(
"Send connection details to remote peer size %zd str %s",
608 goto close_connection;
614 goto close_connection;
618 if (recv(
oob_client_sock, &remote_conn_details_len,
sizeof(
size_t), 0) < 0) {
619 DOCA_LOG_ERR(
"Failed to receive remote connection details");
621 goto close_connection;
624 if (remote_conn_details_len <= 0 || remote_conn_details_len >= (
size_t)-1) {
625 DOCA_LOG_ERR(
"Received wrong remote connection details");
627 goto close_connection;
630 remote_conn_details = calloc(1, remote_conn_details_len);
631 if (remote_conn_details ==
NULL) {
632 DOCA_LOG_ERR(
"Failed to allocate memory for remote connection details");
634 goto close_connection;
637 if (recv(
oob_client_sock, remote_conn_details, remote_conn_details_len, 0) < 0) {
638 DOCA_LOG_ERR(
"Failed to receive remote connection details");
640 goto close_connection;
648 goto close_connection;
651 free(remote_conn_details);
652 remote_conn_details =
NULL;
656 DOCA_LOG_ERR(
"Server failed to call doca_rdma_start_listen_to_port: %s",
658 goto close_connection;
663 DOCA_LOG_INFO(
"Server is waiting for new connections using RDMA CM");
671 DOCA_LOG_ERR(
"Failed to connect to remote peer, connection error");
673 goto close_connection;
679 cuda_ret = cudaStreamCreateWithFlags(&
cstream, cudaStreamNonBlocking);
680 if (cuda_ret != cudaSuccess) {
681 DOCA_LOG_ERR(
"Function cudaStreamCreateWithFlags error %d", cuda_ret);
683 goto close_connection;
689 goto close_connection;
692 DOCA_LOG_INFO(
"Before launching CUDA kernel, buffer array A is:");
693 for (
int idx = 0; idx < 4; idx++) {
694 DOCA_LOG_INFO(
"Buffer %d -> offset 0: %x%x%x%x | offset %d: %x%x%x%x",
714 goto close_connection;
717 if (
cfg->use_rdma_cm) {
725 DOCA_LOG_ERR(
"Failed to connect to remote peer, connection error");
727 goto close_connection;
734 DOCA_LOG_ERR(
"Function create_memory_local_remote_server failed: %s",
736 goto close_connection;
739 DOCA_LOG_INFO(
"Server - Connection 2 memory info exchanged");
749 goto close_connection;
753 cudaStreamSynchronize(
cstream);
755 DOCA_LOG_INFO(
"After launching CUDA kernel, buffer array A is:");
756 for (
int idx = 0; idx < 4; idx++) {
757 DOCA_LOG_INFO(
"Buffer %d -> offset 0: %x%x%x%x | offset %d: %x%x%x%x",
770 if (
cfg->use_rdma_cm) {
771 DOCA_LOG_INFO(
"After launching CUDA kernel for connection 2, buffer array A is:");
772 for (
int idx = 0; idx < 4; idx++) {
773 DOCA_LOG_INFO(
"Buffer %d -> offset 0: %x%x%x%x | offset %d: %x%x%x%x",
810 if (remote_conn_details)
811 free(remote_conn_details);
824 struct doca_rdma_connection *connection =
NULL;
827 cudaError_t cuda_ret;
828 void *remote_conn_details =
NULL;
829 size_t remote_conn_details_len = 0;
832 uint32_t *cpu_exit_flag;
833 uint32_t *gpu_exit_flag;
834 struct timespec ts = {
850 goto destroy_resources;
856 DOCA_LOG_ERR(
"Failed to setup OOB connection with remote peer");
858 goto destroy_resources;
861 if (!
cfg->use_rdma_cm) {
868 DOCA_LOG_ERR(
"Failed to export RDMA with connection details");
869 goto close_connection;
874 if (recv(
oob_sock_fd, &remote_conn_details_len,
sizeof(
size_t), 0) < 0) {
875 DOCA_LOG_ERR(
"Failed to receive remote connection details");
877 goto close_connection;
880 if (remote_conn_details_len <= 0 || remote_conn_details_len >= (
size_t)-1) {
881 DOCA_LOG_ERR(
"Received wrong remote connection details");
883 goto close_connection;
886 remote_conn_details = calloc(1, remote_conn_details_len);
887 if (remote_conn_details ==
NULL) {
888 DOCA_LOG_ERR(
"Failed to allocate memory for remote connection details");
890 goto close_connection;
893 if (recv(
oob_sock_fd, remote_conn_details, remote_conn_details_len, 0) < 0) {
894 DOCA_LOG_ERR(
"Failed to receive remote connection details");
896 goto close_connection;
899 DOCA_LOG_INFO(
"Send connection details to remote peer size %zd str %s",
905 goto close_connection;
911 goto close_connection;
919 goto close_connection;
922 free(remote_conn_details);
923 remote_conn_details =
NULL;
928 goto close_connection;
934 DOCA_LOG_ERR(
"Client failed to call doca_rdma_connect_to_addr %s",
936 goto close_connection;
939 DOCA_LOG_INFO(
"Client is waiting for a connection establishment");
947 DOCA_LOG_ERR(
"Failed to connect to remote peer, connection error");
949 goto close_connection;
958 goto close_connection;
965 (
void **)&gpu_exit_flag,
966 (
void **)&cpu_exit_flag);
969 goto close_connection;
971 cpu_exit_flag[0] = 0;
973 cuda_ret = cudaStreamCreateWithFlags(&
cstream, cudaStreamNonBlocking);
974 if (cuda_ret != cudaSuccess) {
975 DOCA_LOG_ERR(
"Function cudaStreamCreateWithFlags error %d", cuda_ret);
977 goto close_connection;
991 goto close_connection;
994 if (
cfg->use_rdma_cm) {
1000 DOCA_LOG_ERR(
"Client failed to call doca_rdma_connect_to_addr %s",
1002 goto close_connection;
1005 DOCA_LOG_INFO(
"Client is waiting for a connection establishment");
1009 nanosleep(&ts, &ts);
1013 DOCA_LOG_ERR(
"Failed to connect to remote peer, connection error");
1015 goto close_connection;
1022 DOCA_LOG_ERR(
"Function create_memory_local_remote_client failed: %s",
1024 goto close_connection;
1027 DOCA_LOG_INFO(
"Client - Connection 2 memory info exchanged");
1040 goto close_connection;
1045 DOCA_GPUNETIO_VOLATILE(*cpu_exit_flag) = 1;
1046 cudaStreamSynchronize(0);
1048 if (
cfg->use_rdma_cm) {
1049 cudaStreamSynchronize(
cstream);
1076 if (remote_conn_details)
1077 free(remote_conn_details);
doca_error_t create_buf_arr_on_gpu(struct buf_arr_obj *buf_arr_obj)
doca_error_t destroy_rdma_resources(struct rdma_resources *resources)
int oob_connection_client_setup(const char *server_ip, int *oob_sock_fd)
void oob_connection_server_close(int oob_sock_fd, int oob_client_sock)
int oob_connection_server_setup(int *oob_sock_fd, int *oob_client_sock)
doca_error_t create_rdma_resources(struct rdma_config *cfg, const uint32_t rdma_permissions, struct rdma_resources *resources)
void oob_connection_client_close(int oob_sock_fd)
doca_error_t kernel_write_server(cudaStream_t stream, struct doca_gpu_dev_rdma *rdma_gpu, struct doca_gpu_buf_arr *server_local_buf_arr_A, struct doca_gpu_buf_arr *server_remote_buf_arr_F, uint32_t connection_index)
doca_error_t kernel_write_client(cudaStream_t stream, struct doca_gpu_dev_rdma *rdma_gpu, struct doca_gpu_buf_arr *client_local_buf_arr_B, struct doca_gpu_buf_arr *client_local_buf_arr_C, struct doca_gpu_buf_arr *client_local_buf_arr_F, struct doca_gpu_buf_arr *client_remote_buf_arr_A, uint32_t connection_index, uint32_t *exit_flag)
static doca_error_t create_mmap(struct doca_dev *doca_device, unsigned int mmap_permissions, void *memrange_addr, size_t memrange_len, struct doca_mmap **mmap, doca_dpa_dev_mmap_t *dpa_mmap_handle)
struct buf_arr_obj client_local_buf_arr_B[NUM_CONN]
static doca_error_t create_memory_local_remote_client(int oob_sock_fd, struct rdma_resources *resources, int conn_idx, cudaStream_t stream)
uint8_t * client_local_buf_F[NUM_CONN]
uint8_t * client_local_buf_C_cpu[NUM_CONN]
DOCA_LOG_REGISTER(GPURDMA::SAMPLE)
static void destroy_memory_local_remote_client(struct rdma_resources *resources)
struct rdma_mmap_obj client_local_mmap_obj_C[NUM_CONN]
uint8_t * server_local_buf_A_gpu[NUM_CONN]
doca_error_t rdma_write_client(struct rdma_config *cfg)
struct buf_arr_obj server_local_buf_arr_A[NUM_CONN]
struct doca_mmap * client_remote_mmap_A[NUM_CONN]
struct rdma_mmap_obj client_local_mmap_obj_B[NUM_CONN]
struct buf_arr_obj client_remote_buf_arr_A[NUM_CONN]
uint8_t * client_local_buf_B_cpu[NUM_CONN]
struct buf_arr_obj server_remote_buf_arr_F[NUM_CONN]
struct doca_mmap * server_remote_mmap_F[NUM_CONN]
static void destroy_memory_local_remote_server(struct rdma_resources *resources)
static doca_error_t create_memory_local_remote_server(int oob_sock_fd, struct rdma_resources *resources, int conn_idx, cudaStream_t stream)
uint8_t * client_local_buf_B_gpu[NUM_CONN]
doca_error_t rdma_write_server(struct rdma_config *cfg)
struct rdma_resources resources
uint8_t * server_local_buf_A_cpu[NUM_CONN]
struct buf_arr_obj client_local_buf_arr_C[NUM_CONN]
struct buf_arr_obj client_local_buf_arr_F[NUM_CONN]
const uint32_t access_params
struct rdma_mmap_obj client_local_mmap_obj_F[NUM_CONN]
struct rdma_mmap_obj server_local_mmap_obj_A[NUM_CONN]
uint8_t * client_local_buf_C_gpu[NUM_CONN]
DOCA_EXPERIMENTAL doca_error_t doca_buf_arr_destroy(struct doca_buf_arr *buf_arr)
Destroys a doca buf array instance.
enum doca_error doca_error_t
DOCA API return codes.
DOCA_STABLE const char * doca_error_get_descr(doca_error_t error)
Returns the description string of an error code.
@ DOCA_ERROR_CONNECTION_ABORTED
#define DOCA_LOG_ERR(format,...)
Generates an ERROR application log message.
#define DOCA_LOG_INFO(format,...)
Generates an INFO application log message.
DOCA_STABLE doca_error_t doca_mmap_destroy(struct doca_mmap *mmap)
Destroy DOCA Memory Map structure.
DOCA_STABLE doca_error_t doca_mmap_create_from_export(const union doca_data *user_data, const void *export_desc, size_t export_desc_len, struct doca_dev *dev, struct doca_mmap **mmap)
Creates a memory map object representing memory ranges in remote system memory space.
DOCA_STABLE uint8_t doca_pe_progress(struct doca_pe *pe)
Run the progress engine.
DOCA_EXPERIMENTAL doca_error_t doca_rdma_get_gpu_handle(struct doca_rdma *rdma, struct doca_gpu_dev_rdma **gpu_rdma)
Retrieve the handle in the gpu memory space of a doca_rdma.
DOCA_EXPERIMENTAL doca_error_t doca_rdma_connect_to_addr(struct doca_rdma *rdma, struct doca_rdma_addr *addr, union doca_data connection_user_data)
Connect to a remote doca_rdma peer listening for a connection. Can be called when the ctx is in DOCA_...
DOCA_EXPERIMENTAL doca_error_t doca_rdma_addr_create(enum doca_rdma_addr_type addr_type, const char *address, uint16_t port, struct doca_rdma_addr **addr)
Set connection address object for doca_rdma. The object can be queried using doca_rdma_connection_get...
DOCA_EXPERIMENTAL doca_error_t doca_rdma_export(struct doca_rdma *rdma, const void **local_rdma_conn_details, size_t *local_rdma_conn_details_size, struct doca_rdma_connection **rdma_connection)
Export doca_rdma connection details object The doca_rdma_conn_details are used in doca_rdma_connect()...
DOCA_EXPERIMENTAL doca_error_t doca_rdma_start_listen_to_port(struct doca_rdma *rdma, uint16_t port)
Start listening for a connection from a remote doca_rdma peer. Can be called when the ctx is in DOCA_...
DOCA_EXPERIMENTAL doca_error_t doca_rdma_connect(struct doca_rdma *rdma, const void *remote_rdma_conn_details, size_t remote_rdma_conn_details_size, struct doca_rdma_connection *rdma_connection)
Connect to remote doca_rdma peer. Can only be called when the ctx is in DOCA_CTX_STATE_STARTING state...
@ DOCA_GPU_MEM_TYPE_GPU_CPU
@ DOCA_ACCESS_FLAG_LOCAL_READ_WRITE
@ DOCA_ACCESS_FLAG_RDMA_WRITE
const struct ip_frag_config * cfg
struct doca_gpu_buf_arr * gpu_buf_arr
struct doca_buf_arr * buf_arr
struct doca_dev * doca_device
bool server_listen_active
const void * connection_details
bool connection2_established
struct doca_rdma_addr * cm_addr
struct doca_gpu_dev_rdma * gpu_rdma
struct doca_dev * doca_device
bool connection_established
Convenience type for representing opaque data.