/src/plugins/ctp/sock/ctp_sock_api.c
C | 5640 lines | 4325 code | 714 blank | 601 comment | 803 complexity | 192e2833f64ce1309bf7d36f5865392c MD5 | raw file
Possible License(s): BSD-3-Clause
Large files files are truncated, but you can click here to view the full file
- /* vim: set tabstop=8:softtabstop=8:shiftwidth=8:noexpandtab */
- /*
- * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
- * Copyright © 2010-2013 UT-Battelle, LLC. All rights reserved.
- * Copyright © 2010-2013 Oak Ridge National Labs. All rights reserved.
- * Copyright © 2012 inria. All rights reserved.
- *
- * See COPYING in top-level directory
- *
- * $COPYRIGHT$
- *
- */
- #if defined(__INTEL_COMPILER)
- #pragma warning(disable:593)
- #pragma warning(disable:869)
- #pragma warning(disable:981)
- #pragma warning(disable:1338)
- #pragma warning(disable:2259)
- #endif /* __INTEL_COMPILER */
- #include "cci/private_config.h"
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <unistd.h>
- #include <netinet/in.h>
- #include <arpa/inet.h>
- #include <sys/socket.h>
- #include <sys/types.h>
- #include <netdb.h>
- #include <fcntl.h>
- #include <inttypes.h>
- #ifdef HAVE_IFADDRS_H
- #include <ifaddrs.h>
- #include <net/if.h>
- #endif
- #ifdef HAVE_SYS_EPOLL_H
- #include <sys/epoll.h>
- #else
- #include <poll.h>
- #endif /* HAVE_SYS_EPOLL_H */
- #include "cci.h"
- #include "cci_lib_types.h"
- #include "cci-api.h"
- #include "plugins/ctp/ctp.h"
- #include "ctp_sock_internals.h"
- #define DEBUG_RNR 0
- #if DEBUG_RNR
- #include <stdbool.h>
- bool conn_established = false;
- #endif
- sock_globals_t *sglobals = NULL;
- static int threads_running = 0;
- /*
- * Local functions
- */
- static int ctp_sock_init(cci_plugin_ctp_t *plugin,
- uint32_t abi_ver,
- uint32_t flags,
- uint32_t * caps);
- static int ctp_sock_finalize(cci_plugin_ctp_t * plugin);
- static const char *ctp_sock_strerror(cci_endpoint_t * endpoint,
- enum cci_status status);
- static int ctp_sock_create_endpoint(cci_device_t * device,
- int flags,
- cci_endpoint_t ** endpoint,
- cci_os_handle_t * fd);
- static int ctp_sock_destroy_endpoint(cci_endpoint_t * endpoint);
- static int ctp_sock_accept(cci_event_t *event, const void *context);
- static int ctp_sock_reject(cci_event_t *conn_req);
- static int ctp_sock_connect(cci_endpoint_t * endpoint,
- const char *server_uri,
- const void *data_ptr,
- uint32_t data_len,
- cci_conn_attribute_t attribute,
- const void *context,
- int flags,
- const struct timeval *timeout);
- static int ctp_sock_disconnect(cci_connection_t * connection);
- static int ctp_sock_set_opt(cci_opt_handle_t * handle,
- cci_opt_name_t name,
- const void *val);
- static int ctp_sock_get_opt(cci_opt_handle_t * handle,
- cci_opt_name_t name,
- void *val);
- static int ctp_sock_arm_os_handle(cci_endpoint_t * endpoint, int flags);
- static int ctp_sock_get_event(cci_endpoint_t * endpoint,
- cci_event_t ** const event);
- static int ctp_sock_return_event(cci_event_t * event);
- static int ctp_sock_send(cci_connection_t * connection,
- const void *msg_ptr,
- uint32_t msg_len,
- const void *context,
- int flags);
- static int ctp_sock_sendv(cci_connection_t * connection,
- const struct iovec *data,
- uint32_t iovcnt,
- const void *context,
- int flags);
- static int ctp_sock_rma_register(cci_endpoint_t * endpoint,
- void *start,
- uint64_t length,
- int flags,
- cci_rma_handle_t ** rma_handle);
- static int ctp_sock_rma_deregister(cci_endpoint_t * endpoint,
- cci_rma_handle_t * rma_handle);
- static int ctp_sock_rma(cci_connection_t * connection,
- const void *header_ptr,
- uint32_t header_len,
- cci_rma_handle_t * local_handle,
- uint64_t local_offset,
- cci_rma_handle_t * remote_handle,
- uint64_t remote_offset,
- uint64_t data_len,
- const void *context,
- int flags);
- static uint8_t sock_ip_hash(in_addr_t ip, uint16_t port);
- static void sock_progress_sends(cci__ep_t * ep);
- static void *sock_progress_thread(void *arg);
- static void *sock_recv_thread(void *arg);
- static void sock_ack_conns(cci__ep_t * ep);
- static inline int pack_piggyback_ack(cci__ep_t *ep,
- sock_conn_t *sconn, sock_tx_t *tx);
- static inline int sock_ack_sconn(sock_ep_t *sep, sock_conn_t *sconn);
- static int sock_recvfrom_ep(cci__ep_t * ep);
- int progress_recv (cci__ep_t *ep);
- /*
- * Public plugin structure.
- *
- * The name of this structure must be of the following form:
- *
- * cci_ctp_<your_plugin_name>_plugin
- *
- * This allows the symbol to be found after the plugin is dynamically
- * opened.
- *
- * Note that your_plugin_name should match the direct name where the
- * plugin resides.
- */
- cci_plugin_ctp_t cci_ctp_sock_plugin = {
- {
- /* Logistics */
- CCI_ABI_VERSION,
- CCI_CTP_API_VERSION,
- "sock",
- CCI_MAJOR_VERSION, CCI_MINOR_VERSION, CCI_RELEASE_VERSION,
- 30,
- /* Bootstrap function pointers */
- cci_ctp_sock_post_load,
- cci_ctp_sock_pre_unload,
- },
- /* API function pointers */
- ctp_sock_init,
- ctp_sock_finalize,
- ctp_sock_strerror,
- ctp_sock_create_endpoint,
- ctp_sock_destroy_endpoint,
- ctp_sock_accept,
- ctp_sock_reject,
- ctp_sock_connect,
- ctp_sock_disconnect,
- ctp_sock_set_opt,
- ctp_sock_get_opt,
- ctp_sock_arm_os_handle,
- ctp_sock_get_event,
- ctp_sock_return_event,
- ctp_sock_send,
- ctp_sock_sendv,
- ctp_sock_rma_register,
- ctp_sock_rma_deregister,
- ctp_sock_rma
- };
- static inline int
- sock_recv_msg (int fd,
- void *ptr,
- uint32_t len,
- int flags,
- struct sockaddr_in *sin_out)
- {
- int ret = 0;
- uint32_t recv_len = 0;
- static int count = 0;
- uint32_t offset = 0;
- struct sockaddr_in sin;
- socklen_t sin_len = sizeof(sin);
- if (len == 0)
- return ret;
- again:
- do {
- ret = recvfrom (fd, (void*) ((uintptr_t)ptr + offset), len - recv_len, flags, (struct sockaddr *)&sin, &sin_len);
- if (ret < 0) {
- if ((count++ & 0xFFFF) == 0xFFFF)
- debug (CCI_DB_EP, "%s: recvfrom() failed with %s (%u of %u bytes)", __func__, strerror(ret), recv_len, len);
- if (ret == EAGAIN)
- goto again;
- goto out;
- } else if (ret == 0) {
- debug (CCI_DB_MSG, "%s: recvfrom() failed - socket closed", __func__);
- ret = -1;
- goto out;
- }
- recv_len += ret;
- offset += recv_len;
- } while (recv_len < len);
- ret = recv_len;
- if (sin_out != NULL)
- *sin_out = sin;
- out:
- return ret;
- }
- static inline void
- sock_sin_to_name(struct sockaddr_in sin, char *buffer, int len)
- {
- snprintf(buffer, len, "%s:%d", inet_ntoa(sin.sin_addr),
- ntohs(sin.sin_port));
- return;
- }
- static inline const char *sock_msg_type(sock_msg_type_t type)
- {
- switch (type) {
- case SOCK_MSG_CONN_REQUEST:
- return "conn_request";
- case SOCK_MSG_CONN_REPLY:
- return "conn_reply";
- case SOCK_MSG_CONN_ACK:
- return "conn_ack";
- case SOCK_MSG_DISCONNECT:
- return "disconnect";
- case SOCK_MSG_SEND:
- return "send";
- case SOCK_MSG_RNR:
- return "receiver not ready";
- case SOCK_MSG_KEEPALIVE:
- return "keepalive";
- case SOCK_MSG_PING:
- return "ping for RTTM";
- case SOCK_MSG_ACK_ONLY:
- return "ack_only";
- case SOCK_MSG_ACK_UP_TO:
- return "ack_up_to";
- case SOCK_MSG_SACK:
- return "selective ack";
- case SOCK_MSG_NACK:
- return "negative ack";
- case SOCK_MSG_RMA_WRITE:
- return "RMA write";
- case SOCK_MSG_RMA_WRITE_DONE:
- return "RMA write done";
- case SOCK_MSG_RMA_READ_REQUEST:
- return "RMA read request";
- case SOCK_MSG_RMA_READ_REPLY:
- return "RMA read reply";
- case SOCK_MSG_RMA_INVALID:
- return "invalid RMA handle";
- case SOCK_MSG_INVALID:
- assert(0);
- return "invalid";
- case SOCK_MSG_TYPE_MAX:
- assert(0);
- return "type_max";
- }
- return NULL;
- }
- static inline void sock_drop_msg(cci_os_handle_t sock)
- {
- char buf[4];
- struct sockaddr sa;
- socklen_t slen = sizeof(sa);
- recvfrom(sock, buf, 4, 0, &sa, &slen);
- return;
- }
- static inline int sock_create_threads (cci__ep_t *ep)
- {
- int ret;
- sock_ep_t *sep;
- assert (ep);
- sep = ep->priv;
- ret = pthread_create(&sep->recv_tid, NULL, sock_recv_thread, (void*)ep);
- if (ret)
- goto out;
- ret = pthread_create(&sep->progress_tid, NULL, sock_progress_thread, (void*)ep);
- if (ret)
- goto out;
- out:
- return ret;
- }
- static inline int sock_terminate_threads (sock_ep_t *sep)
- {
- CCI_ENTER;
- assert (sep);
- pthread_mutex_lock(&sep->progress_mutex);
- pthread_cond_signal(&sep->wait_condition);
- pthread_mutex_unlock(&sep->progress_mutex);
- pthread_join(sep->progress_tid, NULL);
- pthread_join(sep->recv_tid, NULL);
- CCI_EXIT;
- return CCI_SUCCESS;
- }
- static int ctp_sock_init(cci_plugin_ctp_t *plugin,
- uint32_t abi_ver, uint32_t flags, uint32_t * caps)
- {
- int ret;
- cci__dev_t *dev, *ndev;
- cci_device_t **devices;
- #ifdef HAVE_GETIFADDRS
- struct ifaddrs *addrs = NULL, *addr;
- #endif
- CCI_ENTER;
- /* Some unused parameters, the following avoids warnings from
- compilers */
- UNUSED_PARAM (abi_ver);
- UNUSED_PARAM (flags);
- UNUSED_PARAM (caps);
- #if DEBUG_RNR
- fprintf(stderr, "Warning, debug mode (RNR testing)!\n");
- #endif
- /* init sock globals */
- sglobals = calloc(1, sizeof(*sglobals));
- if (!sglobals) {
- CCI_EXIT;
- return CCI_ENOMEM;
- }
- srandom((unsigned int)sock_get_usecs());
- #ifdef HAVE_GETIFADDRS
- getifaddrs(&addrs);
- /* ignore errors, we've use defaults */
- #endif
- devices = calloc(CCI_MAX_DEVICES, sizeof(*sglobals->devices));
- if (!devices) {
- ret = CCI_ENOMEM;
- goto out;
- }
- if (!globals->configfile) {
- #ifdef HAVE_GETIFADDRS
- if (addrs) {
- for (addr = addrs; addr != NULL; addr = addr->ifa_next) {
- struct cci_device *device;
- sock_dev_t *sdev;
- uint32_t mtu = (uint32_t) -1;
- struct sockaddr_in *sai;
- if (!addr->ifa_addr)
- continue;
- if (addr->ifa_addr->sa_family != AF_INET)
- continue;
- if (addr->ifa_flags & IFF_LOOPBACK)
- continue;
- dev = calloc(1, sizeof(*dev));
- if (!dev) {
- ret = CCI_ENOMEM;
- goto out;
- }
- dev->priv = calloc(1, sizeof(*sdev));
- if (!dev->priv) {
- free(dev);
- ret = CCI_ENOMEM;
- goto out;
- }
- cci__init_dev(dev);
- dev->plugin = plugin;
- dev->priority = plugin->base.priority;
- /* FIXME GV: could use macro here */
- device = &dev->device;
- device->transport = strdup("sock");
- device->name = strdup(addr->ifa_name);
- sdev = dev->priv;
- sai = (struct sockaddr_in *) addr->ifa_addr;
- memcpy(&sdev->ip, &sai->sin_addr, sizeof(sai->sin_addr));
- /* default values */
- device->up = 1;
- device->rate = 0;
- device->pci.domain = -1; /* per CCI spec */
- device->pci.bus = -1; /* per CCI spec */
- device->pci.dev = -1; /* per CCI spec */
- device->pci.func = -1; /* per CCI spec */
- /* try to get the actual values */
- cci__get_dev_ifaddrs_info(dev, addr);
- mtu = device->max_send_size;
- if (mtu == (uint32_t) -1) {
- /* if no mtu, use default */
- device->max_send_size = SOCK_DEFAULT_MSS;
- } else {
- /* compute mss from mtu */
- if (mtu > SOCK_UDP_MAX)
- mtu = SOCK_UDP_MAX;
- mtu -= SOCK_MAX_HDR_SIZE;
- assert(mtu >= SOCK_MIN_MSS); /* FIXME rather ignore the device? */
- device->max_send_size = mtu;
- }
- cci__add_dev(dev);
- devices[sglobals->count] = device;
- sglobals->count++;
- threads_running = 1;
- }
- }
- #endif
- } else
- /* find devices that we own */
- TAILQ_FOREACH_SAFE(dev, &globals->configfile_devs, entry, ndev) {
- if (0 == strcmp("sock", dev->device.transport)) {
- const char * const *arg;
- const char *interface = NULL;
- struct cci_device *device;
- sock_dev_t *sdev;
- uint32_t mtu = (uint32_t) -1;
- dev->plugin = plugin;
- if (dev->priority == -1)
- dev->priority = plugin->base.priority;
- device = &dev->device;
- /* TODO determine link rate
- *
- * linux->driver->get ethtool settings->speed
- * bsd/darwin->ioctl(SIOCGIFMEDIA)->ifm_active
- * windows ?
- */
- dev->priv = calloc(1, sizeof(*sdev));
- if (!dev->priv) {
- ret = CCI_ENOMEM;
- goto out;
- }
- sdev = dev->priv;
- sdev->port = 0;
- sdev->bufsize = 0;
- /* default values */
- device->up = 1;
- device->rate = 0;
- device->pci.domain = -1; /* per CCI spec */
- device->pci.bus = -1; /* per CCI spec */
- device->pci.dev = -1; /* per CCI spec */
- device->pci.func = -1; /* per CCI spec */
- /* parse conf_argv */
- for (arg = device->conf_argv; *arg != NULL; arg++) {
- if (0 == strncmp("ip=", *arg, 3)) {
- const char *ip = *arg + 3;
- /* network order */
- sdev->ip = inet_addr(ip);
- } else if (0 == strncmp("mtu=", *arg, 4)) {
- const char *mtu_str = *arg + 4;
- mtu = strtol(mtu_str, NULL, 0);
- } else if (0 == strncmp("port=", *arg, 5)) {
- const char *s_port = *arg + 5;
- uint16_t port;
- port = atoi (s_port);
- sdev->port = htons(port);
- } else if (0 == strncmp("bufsize=", *arg, 8)) {
- const char *size_str = *arg + 8;
- sdev->bufsize = strtol(size_str,
- NULL, 0);
- } else if (0 == strncmp("interface=",
- *arg, 10))
- {
- interface = *arg + 10;
- }
- }
- if (sdev->ip != 0 || interface) {
- /* try to get the actual values now */
- #ifdef HAVE_GETIFADDRS
- if (addrs) {
- for (addr = addrs;
- addr != NULL;
- addr = addr->ifa_next)
- {
- struct sockaddr_in *sai;
- if (!addr->ifa_addr)
- continue;
- if (addr->ifa_addr->sa_family != AF_INET)
- continue;
- sai = (struct sockaddr_in *) addr->ifa_addr;
- if (!memcmp(&sdev->ip, &sai->sin_addr, sizeof(sdev->ip)))
- break;
- if (interface &&
- !strcmp(interface, addr->ifa_name)) {
- memcpy(&sdev->ip, &sai->sin_addr, sizeof(sdev->ip));
- break;
- }
- }
- if (!addr)
- /* no such device, don't initialize it */
- continue;
- cci__get_dev_ifaddrs_info(dev, addr);
- }
- #endif
- if (mtu == (uint32_t) -1) {
- /* if mtu not specified, use the ifaddr one */
- mtu = device->max_send_size;
- }
- if (mtu == (uint32_t) -1) {
- /* if still no mtu, use default */
- device->max_send_size = SOCK_DEFAULT_MSS;
- } else {
- /* compute mss from mtu */
- if (mtu > SOCK_UDP_MAX)
- mtu = SOCK_UDP_MAX;
- mtu -= SOCK_MAX_HDR_SIZE;
- assert(mtu >= SOCK_MIN_MSS); /* FIXME rather ignore the device? */
- device->max_send_size = mtu;
- }
- /* queue to the main device list now */
- TAILQ_REMOVE(&globals->configfile_devs, dev, entry);
- cci__add_dev(dev);
- devices[sglobals->count] = device;
- sglobals->count++;
- threads_running = 1;
- }
- }
- }
- devices =
- realloc(devices, (sglobals->count + 1) * sizeof(cci_device_t *));
- devices[sglobals->count] = NULL;
- *((cci_device_t ***) & sglobals->devices) = devices;
- #ifdef HAVE_GETIFADDRS
- freeifaddrs(addrs);
- #endif
- CCI_EXIT;
- return CCI_SUCCESS;
- out:
- if (devices) {
- int i = 0;
- cci_device_t *device;
- cci__dev_t *my_dev;
- while (devices[i] != NULL) {
- device = devices[i];
- my_dev = container_of(device, cci__dev_t, device);
- if (my_dev->priv)
- free(my_dev->priv);
- }
- free(devices);
- }
- if (sglobals) {
- free((void *)sglobals);
- sglobals = NULL;
- }
- #ifdef HAVE_GETIFADDRS
- if (addrs) {
- freeifaddrs(addrs);
- }
- #endif
- CCI_EXIT;
- return ret;
- }
- /* TODO */
- static const char *ctp_sock_strerror(cci_endpoint_t * endpoint,
- enum cci_status status)
- {
- CCI_ENTER;
- UNUSED_PARAM (endpoint);
- UNUSED_PARAM (status);
- CCI_EXIT;
- return NULL;
- }
- /* NOTE the CCI layer has already unbound all devices
- * and destroyed all endpoints.
- * All we need to do if free dev->priv
- */
- static int ctp_sock_finalize(cci_plugin_ctp_t * plugin)
- {
- cci__dev_t *dev = NULL;
- CCI_ENTER;
- UNUSED_PARAM (plugin);
- if (!sglobals) {
- CCI_EXIT;
- return CCI_ENODEV;
- }
- TAILQ_FOREACH(dev, &globals->devs, entry)
- if (!strcmp(dev->device.transport, "sock"))
- free(dev->priv);
- free(sglobals->devices);
- free((void *)sglobals);
- sglobals = NULL;
- CCI_EXIT;
- return CCI_SUCCESS;
- }
- static inline int
- sock_set_nonblocking(cci_os_handle_t sock, sock_fd_type_t type, void *p)
- {
- int ret, flags;
- UNUSED_PARAM (type);
- UNUSED_PARAM (p);
- flags = fcntl(sock, F_GETFL, 0);
- if (-1 == flags)
- flags = 0;
- ret = fcntl(sock, F_SETFL, flags | O_NONBLOCK);
- if (-1 == ret)
- return errno;
- return 0;
- }
- static inline void sock_close_socket(cci_os_handle_t sock)
- {
- close(sock);
- return;
- }
- static int ctp_sock_create_endpoint(cci_device_t * device,
- int flags,
- cci_endpoint_t ** endpointp,
- cci_os_handle_t * fd)
- {
- int ret;
- uint32_t i;
- sock_dev_t *sdev;
- struct sockaddr_in sin;
- socklen_t slen;
- char name[40];
- unsigned int sndbuf_size = SOCK_SNDBUF_SIZE;
- unsigned int rcvbuf_size = SOCK_RCVBUF_SIZE;
- cci__dev_t *dev = NULL;
- cci__ep_t *ep = NULL;
- sock_ep_t *sep = NULL;
- struct cci_endpoint *endpoint = (struct cci_endpoint *) *endpointp;
- CCI_ENTER;
- UNUSED_PARAM (flags);
- if (!sglobals) {
- CCI_EXIT;
- return CCI_ENODEV;
- }
- dev = container_of(device, cci__dev_t, device);
- if (0 != strcmp("sock", device->transport)) {
- ret = CCI_EINVAL;
- goto out;
- }
- ep = container_of(endpoint, cci__ep_t, endpoint);
- ep->priv = calloc(1, sizeof(*sep));
- if (!ep->priv) {
- ret = CCI_ENOMEM;
- goto out;
- }
- ep->rx_buf_cnt = SOCK_EP_RX_CNT;
- ep->tx_buf_cnt = SOCK_EP_TX_CNT;
- ep->buffer_len = dev->device.max_send_size + SOCK_MAX_HDRS;
- ep->tx_timeout = SOCK_EP_TX_TIMEOUT_SEC * 1000000;
- sep = ep->priv;
- sep->ids = calloc(SOCK_NUM_BLOCKS, sizeof(*sep->ids));
- if (!sep->ids) {
- ret = CCI_ENOMEM;
- goto out;
- }
- sep->closing = 0;
- pthread_mutex_init (&sep->progress_mutex, NULL);
- pthread_cond_init (&sep->wait_condition, NULL);
- sep->sock = socket(PF_INET, SOCK_DGRAM, 0);
- if (sep->sock == -1) {
- ret = errno;
- goto out;
- }
- sdev = dev->priv;
- if (sndbuf_size < sdev->bufsize)
- sndbuf_size = sdev->bufsize;
- if (rcvbuf_size < sdev->bufsize)
- rcvbuf_size = sdev->bufsize;
- if (sndbuf_size > 0) {
- ret = setsockopt (sep->sock, SOL_SOCKET, SO_SNDBUF,
- &sndbuf_size, sizeof (sndbuf_size));
- if (ret == -1)
- debug (CCI_DB_WARN,
- "%s: Cannot set send buffer size", __func__);
- }
- if (rcvbuf_size > 0) {
- ret = setsockopt (sep->sock, SOL_SOCKET, SO_RCVBUF,
- &rcvbuf_size, sizeof (rcvbuf_size));
- if (ret == -1)
- debug (CCI_DB_WARN, "%s: Cannot set recv buffer size",
- __func__);
- }
- #if CCI_DEBUG
- {
- socklen_t optlen;
- optlen = sizeof (sndbuf_size);
- ret = getsockopt (sep->sock, SOL_SOCKET, SO_SNDBUF,
- &sndbuf_size, &optlen);
- if (ret == -1)
- debug (CCI_DB_WARN, "%s: Cannot get send buffer size",
- __func__);
- debug (CCI_DB_CTP, "Send buffer size: %d bytes (you may also "
- "want to check the value of net.core.wmem_max using "
- "sysctl)", sndbuf_size);
- optlen = sizeof (rcvbuf_size);
- ret = getsockopt (sep->sock, SOL_SOCKET, SO_RCVBUF,
- &rcvbuf_size, &optlen);
- if (ret == -1)
- debug (CCI_DB_WARN, "%s: Cannot get recv buffer size",
- __func__);
- debug (CCI_DB_CTP, "Receive buffer size: %d bytes (you may also "
- "want to check the value of net.core.rmem_max using "
- "sysctl)", rcvbuf_size);
- }
- #endif
- /* bind socket to device */
- memset(&sin, 0, sizeof(sin));
- sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = sdev->ip;
- if (sdev->port != 0)
- sin.sin_port = sdev->port;
- ret = bind(sep->sock, (const struct sockaddr *)&sin, sizeof(sin));
- if (ret) {
- ret = errno;
- goto out;
- }
- slen = sizeof(sep->sin);
- ret = getsockname(sep->sock, (struct sockaddr *)&sep->sin, &slen);
- if (ret) {
- ret = errno;
- goto out;
- }
- memset(name, 0, sizeof(name));
- sprintf(name, "sock://");
- sock_sin_to_name(sep->sin, name + (uintptr_t) 7, sizeof(name) - 7);
- ep->uri = strdup(name);
- for (i = 0; i < SOCK_EP_HASH_SIZE; i++) {
- TAILQ_INIT(&sep->conn_hash[i]);
- TAILQ_INIT(&sep->active_hash[i]);
- }
- TAILQ_INIT(&sep->idle_txs);
- TAILQ_INIT(&sep->idle_rxs);
- TAILQ_INIT(&sep->handles);
- TAILQ_INIT(&sep->rma_ops);
- TAILQ_INIT(&sep->queued);
- TAILQ_INIT(&sep->pending);
- sep->tx_buf = calloc (1, ep->tx_buf_cnt * ep->buffer_len);
- if (!sep->tx_buf) {
- ret = CCI_ENOMEM;
- goto out;
- }
- sep->txs = calloc (1, ep->tx_buf_cnt * sizeof (sock_tx_t));
- if (!sep->txs) {
- ret = CCI_ENOMEM;
- goto out;
- }
- /* alloc txs */
- for (i = 0; i < ep->tx_buf_cnt; i++) {
- sock_tx_t *tx = &sep->txs[i];
- tx->ctx = SOCK_CTX_TX;
- tx->evt.event.type = CCI_EVENT_SEND;
- tx->evt.ep = ep;
- tx->buffer = (void*)((uintptr_t)sep->tx_buf
- + (i * ep->buffer_len));
- tx->len = 0;
- TAILQ_INSERT_TAIL(&sep->idle_txs, tx, dentry);
- }
- sep->rx_buf = calloc (1, ep->rx_buf_cnt * ep->buffer_len);
- if (!sep->rx_buf) {
- ret = CCI_ENOMEM;
- goto out;
- }
- sep->rxs = calloc (1, ep->rx_buf_cnt * sizeof (sock_rx_t));
- if (!sep->rx_buf) {
- ret = CCI_ENOMEM;
- goto out;
- }
- /* alloc rxs */
- for (i = 0; i < ep->rx_buf_cnt; i++) {
- sock_rx_t *rx = &sep->rxs[i];
- rx->ctx = SOCK_CTX_RX;
- rx->evt.event.type = CCI_EVENT_RECV;
- rx->evt.ep = ep;
- rx->buffer = (void*)((uintptr_t)sep->rx_buf
- + (i * ep->buffer_len));
- rx->len = 0;
- TAILQ_INSERT_TAIL(&sep->idle_rxs, rx, entry);
- }
- ret = sock_set_nonblocking(sep->sock, SOCK_FD_EP, ep);
- if (ret)
- goto out;
- sep->event_fd = 0;
- #ifdef HAVE_SYS_EPOLL_H
- if (fd) {
- int fflags = 0;
- int rc;
- struct epoll_event ev;
- ret = epoll_create (2);
- if (ret == -1) {
- ret = errno;
- goto out;
- }
- sep->event_fd = ret;
- fflags = fcntl(sep->event_fd, F_GETFL, 0);
- if (fflags == -1) {
- ret = errno;
- goto out;
- }
- ret = fcntl(sep->event_fd, F_SETFL, fflags | O_NONBLOCK);
- if (ret == -1) {
- ret = errno;
- goto out;
- }
- ev.data.ptr = (void*)(uintptr_t)sock_recvfrom_ep;
- ev.events = EPOLLIN;
- ret = epoll_ctl (sep->event_fd, EPOLL_CTL_ADD, sep->sock, &ev);
- if (ret == -1) {
- ret = errno;
- goto out;
- }
- rc = pipe (sep->fd);
- if (rc == -1) {
- debug (CCI_DB_WARN, "%s: %s", __func__, strerror (errno));
- return CCI_ERROR;
- }
- *fd = sep->fd[0];
- }
- #else
- if (fd) {
- /* We will have poll on the receive thread so we just need to create a
- pipe so the receive and send thread can wake up the application
- thread */
- pipe (sep->fd);
- *fd = sep->fd[0];
- /* We set event_fd to value different than zero to know that we are
- in blocking mode at the application level */
- sep->event_fd = 1;
- }
- #endif /* HAVE_SYS_EPOLL_H */
- ret = sock_create_threads (ep);
- if (ret)
- goto out;
- CCI_EXIT;
- return CCI_SUCCESS;
- out:
- /* Note that there is no need to remove the ep even in the context of
- a failure because the ep is added to the list of active endpoints
- by cci_create_endpoint(), AFTER the call to this function. */
- if (sep) {
- if (sep->txs)
- free (sep->txs);
- if (sep->tx_buf)
- free (sep->tx_buf);
- if (sep->rxs)
- free (sep->rxs);
- if (sep->rx_buf)
- free (sep->rx_buf);
- if (sep->ids)
- free(sep->ids);
- if (sep->sock)
- sock_close_socket(sep->sock);
- free(sep);
- ep->priv = NULL;
- }
- if (ep) {
- free (ep->uri);
- }
- *endpointp = NULL;
- CCI_EXIT;
- return ret;
- }
- static int ctp_sock_destroy_endpoint(cci_endpoint_t * endpoint)
- {
- cci__ep_t *ep = NULL;
- cci__dev_t *dev = NULL;
- sock_ep_t *sep = NULL;
- CCI_ENTER;
- if (!sglobals) {
- CCI_EXIT;
- return CCI_ENODEV;
- }
- ep = container_of(endpoint, cci__ep_t, endpoint);
- dev = ep->dev;
- sep = ep->priv;
- pthread_mutex_lock(&dev->lock);
- pthread_mutex_lock(&ep->lock);
- if (sep) {
- int i;
- cci__conn_t *conn;
- sock_conn_t *sconn;
- sep->closing = 1;
- pthread_mutex_unlock(&dev->lock);
- pthread_mutex_unlock(&ep->lock);
- sock_terminate_threads (sep);
- pthread_mutex_lock(&dev->lock);
- pthread_mutex_lock(&ep->lock);
- if (sep->fd[0] > 0)
- close (sep->fd[0]);
- if (sep->fd[1] > 0)
- close (sep->fd[1]);
- if (sep->sock)
- sock_close_socket(sep->sock);
- for (i = 0; i < SOCK_EP_HASH_SIZE; i++) {
- while (!TAILQ_EMPTY(&sep->conn_hash[i])) {
- sconn = TAILQ_FIRST(&sep->conn_hash[i]);
- TAILQ_REMOVE(&sep->conn_hash[i], sconn, entry);
- conn = sconn->conn;
- free(conn);
- free(sconn);
- }
- while (!TAILQ_EMPTY(&sep->active_hash[i])) {
- sconn = TAILQ_FIRST(&sep->active_hash[i]);
- TAILQ_REMOVE(&sep->active_hash[i], sconn, entry);
- conn = sconn->conn;
- free(conn);
- free(sconn);
- }
- }
- free (sep->txs);
- free (sep->tx_buf);
- free (sep->rxs);
- free (sep->rx_buf);
- while (!TAILQ_EMPTY(&sep->rma_ops)) {
- sock_rma_op_t *rma_op = TAILQ_FIRST(&sep->rma_ops);
- TAILQ_REMOVE(&sep->rma_ops, rma_op, entry);
- free(rma_op);
- }
- while (!TAILQ_EMPTY(&sep->handles)) {
- sock_rma_handle_t *handle = TAILQ_FIRST(&sep->handles);
- TAILQ_REMOVE(&sep->handles, handle, entry);
- free(handle);
- }
- if (sep->ids)
- free(sep->ids);
- free(sep);
- ep->priv = NULL;
- }
- ep->priv = NULL;
- if (ep->uri)
- free((char *)ep->uri);
- pthread_mutex_unlock(&ep->lock);
- pthread_mutex_unlock(&dev->lock);
- CCI_EXIT;
- return CCI_SUCCESS;
- }
- static void sock_get_id(sock_ep_t * ep, uint32_t * id)
- {
- uint32_t n, block, offset;
- uint64_t *b;
- while (1) {
- n = random() % SOCK_NUM_BLOCKS;
- block = n / SOCK_BLOCK_SIZE;
- offset = n % SOCK_BLOCK_SIZE;
- b = &ep->ids[block];
- if ((*b & (1ULL << offset)) == 0) {
- *b |= (1ULL << offset);
- *id = (block * SOCK_BLOCK_SIZE) + offset;
- break;
- }
- }
- return;
- }
- #if 0
- static void sock_put_id(sock_ep_t * ep, uint32_t id)
- {
- uint32_t block, offset;
- uint64_t *b;
- block = id / SOCK_BLOCK_SIZE;
- offset = id % SOCK_BLOCK_SIZE;
- b = &ep->ids[block];
- assert((*b & (1 << offset)) == 1);
- *b &= ~(1 << offset);
- return;
- }
- #endif
- static inline uint32_t sock_get_new_seq(void)
- {
- return ((uint32_t) random() & SOCK_SEQ_MASK);
- }
- /* The endpoint maintains 256 lists. Hash the ip and port and return the index
- * of the list. We use all six bytes and this is endian agnostic. It evenly
- * disperses large blocks of addresses as well as large ranges of ports on the
- * same address.
- */
- static uint8_t sock_ip_hash(in_addr_t ip, uint16_t port)
- {
- port ^= (ip & 0x0000FFFF);
- port ^= (ip & 0xFFFF0000) >> 16;
- return (port & 0x00FF) ^ ((port & 0xFF00) >> 8);
- }
- static int ctp_sock_accept(cci_event_t *event, const void *context)
- {
- uint8_t a;
- uint16_t b;
- uint32_t unused;
- uint32_t peer_seq;
- uint32_t peer_ts;
- int i;
- cci_endpoint_t *endpoint;
- cci__ep_t *ep = NULL;
- cci__conn_t *conn = NULL;
- cci__evt_t *evt = NULL;
- cci__dev_t *dev = NULL;
- sock_ep_t *sep = NULL;
- sock_conn_t *sconn = NULL;
- sock_header_r_t *hdr_r = NULL;
- sock_msg_type_t type;
- sock_tx_t *tx = NULL;
- sock_rx_t *rx = NULL;
- sock_handshake_t *hs = NULL;
- uint32_t id, ack, max_recv_buffer_count, mss = 0, ka;
- CCI_ENTER;
- if (!sglobals) {
- CCI_EXIT;
- return CCI_ENODEV;
- }
- evt = container_of(event, cci__evt_t, event);
- rx = container_of(evt, sock_rx_t, evt);
- ep = evt->ep;
- endpoint = &ep->endpoint;
- sep = ep->priv;
- dev = ep->dev;
- conn = calloc(1, sizeof(*conn));
- if (!conn) {
- CCI_EXIT;
- return CCI_ENOMEM;
- }
- conn->plugin = ep->plugin;
- conn->tx_timeout = ep->tx_timeout;
- conn->priv = calloc(1, sizeof(*sconn));
- if (!conn->priv) {
- free(conn);
- CCI_EXIT;
- return CCI_ENOMEM;
- }
- /* get a tx */
- tx = sock_get_tx (ep);
- if (!tx) {
- free(conn->priv);
- free(conn);
- CCI_EXIT;
- return CCI_ENOBUFS;
- }
- tx->rma_ptr = NULL;
- tx->rma_len = 0;
- hdr_r = rx->buffer;
- sock_parse_header(&hdr_r->header, &type, &a, &b, &unused);
- sock_parse_seq_ts(&hdr_r->seq_ts, &peer_seq, &peer_ts);
- conn->connection.attribute = (enum cci_conn_attribute)a;
- conn->connection.endpoint = endpoint;
- conn->connection.context = (void *)context;
- conn->connection.max_send_size = dev->device.max_send_size;
- hs = (sock_handshake_t *)((uintptr_t)rx->buffer +
- (uintptr_t) sizeof(sock_header_r_t));
- sock_parse_handshake(hs, &id, &ack, &max_recv_buffer_count, &mss, &ka);
- if (ka != 0UL) {
- debug(CCI_DB_CONN, "%s: keepalive timeout: %d", __func__, ka);
- conn->keepalive_timeout = ka;
- }
- if (mss < SOCK_MIN_MSS) {
- /* FIXME do what? */
- }
- if (mss < conn->connection.max_send_size)
- conn->connection.max_send_size = mss;
- sconn = conn->priv;
- TAILQ_INIT(&sconn->tx_seqs);
- TAILQ_INIT(&sconn->acks);
- TAILQ_INIT(&sconn->rmas);
- sconn->conn = conn;
- sconn->cwnd = SOCK_INITIAL_CWND;
- sconn->status = SOCK_CONN_READY; /* set ready since the app thinks it is */
- sconn->last_recvd_seq = 0;
- *((struct sockaddr_in *)&sconn->sin) = rx->sin;
- sconn->peer_id = id;
- sock_get_id(sep, &sconn->id);
- sconn->seq = sock_get_new_seq(); /* even for UU since this reply is reliable */
- sconn->seq_pending = sconn->seq - 1;
- if (cci_conn_is_reliable(conn)) {
- sconn->max_tx_cnt = max_recv_buffer_count < ep->tx_buf_cnt ?
- max_recv_buffer_count : ep->tx_buf_cnt;
- sconn->last_ack_seq = sconn->seq;
- sconn->last_ack_ts = sock_get_usecs();
- sconn->ssthresh = sconn->max_tx_cnt;
- sconn->seq_pending = sconn->seq;
- }
- /* insert in sock ep's list of conns */
- i = sock_ip_hash(sconn->sin.sin_addr.s_addr, sconn->sin.sin_port);
- pthread_mutex_lock(&ep->lock);
- TAILQ_INSERT_TAIL(&sep->conn_hash[i], sconn, entry);
- pthread_mutex_unlock(&ep->lock);
- debug_ep(ep, CCI_DB_CONN, "%s: accepting conn with hash %d",
- __func__, i);
- /* prepare conn_reply */
- tx->msg_type = SOCK_MSG_CONN_REPLY;
- tx->last_attempt_us = 0ULL;
- tx->timeout_us = 0ULL;
- tx->rma_op = NULL;
- evt = &tx->evt;
- evt->ep = ep;
- evt->conn = conn;
- evt->event.type = CCI_EVENT_ACCEPT;
- evt->event.accept.status = CCI_SUCCESS; /* for now */
- evt->event.accept.context = (void *)context;
- evt->event.accept.connection = &conn->connection;
- /* pack the msg */
- hdr_r = (sock_header_r_t *) tx->buffer;
- sock_pack_conn_reply(&hdr_r->header, CCI_SUCCESS /* FIXME */ ,
- sconn->peer_id);
- sock_pack_seq_ts(&hdr_r->seq_ts, sconn->seq,
- (uint32_t) sconn->last_ack_ts);
- hs = (sock_handshake_t *) ((uintptr_t)tx->buffer + sizeof(*hdr_r));
- sock_pack_handshake(hs, sconn->id, peer_seq,
- ep->rx_buf_cnt,
- conn->connection.max_send_size, 0);
- tx->len = sizeof(*hdr_r) + sizeof(*hs);
- tx->seq = sconn->seq;
- debug_ep(ep, CCI_DB_CONN, "%s: queuing conn_reply with seq %u ts %x",
- __func__, sconn->seq, sconn->ts);
- /* insert at tail of device's queued list */
- tx->state = SOCK_TX_QUEUED;
- pthread_mutex_lock(&ep->lock);
- TAILQ_INSERT_TAIL(&sep->queued, &tx->evt, entry);
- pthread_mutex_unlock(&ep->lock);
- /* try to progress txs */
- pthread_mutex_lock(&sep->progress_mutex);
- pthread_cond_signal(&sep->wait_condition);
- pthread_mutex_unlock(&sep->progress_mutex);
-
- CCI_EXIT;
- return CCI_SUCCESS;
- }
- /* Send reject reply to client.
- *
- * We cannot use the event's buffer since the app will most likely return the
- * event before we get an ack from the client. We will get a tx for the reply.
- */
- static int ctp_sock_reject(cci_event_t *event)
- {
- int ret = CCI_SUCCESS;
- uint8_t a;
- uint16_t b;
- uint32_t peer_id;
- uint32_t peer_seq;
- uint32_t peer_ts;
- cci__evt_t *evt = NULL;
- cci__ep_t *ep = NULL;
- sock_ep_t *sep = NULL;
- sock_header_r_t *hdr_r = NULL;
- sock_msg_type_t type;
- sock_rx_t *rx = NULL;
- sock_tx_t *tx = NULL;
- CCI_ENTER;
- if (!sglobals) {
- CCI_EXIT;
- return CCI_ENODEV;
- }
- evt = container_of(event, cci__evt_t, event);
- ep = evt->ep;
- sep = ep->priv;
- rx = container_of(evt, sock_rx_t, evt);
- hdr_r = rx->buffer;
- sock_parse_header(&hdr_r->header, &type, &a, &b, &peer_id);
- sock_parse_seq_ts(&hdr_r->seq_ts, &peer_seq, &peer_ts);
- /* get a tx */
- tx = sock_get_tx (ep);
- if (!tx) {
- ret = CCI_ENOBUFS;
- goto out;
- }
- tx->rma_ptr = NULL;
- tx->rma_len = 0;
- /* prep the tx */
- tx->msg_type = SOCK_MSG_CONN_REPLY;
- tx->evt.ep = ep;
- tx->evt.conn = NULL;
- tx->evt.event.type = CCI_EVENT_CONNECT;
- tx->evt.event.connect.status = CCI_ECONNREFUSED;
- tx->evt.event.connect.connection = NULL;
- tx->last_attempt_us = 0ULL;
- tx->timeout_us = 0ULL;
- tx->rma_op = NULL;
- tx->sin = rx->sin;
- /* prepare conn_reply */
- hdr_r = (sock_header_r_t *) tx->buffer;
- sock_pack_conn_reply(&hdr_r->header, CCI_ECONNREFUSED, peer_id);
- sock_pack_seq_ts(&hdr_r->seq_ts, peer_seq, 0);
- tx->len = sizeof(*hdr_r);
- tx->state = SOCK_TX_QUEUED;
- /* We have no connection and the request is rejected so we generate
- a new seq since the client may or not ack the conn_reply. In the
- worst case, the conn_reply associated to the reject is thrown away
- when it times out */
- tx->seq = sock_get_new_seq ();
- /* insert at tail of endpoint's queued list */
- pthread_mutex_lock(&ep->lock);
- TAILQ_INSERT_TAIL(&sep->queued, &tx->evt, entry);
- pthread_mutex_unlock(&ep->lock);
- /* try to progress txs */
- pthread_mutex_lock(&sep->progress_mutex);
- pthread_cond_signal(&sep->wait_condition);
- pthread_mutex_unlock(&sep->progress_mutex);
-
- #if CCI_DEBUG
- {
- char name[32];
- memset(name, 0, sizeof(name));
- sock_sin_to_name(rx->sin, name, sizeof(name));
- debug_ep(ep, (CCI_DB_MSG | CCI_DB_CONN),
- "%s: queued conn_reply (reject) to %s (seq %u)",
- __func__, name, tx->seq);
- }
- #endif
- out:
- CCI_EXIT;
- return ret;
- }
- static int sock_getaddrinfo(const char *uri, in_addr_t * in, uint16_t * port)
- {
- int ret;
- char *hostname, *svc, *colon;
- struct addrinfo *ai = NULL, hints;
- if (0 == strncmp("sock://", uri, 7))
- hostname = strdup(&uri[7]);
- else {
- CCI_EXIT;
- return CCI_EINVAL;
- }
- colon = strchr(hostname, ':');
- if (colon) {
- *colon = '\0';
- } else {
- free(hostname);
- CCI_EXIT;
- return CCI_EINVAL;
- }
- colon++;
- svc = colon;
- memset(&hints, 0, sizeof(hints));
- hints.ai_family = AF_INET;
- hints.ai_socktype = SOCK_DGRAM;
- hints.ai_protocol = IPPROTO_UDP;
- ret = getaddrinfo(hostname, svc, &hints, &ai);
- free(hostname);
- if (ret) {
- if (ai)
- freeaddrinfo(ai);
- CCI_EXIT;
- return ret;
- }
- *in = ((struct sockaddr_in *)ai->ai_addr)->sin_addr.s_addr;
- *port = ((struct sockaddr_in *)ai->ai_addr)->sin_port;
- freeaddrinfo(ai);
- CCI_EXIT;
- return CCI_SUCCESS;
- }
- static sock_conn_t *sock_find_open_conn(sock_ep_t * sep, in_addr_t ip,
- uint16_t port, uint32_t id)
- {
- uint8_t i;
- struct s_conns *conn_list;
- sock_conn_t *sconn = NULL, *sc;
- CCI_ENTER;
- i = sock_ip_hash(ip, port);
- conn_list = &sep->conn_hash[i];
- TAILQ_FOREACH(sc, conn_list, entry) {
- if (sc->sin.sin_addr.s_addr == ip &&
- sc->sin.sin_port == port && sc->id == id) {
- sconn = sc;
- break;
- }
- }
- CCI_EXIT;
- return sconn;
- }
- static sock_conn_t *sock_find_active_conn(sock_ep_t * sep, in_addr_t ip,
- uint32_t id)
- {
- uint8_t i;
- struct s_active *active_list;
- sock_conn_t *sconn = NULL, *sc;
- CCI_ENTER;
- i = sock_ip_hash(ip, 0);
- active_list = &sep->active_hash[i];
- TAILQ_FOREACH(sc, active_list, entry) {
- if (sc->sin.sin_addr.s_addr == ip && sc->id == id) {
- sconn = sc;
- break;
- }
- }
- CCI_EXIT;
- return sconn;
- }
- static sock_conn_t *sock_find_conn(sock_ep_t * sep, in_addr_t ip, uint16_t port,
- uint32_t id, sock_msg_type_t type)
- {
- switch (type) {
- case SOCK_MSG_CONN_REPLY:
- return sock_find_active_conn(sep, ip, id);
- default:
- return sock_find_open_conn(sep, ip, port, id);
- }
- }
- static int ctp_sock_connect(cci_endpoint_t * endpoint,
- const char *server_uri,
- const void *data_ptr,
- uint32_t data_len,
- cci_conn_attribute_t attribute,
- const void *context,
- int flags,
- const struct timeval *timeout)
- {
- int ret;
- int i;
- cci__ep_t *ep = NULL;
- cci__dev_t *dev = NULL;
- cci__conn_t *conn = NULL;
- sock_ep_t *sep = NULL;
- sock_conn_t *sconn = NULL;
- sock_tx_t *tx = NULL;
- sock_header_r_t *hdr_r = NULL;
- cci__evt_t *evt = NULL;
- struct cci_connection *connection = NULL;
- struct sockaddr_in *sin = NULL;
- void *ptr = NULL;
- in_addr_t ip;
- uint32_t ts = 0;
- struct s_active *active_list;
- sock_handshake_t *hs = NULL;
- uint16_t port;
- uint32_t keepalive = 0ULL;
- CCI_ENTER;
- UNUSED_PARAM (flags);
- UNUSED_PARAM (timeout);
- if (!sglobals) {
- CCI_EXIT;
- return CCI_ENODEV;
- }
- /* allocate a new connection */
- conn = calloc(1, sizeof(*conn));
- if (!conn) {
- CCI_EXIT;
- return CCI_ENOMEM;
- }
- conn->priv = calloc(1, sizeof(*sconn));
- if (!conn->priv) {
- ret = CCI_ENOMEM;
- goto out;
- }
- sconn = conn->priv;
- sconn->conn = conn;
- TAILQ_INIT(&sconn->tx_seqs);
- TAILQ_INIT(&sconn->acks);
- TAILQ_INIT(&sconn->rmas);
- /* conn->tx_timeout = 0 by default */
- connection = &conn->connection;
- connection->attribute = attribute;
- connection->endpoint = endpoint;
- connection->context = (void *)context;
- /* set up sock specific info */
- sconn->status = SOCK_CONN_ACTIVE;
- sconn->cwnd = SOCK_INITIAL_CWND;
- sconn->last_recvd_seq = 0;
- sin = (struct sockaddr_in *)&sconn->sin;
- memset(sin, 0, sizeof(*sin));
- sin->sin_family = AF_INET;
- ret = sock_getaddrinfo(server_uri, &ip, &port);
- if (ret)
- goto out;
- sin->sin_addr.s_addr = ip; /* already in network order */
- sin->sin_port = port; /* already in network order */
- /* peer will assign id */
- /* get our endpoint and device */
- ep = container_of(endpoint, cci__ep_t, endpoint);
- sep = ep->priv;
- dev = ep->dev;
- connection->max_send_size = dev->device.max_send_size;
- conn->plugin = ep->plugin;
- /* Dealing with keepalive, if set, include the keepalive timeout value into
- the connection request */
- if ((((attribute & CCI_CONN_ATTR_RO) == CCI_CONN_ATTR_RO)
- || ((attribute & CCI_CONN_ATTR_RU) == CCI_CONN_ATTR_RU))
- && ep->keepalive_timeout != 0UL) {
- keepalive = ep->keepalive_timeout;
- }
- i = sock_ip_hash(ip, 0);
- active_list = &sep->active_hash[i];
- pthread_mutex_lock(&ep->lock);
- TAILQ_INSERT_TAIL(active_list, sconn, entry);
- pthread_mutex_unlock(&ep->lock);
- /* get a tx */
- tx = sock_get_tx (ep);
- if (!tx) {
- /* FIXME leak */
- CCI_EXIT;
- return CCI_ENOBUFS;
- }
- tx->rma_ptr = NULL;
- tx->rma_len = 0;
- /* prep the tx */
- tx->msg_type = SOCK_MSG_CONN_REQUEST;
- evt = &tx->evt;
- evt->ep = ep;
- evt->conn = conn;
- evt->event.type = CCI_EVENT_CONNECT; /* for now */
- evt->event.connect.status = CCI_SUCCESS;
- evt->event.connect.context = (void *)context;
- evt->event.connect.connection = connection;
- /* pack the msg */
- hdr_r = (sock_header_r_t *) tx->buffer;
- sock_get_id(sep, &sconn->id);
- sock_pack_conn_request(&hdr_r->header, attribute,
- (uint16_t) data_len, sconn->id);
- tx->len = sizeof(*hdr_r);
- /* add seq and ack */
- sconn->seq = sock_get_new_seq();
- sconn->seq_pending = sconn->seq - 1;
- sconn->last_ack_seq = sconn->seq;
- tx->seq = sconn->seq;
- sock_pack_seq_ts(&hdr_r->seq_ts, tx->seq, ts);
- /* add handshake */
- hs = (sock_handshake_t *) & hdr_r->data;
- if (keepalive != 0UL)
- conn->keepalive_timeout = keepalive;
- sock_pack_handshake(hs, sconn->id, 0,
- ep->rx_buf_cnt,
- connection->max_send_size, keepalive);
- tx->len += sizeof(*hs);
- ptr = (void*)((uintptr_t)tx->buffer + tx->len);
- debug_ep(ep,CCI_DB_CONN, "%s: queuing conn_request with seq %u ts %x",
- __func__, tx->seq, ts);
- /* zero even if unreliable */
- tx->last_attempt_us = 0ULL;
- tx->timeout_us = 0ULL;
- tx->rma_op = NULL;
- if (data_len)
- memcpy(ptr, data_ptr, data_len);
- tx->len += data_len;
- assert(tx->len <= ep->buffer_len);
- /* insert at tail of device's queued list */
- tx->state = SOCK_TX_QUEUED;
- pthread_mutex_lock(&ep->lock);
- TAILQ_INSERT_TAIL(&sep->queued, &tx->evt, entry);
- pthread_mutex_unlock(&ep->lock);
- /* try to progress txs */
- pthread_mutex_lock(&sep->progress_mutex);
- pthread_cond_signal(&sep->wait_condition);
- pthread_mutex_unlock(&sep->progress_mutex);
- CCI_EXIT;
- return CCI_SUCCESS;
- out:
- if (conn) {
- if (conn->uri)
- free((char *)conn->uri);
- if (conn->priv)
- free(conn->priv);
- free(conn);
- }
- CCI_EXIT;
- return ret;
- }
- static int ctp_sock_disconnect(cci_connection_t * connection)
- {
- int i = 0;
- cci__conn_t *conn = NULL;
- cci__ep_t *ep = NULL;
- sock_conn_t *sconn = NULL;
- sock_ep_t *sep = NULL;
- CCI_ENTER;
- if (!sglobals) {
- CCI_EXIT;
- return CCI_ENODEV;
- }
- /* need to clean up */
- /* remove conn from ep->conn_hash[i] */
- /* if sock conn uri, free it
- * free sock conn
- * free conn
- */
- conn = container_of(connection, cci__conn_t, connection);
- sconn = conn->priv;
- ep = container_of(connection->endpoint, cci__ep_t, endpoint);
- sep = ep->priv;
- if (conn->uri)
- free((char *)conn->uri);
- i = sock_ip_hash(sconn->sin.sin_addr.s_addr, sconn->sin.sin_port);
- pthread_mutex_lock(&ep->lock);
- TAILQ_REMOVE(&sep->conn_hash[i], sconn, entry);
- pthread_mutex_unlock(&ep->lock);
- free(sconn);
- free(conn);
- CCI_EXIT;
- return CCI_SUCCESS;
- }
- static int ctp_sock_set_opt(cci_opt_handle_t * handle,
- cci_opt_name_t name, const void *val)
- {
- int ret = CCI_SUCCESS;
- cci__ep_t *ep = NULL;
- cci__conn_t *conn = NULL;
- CCI_ENTER;
- if (!sglobals) {
- CCI_EXIT;
- return CCI_ENODEV;
- }
- switch (name) {
- case CCI_OPT_ENDPT_SEND_TIMEOUT:
- ep = container_of(handle, cci__ep_t, endpoint);
- ep->tx_timeout = *((uint32_t*) val);
- break;
- case CCI_OPT_ENDPT_RECV_BUF_COUNT:
- ret = CCI_ERR_NOT_IMPLEMENTED;
- break;
- case CCI_OPT_ENDPT_SEND_BUF_COUNT:
- ret = CCI_ERR_NOT_IMPLEMENTED;
- break;
- case CCI_OPT_ENDPT_KEEPALIVE_TIMEOUT:
- ep = container_of(handle, cci__ep_t, endpoint);
- ep->keepalive_timeout = *((uint32_t*) val);
- break;
- case CCI_OPT_CONN_SEND_TIMEOUT:
- conn->tx_timeout = *((uint32_t*) val);
- break;
- default:
- debug(CCI_DB_INFO, "%s: unknown option %u", __func__, name);
- ret = CCI_EINVAL;
- }
- CCI_EXIT;
- return ret;
- }
- static int ctp_sock_get_opt(cci_opt_handle_t * handle,
- cci_opt_name_t name, void *val)
- {
- int ret = CCI_SUCCESS;
- cci_endpoint_t *endpoint = NULL;
- cci__ep_t *ep = NULL;
- CCI_ENTER;
- if (!sglobals) {
- CCI_EXIT;
- return CCI_ENODEV;
- }
- endpoint = handle;
- ep = container_of(endpoint, cci__ep_t, endpoint);
- assert (ep);
-
- switch (name) {
- case CCI_OPT_ENDPT_RECV_BUF_COUNT:
- {
- uint32_t *cnt = val;
- *cnt = ep->rx_buf_cnt;
- break;
- }
- case CCI_OPT_ENDPT_SEND_BUF_COUNT:
- {
- uint32_t *cnt = val;
- *cnt = ep->tx_buf_cnt;
- break;
- }
- case CCI_OPT_ENDPT_KEEPALIVE_TIMEOUT:
- {
- uint32_t *timeout = val;
- *timeout = ep->keepalive_timeout;
- break;
- }
- default:
- /* Invalid opt name */
- ret = CCI_EINVAL;
- }
- CCI_EXIT;
- return ret;
- }
- static int ctp_sock_arm_os_handle(cci_endpoint_t * endpoint, int flags)
- {
- CCI_ENTER;
- UNUSED_PARAM (endpoint);
- UNUSED_PARAM (flags);
- if (!sglobals) {
- CCI_EXIT;
- return CCI_ENODEV;
- }
- CCI_EXIT;
- return CCI_ERR_NOT_IMPLEMENTED;
- }
- static int
- ctp_sock_get_event(cci_endpoint_t * endpoint, cci_event_t ** const event)
- {
- int ret = CCI_SUCCESS;
- cci__ep_t *ep;
- sock_ep_t *sep;
- cci__evt_t *ev = NULL, *e;
- CCI_ENTER;
- if (!sglobals) {
- CCI_EXIT;
- return CCI_ENODEV;
- }
- ep = container_of(endpoint, cci__ep_t, endpoint);
- sep = ep->priv;
- /* try to progress sends... */
- if (!sep->closing) {
- pthread_mutex_lock(&sep->progress_mutex);
- pthread_cond_signal(&sep->wait_condition);
- pthread_mutex_unlock(&sep->progress_mutex);
- }
- pthread_mutex_lock(&ep->lock);
- /* give the user the first event */
- TAILQ_FOREACH(e, &ep->evts, entry) {
- if (e->event.type == CCI_EVENT_SEND) {
- /* NOTE: if it is blocking, skip it since sock_sendv()
- * is waiting on it
- */
- sock_tx_t *tx = container_of(e, sock_tx_t, evt);
- if (tx->flags & CCI_FLAG_BLOCKING) {
- continue;
- } else {
- ev = e;
- break;
- }
- } else {
- ev = e;
- break;
- }
- }
- if (ev) {
- TAILQ_REMOVE(&ep->evts, ev, entry);
- *event = &ev->event;
- } else {
- *event = NULL;
- /* No event is available and there are no available
- receive buffers. The application must return events
- before any more messages can be received. */
- if (TAILQ_EMPTY(&sep->idle_rxs)) {
- ret = CCI_ENOBUFS;
- } else {
- ret = CCI_EAGAIN;
- }
- }
- pthread_mutex_unlock(&ep->lock);
- /* We read on the fd to block again */
- if (ev && sep->event_fd) {
- char a[1];
- int rc;
- /* We bock again only and only if there is no more
- pending events */
- if (event_queue_is_empty (ep)) {
- /* Draining events so the app thread can block */
- rc = read (sep->fd[0], a, sizeof (a));
- if (rc != sizeof (a)) {
- ret = CCI_ERROR;
- }
- }
- }
- CCI_EXIT;
- return ret;
- }
- static int ctp_sock_return_event(cci_event_t * event)
- {
- cci__ep_t *ep;
- sock_ep_t *sep;
- cci__evt_t *evt;
- sock_tx_t *tx;
- sock_rx_t *rx;
- int ret = CCI_SUCCESS;
- CCI_ENTER;
- if (!sglobals) {
- CCI_EXIT;
- return CCI_ENODEV;
- }
- if (!event) {
- CCI_EXIT;
- return CCI_SUCCESS;
- }
- evt = container_of(event, cci__evt_t, event);
- ep = evt->ep;
- sep = ep->priv;
- /* enqueue the event */
- switch (event->type) {
- case CCI_EVENT_SEND:
- case CCI_EVENT_ACCEPT:
- tx = container_of(evt, sock_tx_t, evt);
- pthread_mutex_lock(&ep->lock);
- /* insert at head to keep it in cache */
- TAILQ_INSERT_HEAD(&sep->idle_txs, tx, dentry);
- pthread_mutex_unlock(&ep->lock);
- break;
- case CCI_EVENT_RECV:
- case CCI_EVENT_CONNECT_REQUEST:
- rx = container_of(evt, sock_rx_t, evt);
- pthread_mutex_lock(&ep->lock);
- /* insert at head to keep it in cache */
- TAILQ_INSERT_HEAD(&sep->idle_rxs, rx, entry);
- pthread_mutex_unlock(&ep->lock);
- break;
- case CCI_EVENT_CONNECT:
- rx = container_of (evt, sock_rx_t, evt);
- if (rx->ctx == SOCK_CTX_RX) {
- pthread_mutex_lock(&ep->lock);
- TAILQ_INSERT_HEAD (&sep->idle_rxs, rx, entry);
- pthread_mutex_unlock(&ep->lock);
- } else {
- tx = (sock_tx_t*)rx;
- pthread_mutex_lock(&ep->lock);
- TAILQ_INSERT_HEAD (&sep->idle_txs, tx, dentry);
- pthread_mutex_unlock(&ep->lock);
- }
- break;
- default:
- debug (CCI_DB_EP,
- "%s: unhandled %s event", __func__,
- cci_event_type_str(event->type));
- ret = CCI_ERROR;
- break;
- }
- CCI_EXIT;
- return ret;
- }
- static void sock_progress_pending(cci__ep_t * ep)
- {
- int ret;
- uint64_t now;
- sock_tx_t *tx;
- cci__evt_t *evt, *tmp, *my_temp_evt;
- union cci_event *event; /* generic CCI event */
- cci__conn_t *conn;
- sock_conn_t *sconn = NULL;
- sock_ep_t *sep = ep->priv;
- TAILQ_HEAD(s_idle_txs, sock_tx) idle_txs
- = TAILQ_HEAD_INITIALIZER(idle_txs);
- TAILQ_HEAD(s_evts, cci__evt) evts = TAILQ_HEAD_INITIALIZER(evts);
- TAILQ_INIT(&idle_txs);
- TAILQ_INIT(&evts);
- CCI_ENTER;
- now = sock_get_usecs();
- /* This is only for reliable messages.
- * Do not dequeue txs, just walk the list.
- */
- pthread_mutex_lock (&ep->lock);
- TAILQ_FOREACH_SAFE(evt, &sep->pending, entry, tmp) {
- sock_tx_t *tx = container_of (evt, sock_tx_t, evt);
- conn = evt->conn;
- if (conn)
- sconn = conn->priv;
- event = &evt->event;
- assert(tx->last_attempt_us != 0ULL);
- /* has it timed out? */
- if (SOCK_U64_LT(tx->timeout_us, now)) {
- /* dequeue */
- debug_ep(ep, CCI_DB_WARN,
- "%s: timeout of %s msg (seq %u)",
- __func__, sock_msg_type(tx->msg_type),
- tx->seq);
- TAILQ_REMOVE(&sep->pending, &tx->evt, entry);
- /* set status and add to completed events */
- if (tx->msg_type == SOCK_MSG_SEND)
- sconn->pending--;
- switch (tx->msg_type) {
- case SOCK_MSG_SEND:
- event->send.status = CCI_ETIMEDOUT;
- if (tx->rnr != 0) {
- event->send.status = CCI_ERR_RNR;
- /* If a message that is already marked
- RNR times out, and if the connection
- is reliable and ordered, we mark all
- following messages as RNR */
- if (conn->connection.attribute == CCI_CONN_ATTR_RO) {
- sock_tx_t *my_temp_tx;
- TAILQ_FOREACH_SAFE(my_temp_evt,
- &sep->pending,
- entry,
- tmp)
- {
- my_temp_tx = container_of (my_temp_evt, sock_tx_t, evt);
- if (my_temp_tx->seq > tx->seq)
- my_temp_tx->rnr = 1;
- }
- }
- }
- break;
- case SOCK_MSG_RMA_READ_REQUEST:
- case SOCK_MSG_RMA_WRITE:
- pthread_mutex_lock(&ep->lock);
- tx->rma_op->pending--;
- tx->rma_op->status = CCI_ETIMEDOUT;
- pthread_mutex_unlock(&ep->lock);
- break;
- case SOCK_MSG_CONN_REQUEST: {
- int i;
- struct s_active *active_list;
- event->connect.status = CCI_ETIMEDOUT;
- event->connect.connection = NULL;
- if (conn->uri)
- free((char *)conn->uri);
- sconn->status = SOCK_CONN_CLOSING;
- i = sock_ip_hash(sconn->sin.sin_addr.s_addr,
- 0);
- active_list = &sep->active_hash[i];
- pthread_mutex_lock(&ep->lock);
- TAILQ_REMOVE(active_list, sconn, entry);
- pthread_mutex_unlock(&ep->lock);
- free(sconn);
- free(conn);
- sconn = NULL;
- conn = NULL;
- tx->evt.ep = ep;
- tx->evt.conn = NULL;
- break;
- }
- case SOCK_MSG_CONN_REPLY: {
- /* The client is not requiered to ack a
- conn_reply in the context of a reject, so
- we just ignore the timeout in that
- context */
- if (tx->evt.event.connect.status
- == CCI_ECONNREFUSED)
- {
- /* store locally until we can drop the
- dev->lock */
- debug_ep (ep, CCI_DB_CONN,
- "%s: No ACK of the reject, "
- "dropping pending msg",
- __func__);
- TAILQ_INSERT_HEAD(&idle_txs,
- tx,
- dentry);
- break;
- }
- }
- case SOCK_MSG_CONN_ACK:
- default:
- /* TODO */
- CCI_EXIT;
- return;
- }
- /* if SILENT, put idle tx */
- if (tx->flags & CCI_FLAG_SILENT &&
- (tx->msg_type == SOCK_MSG_SEND ||
- tx->msg_type == SOCK_MSG_RMA_WRITE)) {
- tx->state = SOCK_TX_IDLE;
- /* store locally until we can drop the
- dev->lock */
- TAILQ_INSERT_HEAD(…
Large files files are truncated, but you can click here to view the full file