PageRenderTime 91ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 2ms

/src/plugins/ctp/sock/ctp_sock_api.c

https://github.com/storage-zuiwanyuan/cci
C | 5640 lines | 4325 code | 714 blank | 601 comment | 803 complexity | 192e2833f64ce1309bf7d36f5865392c MD5 | raw file
Possible License(s): BSD-3-Clause
  1. /* vim: set tabstop=8:softtabstop=8:shiftwidth=8:noexpandtab */
  2. /*
  3. * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
  4. * Copyright © 2010-2013 UT-Battelle, LLC. All rights reserved.
  5. * Copyright © 2010-2013 Oak Ridge National Labs. All rights reserved.
  6. * Copyright © 2012 inria. All rights reserved.
  7. *
  8. * See COPYING in top-level directory
  9. *
  10. * $COPYRIGHT$
  11. *
  12. */
  13. #if defined(__INTEL_COMPILER)
  14. #pragma warning(disable:593)
  15. #pragma warning(disable:869)
  16. #pragma warning(disable:981)
  17. #pragma warning(disable:1338)
  18. #pragma warning(disable:2259)
  19. #endif /* __INTEL_COMPILER */
  20. #include "cci/private_config.h"
  21. #include <stdio.h>
  22. #include <stdlib.h>
  23. #include <string.h>
  24. #include <unistd.h>
  25. #include <netinet/in.h>
  26. #include <arpa/inet.h>
  27. #include <sys/socket.h>
  28. #include <sys/types.h>
  29. #include <netdb.h>
  30. #include <fcntl.h>
  31. #include <inttypes.h>
  32. #ifdef HAVE_IFADDRS_H
  33. #include <ifaddrs.h>
  34. #include <net/if.h>
  35. #endif
  36. #ifdef HAVE_SYS_EPOLL_H
  37. #include <sys/epoll.h>
  38. #else
  39. #include <poll.h>
  40. #endif /* HAVE_SYS_EPOLL_H */
  41. #include "cci.h"
  42. #include "cci_lib_types.h"
  43. #include "cci-api.h"
  44. #include "plugins/ctp/ctp.h"
  45. #include "ctp_sock_internals.h"
  46. #define DEBUG_RNR 0
  47. #if DEBUG_RNR
  48. #include <stdbool.h>
  49. bool conn_established = false;
  50. #endif
  51. sock_globals_t *sglobals = NULL;
  52. static int threads_running = 0;
  53. /*
  54. * Local functions
  55. */
  56. static int ctp_sock_init(cci_plugin_ctp_t *plugin,
  57. uint32_t abi_ver,
  58. uint32_t flags,
  59. uint32_t * caps);
  60. static int ctp_sock_finalize(cci_plugin_ctp_t * plugin);
  61. static const char *ctp_sock_strerror(cci_endpoint_t * endpoint,
  62. enum cci_status status);
  63. static int ctp_sock_create_endpoint(cci_device_t * device,
  64. int flags,
  65. cci_endpoint_t ** endpoint,
  66. cci_os_handle_t * fd);
  67. static int ctp_sock_destroy_endpoint(cci_endpoint_t * endpoint);
  68. static int ctp_sock_accept(cci_event_t *event, const void *context);
  69. static int ctp_sock_reject(cci_event_t *conn_req);
  70. static int ctp_sock_connect(cci_endpoint_t * endpoint,
  71. const char *server_uri,
  72. const void *data_ptr,
  73. uint32_t data_len,
  74. cci_conn_attribute_t attribute,
  75. const void *context,
  76. int flags,
  77. const struct timeval *timeout);
  78. static int ctp_sock_disconnect(cci_connection_t * connection);
  79. static int ctp_sock_set_opt(cci_opt_handle_t * handle,
  80. cci_opt_name_t name,
  81. const void *val);
  82. static int ctp_sock_get_opt(cci_opt_handle_t * handle,
  83. cci_opt_name_t name,
  84. void *val);
  85. static int ctp_sock_arm_os_handle(cci_endpoint_t * endpoint, int flags);
  86. static int ctp_sock_get_event(cci_endpoint_t * endpoint,
  87. cci_event_t ** const event);
  88. static int ctp_sock_return_event(cci_event_t * event);
  89. static int ctp_sock_send(cci_connection_t * connection,
  90. const void *msg_ptr,
  91. uint32_t msg_len,
  92. const void *context,
  93. int flags);
  94. static int ctp_sock_sendv(cci_connection_t * connection,
  95. const struct iovec *data,
  96. uint32_t iovcnt,
  97. const void *context,
  98. int flags);
  99. static int ctp_sock_rma_register(cci_endpoint_t * endpoint,
  100. void *start,
  101. uint64_t length,
  102. int flags,
  103. cci_rma_handle_t ** rma_handle);
  104. static int ctp_sock_rma_deregister(cci_endpoint_t * endpoint,
  105. cci_rma_handle_t * rma_handle);
  106. static int ctp_sock_rma(cci_connection_t * connection,
  107. const void *header_ptr,
  108. uint32_t header_len,
  109. cci_rma_handle_t * local_handle,
  110. uint64_t local_offset,
  111. cci_rma_handle_t * remote_handle,
  112. uint64_t remote_offset,
  113. uint64_t data_len,
  114. const void *context,
  115. int flags);
  116. static uint8_t sock_ip_hash(in_addr_t ip, uint16_t port);
  117. static void sock_progress_sends(cci__ep_t * ep);
  118. static void *sock_progress_thread(void *arg);
  119. static void *sock_recv_thread(void *arg);
  120. static void sock_ack_conns(cci__ep_t * ep);
  121. static inline int pack_piggyback_ack(cci__ep_t *ep,
  122. sock_conn_t *sconn, sock_tx_t *tx);
  123. static inline int sock_ack_sconn(sock_ep_t *sep, sock_conn_t *sconn);
  124. static int sock_recvfrom_ep(cci__ep_t * ep);
  125. int progress_recv (cci__ep_t *ep);
  126. /*
  127. * Public plugin structure.
  128. *
  129. * The name of this structure must be of the following form:
  130. *
  131. * cci_ctp_<your_plugin_name>_plugin
  132. *
  133. * This allows the symbol to be found after the plugin is dynamically
  134. * opened.
  135. *
  136. * Note that your_plugin_name should match the direct name where the
  137. * plugin resides.
  138. */
  139. cci_plugin_ctp_t cci_ctp_sock_plugin = {
  140. {
  141. /* Logistics */
  142. CCI_ABI_VERSION,
  143. CCI_CTP_API_VERSION,
  144. "sock",
  145. CCI_MAJOR_VERSION, CCI_MINOR_VERSION, CCI_RELEASE_VERSION,
  146. 30,
  147. /* Bootstrap function pointers */
  148. cci_ctp_sock_post_load,
  149. cci_ctp_sock_pre_unload,
  150. },
  151. /* API function pointers */
  152. ctp_sock_init,
  153. ctp_sock_finalize,
  154. ctp_sock_strerror,
  155. ctp_sock_create_endpoint,
  156. ctp_sock_destroy_endpoint,
  157. ctp_sock_accept,
  158. ctp_sock_reject,
  159. ctp_sock_connect,
  160. ctp_sock_disconnect,
  161. ctp_sock_set_opt,
  162. ctp_sock_get_opt,
  163. ctp_sock_arm_os_handle,
  164. ctp_sock_get_event,
  165. ctp_sock_return_event,
  166. ctp_sock_send,
  167. ctp_sock_sendv,
  168. ctp_sock_rma_register,
  169. ctp_sock_rma_deregister,
  170. ctp_sock_rma
  171. };
  172. static inline int
  173. sock_recv_msg (int fd,
  174. void *ptr,
  175. uint32_t len,
  176. int flags,
  177. struct sockaddr_in *sin_out)
  178. {
  179. int ret = 0;
  180. uint32_t recv_len = 0;
  181. static int count = 0;
  182. uint32_t offset = 0;
  183. struct sockaddr_in sin;
  184. socklen_t sin_len = sizeof(sin);
  185. if (len == 0)
  186. return ret;
  187. again:
  188. do {
  189. ret = recvfrom (fd, (void*) ((uintptr_t)ptr + offset), len - recv_len, flags, (struct sockaddr *)&sin, &sin_len);
  190. if (ret < 0) {
  191. if ((count++ & 0xFFFF) == 0xFFFF)
  192. debug (CCI_DB_EP, "%s: recvfrom() failed with %s (%u of %u bytes)", __func__, strerror(ret), recv_len, len);
  193. if (ret == EAGAIN)
  194. goto again;
  195. goto out;
  196. } else if (ret == 0) {
  197. debug (CCI_DB_MSG, "%s: recvfrom() failed - socket closed", __func__);
  198. ret = -1;
  199. goto out;
  200. }
  201. recv_len += ret;
  202. offset += recv_len;
  203. } while (recv_len < len);
  204. ret = recv_len;
  205. if (sin_out != NULL)
  206. *sin_out = sin;
  207. out:
  208. return ret;
  209. }
  210. static inline void
  211. sock_sin_to_name(struct sockaddr_in sin, char *buffer, int len)
  212. {
  213. snprintf(buffer, len, "%s:%d", inet_ntoa(sin.sin_addr),
  214. ntohs(sin.sin_port));
  215. return;
  216. }
  217. static inline const char *sock_msg_type(sock_msg_type_t type)
  218. {
  219. switch (type) {
  220. case SOCK_MSG_CONN_REQUEST:
  221. return "conn_request";
  222. case SOCK_MSG_CONN_REPLY:
  223. return "conn_reply";
  224. case SOCK_MSG_CONN_ACK:
  225. return "conn_ack";
  226. case SOCK_MSG_DISCONNECT:
  227. return "disconnect";
  228. case SOCK_MSG_SEND:
  229. return "send";
  230. case SOCK_MSG_RNR:
  231. return "receiver not ready";
  232. case SOCK_MSG_KEEPALIVE:
  233. return "keepalive";
  234. case SOCK_MSG_PING:
  235. return "ping for RTTM";
  236. case SOCK_MSG_ACK_ONLY:
  237. return "ack_only";
  238. case SOCK_MSG_ACK_UP_TO:
  239. return "ack_up_to";
  240. case SOCK_MSG_SACK:
  241. return "selective ack";
  242. case SOCK_MSG_NACK:
  243. return "negative ack";
  244. case SOCK_MSG_RMA_WRITE:
  245. return "RMA write";
  246. case SOCK_MSG_RMA_WRITE_DONE:
  247. return "RMA write done";
  248. case SOCK_MSG_RMA_READ_REQUEST:
  249. return "RMA read request";
  250. case SOCK_MSG_RMA_READ_REPLY:
  251. return "RMA read reply";
  252. case SOCK_MSG_RMA_INVALID:
  253. return "invalid RMA handle";
  254. case SOCK_MSG_INVALID:
  255. assert(0);
  256. return "invalid";
  257. case SOCK_MSG_TYPE_MAX:
  258. assert(0);
  259. return "type_max";
  260. }
  261. return NULL;
  262. }
  263. static inline void sock_drop_msg(cci_os_handle_t sock)
  264. {
  265. char buf[4];
  266. struct sockaddr sa;
  267. socklen_t slen = sizeof(sa);
  268. recvfrom(sock, buf, 4, 0, &sa, &slen);
  269. return;
  270. }
  271. static inline int sock_create_threads (cci__ep_t *ep)
  272. {
  273. int ret;
  274. sock_ep_t *sep;
  275. assert (ep);
  276. sep = ep->priv;
  277. ret = pthread_create(&sep->recv_tid, NULL, sock_recv_thread, (void*)ep);
  278. if (ret)
  279. goto out;
  280. ret = pthread_create(&sep->progress_tid, NULL, sock_progress_thread, (void*)ep);
  281. if (ret)
  282. goto out;
  283. out:
  284. return ret;
  285. }
  286. static inline int sock_terminate_threads (sock_ep_t *sep)
  287. {
  288. CCI_ENTER;
  289. assert (sep);
  290. pthread_mutex_lock(&sep->progress_mutex);
  291. pthread_cond_signal(&sep->wait_condition);
  292. pthread_mutex_unlock(&sep->progress_mutex);
  293. pthread_join(sep->progress_tid, NULL);
  294. pthread_join(sep->recv_tid, NULL);
  295. CCI_EXIT;
  296. return CCI_SUCCESS;
  297. }
  298. static int ctp_sock_init(cci_plugin_ctp_t *plugin,
  299. uint32_t abi_ver, uint32_t flags, uint32_t * caps)
  300. {
  301. int ret;
  302. cci__dev_t *dev, *ndev;
  303. cci_device_t **devices;
  304. #ifdef HAVE_GETIFADDRS
  305. struct ifaddrs *addrs = NULL, *addr;
  306. #endif
  307. CCI_ENTER;
  308. /* Some unused parameters, the following avoids warnings from
  309. compilers */
  310. UNUSED_PARAM (abi_ver);
  311. UNUSED_PARAM (flags);
  312. UNUSED_PARAM (caps);
  313. #if DEBUG_RNR
  314. fprintf(stderr, "Warning, debug mode (RNR testing)!\n");
  315. #endif
  316. /* init sock globals */
  317. sglobals = calloc(1, sizeof(*sglobals));
  318. if (!sglobals) {
  319. CCI_EXIT;
  320. return CCI_ENOMEM;
  321. }
  322. srandom((unsigned int)sock_get_usecs());
  323. #ifdef HAVE_GETIFADDRS
  324. getifaddrs(&addrs);
  325. /* ignore errors, we've use defaults */
  326. #endif
  327. devices = calloc(CCI_MAX_DEVICES, sizeof(*sglobals->devices));
  328. if (!devices) {
  329. ret = CCI_ENOMEM;
  330. goto out;
  331. }
  332. if (!globals->configfile) {
  333. #ifdef HAVE_GETIFADDRS
  334. if (addrs) {
  335. for (addr = addrs; addr != NULL; addr = addr->ifa_next) {
  336. struct cci_device *device;
  337. sock_dev_t *sdev;
  338. uint32_t mtu = (uint32_t) -1;
  339. struct sockaddr_in *sai;
  340. if (!addr->ifa_addr)
  341. continue;
  342. if (addr->ifa_addr->sa_family != AF_INET)
  343. continue;
  344. if (addr->ifa_flags & IFF_LOOPBACK)
  345. continue;
  346. dev = calloc(1, sizeof(*dev));
  347. if (!dev) {
  348. ret = CCI_ENOMEM;
  349. goto out;
  350. }
  351. dev->priv = calloc(1, sizeof(*sdev));
  352. if (!dev->priv) {
  353. free(dev);
  354. ret = CCI_ENOMEM;
  355. goto out;
  356. }
  357. cci__init_dev(dev);
  358. dev->plugin = plugin;
  359. dev->priority = plugin->base.priority;
  360. /* FIXME GV: could use macro here */
  361. device = &dev->device;
  362. device->transport = strdup("sock");
  363. device->name = strdup(addr->ifa_name);
  364. sdev = dev->priv;
  365. sai = (struct sockaddr_in *) addr->ifa_addr;
  366. memcpy(&sdev->ip, &sai->sin_addr, sizeof(sai->sin_addr));
  367. /* default values */
  368. device->up = 1;
  369. device->rate = 0;
  370. device->pci.domain = -1; /* per CCI spec */
  371. device->pci.bus = -1; /* per CCI spec */
  372. device->pci.dev = -1; /* per CCI spec */
  373. device->pci.func = -1; /* per CCI spec */
  374. /* try to get the actual values */
  375. cci__get_dev_ifaddrs_info(dev, addr);
  376. mtu = device->max_send_size;
  377. if (mtu == (uint32_t) -1) {
  378. /* if no mtu, use default */
  379. device->max_send_size = SOCK_DEFAULT_MSS;
  380. } else {
  381. /* compute mss from mtu */
  382. if (mtu > SOCK_UDP_MAX)
  383. mtu = SOCK_UDP_MAX;
  384. mtu -= SOCK_MAX_HDR_SIZE;
  385. assert(mtu >= SOCK_MIN_MSS); /* FIXME rather ignore the device? */
  386. device->max_send_size = mtu;
  387. }
  388. cci__add_dev(dev);
  389. devices[sglobals->count] = device;
  390. sglobals->count++;
  391. threads_running = 1;
  392. }
  393. }
  394. #endif
  395. } else
  396. /* find devices that we own */
  397. TAILQ_FOREACH_SAFE(dev, &globals->configfile_devs, entry, ndev) {
  398. if (0 == strcmp("sock", dev->device.transport)) {
  399. const char * const *arg;
  400. const char *interface = NULL;
  401. struct cci_device *device;
  402. sock_dev_t *sdev;
  403. uint32_t mtu = (uint32_t) -1;
  404. dev->plugin = plugin;
  405. if (dev->priority == -1)
  406. dev->priority = plugin->base.priority;
  407. device = &dev->device;
  408. /* TODO determine link rate
  409. *
  410. * linux->driver->get ethtool settings->speed
  411. * bsd/darwin->ioctl(SIOCGIFMEDIA)->ifm_active
  412. * windows ?
  413. */
  414. dev->priv = calloc(1, sizeof(*sdev));
  415. if (!dev->priv) {
  416. ret = CCI_ENOMEM;
  417. goto out;
  418. }
  419. sdev = dev->priv;
  420. sdev->port = 0;
  421. sdev->bufsize = 0;
  422. /* default values */
  423. device->up = 1;
  424. device->rate = 0;
  425. device->pci.domain = -1; /* per CCI spec */
  426. device->pci.bus = -1; /* per CCI spec */
  427. device->pci.dev = -1; /* per CCI spec */
  428. device->pci.func = -1; /* per CCI spec */
  429. /* parse conf_argv */
  430. for (arg = device->conf_argv; *arg != NULL; arg++) {
  431. if (0 == strncmp("ip=", *arg, 3)) {
  432. const char *ip = *arg + 3;
  433. /* network order */
  434. sdev->ip = inet_addr(ip);
  435. } else if (0 == strncmp("mtu=", *arg, 4)) {
  436. const char *mtu_str = *arg + 4;
  437. mtu = strtol(mtu_str, NULL, 0);
  438. } else if (0 == strncmp("port=", *arg, 5)) {
  439. const char *s_port = *arg + 5;
  440. uint16_t port;
  441. port = atoi (s_port);
  442. sdev->port = htons(port);
  443. } else if (0 == strncmp("bufsize=", *arg, 8)) {
  444. const char *size_str = *arg + 8;
  445. sdev->bufsize = strtol(size_str,
  446. NULL, 0);
  447. } else if (0 == strncmp("interface=",
  448. *arg, 10))
  449. {
  450. interface = *arg + 10;
  451. }
  452. }
  453. if (sdev->ip != 0 || interface) {
  454. /* try to get the actual values now */
  455. #ifdef HAVE_GETIFADDRS
  456. if (addrs) {
  457. for (addr = addrs;
  458. addr != NULL;
  459. addr = addr->ifa_next)
  460. {
  461. struct sockaddr_in *sai;
  462. if (!addr->ifa_addr)
  463. continue;
  464. if (addr->ifa_addr->sa_family != AF_INET)
  465. continue;
  466. sai = (struct sockaddr_in *) addr->ifa_addr;
  467. if (!memcmp(&sdev->ip, &sai->sin_addr, sizeof(sdev->ip)))
  468. break;
  469. if (interface &&
  470. !strcmp(interface, addr->ifa_name)) {
  471. memcpy(&sdev->ip, &sai->sin_addr, sizeof(sdev->ip));
  472. break;
  473. }
  474. }
  475. if (!addr)
  476. /* no such device, don't initialize it */
  477. continue;
  478. cci__get_dev_ifaddrs_info(dev, addr);
  479. }
  480. #endif
  481. if (mtu == (uint32_t) -1) {
  482. /* if mtu not specified, use the ifaddr one */
  483. mtu = device->max_send_size;
  484. }
  485. if (mtu == (uint32_t) -1) {
  486. /* if still no mtu, use default */
  487. device->max_send_size = SOCK_DEFAULT_MSS;
  488. } else {
  489. /* compute mss from mtu */
  490. if (mtu > SOCK_UDP_MAX)
  491. mtu = SOCK_UDP_MAX;
  492. mtu -= SOCK_MAX_HDR_SIZE;
  493. assert(mtu >= SOCK_MIN_MSS); /* FIXME rather ignore the device? */
  494. device->max_send_size = mtu;
  495. }
  496. /* queue to the main device list now */
  497. TAILQ_REMOVE(&globals->configfile_devs, dev, entry);
  498. cci__add_dev(dev);
  499. devices[sglobals->count] = device;
  500. sglobals->count++;
  501. threads_running = 1;
  502. }
  503. }
  504. }
  505. devices =
  506. realloc(devices, (sglobals->count + 1) * sizeof(cci_device_t *));
  507. devices[sglobals->count] = NULL;
  508. *((cci_device_t ***) & sglobals->devices) = devices;
  509. #ifdef HAVE_GETIFADDRS
  510. freeifaddrs(addrs);
  511. #endif
  512. CCI_EXIT;
  513. return CCI_SUCCESS;
  514. out:
  515. if (devices) {
  516. int i = 0;
  517. cci_device_t *device;
  518. cci__dev_t *my_dev;
  519. while (devices[i] != NULL) {
  520. device = devices[i];
  521. my_dev = container_of(device, cci__dev_t, device);
  522. if (my_dev->priv)
  523. free(my_dev->priv);
  524. }
  525. free(devices);
  526. }
  527. if (sglobals) {
  528. free((void *)sglobals);
  529. sglobals = NULL;
  530. }
  531. #ifdef HAVE_GETIFADDRS
  532. if (addrs) {
  533. freeifaddrs(addrs);
  534. }
  535. #endif
  536. CCI_EXIT;
  537. return ret;
  538. }
  539. /* TODO */
  540. static const char *ctp_sock_strerror(cci_endpoint_t * endpoint,
  541. enum cci_status status)
  542. {
  543. CCI_ENTER;
  544. UNUSED_PARAM (endpoint);
  545. UNUSED_PARAM (status);
  546. CCI_EXIT;
  547. return NULL;
  548. }
  549. /* NOTE the CCI layer has already unbound all devices
  550. * and destroyed all endpoints.
  551. * All we need to do if free dev->priv
  552. */
  553. static int ctp_sock_finalize(cci_plugin_ctp_t * plugin)
  554. {
  555. cci__dev_t *dev = NULL;
  556. CCI_ENTER;
  557. UNUSED_PARAM (plugin);
  558. if (!sglobals) {
  559. CCI_EXIT;
  560. return CCI_ENODEV;
  561. }
  562. TAILQ_FOREACH(dev, &globals->devs, entry)
  563. if (!strcmp(dev->device.transport, "sock"))
  564. free(dev->priv);
  565. free(sglobals->devices);
  566. free((void *)sglobals);
  567. sglobals = NULL;
  568. CCI_EXIT;
  569. return CCI_SUCCESS;
  570. }
  571. static inline int
  572. sock_set_nonblocking(cci_os_handle_t sock, sock_fd_type_t type, void *p)
  573. {
  574. int ret, flags;
  575. UNUSED_PARAM (type);
  576. UNUSED_PARAM (p);
  577. flags = fcntl(sock, F_GETFL, 0);
  578. if (-1 == flags)
  579. flags = 0;
  580. ret = fcntl(sock, F_SETFL, flags | O_NONBLOCK);
  581. if (-1 == ret)
  582. return errno;
  583. return 0;
  584. }
  585. static inline void sock_close_socket(cci_os_handle_t sock)
  586. {
  587. close(sock);
  588. return;
  589. }
  590. static int ctp_sock_create_endpoint(cci_device_t * device,
  591. int flags,
  592. cci_endpoint_t ** endpointp,
  593. cci_os_handle_t * fd)
  594. {
  595. int ret;
  596. uint32_t i;
  597. sock_dev_t *sdev;
  598. struct sockaddr_in sin;
  599. socklen_t slen;
  600. char name[40];
  601. unsigned int sndbuf_size = SOCK_SNDBUF_SIZE;
  602. unsigned int rcvbuf_size = SOCK_RCVBUF_SIZE;
  603. cci__dev_t *dev = NULL;
  604. cci__ep_t *ep = NULL;
  605. sock_ep_t *sep = NULL;
  606. struct cci_endpoint *endpoint = (struct cci_endpoint *) *endpointp;
  607. CCI_ENTER;
  608. UNUSED_PARAM (flags);
  609. if (!sglobals) {
  610. CCI_EXIT;
  611. return CCI_ENODEV;
  612. }
  613. dev = container_of(device, cci__dev_t, device);
  614. if (0 != strcmp("sock", device->transport)) {
  615. ret = CCI_EINVAL;
  616. goto out;
  617. }
  618. ep = container_of(endpoint, cci__ep_t, endpoint);
  619. ep->priv = calloc(1, sizeof(*sep));
  620. if (!ep->priv) {
  621. ret = CCI_ENOMEM;
  622. goto out;
  623. }
  624. ep->rx_buf_cnt = SOCK_EP_RX_CNT;
  625. ep->tx_buf_cnt = SOCK_EP_TX_CNT;
  626. ep->buffer_len = dev->device.max_send_size + SOCK_MAX_HDRS;
  627. ep->tx_timeout = SOCK_EP_TX_TIMEOUT_SEC * 1000000;
  628. sep = ep->priv;
  629. sep->ids = calloc(SOCK_NUM_BLOCKS, sizeof(*sep->ids));
  630. if (!sep->ids) {
  631. ret = CCI_ENOMEM;
  632. goto out;
  633. }
  634. sep->closing = 0;
  635. pthread_mutex_init (&sep->progress_mutex, NULL);
  636. pthread_cond_init (&sep->wait_condition, NULL);
  637. sep->sock = socket(PF_INET, SOCK_DGRAM, 0);
  638. if (sep->sock == -1) {
  639. ret = errno;
  640. goto out;
  641. }
  642. sdev = dev->priv;
  643. if (sndbuf_size < sdev->bufsize)
  644. sndbuf_size = sdev->bufsize;
  645. if (rcvbuf_size < sdev->bufsize)
  646. rcvbuf_size = sdev->bufsize;
  647. if (sndbuf_size > 0) {
  648. ret = setsockopt (sep->sock, SOL_SOCKET, SO_SNDBUF,
  649. &sndbuf_size, sizeof (sndbuf_size));
  650. if (ret == -1)
  651. debug (CCI_DB_WARN,
  652. "%s: Cannot set send buffer size", __func__);
  653. }
  654. if (rcvbuf_size > 0) {
  655. ret = setsockopt (sep->sock, SOL_SOCKET, SO_RCVBUF,
  656. &rcvbuf_size, sizeof (rcvbuf_size));
  657. if (ret == -1)
  658. debug (CCI_DB_WARN, "%s: Cannot set recv buffer size",
  659. __func__);
  660. }
  661. #if CCI_DEBUG
  662. {
  663. socklen_t optlen;
  664. optlen = sizeof (sndbuf_size);
  665. ret = getsockopt (sep->sock, SOL_SOCKET, SO_SNDBUF,
  666. &sndbuf_size, &optlen);
  667. if (ret == -1)
  668. debug (CCI_DB_WARN, "%s: Cannot get send buffer size",
  669. __func__);
  670. debug (CCI_DB_CTP, "Send buffer size: %d bytes (you may also "
  671. "want to check the value of net.core.wmem_max using "
  672. "sysctl)", sndbuf_size);
  673. optlen = sizeof (rcvbuf_size);
  674. ret = getsockopt (sep->sock, SOL_SOCKET, SO_RCVBUF,
  675. &rcvbuf_size, &optlen);
  676. if (ret == -1)
  677. debug (CCI_DB_WARN, "%s: Cannot get recv buffer size",
  678. __func__);
  679. debug (CCI_DB_CTP, "Receive buffer size: %d bytes (you may also "
  680. "want to check the value of net.core.rmem_max using "
  681. "sysctl)", rcvbuf_size);
  682. }
  683. #endif
  684. /* bind socket to device */
  685. memset(&sin, 0, sizeof(sin));
  686. sin.sin_family = AF_INET;
  687. sin.sin_addr.s_addr = sdev->ip;
  688. if (sdev->port != 0)
  689. sin.sin_port = sdev->port;
  690. ret = bind(sep->sock, (const struct sockaddr *)&sin, sizeof(sin));
  691. if (ret) {
  692. ret = errno;
  693. goto out;
  694. }
  695. slen = sizeof(sep->sin);
  696. ret = getsockname(sep->sock, (struct sockaddr *)&sep->sin, &slen);
  697. if (ret) {
  698. ret = errno;
  699. goto out;
  700. }
  701. memset(name, 0, sizeof(name));
  702. sprintf(name, "sock://");
  703. sock_sin_to_name(sep->sin, name + (uintptr_t) 7, sizeof(name) - 7);
  704. ep->uri = strdup(name);
  705. for (i = 0; i < SOCK_EP_HASH_SIZE; i++) {
  706. TAILQ_INIT(&sep->conn_hash[i]);
  707. TAILQ_INIT(&sep->active_hash[i]);
  708. }
  709. TAILQ_INIT(&sep->idle_txs);
  710. TAILQ_INIT(&sep->idle_rxs);
  711. TAILQ_INIT(&sep->handles);
  712. TAILQ_INIT(&sep->rma_ops);
  713. TAILQ_INIT(&sep->queued);
  714. TAILQ_INIT(&sep->pending);
  715. sep->tx_buf = calloc (1, ep->tx_buf_cnt * ep->buffer_len);
  716. if (!sep->tx_buf) {
  717. ret = CCI_ENOMEM;
  718. goto out;
  719. }
  720. sep->txs = calloc (1, ep->tx_buf_cnt * sizeof (sock_tx_t));
  721. if (!sep->txs) {
  722. ret = CCI_ENOMEM;
  723. goto out;
  724. }
  725. /* alloc txs */
  726. for (i = 0; i < ep->tx_buf_cnt; i++) {
  727. sock_tx_t *tx = &sep->txs[i];
  728. tx->ctx = SOCK_CTX_TX;
  729. tx->evt.event.type = CCI_EVENT_SEND;
  730. tx->evt.ep = ep;
  731. tx->buffer = (void*)((uintptr_t)sep->tx_buf
  732. + (i * ep->buffer_len));
  733. tx->len = 0;
  734. TAILQ_INSERT_TAIL(&sep->idle_txs, tx, dentry);
  735. }
  736. sep->rx_buf = calloc (1, ep->rx_buf_cnt * ep->buffer_len);
  737. if (!sep->rx_buf) {
  738. ret = CCI_ENOMEM;
  739. goto out;
  740. }
  741. sep->rxs = calloc (1, ep->rx_buf_cnt * sizeof (sock_rx_t));
  742. if (!sep->rx_buf) {
  743. ret = CCI_ENOMEM;
  744. goto out;
  745. }
  746. /* alloc rxs */
  747. for (i = 0; i < ep->rx_buf_cnt; i++) {
  748. sock_rx_t *rx = &sep->rxs[i];
  749. rx->ctx = SOCK_CTX_RX;
  750. rx->evt.event.type = CCI_EVENT_RECV;
  751. rx->evt.ep = ep;
  752. rx->buffer = (void*)((uintptr_t)sep->rx_buf
  753. + (i * ep->buffer_len));
  754. rx->len = 0;
  755. TAILQ_INSERT_TAIL(&sep->idle_rxs, rx, entry);
  756. }
  757. ret = sock_set_nonblocking(sep->sock, SOCK_FD_EP, ep);
  758. if (ret)
  759. goto out;
  760. sep->event_fd = 0;
  761. #ifdef HAVE_SYS_EPOLL_H
  762. if (fd) {
  763. int fflags = 0;
  764. int rc;
  765. struct epoll_event ev;
  766. ret = epoll_create (2);
  767. if (ret == -1) {
  768. ret = errno;
  769. goto out;
  770. }
  771. sep->event_fd = ret;
  772. fflags = fcntl(sep->event_fd, F_GETFL, 0);
  773. if (fflags == -1) {
  774. ret = errno;
  775. goto out;
  776. }
  777. ret = fcntl(sep->event_fd, F_SETFL, fflags | O_NONBLOCK);
  778. if (ret == -1) {
  779. ret = errno;
  780. goto out;
  781. }
  782. ev.data.ptr = (void*)(uintptr_t)sock_recvfrom_ep;
  783. ev.events = EPOLLIN;
  784. ret = epoll_ctl (sep->event_fd, EPOLL_CTL_ADD, sep->sock, &ev);
  785. if (ret == -1) {
  786. ret = errno;
  787. goto out;
  788. }
  789. rc = pipe (sep->fd);
  790. if (rc == -1) {
  791. debug (CCI_DB_WARN, "%s: %s", __func__, strerror (errno));
  792. return CCI_ERROR;
  793. }
  794. *fd = sep->fd[0];
  795. }
  796. #else
  797. if (fd) {
  798. /* We will have poll on the receive thread so we just need to create a
  799. pipe so the receive and send thread can wake up the application
  800. thread */
  801. pipe (sep->fd);
  802. *fd = sep->fd[0];
  803. /* We set event_fd to value different than zero to know that we are
  804. in blocking mode at the application level */
  805. sep->event_fd = 1;
  806. }
  807. #endif /* HAVE_SYS_EPOLL_H */
  808. ret = sock_create_threads (ep);
  809. if (ret)
  810. goto out;
  811. CCI_EXIT;
  812. return CCI_SUCCESS;
  813. out:
  814. /* Note that there is no need to remove the ep even in the context of
  815. a failure because the ep is added to the list of active endpoints
  816. by cci_create_endpoint(), AFTER the call to this function. */
  817. if (sep) {
  818. if (sep->txs)
  819. free (sep->txs);
  820. if (sep->tx_buf)
  821. free (sep->tx_buf);
  822. if (sep->rxs)
  823. free (sep->rxs);
  824. if (sep->rx_buf)
  825. free (sep->rx_buf);
  826. if (sep->ids)
  827. free(sep->ids);
  828. if (sep->sock)
  829. sock_close_socket(sep->sock);
  830. free(sep);
  831. ep->priv = NULL;
  832. }
  833. if (ep) {
  834. free (ep->uri);
  835. }
  836. *endpointp = NULL;
  837. CCI_EXIT;
  838. return ret;
  839. }
  840. static int ctp_sock_destroy_endpoint(cci_endpoint_t * endpoint)
  841. {
  842. cci__ep_t *ep = NULL;
  843. cci__dev_t *dev = NULL;
  844. sock_ep_t *sep = NULL;
  845. CCI_ENTER;
  846. if (!sglobals) {
  847. CCI_EXIT;
  848. return CCI_ENODEV;
  849. }
  850. ep = container_of(endpoint, cci__ep_t, endpoint);
  851. dev = ep->dev;
  852. sep = ep->priv;
  853. pthread_mutex_lock(&dev->lock);
  854. pthread_mutex_lock(&ep->lock);
  855. if (sep) {
  856. int i;
  857. cci__conn_t *conn;
  858. sock_conn_t *sconn;
  859. sep->closing = 1;
  860. pthread_mutex_unlock(&dev->lock);
  861. pthread_mutex_unlock(&ep->lock);
  862. sock_terminate_threads (sep);
  863. pthread_mutex_lock(&dev->lock);
  864. pthread_mutex_lock(&ep->lock);
  865. if (sep->fd[0] > 0)
  866. close (sep->fd[0]);
  867. if (sep->fd[1] > 0)
  868. close (sep->fd[1]);
  869. if (sep->sock)
  870. sock_close_socket(sep->sock);
  871. for (i = 0; i < SOCK_EP_HASH_SIZE; i++) {
  872. while (!TAILQ_EMPTY(&sep->conn_hash[i])) {
  873. sconn = TAILQ_FIRST(&sep->conn_hash[i]);
  874. TAILQ_REMOVE(&sep->conn_hash[i], sconn, entry);
  875. conn = sconn->conn;
  876. free(conn);
  877. free(sconn);
  878. }
  879. while (!TAILQ_EMPTY(&sep->active_hash[i])) {
  880. sconn = TAILQ_FIRST(&sep->active_hash[i]);
  881. TAILQ_REMOVE(&sep->active_hash[i], sconn, entry);
  882. conn = sconn->conn;
  883. free(conn);
  884. free(sconn);
  885. }
  886. }
  887. free (sep->txs);
  888. free (sep->tx_buf);
  889. free (sep->rxs);
  890. free (sep->rx_buf);
  891. while (!TAILQ_EMPTY(&sep->rma_ops)) {
  892. sock_rma_op_t *rma_op = TAILQ_FIRST(&sep->rma_ops);
  893. TAILQ_REMOVE(&sep->rma_ops, rma_op, entry);
  894. free(rma_op);
  895. }
  896. while (!TAILQ_EMPTY(&sep->handles)) {
  897. sock_rma_handle_t *handle = TAILQ_FIRST(&sep->handles);
  898. TAILQ_REMOVE(&sep->handles, handle, entry);
  899. free(handle);
  900. }
  901. if (sep->ids)
  902. free(sep->ids);
  903. free(sep);
  904. ep->priv = NULL;
  905. }
  906. ep->priv = NULL;
  907. if (ep->uri)
  908. free((char *)ep->uri);
  909. pthread_mutex_unlock(&ep->lock);
  910. pthread_mutex_unlock(&dev->lock);
  911. CCI_EXIT;
  912. return CCI_SUCCESS;
  913. }
  914. static void sock_get_id(sock_ep_t * ep, uint32_t * id)
  915. {
  916. uint32_t n, block, offset;
  917. uint64_t *b;
  918. while (1) {
  919. n = random() % SOCK_NUM_BLOCKS;
  920. block = n / SOCK_BLOCK_SIZE;
  921. offset = n % SOCK_BLOCK_SIZE;
  922. b = &ep->ids[block];
  923. if ((*b & (1ULL << offset)) == 0) {
  924. *b |= (1ULL << offset);
  925. *id = (block * SOCK_BLOCK_SIZE) + offset;
  926. break;
  927. }
  928. }
  929. return;
  930. }
  931. #if 0
  932. static void sock_put_id(sock_ep_t * ep, uint32_t id)
  933. {
  934. uint32_t block, offset;
  935. uint64_t *b;
  936. block = id / SOCK_BLOCK_SIZE;
  937. offset = id % SOCK_BLOCK_SIZE;
  938. b = &ep->ids[block];
  939. assert((*b & (1 << offset)) == 1);
  940. *b &= ~(1 << offset);
  941. return;
  942. }
  943. #endif
  944. static inline uint32_t sock_get_new_seq(void)
  945. {
  946. return ((uint32_t) random() & SOCK_SEQ_MASK);
  947. }
  948. /* The endpoint maintains 256 lists. Hash the ip and port and return the index
  949. * of the list. We use all six bytes and this is endian agnostic. It evenly
  950. * disperses large blocks of addresses as well as large ranges of ports on the
  951. * same address.
  952. */
  953. static uint8_t sock_ip_hash(in_addr_t ip, uint16_t port)
  954. {
  955. port ^= (ip & 0x0000FFFF);
  956. port ^= (ip & 0xFFFF0000) >> 16;
  957. return (port & 0x00FF) ^ ((port & 0xFF00) >> 8);
  958. }
  959. static int ctp_sock_accept(cci_event_t *event, const void *context)
  960. {
  961. uint8_t a;
  962. uint16_t b;
  963. uint32_t unused;
  964. uint32_t peer_seq;
  965. uint32_t peer_ts;
  966. int i;
  967. cci_endpoint_t *endpoint;
  968. cci__ep_t *ep = NULL;
  969. cci__conn_t *conn = NULL;
  970. cci__evt_t *evt = NULL;
  971. cci__dev_t *dev = NULL;
  972. sock_ep_t *sep = NULL;
  973. sock_conn_t *sconn = NULL;
  974. sock_header_r_t *hdr_r = NULL;
  975. sock_msg_type_t type;
  976. sock_tx_t *tx = NULL;
  977. sock_rx_t *rx = NULL;
  978. sock_handshake_t *hs = NULL;
  979. uint32_t id, ack, max_recv_buffer_count, mss = 0, ka;
  980. CCI_ENTER;
  981. if (!sglobals) {
  982. CCI_EXIT;
  983. return CCI_ENODEV;
  984. }
  985. evt = container_of(event, cci__evt_t, event);
  986. rx = container_of(evt, sock_rx_t, evt);
  987. ep = evt->ep;
  988. endpoint = &ep->endpoint;
  989. sep = ep->priv;
  990. dev = ep->dev;
  991. conn = calloc(1, sizeof(*conn));
  992. if (!conn) {
  993. CCI_EXIT;
  994. return CCI_ENOMEM;
  995. }
  996. conn->plugin = ep->plugin;
  997. conn->tx_timeout = ep->tx_timeout;
  998. conn->priv = calloc(1, sizeof(*sconn));
  999. if (!conn->priv) {
  1000. free(conn);
  1001. CCI_EXIT;
  1002. return CCI_ENOMEM;
  1003. }
  1004. /* get a tx */
  1005. tx = sock_get_tx (ep);
  1006. if (!tx) {
  1007. free(conn->priv);
  1008. free(conn);
  1009. CCI_EXIT;
  1010. return CCI_ENOBUFS;
  1011. }
  1012. tx->rma_ptr = NULL;
  1013. tx->rma_len = 0;
  1014. hdr_r = rx->buffer;
  1015. sock_parse_header(&hdr_r->header, &type, &a, &b, &unused);
  1016. sock_parse_seq_ts(&hdr_r->seq_ts, &peer_seq, &peer_ts);
  1017. conn->connection.attribute = (enum cci_conn_attribute)a;
  1018. conn->connection.endpoint = endpoint;
  1019. conn->connection.context = (void *)context;
  1020. conn->connection.max_send_size = dev->device.max_send_size;
  1021. hs = (sock_handshake_t *)((uintptr_t)rx->buffer +
  1022. (uintptr_t) sizeof(sock_header_r_t));
  1023. sock_parse_handshake(hs, &id, &ack, &max_recv_buffer_count, &mss, &ka);
  1024. if (ka != 0UL) {
  1025. debug(CCI_DB_CONN, "%s: keepalive timeout: %d", __func__, ka);
  1026. conn->keepalive_timeout = ka;
  1027. }
  1028. if (mss < SOCK_MIN_MSS) {
  1029. /* FIXME do what? */
  1030. }
  1031. if (mss < conn->connection.max_send_size)
  1032. conn->connection.max_send_size = mss;
  1033. sconn = conn->priv;
  1034. TAILQ_INIT(&sconn->tx_seqs);
  1035. TAILQ_INIT(&sconn->acks);
  1036. TAILQ_INIT(&sconn->rmas);
  1037. sconn->conn = conn;
  1038. sconn->cwnd = SOCK_INITIAL_CWND;
  1039. sconn->status = SOCK_CONN_READY; /* set ready since the app thinks it is */
  1040. sconn->last_recvd_seq = 0;
  1041. *((struct sockaddr_in *)&sconn->sin) = rx->sin;
  1042. sconn->peer_id = id;
  1043. sock_get_id(sep, &sconn->id);
  1044. sconn->seq = sock_get_new_seq(); /* even for UU since this reply is reliable */
  1045. sconn->seq_pending = sconn->seq - 1;
  1046. if (cci_conn_is_reliable(conn)) {
  1047. sconn->max_tx_cnt = max_recv_buffer_count < ep->tx_buf_cnt ?
  1048. max_recv_buffer_count : ep->tx_buf_cnt;
  1049. sconn->last_ack_seq = sconn->seq;
  1050. sconn->last_ack_ts = sock_get_usecs();
  1051. sconn->ssthresh = sconn->max_tx_cnt;
  1052. sconn->seq_pending = sconn->seq;
  1053. }
  1054. /* insert in sock ep's list of conns */
  1055. i = sock_ip_hash(sconn->sin.sin_addr.s_addr, sconn->sin.sin_port);
  1056. pthread_mutex_lock(&ep->lock);
  1057. TAILQ_INSERT_TAIL(&sep->conn_hash[i], sconn, entry);
  1058. pthread_mutex_unlock(&ep->lock);
  1059. debug_ep(ep, CCI_DB_CONN, "%s: accepting conn with hash %d",
  1060. __func__, i);
  1061. /* prepare conn_reply */
  1062. tx->msg_type = SOCK_MSG_CONN_REPLY;
  1063. tx->last_attempt_us = 0ULL;
  1064. tx->timeout_us = 0ULL;
  1065. tx->rma_op = NULL;
  1066. evt = &tx->evt;
  1067. evt->ep = ep;
  1068. evt->conn = conn;
  1069. evt->event.type = CCI_EVENT_ACCEPT;
  1070. evt->event.accept.status = CCI_SUCCESS; /* for now */
  1071. evt->event.accept.context = (void *)context;
  1072. evt->event.accept.connection = &conn->connection;
  1073. /* pack the msg */
  1074. hdr_r = (sock_header_r_t *) tx->buffer;
  1075. sock_pack_conn_reply(&hdr_r->header, CCI_SUCCESS /* FIXME */ ,
  1076. sconn->peer_id);
  1077. sock_pack_seq_ts(&hdr_r->seq_ts, sconn->seq,
  1078. (uint32_t) sconn->last_ack_ts);
  1079. hs = (sock_handshake_t *) ((uintptr_t)tx->buffer + sizeof(*hdr_r));
  1080. sock_pack_handshake(hs, sconn->id, peer_seq,
  1081. ep->rx_buf_cnt,
  1082. conn->connection.max_send_size, 0);
  1083. tx->len = sizeof(*hdr_r) + sizeof(*hs);
  1084. tx->seq = sconn->seq;
  1085. debug_ep(ep, CCI_DB_CONN, "%s: queuing conn_reply with seq %u ts %x",
  1086. __func__, sconn->seq, sconn->ts);
  1087. /* insert at tail of device's queued list */
  1088. tx->state = SOCK_TX_QUEUED;
  1089. pthread_mutex_lock(&ep->lock);
  1090. TAILQ_INSERT_TAIL(&sep->queued, &tx->evt, entry);
  1091. pthread_mutex_unlock(&ep->lock);
  1092. /* try to progress txs */
  1093. pthread_mutex_lock(&sep->progress_mutex);
  1094. pthread_cond_signal(&sep->wait_condition);
  1095. pthread_mutex_unlock(&sep->progress_mutex);
  1096. CCI_EXIT;
  1097. return CCI_SUCCESS;
  1098. }
  1099. /* Send reject reply to client.
  1100. *
  1101. * We cannot use the event's buffer since the app will most likely return the
  1102. * event before we get an ack from the client. We will get a tx for the reply.
  1103. */
  1104. static int ctp_sock_reject(cci_event_t *event)
  1105. {
  1106. int ret = CCI_SUCCESS;
  1107. uint8_t a;
  1108. uint16_t b;
  1109. uint32_t peer_id;
  1110. uint32_t peer_seq;
  1111. uint32_t peer_ts;
  1112. cci__evt_t *evt = NULL;
  1113. cci__ep_t *ep = NULL;
  1114. sock_ep_t *sep = NULL;
  1115. sock_header_r_t *hdr_r = NULL;
  1116. sock_msg_type_t type;
  1117. sock_rx_t *rx = NULL;
  1118. sock_tx_t *tx = NULL;
  1119. CCI_ENTER;
  1120. if (!sglobals) {
  1121. CCI_EXIT;
  1122. return CCI_ENODEV;
  1123. }
  1124. evt = container_of(event, cci__evt_t, event);
  1125. ep = evt->ep;
  1126. sep = ep->priv;
  1127. rx = container_of(evt, sock_rx_t, evt);
  1128. hdr_r = rx->buffer;
  1129. sock_parse_header(&hdr_r->header, &type, &a, &b, &peer_id);
  1130. sock_parse_seq_ts(&hdr_r->seq_ts, &peer_seq, &peer_ts);
  1131. /* get a tx */
  1132. tx = sock_get_tx (ep);
  1133. if (!tx) {
  1134. ret = CCI_ENOBUFS;
  1135. goto out;
  1136. }
  1137. tx->rma_ptr = NULL;
  1138. tx->rma_len = 0;
  1139. /* prep the tx */
  1140. tx->msg_type = SOCK_MSG_CONN_REPLY;
  1141. tx->evt.ep = ep;
  1142. tx->evt.conn = NULL;
  1143. tx->evt.event.type = CCI_EVENT_CONNECT;
  1144. tx->evt.event.connect.status = CCI_ECONNREFUSED;
  1145. tx->evt.event.connect.connection = NULL;
  1146. tx->last_attempt_us = 0ULL;
  1147. tx->timeout_us = 0ULL;
  1148. tx->rma_op = NULL;
  1149. tx->sin = rx->sin;
  1150. /* prepare conn_reply */
  1151. hdr_r = (sock_header_r_t *) tx->buffer;
  1152. sock_pack_conn_reply(&hdr_r->header, CCI_ECONNREFUSED, peer_id);
  1153. sock_pack_seq_ts(&hdr_r->seq_ts, peer_seq, 0);
  1154. tx->len = sizeof(*hdr_r);
  1155. tx->state = SOCK_TX_QUEUED;
  1156. /* We have no connection and the request is rejected so we generate
  1157. a new seq since the client may or not ack the conn_reply. In the
  1158. worst case, the conn_reply associated to the reject is thrown away
  1159. when it times out */
  1160. tx->seq = sock_get_new_seq ();
  1161. /* insert at tail of endpoint's queued list */
  1162. pthread_mutex_lock(&ep->lock);
  1163. TAILQ_INSERT_TAIL(&sep->queued, &tx->evt, entry);
  1164. pthread_mutex_unlock(&ep->lock);
  1165. /* try to progress txs */
  1166. pthread_mutex_lock(&sep->progress_mutex);
  1167. pthread_cond_signal(&sep->wait_condition);
  1168. pthread_mutex_unlock(&sep->progress_mutex);
  1169. #if CCI_DEBUG
  1170. {
  1171. char name[32];
  1172. memset(name, 0, sizeof(name));
  1173. sock_sin_to_name(rx->sin, name, sizeof(name));
  1174. debug_ep(ep, (CCI_DB_MSG | CCI_DB_CONN),
  1175. "%s: queued conn_reply (reject) to %s (seq %u)",
  1176. __func__, name, tx->seq);
  1177. }
  1178. #endif
  1179. out:
  1180. CCI_EXIT;
  1181. return ret;
  1182. }
  1183. static int sock_getaddrinfo(const char *uri, in_addr_t * in, uint16_t * port)
  1184. {
  1185. int ret;
  1186. char *hostname, *svc, *colon;
  1187. struct addrinfo *ai = NULL, hints;
  1188. if (0 == strncmp("sock://", uri, 7))
  1189. hostname = strdup(&uri[7]);
  1190. else {
  1191. CCI_EXIT;
  1192. return CCI_EINVAL;
  1193. }
  1194. colon = strchr(hostname, ':');
  1195. if (colon) {
  1196. *colon = '\0';
  1197. } else {
  1198. free(hostname);
  1199. CCI_EXIT;
  1200. return CCI_EINVAL;
  1201. }
  1202. colon++;
  1203. svc = colon;
  1204. memset(&hints, 0, sizeof(hints));
  1205. hints.ai_family = AF_INET;
  1206. hints.ai_socktype = SOCK_DGRAM;
  1207. hints.ai_protocol = IPPROTO_UDP;
  1208. ret = getaddrinfo(hostname, svc, &hints, &ai);
  1209. free(hostname);
  1210. if (ret) {
  1211. if (ai)
  1212. freeaddrinfo(ai);
  1213. CCI_EXIT;
  1214. return ret;
  1215. }
  1216. *in = ((struct sockaddr_in *)ai->ai_addr)->sin_addr.s_addr;
  1217. *port = ((struct sockaddr_in *)ai->ai_addr)->sin_port;
  1218. freeaddrinfo(ai);
  1219. CCI_EXIT;
  1220. return CCI_SUCCESS;
  1221. }
  1222. static sock_conn_t *sock_find_open_conn(sock_ep_t * sep, in_addr_t ip,
  1223. uint16_t port, uint32_t id)
  1224. {
  1225. uint8_t i;
  1226. struct s_conns *conn_list;
  1227. sock_conn_t *sconn = NULL, *sc;
  1228. CCI_ENTER;
  1229. i = sock_ip_hash(ip, port);
  1230. conn_list = &sep->conn_hash[i];
  1231. TAILQ_FOREACH(sc, conn_list, entry) {
  1232. if (sc->sin.sin_addr.s_addr == ip &&
  1233. sc->sin.sin_port == port && sc->id == id) {
  1234. sconn = sc;
  1235. break;
  1236. }
  1237. }
  1238. CCI_EXIT;
  1239. return sconn;
  1240. }
  1241. static sock_conn_t *sock_find_active_conn(sock_ep_t * sep, in_addr_t ip,
  1242. uint32_t id)
  1243. {
  1244. uint8_t i;
  1245. struct s_active *active_list;
  1246. sock_conn_t *sconn = NULL, *sc;
  1247. CCI_ENTER;
  1248. i = sock_ip_hash(ip, 0);
  1249. active_list = &sep->active_hash[i];
  1250. TAILQ_FOREACH(sc, active_list, entry) {
  1251. if (sc->sin.sin_addr.s_addr == ip && sc->id == id) {
  1252. sconn = sc;
  1253. break;
  1254. }
  1255. }
  1256. CCI_EXIT;
  1257. return sconn;
  1258. }
  1259. static sock_conn_t *sock_find_conn(sock_ep_t * sep, in_addr_t ip, uint16_t port,
  1260. uint32_t id, sock_msg_type_t type)
  1261. {
  1262. switch (type) {
  1263. case SOCK_MSG_CONN_REPLY:
  1264. return sock_find_active_conn(sep, ip, id);
  1265. default:
  1266. return sock_find_open_conn(sep, ip, port, id);
  1267. }
  1268. }
  1269. static int ctp_sock_connect(cci_endpoint_t * endpoint,
  1270. const char *server_uri,
  1271. const void *data_ptr,
  1272. uint32_t data_len,
  1273. cci_conn_attribute_t attribute,
  1274. const void *context,
  1275. int flags,
  1276. const struct timeval *timeout)
  1277. {
  1278. int ret;
  1279. int i;
  1280. cci__ep_t *ep = NULL;
  1281. cci__dev_t *dev = NULL;
  1282. cci__conn_t *conn = NULL;
  1283. sock_ep_t *sep = NULL;
  1284. sock_conn_t *sconn = NULL;
  1285. sock_tx_t *tx = NULL;
  1286. sock_header_r_t *hdr_r = NULL;
  1287. cci__evt_t *evt = NULL;
  1288. struct cci_connection *connection = NULL;
  1289. struct sockaddr_in *sin = NULL;
  1290. void *ptr = NULL;
  1291. in_addr_t ip;
  1292. uint32_t ts = 0;
  1293. struct s_active *active_list;
  1294. sock_handshake_t *hs = NULL;
  1295. uint16_t port;
  1296. uint32_t keepalive = 0ULL;
  1297. CCI_ENTER;
  1298. UNUSED_PARAM (flags);
  1299. UNUSED_PARAM (timeout);
  1300. if (!sglobals) {
  1301. CCI_EXIT;
  1302. return CCI_ENODEV;
  1303. }
  1304. /* allocate a new connection */
  1305. conn = calloc(1, sizeof(*conn));
  1306. if (!conn) {
  1307. CCI_EXIT;
  1308. return CCI_ENOMEM;
  1309. }
  1310. conn->priv = calloc(1, sizeof(*sconn));
  1311. if (!conn->priv) {
  1312. ret = CCI_ENOMEM;
  1313. goto out;
  1314. }
  1315. sconn = conn->priv;
  1316. sconn->conn = conn;
  1317. TAILQ_INIT(&sconn->tx_seqs);
  1318. TAILQ_INIT(&sconn->acks);
  1319. TAILQ_INIT(&sconn->rmas);
  1320. /* conn->tx_timeout = 0 by default */
  1321. connection = &conn->connection;
  1322. connection->attribute = attribute;
  1323. connection->endpoint = endpoint;
  1324. connection->context = (void *)context;
  1325. /* set up sock specific info */
  1326. sconn->status = SOCK_CONN_ACTIVE;
  1327. sconn->cwnd = SOCK_INITIAL_CWND;
  1328. sconn->last_recvd_seq = 0;
  1329. sin = (struct sockaddr_in *)&sconn->sin;
  1330. memset(sin, 0, sizeof(*sin));
  1331. sin->sin_family = AF_INET;
  1332. ret = sock_getaddrinfo(server_uri, &ip, &port);
  1333. if (ret)
  1334. goto out;
  1335. sin->sin_addr.s_addr = ip; /* already in network order */
  1336. sin->sin_port = port; /* already in network order */
  1337. /* peer will assign id */
  1338. /* get our endpoint and device */
  1339. ep = container_of(endpoint, cci__ep_t, endpoint);
  1340. sep = ep->priv;
  1341. dev = ep->dev;
  1342. connection->max_send_size = dev->device.max_send_size;
  1343. conn->plugin = ep->plugin;
  1344. /* Dealing with keepalive, if set, include the keepalive timeout value into
  1345. the connection request */
  1346. if ((((attribute & CCI_CONN_ATTR_RO) == CCI_CONN_ATTR_RO)
  1347. || ((attribute & CCI_CONN_ATTR_RU) == CCI_CONN_ATTR_RU))
  1348. && ep->keepalive_timeout != 0UL) {
  1349. keepalive = ep->keepalive_timeout;
  1350. }
  1351. i = sock_ip_hash(ip, 0);
  1352. active_list = &sep->active_hash[i];
  1353. pthread_mutex_lock(&ep->lock);
  1354. TAILQ_INSERT_TAIL(active_list, sconn, entry);
  1355. pthread_mutex_unlock(&ep->lock);
  1356. /* get a tx */
  1357. tx = sock_get_tx (ep);
  1358. if (!tx) {
  1359. /* FIXME leak */
  1360. CCI_EXIT;
  1361. return CCI_ENOBUFS;
  1362. }
  1363. tx->rma_ptr = NULL;
  1364. tx->rma_len = 0;
  1365. /* prep the tx */
  1366. tx->msg_type = SOCK_MSG_CONN_REQUEST;
  1367. evt = &tx->evt;
  1368. evt->ep = ep;
  1369. evt->conn = conn;
  1370. evt->event.type = CCI_EVENT_CONNECT; /* for now */
  1371. evt->event.connect.status = CCI_SUCCESS;
  1372. evt->event.connect.context = (void *)context;
  1373. evt->event.connect.connection = connection;
  1374. /* pack the msg */
  1375. hdr_r = (sock_header_r_t *) tx->buffer;
  1376. sock_get_id(sep, &sconn->id);
  1377. sock_pack_conn_request(&hdr_r->header, attribute,
  1378. (uint16_t) data_len, sconn->id);
  1379. tx->len = sizeof(*hdr_r);
  1380. /* add seq and ack */
  1381. sconn->seq = sock_get_new_seq();
  1382. sconn->seq_pending = sconn->seq - 1;
  1383. sconn->last_ack_seq = sconn->seq;
  1384. tx->seq = sconn->seq;
  1385. sock_pack_seq_ts(&hdr_r->seq_ts, tx->seq, ts);
  1386. /* add handshake */
  1387. hs = (sock_handshake_t *) & hdr_r->data;
  1388. if (keepalive != 0UL)
  1389. conn->keepalive_timeout = keepalive;
  1390. sock_pack_handshake(hs, sconn->id, 0,
  1391. ep->rx_buf_cnt,
  1392. connection->max_send_size, keepalive);
  1393. tx->len += sizeof(*hs);
  1394. ptr = (void*)((uintptr_t)tx->buffer + tx->len);
  1395. debug_ep(ep,CCI_DB_CONN, "%s: queuing conn_request with seq %u ts %x",
  1396. __func__, tx->seq, ts);
  1397. /* zero even if unreliable */
  1398. tx->last_attempt_us = 0ULL;
  1399. tx->timeout_us = 0ULL;
  1400. tx->rma_op = NULL;
  1401. if (data_len)
  1402. memcpy(ptr, data_ptr, data_len);
  1403. tx->len += data_len;
  1404. assert(tx->len <= ep->buffer_len);
  1405. /* insert at tail of device's queued list */
  1406. tx->state = SOCK_TX_QUEUED;
  1407. pthread_mutex_lock(&ep->lock);
  1408. TAILQ_INSERT_TAIL(&sep->queued, &tx->evt, entry);
  1409. pthread_mutex_unlock(&ep->lock);
  1410. /* try to progress txs */
  1411. pthread_mutex_lock(&sep->progress_mutex);
  1412. pthread_cond_signal(&sep->wait_condition);
  1413. pthread_mutex_unlock(&sep->progress_mutex);
  1414. CCI_EXIT;
  1415. return CCI_SUCCESS;
  1416. out:
  1417. if (conn) {
  1418. if (conn->uri)
  1419. free((char *)conn->uri);
  1420. if (conn->priv)
  1421. free(conn->priv);
  1422. free(conn);
  1423. }
  1424. CCI_EXIT;
  1425. return ret;
  1426. }
  1427. static int ctp_sock_disconnect(cci_connection_t * connection)
  1428. {
  1429. int i = 0;
  1430. cci__conn_t *conn = NULL;
  1431. cci__ep_t *ep = NULL;
  1432. sock_conn_t *sconn = NULL;
  1433. sock_ep_t *sep = NULL;
  1434. CCI_ENTER;
  1435. if (!sglobals) {
  1436. CCI_EXIT;
  1437. return CCI_ENODEV;
  1438. }
  1439. /* need to clean up */
  1440. /* remove conn from ep->conn_hash[i] */
  1441. /* if sock conn uri, free it
  1442. * free sock conn
  1443. * free conn
  1444. */
  1445. conn = container_of(connection, cci__conn_t, connection);
  1446. sconn = conn->priv;
  1447. ep = container_of(connection->endpoint, cci__ep_t, endpoint);
  1448. sep = ep->priv;
  1449. if (conn->uri)
  1450. free((char *)conn->uri);
  1451. i = sock_ip_hash(sconn->sin.sin_addr.s_addr, sconn->sin.sin_port);
  1452. pthread_mutex_lock(&ep->lock);
  1453. TAILQ_REMOVE(&sep->conn_hash[i], sconn, entry);
  1454. pthread_mutex_unlock(&ep->lock);
  1455. free(sconn);
  1456. free(conn);
  1457. CCI_EXIT;
  1458. return CCI_SUCCESS;
  1459. }
  1460. static int ctp_sock_set_opt(cci_opt_handle_t * handle,
  1461. cci_opt_name_t name, const void *val)
  1462. {
  1463. int ret = CCI_SUCCESS;
  1464. cci__ep_t *ep = NULL;
  1465. cci__conn_t *conn = NULL;
  1466. CCI_ENTER;
  1467. if (!sglobals) {
  1468. CCI_EXIT;
  1469. return CCI_ENODEV;
  1470. }
  1471. switch (name) {
  1472. case CCI_OPT_ENDPT_SEND_TIMEOUT:
  1473. ep = container_of(handle, cci__ep_t, endpoint);
  1474. ep->tx_timeout = *((uint32_t*) val);
  1475. break;
  1476. case CCI_OPT_ENDPT_RECV_BUF_COUNT:
  1477. ret = CCI_ERR_NOT_IMPLEMENTED;
  1478. break;
  1479. case CCI_OPT_ENDPT_SEND_BUF_COUNT:
  1480. ret = CCI_ERR_NOT_IMPLEMENTED;
  1481. break;
  1482. case CCI_OPT_ENDPT_KEEPALIVE_TIMEOUT:
  1483. ep = container_of(handle, cci__ep_t, endpoint);
  1484. ep->keepalive_timeout = *((uint32_t*) val);
  1485. break;
  1486. case CCI_OPT_CONN_SEND_TIMEOUT:
  1487. conn->tx_timeout = *((uint32_t*) val);
  1488. break;
  1489. default:
  1490. debug(CCI_DB_INFO, "%s: unknown option %u", __func__, name);
  1491. ret = CCI_EINVAL;
  1492. }
  1493. CCI_EXIT;
  1494. return ret;
  1495. }
  1496. static int ctp_sock_get_opt(cci_opt_handle_t * handle,
  1497. cci_opt_name_t name, void *val)
  1498. {
  1499. int ret = CCI_SUCCESS;
  1500. cci_endpoint_t *endpoint = NULL;
  1501. cci__ep_t *ep = NULL;
  1502. CCI_ENTER;
  1503. if (!sglobals) {
  1504. CCI_EXIT;
  1505. return CCI_ENODEV;
  1506. }
  1507. endpoint = handle;
  1508. ep = container_of(endpoint, cci__ep_t, endpoint);
  1509. assert (ep);
  1510. switch (name) {
  1511. case CCI_OPT_ENDPT_RECV_BUF_COUNT:
  1512. {
  1513. uint32_t *cnt = val;
  1514. *cnt = ep->rx_buf_cnt;
  1515. break;
  1516. }
  1517. case CCI_OPT_ENDPT_SEND_BUF_COUNT:
  1518. {
  1519. uint32_t *cnt = val;
  1520. *cnt = ep->tx_buf_cnt;
  1521. break;
  1522. }
  1523. case CCI_OPT_ENDPT_KEEPALIVE_TIMEOUT:
  1524. {
  1525. uint32_t *timeout = val;
  1526. *timeout = ep->keepalive_timeout;
  1527. break;
  1528. }
  1529. default:
  1530. /* Invalid opt name */
  1531. ret = CCI_EINVAL;
  1532. }
  1533. CCI_EXIT;
  1534. return ret;
  1535. }
  1536. static int ctp_sock_arm_os_handle(cci_endpoint_t * endpoint, int flags)
  1537. {
  1538. CCI_ENTER;
  1539. UNUSED_PARAM (endpoint);
  1540. UNUSED_PARAM (flags);
  1541. if (!sglobals) {
  1542. CCI_EXIT;
  1543. return CCI_ENODEV;
  1544. }
  1545. CCI_EXIT;
  1546. return CCI_ERR_NOT_IMPLEMENTED;
  1547. }
  1548. static int
  1549. ctp_sock_get_event(cci_endpoint_t * endpoint, cci_event_t ** const event)
  1550. {
  1551. int ret = CCI_SUCCESS;
  1552. cci__ep_t *ep;
  1553. sock_ep_t *sep;
  1554. cci__evt_t *ev = NULL, *e;
  1555. CCI_ENTER;
  1556. if (!sglobals) {
  1557. CCI_EXIT;
  1558. return CCI_ENODEV;
  1559. }
  1560. ep = container_of(endpoint, cci__ep_t, endpoint);
  1561. sep = ep->priv;
  1562. /* try to progress sends... */
  1563. if (!sep->closing) {
  1564. pthread_mutex_lock(&sep->progress_mutex);
  1565. pthread_cond_signal(&sep->wait_condition);
  1566. pthread_mutex_unlock(&sep->progress_mutex);
  1567. }
  1568. pthread_mutex_lock(&ep->lock);
  1569. /* give the user the first event */
  1570. TAILQ_FOREACH(e, &ep->evts, entry) {
  1571. if (e->event.type == CCI_EVENT_SEND) {
  1572. /* NOTE: if it is blocking, skip it since sock_sendv()
  1573. * is waiting on it
  1574. */
  1575. sock_tx_t *tx = container_of(e, sock_tx_t, evt);
  1576. if (tx->flags & CCI_FLAG_BLOCKING) {
  1577. continue;
  1578. } else {
  1579. ev = e;
  1580. break;
  1581. }
  1582. } else {
  1583. ev = e;
  1584. break;
  1585. }
  1586. }
  1587. if (ev) {
  1588. TAILQ_REMOVE(&ep->evts, ev, entry);
  1589. *event = &ev->event;
  1590. } else {
  1591. *event = NULL;
  1592. /* No event is available and there are no available
  1593. receive buffers. The application must return events
  1594. before any more messages can be received. */
  1595. if (TAILQ_EMPTY(&sep->idle_rxs)) {
  1596. ret = CCI_ENOBUFS;
  1597. } else {
  1598. ret = CCI_EAGAIN;
  1599. }
  1600. }
  1601. pthread_mutex_unlock(&ep->lock);
  1602. /* We read on the fd to block again */
  1603. if (ev && sep->event_fd) {
  1604. char a[1];
  1605. int rc;
  1606. /* We bock again only and only if there is no more
  1607. pending events */
  1608. if (event_queue_is_empty (ep)) {
  1609. /* Draining events so the app thread can block */
  1610. rc = read (sep->fd[0], a, sizeof (a));
  1611. if (rc != sizeof (a)) {
  1612. ret = CCI_ERROR;
  1613. }
  1614. }
  1615. }
  1616. CCI_EXIT;
  1617. return ret;
  1618. }
  1619. static int ctp_sock_return_event(cci_event_t * event)
  1620. {
  1621. cci__ep_t *ep;
  1622. sock_ep_t *sep;
  1623. cci__evt_t *evt;
  1624. sock_tx_t *tx;
  1625. sock_rx_t *rx;
  1626. int ret = CCI_SUCCESS;
  1627. CCI_ENTER;
  1628. if (!sglobals) {
  1629. CCI_EXIT;
  1630. return CCI_ENODEV;
  1631. }
  1632. if (!event) {
  1633. CCI_EXIT;
  1634. return CCI_SUCCESS;
  1635. }
  1636. evt = container_of(event, cci__evt_t, event);
  1637. ep = evt->ep;
  1638. sep = ep->priv;
  1639. /* enqueue the event */
  1640. switch (event->type) {
  1641. case CCI_EVENT_SEND:
  1642. case CCI_EVENT_ACCEPT:
  1643. tx = container_of(evt, sock_tx_t, evt);
  1644. pthread_mutex_lock(&ep->lock);
  1645. /* insert at head to keep it in cache */
  1646. TAILQ_INSERT_HEAD(&sep->idle_txs, tx, dentry);
  1647. pthread_mutex_unlock(&ep->lock);
  1648. break;
  1649. case CCI_EVENT_RECV:
  1650. case CCI_EVENT_CONNECT_REQUEST:
  1651. rx = container_of(evt, sock_rx_t, evt);
  1652. pthread_mutex_lock(&ep->lock);
  1653. /* insert at head to keep it in cache */
  1654. TAILQ_INSERT_HEAD(&sep->idle_rxs, rx, entry);
  1655. pthread_mutex_unlock(&ep->lock);
  1656. break;
  1657. case CCI_EVENT_CONNECT:
  1658. rx = container_of (evt, sock_rx_t, evt);
  1659. if (rx->ctx == SOCK_CTX_RX) {
  1660. pthread_mutex_lock(&ep->lock);
  1661. TAILQ_INSERT_HEAD (&sep->idle_rxs, rx, entry);
  1662. pthread_mutex_unlock(&ep->lock);
  1663. } else {
  1664. tx = (sock_tx_t*)rx;
  1665. pthread_mutex_lock(&ep->lock);
  1666. TAILQ_INSERT_HEAD (&sep->idle_txs, tx, dentry);
  1667. pthread_mutex_unlock(&ep->lock);
  1668. }
  1669. break;
  1670. default:
  1671. debug (CCI_DB_EP,
  1672. "%s: unhandled %s event", __func__,
  1673. cci_event_type_str(event->type));
  1674. ret = CCI_ERROR;
  1675. break;
  1676. }
  1677. CCI_EXIT;
  1678. return ret;
  1679. }
  1680. static void sock_progress_pending(cci__ep_t * ep)
  1681. {
  1682. int ret;
  1683. uint64_t now;
  1684. sock_tx_t *tx;
  1685. cci__evt_t *evt, *tmp, *my_temp_evt;
  1686. union cci_event *event; /* generic CCI event */
  1687. cci__conn_t *conn;
  1688. sock_conn_t *sconn = NULL;
  1689. sock_ep_t *sep = ep->priv;
  1690. TAILQ_HEAD(s_idle_txs, sock_tx) idle_txs
  1691. = TAILQ_HEAD_INITIALIZER(idle_txs);
  1692. TAILQ_HEAD(s_evts, cci__evt) evts = TAILQ_HEAD_INITIALIZER(evts);
  1693. TAILQ_INIT(&idle_txs);
  1694. TAILQ_INIT(&evts);
  1695. CCI_ENTER;
  1696. now = sock_get_usecs();
  1697. /* This is only for reliable messages.
  1698. * Do not dequeue txs, just walk the list.
  1699. */
  1700. pthread_mutex_lock (&ep->lock);
  1701. TAILQ_FOREACH_SAFE(evt, &sep->pending, entry, tmp) {
  1702. sock_tx_t *tx = container_of (evt, sock_tx_t, evt);
  1703. conn = evt->conn;
  1704. if (conn)
  1705. sconn = conn->priv;
  1706. event = &evt->event;
  1707. assert(tx->last_attempt_us != 0ULL);
  1708. /* has it timed out? */
  1709. if (SOCK_U64_LT(tx->timeout_us, now)) {
  1710. /* dequeue */
  1711. debug_ep(ep, CCI_DB_WARN,
  1712. "%s: timeout of %s msg (seq %u)",
  1713. __func__, sock_msg_type(tx->msg_type),
  1714. tx->seq);
  1715. TAILQ_REMOVE(&sep->pending, &tx->evt, entry);
  1716. /* set status and add to completed events */
  1717. if (tx->msg_type == SOCK_MSG_SEND)
  1718. sconn->pending--;
  1719. switch (tx->msg_type) {
  1720. case SOCK_MSG_SEND:
  1721. event->send.status = CCI_ETIMEDOUT;
  1722. if (tx->rnr != 0) {
  1723. event->send.status = CCI_ERR_RNR;
  1724. /* If a message that is already marked
  1725. RNR times out, and if the connection
  1726. is reliable and ordered, we mark all
  1727. following messages as RNR */
  1728. if (conn->connection.attribute == CCI_CONN_ATTR_RO) {
  1729. sock_tx_t *my_temp_tx;
  1730. TAILQ_FOREACH_SAFE(my_temp_evt,
  1731. &sep->pending,
  1732. entry,
  1733. tmp)
  1734. {
  1735. my_temp_tx = container_of (my_temp_evt, sock_tx_t, evt);
  1736. if (my_temp_tx->seq > tx->seq)
  1737. my_temp_tx->rnr = 1;
  1738. }
  1739. }
  1740. }
  1741. break;
  1742. case SOCK_MSG_RMA_READ_REQUEST:
  1743. case SOCK_MSG_RMA_WRITE:
  1744. pthread_mutex_lock(&ep->lock);
  1745. tx->rma_op->pending--;
  1746. tx->rma_op->status = CCI_ETIMEDOUT;
  1747. pthread_mutex_unlock(&ep->lock);
  1748. break;
  1749. case SOCK_MSG_CONN_REQUEST: {
  1750. int i;
  1751. struct s_active *active_list;
  1752. event->connect.status = CCI_ETIMEDOUT;
  1753. event->connect.connection = NULL;
  1754. if (conn->uri)
  1755. free((char *)conn->uri);
  1756. sconn->status = SOCK_CONN_CLOSING;
  1757. i = sock_ip_hash(sconn->sin.sin_addr.s_addr,
  1758. 0);
  1759. active_list = &sep->active_hash[i];
  1760. pthread_mutex_lock(&ep->lock);
  1761. TAILQ_REMOVE(active_list, sconn, entry);
  1762. pthread_mutex_unlock(&ep->lock);
  1763. free(sconn);
  1764. free(conn);
  1765. sconn = NULL;
  1766. conn = NULL;
  1767. tx->evt.ep = ep;
  1768. tx->evt.conn = NULL;
  1769. break;
  1770. }
  1771. case SOCK_MSG_CONN_REPLY: {
  1772. /* The client is not requiered to ack a
  1773. conn_reply in the context of a reject, so
  1774. we just ignore the timeout in that
  1775. context */
  1776. if (tx->evt.event.connect.status
  1777. == CCI_ECONNREFUSED)
  1778. {
  1779. /* store locally until we can drop the
  1780. dev->lock */
  1781. debug_ep (ep, CCI_DB_CONN,
  1782. "%s: No ACK of the reject, "
  1783. "dropping pending msg",
  1784. __func__);
  1785. TAILQ_INSERT_HEAD(&idle_txs,
  1786. tx,
  1787. dentry);
  1788. break;
  1789. }
  1790. }
  1791. case SOCK_MSG_CONN_ACK:
  1792. default:
  1793. /* TODO */
  1794. CCI_EXIT;
  1795. return;
  1796. }
  1797. /* if SILENT, put idle tx */
  1798. if (tx->flags & CCI_FLAG_SILENT &&
  1799. (tx->msg_type == SOCK_MSG_SEND ||
  1800. tx->msg_type == SOCK_MSG_RMA_WRITE)) {
  1801. tx->state = SOCK_TX_IDLE;
  1802. /* store locally until we can drop the
  1803. dev->lock */
  1804. TAILQ_INSERT_HEAD(&idle_txs, tx, dentry);
  1805. } else {
  1806. tx->state = SOCK_TX_COMPLETED;
  1807. /* store locally until we can drop the
  1808. dev->lock */
  1809. TAILQ_INSERT_TAIL(&evts, evt, entry);
  1810. }
  1811. continue;
  1812. }
  1813. /* is it time to resend? */
  1814. if ((tx->last_attempt_us +
  1815. ((1 << tx->send_count) * SOCK_RESEND_TIME_SEC * 1000000)) >
  1816. now) {
  1817. continue;
  1818. }
  1819. /* need to resend it */
  1820. #if 0
  1821. if (tx->send_count == 1 && tx->msg_type == SOCK_MSG_SEND && 0) {
  1822. debug(CCI_DB_INFO, "%s: reducing cwnd from %d to %d"
  1823. " reducing ssthresh from %d to %d",
  1824. __func__, sconn->cwnd, 2, sconn->ssthresh,
  1825. sconn->pending / 2 + 1);
  1826. /* reduce the slow start threshhold */
  1827. sconn->ssthresh = (sconn->pending / 2) + 1;
  1828. if (sconn->ssthresh < 2)
  1829. sconn->ssthresh = 2;
  1830. sconn->cwnd = 2;
  1831. }
  1832. #endif
  1833. tx->last_attempt_us = now;
  1834. tx->send_count++;
  1835. debug_ep(ep, CCI_DB_MSG,
  1836. "%s: re-sending %s msg seq %u count %u",
  1837. __func__, sock_msg_type(tx->msg_type), tx->seq,
  1838. tx->send_count);
  1839. pack_piggyback_ack (ep, sconn, tx);
  1840. ret = sock_sendto(sep->sock, tx->buffer, tx->len, tx->rma_ptr,
  1841. tx->rma_len, sconn->sin);
  1842. if (tx->rma_ptr == NULL && ret != tx->len) {
  1843. debug((CCI_DB_MSG | CCI_DB_INFO),
  1844. "%s: sendto() failed with %s (%d/%d)", __func__,
  1845. cci_strerror(&ep->endpoint, (enum cci_status)errno),
  1846. ret, tx->len);
  1847. continue;
  1848. }
  1849. if (tx->rma_ptr != NULL && ret != (tx->rma_len + tx->len)) {
  1850. debug((CCI_DB_MSG | CCI_DB_INFO),
  1851. "%s: sendto() failed with %s (%d/%d)", __func__,
  1852. cci_strerror(&ep->endpoint, (enum cci_status)errno),
  1853. ret, tx->rma_len);
  1854. continue;
  1855. }
  1856. }
  1857. pthread_mutex_unlock (&ep->lock);
  1858. /* transfer txs to sock ep's list */
  1859. while (!TAILQ_EMPTY(&idle_txs)) {
  1860. tx = TAILQ_FIRST(&idle_txs);
  1861. TAILQ_REMOVE(&idle_txs, tx, dentry);
  1862. ep = tx->evt.ep;
  1863. sep = ep->priv;
  1864. pthread_mutex_lock(&ep->lock);
  1865. TAILQ_INSERT_HEAD(&sep->idle_txs, tx, dentry);
  1866. pthread_mutex_unlock(&ep->lock);
  1867. }
  1868. /* transfer evts to the ep's list */
  1869. while (!TAILQ_EMPTY(&evts)) {
  1870. evt = TAILQ_FIRST(&evts);
  1871. TAILQ_REMOVE(&evts, evt, entry);
  1872. ep = evt->ep;
  1873. sock_queue_event (ep, evt);
  1874. if (sep->event_fd) {
  1875. int rc;
  1876. rc = write (sep->fd[1], "a", 1);
  1877. if (rc != 1) {
  1878. debug (CCI_DB_WARN, "%s: Write failed", __func__);
  1879. return;
  1880. }
  1881. }
  1882. }
  1883. CCI_EXIT;
  1884. return;
  1885. }
  1886. static inline int
  1887. pack_piggyback_ack (cci__ep_t *ep, sock_conn_t *sconn, sock_tx_t *tx)
  1888. {
  1889. sock_ack_t *ack = NULL;
  1890. uint64_t now = 0ULL;
  1891. UNUSED_PARAM (ep);
  1892. if (!cci_conn_is_reliable(sconn->conn))
  1893. return CCI_SUCCESS;
  1894. if (!TAILQ_EMPTY(&sconn->acks)) {
  1895. ack = TAILQ_FIRST(&sconn->acks);
  1896. if (1 == sock_need_sack(sconn)) {
  1897. /* Nothing to do */
  1898. } else if (ack != NULL && ack->start == ack->end) {
  1899. sock_header_r_t *hdr_r = tx->buffer;
  1900. hdr_r->pb_ack = ack->start;
  1901. TAILQ_REMOVE(&sconn->acks, ack, entry);
  1902. ack = TAILQ_FIRST(&sconn->acks);
  1903. /* We could get now from the caller if we wanted to */
  1904. now = sock_get_usecs();
  1905. sconn->last_ack_ts = now;
  1906. } else {
  1907. /* ACK_UP_TO, not handled at the moment */
  1908. }
  1909. } else {
  1910. sock_header_r_t *hdr_r = tx->buffer;
  1911. hdr_r->pb_ack = 0;
  1912. }
  1913. return CCI_SUCCESS;
  1914. }
  1915. static void sock_progress_queued(cci__ep_t * ep)
  1916. {
  1917. int ret, is_reliable = 0;
  1918. uint32_t timeout;
  1919. uint64_t now;
  1920. sock_tx_t *tx;
  1921. cci__evt_t *evt, *tmp;
  1922. cci__conn_t *conn;
  1923. sock_ep_t *sep = ep->priv;
  1924. sock_conn_t *sconn;
  1925. union cci_event *event = NULL; /* generic CCI event */
  1926. TAILQ_HEAD(s_idle_txs, sock_tx) idle_txs
  1927. = TAILQ_HEAD_INITIALIZER(idle_txs);
  1928. TAILQ_HEAD(s_evts, cci__evt) evts = TAILQ_HEAD_INITIALIZER(evts);
  1929. CCI_ENTER;
  1930. TAILQ_INIT(&idle_txs);
  1931. TAILQ_INIT(&evts);
  1932. if (!sep)
  1933. return;
  1934. now = sock_get_usecs();
  1935. pthread_mutex_lock(&ep->lock);
  1936. TAILQ_FOREACH_SAFE(evt, &sep->queued, entry, tmp) {
  1937. tx = container_of (evt, sock_tx_t, evt);
  1938. event = &evt->event;
  1939. /* If we deal with a CONN_REJECT, we do not have a
  1940. valid connection */
  1941. if (tx->msg_type == SOCK_MSG_CONN_REPLY
  1942. && tx->evt.event.connect.status == CCI_ECONNREFUSED) {
  1943. conn = NULL;
  1944. sconn = NULL;
  1945. } else {
  1946. conn = evt->conn;
  1947. sconn = conn->priv;
  1948. is_reliable = cci_conn_is_reliable(conn);
  1949. }
  1950. /* try to send it */
  1951. /*
  1952. RMA_READ_REPLY message are a special case: they act as an
  1953. ACK. For this reason, we do not handle any kind of timeout
  1954. for RMA_READ_REPLY messages
  1955. SOCK_MSG_CONN_REPLY in the context of a reject are also a
  1956. special case because we do not have a valid connection yet
  1957. */
  1958. if (!(tx->msg_type == SOCK_MSG_RMA_READ_REPLY ||
  1959. (tx->msg_type == SOCK_MSG_CONN_REPLY
  1960. && tx->evt.event.connect.status == CCI_ECONNREFUSED)))
  1961. {
  1962. if (tx->timeout_us == 0ULL) {
  1963. timeout =
  1964. conn->tx_timeout ? conn->tx_timeout
  1965. : ep->tx_timeout;
  1966. tx->timeout_us = now + (uint64_t) timeout;
  1967. }
  1968. if (SOCK_U64_LT(tx->timeout_us, now)) {
  1969. /* set status and add to completed events */
  1970. switch (tx->msg_type) {
  1971. case SOCK_MSG_SEND:
  1972. if (tx->rnr != 0) {
  1973. event->send.status
  1974. = CCI_ERR_RNR;
  1975. } else {
  1976. event->send.status
  1977. = CCI_ETIMEDOUT;
  1978. }
  1979. break;
  1980. case SOCK_MSG_CONN_REQUEST:
  1981. /* FIXME only CONN_REQUEST gets an
  1982. * event the other two need to
  1983. * disconnect the conn */
  1984. event->connect.status = CCI_ETIMEDOUT;
  1985. event->connect.connection = NULL;
  1986. break;
  1987. case SOCK_MSG_RMA_WRITE:
  1988. tx->rma_op->pending--;
  1989. tx->rma_op->status = CCI_ETIMEDOUT;
  1990. break;
  1991. case SOCK_MSG_CONN_REPLY:
  1992. case SOCK_MSG_CONN_ACK:
  1993. default:
  1994. /* TODO */
  1995. debug(CCI_DB_WARN,
  1996. "%s: timeout of %s msg",
  1997. __func__,
  1998. sock_msg_type(tx->msg_type));
  1999. pthread_mutex_lock(&ep->lock);
  2000. CCI_EXIT;
  2001. return;
  2002. }
  2003. TAILQ_REMOVE(&sep->queued, evt, entry);
  2004. /* if SILENT, put idle tx */
  2005. if (tx->flags & CCI_FLAG_SILENT &&
  2006. (tx->msg_type == SOCK_MSG_SEND ||
  2007. tx->msg_type == SOCK_MSG_RMA_WRITE))
  2008. {
  2009. tx->state = SOCK_TX_IDLE;
  2010. /* store locally until we can drop the
  2011. * dev->lock */
  2012. TAILQ_INSERT_HEAD(&idle_txs,
  2013. tx, dentry);
  2014. } else {
  2015. tx->state = SOCK_TX_COMPLETED;
  2016. /* store locally until we can drop the
  2017. * dev->lock */
  2018. TAILQ_INSERT_TAIL(&evts, evt, entry);
  2019. }
  2020. continue;
  2021. } /* end timeout case */
  2022. if (tx->last_attempt_us
  2023. + (SOCK_RESEND_TIME_SEC * 1000000) > now)
  2024. {
  2025. continue;
  2026. }
  2027. }
  2028. #if 0
  2029. if (sconn->pending > sconn->cwnd &&
  2030. tx->msg_type == SOCK_MSG_SEND && 0) {
  2031. continue;
  2032. }
  2033. #endif
  2034. tx->last_attempt_us = now;
  2035. tx->send_count = 1;
  2036. if (is_reliable &&
  2037. !(tx->msg_type == SOCK_MSG_CONN_REQUEST ||
  2038. tx->msg_type == SOCK_MSG_CONN_REPLY))
  2039. {
  2040. TAILQ_INSERT_TAIL(&sconn->tx_seqs, tx, tx_seq);
  2041. }
  2042. #if 0
  2043. /* if reliable and ordered, we have to check whether the tx is marked
  2044. RNR */
  2045. if (is_reliable
  2046. && conn
  2047. && conn->connection.attribute == CCI_CONN_ATTR_RO
  2048. && tx->rnr != 0)
  2049. {
  2050. event->send.status = CCI_ERR_RNR;
  2051. }
  2052. #endif
  2053. /* For RMA Writes and RMA read request, we only allow a given
  2054. number of messages to be in fly */
  2055. if (tx->msg_type == SOCK_MSG_RMA_WRITE ||
  2056. tx->msg_type == SOCK_MSG_RMA_READ_REQUEST)
  2057. {
  2058. if (tx->rma_op->pending >= SOCK_RMA_DEPTH) {
  2059. continue;
  2060. }
  2061. }
  2062. /* need to send it */
  2063. debug_ep(ep, CCI_DB_MSG, "%s: sending %s msg seq %u",
  2064. __func__, sock_msg_type(tx->msg_type), tx->seq);
  2065. if (tx->msg_type != SOCK_MSG_RMA_READ_REPLY &&
  2066. tx->msg_type != SOCK_MSG_CONN_REPLY)
  2067. {
  2068. pack_piggyback_ack (ep, sconn, tx);
  2069. }
  2070. /* If we deal with a CONN_REJECT, we do not have a
  2071. valid connection */
  2072. if (tx->msg_type == SOCK_MSG_CONN_REPLY
  2073. && tx->evt.event.connect.status == CCI_ECONNREFUSED) {
  2074. ret = sock_sendto(sep->sock, tx->buffer, tx->len,
  2075. tx->rma_ptr, tx->rma_len, tx->sin);
  2076. } else if (tx->msg_type == SOCK_MSG_RMA_WRITE_DONE) {
  2077. /* RMA_WRITE_DONE msg are normal messages even if
  2078. associated to a RMA operation so we make sure it
  2079. cannot be put on the wire as a RMA message. */
  2080. ret = sock_sendto(sep->sock, tx->buffer, tx->len,
  2081. NULL, 0, sconn->sin);
  2082. } else {
  2083. ret = sock_sendto(sep->sock, tx->buffer, tx->len,
  2084. tx->rma_ptr, tx->rma_len,
  2085. sconn->sin);
  2086. }
  2087. if (ret == -1) {
  2088. switch (errno) {
  2089. default:
  2090. debug((CCI_DB_MSG | CCI_DB_INFO),
  2091. "%s: sendto() failed with %s\n",
  2092. __func__, strerror(errno));
  2093. /* fall through */
  2094. case EINTR:
  2095. case EAGAIN:
  2096. case ENOMEM:
  2097. case ENOBUFS:
  2098. if (is_reliable &&
  2099. !(tx->msg_type == SOCK_MSG_CONN_REQUEST ||
  2100. tx->msg_type == SOCK_MSG_CONN_REPLY))
  2101. {
  2102. TAILQ_REMOVE(&sconn->tx_seqs,
  2103. tx, tx_seq);
  2104. }
  2105. continue;
  2106. }
  2107. } else {
  2108. /* msg sent, dequeue */
  2109. TAILQ_REMOVE(&sep->queued, &tx->evt, entry);
  2110. if (tx->msg_type == SOCK_MSG_SEND)
  2111. sconn->pending++;
  2112. /* If reliable or connection, add to pending
  2113. else add to idle txs. Note that is we have a
  2114. conn_reply with a conn_reject, we do not have a
  2115. valid connection and therefore we cannot deal with
  2116. a seq. As a result, we just send the conn_reply
  2117. message, but we do _NOT_ wait for a ACK (the message
  2118. does not go to the pending queue). */
  2119. if (is_reliable ||
  2120. tx->msg_type == SOCK_MSG_CONN_REQUEST ||
  2121. (tx->msg_type == SOCK_MSG_CONN_REPLY &&
  2122. tx->evt.event.connect.status != CCI_ECONNREFUSED))
  2123. {
  2124. tx->state = SOCK_TX_PENDING;
  2125. TAILQ_INSERT_TAIL(&sep->pending, evt, entry);
  2126. debug((CCI_DB_CONN | CCI_DB_MSG),
  2127. "%s: moving queued %s tx to pending "
  2128. "(seq: %u)",
  2129. __func__, sock_msg_type(tx->msg_type),
  2130. tx->seq);
  2131. if (tx->msg_type == SOCK_MSG_RMA_WRITE ||
  2132. tx->msg_type == SOCK_MSG_RMA_READ_REQUEST)
  2133. tx->rma_op->pending++;
  2134. } else {
  2135. tx->state = SOCK_TX_COMPLETED;
  2136. TAILQ_INSERT_TAIL(&idle_txs, tx, dentry);
  2137. }
  2138. }
  2139. }
  2140. pthread_mutex_unlock(&ep->lock);
  2141. /* transfer txs to sock ep's list */
  2142. while (!TAILQ_EMPTY(&idle_txs)) {
  2143. tx = TAILQ_FIRST(&idle_txs);
  2144. TAILQ_REMOVE(&idle_txs, tx, dentry);
  2145. ep = tx->evt.ep;
  2146. sep = ep->priv;
  2147. pthread_mutex_lock(&ep->lock);
  2148. TAILQ_INSERT_HEAD(&sep->idle_txs, tx, dentry);
  2149. pthread_mutex_unlock(&ep->lock);
  2150. }
  2151. /* transfer evts to the ep's list */
  2152. while (!TAILQ_EMPTY(&evts)) {
  2153. evt = TAILQ_FIRST(&evts);
  2154. TAILQ_REMOVE(&evts, evt, entry);
  2155. sock_queue_event (evt->ep, evt);
  2156. if (sep->event_fd) {
  2157. int rc;
  2158. rc = write (sep->fd[1], "a", 1);
  2159. if (rc != 1) {
  2160. debug (CCI_DB_WARN, "%s: Write failed", __func__);
  2161. return;
  2162. }
  2163. }
  2164. }
  2165. CCI_EXIT;
  2166. return;
  2167. }
  2168. static void sock_progress_sends(cci__ep_t * ep)
  2169. {
  2170. CCI_ENTER;
  2171. sock_progress_pending (ep);
  2172. sock_ack_conns(ep);
  2173. sock_progress_queued (ep);
  2174. CCI_EXIT;
  2175. return;
  2176. }
  2177. static int ctp_sock_send(cci_connection_t * connection,
  2178. const void *msg_ptr,
  2179. uint32_t msg_len,
  2180. const void *context,
  2181. int flags)
  2182. {
  2183. uint32_t iovcnt = 0;
  2184. struct iovec iov = { NULL, 0 };
  2185. if (msg_ptr && msg_len) {
  2186. iovcnt = 1;
  2187. iov.iov_base = (void *) msg_ptr;
  2188. iov.iov_len = msg_len;
  2189. }
  2190. return ctp_sock_sendv(connection, &iov, iovcnt, context, flags);
  2191. }
  2192. static int ctp_sock_sendv(cci_connection_t * connection,
  2193. const struct iovec *data, uint32_t iovcnt,
  2194. const void *context, int flags)
  2195. {
  2196. int ret = CCI_SUCCESS;
  2197. int is_reliable = 0;
  2198. int data_len = 0;
  2199. uint32_t i;
  2200. size_t s = 0;
  2201. cci_endpoint_t *endpoint = connection->endpoint;
  2202. cci__ep_t *ep;
  2203. cci__conn_t *conn;
  2204. sock_ep_t *sep;
  2205. sock_conn_t *sconn;
  2206. sock_tx_t *tx = NULL;
  2207. sock_header_t *hdr;
  2208. void *ptr;
  2209. cci__evt_t *evt;
  2210. union cci_event *event; /* generic CCI event */
  2211. CCI_ENTER;
  2212. if (!sglobals) {
  2213. CCI_EXIT;
  2214. return CCI_ENODEV;
  2215. }
  2216. for (i = 0; i < iovcnt; i++)
  2217. data_len += data[i].iov_len;
  2218. ep = container_of(endpoint, cci__ep_t, endpoint);
  2219. sep = ep->priv;
  2220. conn = container_of(connection, cci__conn_t, connection);
  2221. sconn = conn->priv;
  2222. is_reliable = cci_conn_is_reliable(conn);
  2223. /* get a tx */
  2224. tx = sock_get_tx (ep);
  2225. if (!tx) {
  2226. CCI_EXIT;
  2227. return CCI_ENOBUFS;
  2228. }
  2229. tx->rma_ptr = NULL;
  2230. tx->rma_len = 0;
  2231. /* tx bookkeeping */
  2232. tx->msg_type = SOCK_MSG_SEND;
  2233. tx->flags = flags;
  2234. /* zero even if unreliable */
  2235. if (!is_reliable) {
  2236. tx->last_attempt_us = 0ULL;
  2237. tx->timeout_us = 0ULL;
  2238. /* If the connection is not reliable, it cannot be a RMA operation */
  2239. tx->rma_op = NULL;
  2240. } else {
  2241. tx->last_attempt_us = 0ULL;
  2242. tx->timeout_us =
  2243. sock_get_usecs() + SOCK_EP_TX_TIMEOUT_SEC * 1000000;
  2244. }
  2245. /* setup generic CCI event */
  2246. evt = &tx->evt;
  2247. evt->ep = ep;
  2248. evt->conn = conn;
  2249. event = &evt->event;
  2250. event->type = CCI_EVENT_SEND;
  2251. event->send.connection = connection;
  2252. event->send.context = (void *)context;
  2253. event->send.status = CCI_SUCCESS; /* for now */
  2254. /* pack buffer */
  2255. hdr = (sock_header_t *) tx->buffer;
  2256. sock_pack_send(hdr, data_len, sconn->peer_id);
  2257. tx->len = sizeof(*hdr);
  2258. /* if reliable, add seq and ack */
  2259. if (is_reliable) {
  2260. sock_header_r_t *hdr_r = tx->buffer;
  2261. uint32_t ts = 0;
  2262. pthread_mutex_lock(&ep->lock);
  2263. tx->seq = ++(sconn->seq);
  2264. pthread_mutex_unlock(&ep->lock);
  2265. sock_pack_seq_ts(&hdr_r->seq_ts, tx->seq, ts);
  2266. tx->len = sizeof(*hdr_r);
  2267. }
  2268. ptr = (void*)((uintptr_t)tx->buffer + tx->len);
  2269. /* copy user data to buffer
  2270. * NOTE: ignore CCI_FLAG_NO_COPY because we need to
  2271. send the entire packet in one shot. We could
  2272. use sendmsg() with an iovec. */
  2273. for (i = 0; i < iovcnt; i++) {
  2274. if (s + data[i].iov_len > connection->max_send_size) {
  2275. debug (CCI_DB_CTP,
  2276. "Msg too big: %lu/%u\n",
  2277. tx->len + data[i].iov_len,
  2278. connection->max_send_size);
  2279. CCI_EXIT;
  2280. return CCI_EINVAL;
  2281. }
  2282. memcpy(ptr, data[i].iov_base, data[i].iov_len);
  2283. ptr = (void*)((uintptr_t)ptr + data[i].iov_len);
  2284. tx->len += data[i].iov_len;
  2285. s += data[i].iov_len;
  2286. }
  2287. /* if unreliable, try to send */
  2288. if (!is_reliable) {
  2289. ret = sock_sendto (sep->sock,
  2290. tx->buffer,
  2291. tx->len,
  2292. tx->rma_ptr,
  2293. tx->rma_len,
  2294. sconn->sin);
  2295. if (ret == tx->len) {
  2296. /* queue event on enpoint's completed queue */
  2297. tx->state = SOCK_TX_COMPLETED;
  2298. sock_queue_event (ep, evt);
  2299. debug(CCI_DB_MSG, "%s: sent UU msg with %d bytes",
  2300. __func__, tx->len - (int)sizeof(sock_header_t));
  2301. /* waking up the app thread if it is blocking on a OS handle */
  2302. if (sep->event_fd) {
  2303. int rc;
  2304. rc = write (sep->fd[1], "a", 1);
  2305. if (rc != 1) {
  2306. CCI_EXIT;
  2307. return CCI_ERROR;
  2308. }
  2309. }
  2310. if (!sep->closing) {
  2311. pthread_mutex_lock(&sep->progress_mutex);
  2312. pthread_cond_signal(&sep->wait_condition);
  2313. pthread_mutex_unlock(&sep->progress_mutex);
  2314. }
  2315. CCI_EXIT;
  2316. return CCI_SUCCESS;
  2317. }
  2318. /* if error, fall through and set the return code to CCI_ERROR
  2319. to make sure the application is notified that the send()
  2320. could not be locally completed */
  2321. if (ret == -1) {
  2322. /* If in debug mode, display a warning help tracing
  2323. things. */
  2324. debug (CCI_DB_WARN, "%s: Send failed (%s)",
  2325. __func__, strerror (errno));
  2326. CCI_EXIT;
  2327. return (CCI_ERROR);
  2328. }
  2329. }
  2330. /* insert at tail of sock device's queued list */
  2331. tx->state = SOCK_TX_QUEUED;
  2332. pthread_mutex_lock(&ep->lock);
  2333. TAILQ_INSERT_TAIL(&sep->queued, evt, entry);
  2334. pthread_mutex_unlock(&ep->lock);
  2335. /* try to progress txs */
  2336. if (!sep->closing) {
  2337. pthread_mutex_lock(&sep->progress_mutex);
  2338. pthread_cond_signal(&sep->wait_condition);
  2339. pthread_mutex_unlock(&sep->progress_mutex);
  2340. }
  2341. ret = CCI_SUCCESS;
  2342. /* if blocking, wait for completion */
  2343. if (tx->flags & CCI_FLAG_BLOCKING) {
  2344. struct timeval tv = { 0, SOCK_PROG_TIME_US / 2 };
  2345. while (tx->state != SOCK_TX_COMPLETED)
  2346. select(0, NULL, NULL, NULL, &tv);
  2347. /* get status and cleanup */
  2348. ret = event->send.status;
  2349. pthread_mutex_lock(&ep->lock);
  2350. TAILQ_REMOVE(&ep->evts, evt, entry);
  2351. pthread_mutex_unlock(&ep->lock);
  2352. /* waking up the app thread if it is blocking on a OS handle */
  2353. if (sep->event_fd) {
  2354. int rc;
  2355. rc = write (sep->fd[1], "a", 1);
  2356. if (rc != 1)
  2357. ret = CCI_ERROR;
  2358. }
  2359. pthread_mutex_lock(&ep->lock);
  2360. TAILQ_INSERT_HEAD(&sep->idle_txs, tx, dentry);
  2361. pthread_mutex_unlock(&ep->lock);
  2362. }
  2363. CCI_EXIT;
  2364. return ret;
  2365. }
  2366. static int ctp_sock_rma_register(cci_endpoint_t * endpoint,
  2367. void *start, uint64_t length,
  2368. int flags, cci_rma_handle_t ** rma_handle)
  2369. {
  2370. cci__ep_t *ep = NULL;
  2371. sock_ep_t *sep = NULL;
  2372. sock_rma_handle_t *handle = NULL;
  2373. CCI_ENTER;
  2374. /* FIXME use read/write flags? */
  2375. UNUSED_PARAM (flags);
  2376. if (!sglobals) {
  2377. CCI_EXIT;
  2378. return CCI_ENODEV;
  2379. }
  2380. ep = container_of(endpoint, cci__ep_t, endpoint);
  2381. sep = ep->priv;
  2382. handle = calloc(1, sizeof(*handle));
  2383. if (!handle) {
  2384. CCI_EXIT;
  2385. return CCI_ENOMEM;
  2386. }
  2387. handle->ep = ep;
  2388. handle->length = length;
  2389. handle->start = start;
  2390. *((uint64_t *)&handle->rma_handle.stuff[0]) = (uintptr_t)handle;
  2391. handle->refcnt = 1;
  2392. pthread_mutex_lock(&ep->lock);
  2393. TAILQ_INSERT_TAIL(&sep->handles, handle, entry);
  2394. pthread_mutex_unlock(&ep->lock);
  2395. *rma_handle = &handle->rma_handle;
  2396. CCI_EXIT;
  2397. return CCI_SUCCESS;
  2398. }
  2399. static int
  2400. ctp_sock_rma_deregister(cci_endpoint_t * endpoint,
  2401. cci_rma_handle_t * rma_handle)
  2402. {
  2403. int ret = CCI_EINVAL;
  2404. const struct cci_rma_handle *lh = rma_handle;
  2405. sock_rma_handle_t *handle = (void*)((uintptr_t)lh->stuff[0]);
  2406. cci__ep_t *ep = NULL;
  2407. sock_ep_t *sep = NULL;
  2408. sock_rma_handle_t *h = NULL;
  2409. sock_rma_handle_t *tmp = NULL;
  2410. CCI_ENTER;
  2411. debug (CCI_DB_INFO,
  2412. "%s: deregistering memory -- start: %p",
  2413. __func__, handle->start);
  2414. UNUSED_PARAM (endpoint);
  2415. if (!sglobals) {
  2416. CCI_EXIT;
  2417. return CCI_ENODEV;
  2418. }
  2419. ep = handle->ep;
  2420. sep = ep->priv;
  2421. pthread_mutex_lock(&ep->lock);
  2422. TAILQ_FOREACH_SAFE(h, &sep->handles, entry, tmp) {
  2423. if (h == handle) {
  2424. handle->refcnt--;
  2425. if (handle->refcnt == 0)
  2426. TAILQ_REMOVE(&sep->handles, handle, entry);
  2427. break;
  2428. }
  2429. }
  2430. pthread_mutex_unlock(&ep->lock);
  2431. if (h == handle) {
  2432. if (handle->refcnt == 0) {
  2433. memset(handle, 0, sizeof(*handle));
  2434. free(handle);
  2435. }
  2436. ret = CCI_SUCCESS;
  2437. }
  2438. CCI_EXIT;
  2439. return ret;
  2440. }
  2441. static int ctp_sock_rma(cci_connection_t * connection,
  2442. const void *msg_ptr,
  2443. uint32_t msg_len,
  2444. cci_rma_handle_t * local_handle,
  2445. uint64_t local_offset,
  2446. cci_rma_handle_t * remote_handle,
  2447. uint64_t remote_offset,
  2448. uint64_t data_len,
  2449. const void *context,
  2450. int flags)
  2451. {
  2452. int ret = CCI_ERR_NOT_IMPLEMENTED;
  2453. cci__ep_t *ep = NULL;
  2454. cci__conn_t *conn = NULL;
  2455. sock_ep_t *sep = NULL;
  2456. sock_conn_t *sconn = NULL;
  2457. sock_rma_handle_t *local = (void*)((uintptr_t)local_handle->stuff[0]);
  2458. sock_rma_handle_t *h = NULL;
  2459. sock_rma_op_t *rma_op = NULL;
  2460. size_t max_send_size;
  2461. CCI_ENTER;
  2462. if (!sglobals) {
  2463. CCI_EXIT;
  2464. return CCI_ENODEV;
  2465. }
  2466. if (local->length < local_offset + data_len) {
  2467. debug(CCI_DB_MSG,
  2468. "%s: RMA length + offset exceeds registered length "
  2469. "(%"PRIu64" + %"PRIu64" > %"PRIu64")",
  2470. __func__, data_len, local_offset, local->length);
  2471. CCI_EXIT;
  2472. return CCI_EINVAL;
  2473. }
  2474. conn = container_of(connection, cci__conn_t, connection);
  2475. sconn = conn->priv;
  2476. ep = container_of(connection->endpoint, cci__ep_t, endpoint);
  2477. sep = ep->priv;
  2478. if (!local) {
  2479. debug(CCI_DB_INFO, "%s: invalid local RMA handle", __func__);
  2480. CCI_EXIT;
  2481. return CCI_EINVAL;
  2482. }
  2483. pthread_mutex_lock(&ep->lock);
  2484. TAILQ_FOREACH(h, &sep->handles, entry) {
  2485. if (h == local) {
  2486. local->refcnt++;
  2487. break;
  2488. }
  2489. }
  2490. pthread_mutex_unlock(&ep->lock);
  2491. if (h != local) {
  2492. debug(CCI_DB_INFO, "%s: invalid endpoint for this RMA handle",
  2493. __func__);
  2494. CCI_EXIT;
  2495. return CCI_EINVAL;
  2496. }
  2497. rma_op = calloc(1, sizeof(*rma_op));
  2498. if (!rma_op) {
  2499. pthread_mutex_lock(&ep->lock);
  2500. local->refcnt--;
  2501. pthread_mutex_unlock(&ep->lock);
  2502. CCI_EXIT;
  2503. return CCI_ENOMEM;
  2504. }
  2505. rma_op->data_len = data_len;
  2506. rma_op->local_handle = local_handle;
  2507. rma_op->local_offset = local_offset;
  2508. rma_op->remote_handle = remote_handle;
  2509. rma_op->remote_offset = remote_offset;
  2510. rma_op->id = ++(sconn->rma_id);
  2511. RMA_PAYLOAD_SIZE (connection, max_send_size);
  2512. rma_op->num_msgs = data_len / max_send_size;
  2513. if (data_len % max_send_size)
  2514. rma_op->num_msgs++;
  2515. rma_op->completed = 0;
  2516. rma_op->status = CCI_SUCCESS; /* for now */
  2517. rma_op->context = (void *)context;
  2518. rma_op->flags = flags;
  2519. rma_op->msg_len = (uint16_t) msg_len;
  2520. rma_op->tx = NULL;
  2521. if (msg_len)
  2522. rma_op->msg_ptr = (void *) msg_ptr;
  2523. else
  2524. rma_op->msg_ptr = NULL;
  2525. {
  2526. uint32_t i, cnt;
  2527. int err = 0;
  2528. sock_tx_t **txs = NULL;
  2529. uint64_t old_seq = 0ULL;
  2530. debug(CCI_DB_MSG,
  2531. "%s: starting RMA %s (start: %p, len: %"PRIu64") ***",
  2532. __func__,
  2533. flags & CCI_FLAG_WRITE ? "Write" : "Read",
  2534. (void*)local->start, data_len);
  2535. cnt = rma_op->num_msgs < SOCK_RMA_DEPTH ?
  2536. rma_op->num_msgs : SOCK_RMA_DEPTH;
  2537. txs = calloc(cnt, sizeof(*txs));
  2538. if (!txs) {
  2539. pthread_mutex_lock(&ep->lock);
  2540. local->refcnt--;
  2541. pthread_mutex_unlock(&ep->lock);
  2542. free(rma_op);
  2543. CCI_EXIT;
  2544. return CCI_ENOMEM;
  2545. }
  2546. pthread_mutex_lock(&ep->lock);
  2547. old_seq = sconn->seq;
  2548. for (i = 0; i < cnt; i++) {
  2549. if (!TAILQ_EMPTY(&sep->idle_txs)) {
  2550. txs[i] = TAILQ_FIRST(&sep->idle_txs);
  2551. TAILQ_REMOVE(&sep->idle_txs, txs[i], dentry);
  2552. INIT_TX (txs[i]);
  2553. txs[i]->seq = ++(sconn->seq);
  2554. } else
  2555. err++;
  2556. }
  2557. if (err) {
  2558. for (i = 0; i < cnt; i++) {
  2559. if (txs[i])
  2560. TAILQ_INSERT_HEAD(&sep->idle_txs,
  2561. txs[i], dentry);
  2562. }
  2563. local->refcnt--;
  2564. sconn->seq = old_seq;
  2565. }
  2566. pthread_mutex_unlock(&ep->lock);
  2567. if (err) {
  2568. free(txs);
  2569. free(rma_op);
  2570. CCI_EXIT;
  2571. return CCI_ENOBUFS;
  2572. }
  2573. /* we have all the txs we need, pack them and queue them */
  2574. for (i = 0; i < cnt; i++) {
  2575. sock_tx_t *tx = txs[i];
  2576. uint64_t offset = (uint64_t)i * (uint64_t)max_send_size;
  2577. sock_rma_header_t *rma_hdr = (sock_rma_header_t *) tx->buffer;
  2578. rma_op->next = i + 1;
  2579. tx->flags = flags | CCI_FLAG_SILENT;
  2580. tx->state = SOCK_TX_QUEUED;
  2581. /* For RMA, the TX length only includes the header */
  2582. tx->len = sizeof(sock_rma_header_t);
  2583. tx->send_count = 0;
  2584. tx->last_attempt_us = 0ULL;
  2585. tx->timeout_us = 0ULL;
  2586. tx->rma_op = rma_op;
  2587. tx->evt.event.type = CCI_EVENT_SEND;
  2588. tx->evt.event.send.connection = connection;
  2589. tx->evt.conn = conn;
  2590. tx->evt.ep = ep;
  2591. tx->rma_ptr = NULL;
  2592. /* We calculate the amount of data we will actually need */
  2593. if (i == (rma_op->num_msgs - 1)) {
  2594. if (data_len % max_send_size)
  2595. tx->rma_len = data_len % max_send_size;
  2596. } else {
  2597. tx->rma_len = (uint16_t)max_send_size;
  2598. }
  2599. if (flags & CCI_FLAG_WRITE) {
  2600. uint64_t src_offset = local_offset + offset;
  2601. uint64_t dst_offset = remote_offset + offset;
  2602. tx->msg_type = SOCK_MSG_RMA_WRITE;
  2603. tx->rma_ptr = (void*)((uintptr_t)local->start + src_offset);
  2604. sock_pack_rma_write(rma_hdr,
  2605. tx->rma_len,
  2606. sconn->peer_id,
  2607. tx->seq,
  2608. 0,
  2609. local_handle->stuff[0],
  2610. src_offset,
  2611. remote_handle->stuff[0],
  2612. dst_offset);
  2613. debug_ep (ep, CCI_DB_INFO,
  2614. "%s: Preparing RMA write -- "
  2615. "local start: %p, "
  2616. "remote: %"PRIu64", "
  2617. "local offset: %"PRIu64", "
  2618. "remote offset: %"PRIu64", "
  2619. "len: %u, seq: %u",
  2620. __func__,
  2621. local->start,
  2622. remote_handle->stuff[0],
  2623. src_offset,
  2624. dst_offset,
  2625. tx->rma_len,
  2626. tx->seq);
  2627. } else {
  2628. tx->msg_type = SOCK_MSG_RMA_READ_REQUEST;
  2629. debug (CCI_DB_MSG,
  2630. "%s: pack RMA_READ_REQUEST (seq %u)",
  2631. __func__, tx->seq);
  2632. sock_pack_rma_read_request (rma_hdr,
  2633. tx->rma_len,
  2634. sconn->peer_id,
  2635. tx->seq, 0,
  2636. local_handle->stuff[0],
  2637. local_offset + offset,
  2638. remote_handle->stuff[0],
  2639. remote_offset + offset);
  2640. }
  2641. }
  2642. pthread_mutex_lock(&ep->lock);
  2643. for (i = 0; i < cnt; i++)
  2644. TAILQ_INSERT_TAIL(&sep->queued, &(txs[i])->evt, entry);
  2645. TAILQ_INSERT_TAIL(&sconn->rmas, rma_op, rmas);
  2646. TAILQ_INSERT_TAIL(&sep->rma_ops, rma_op, entry);
  2647. pthread_mutex_unlock(&ep->lock);
  2648. /* it is no longer needed */
  2649. free(txs);
  2650. ret = CCI_SUCCESS;
  2651. }
  2652. CCI_EXIT;
  2653. return ret;
  2654. }
  2655. /*!
  2656. Handle incoming sequence number
  2657. If we have acked it
  2658. ignore it
  2659. Walk sconn->acks:
  2660. if it exists in a current entry
  2661. do nothing
  2662. if it borders a current entry
  2663. add it to the entry
  2664. if it falls between two entries without boardering them
  2665. add a new entry between them
  2666. else
  2667. add a new entry at the tail
  2668. */
  2669. static inline void sock_handle_seq(sock_conn_t * sconn, uint32_t seq)
  2670. {
  2671. int done = 0;
  2672. sock_ack_t *ack = NULL;
  2673. sock_ack_t *last = NULL;
  2674. sock_ack_t *tmp = NULL;
  2675. cci__conn_t *conn = sconn->conn;
  2676. cci_connection_t *connection = &conn->connection;
  2677. cci_endpoint_t *endpoint = connection->endpoint;
  2678. cci__ep_t *ep = container_of(endpoint, cci__ep_t, endpoint);
  2679. if (SOCK_SEQ_LTE(seq, sconn->acked)) {
  2680. debug(CCI_DB_MSG, "%s: ignoring seq %u (acked %u) ***",
  2681. __func__, seq, sconn->acked);
  2682. return;
  2683. }
  2684. pthread_mutex_lock(&ep->lock);
  2685. TAILQ_FOREACH_SAFE(ack, &sconn->acks, entry, tmp) {
  2686. if (SOCK_SEQ_GTE(seq, ack->start) &&
  2687. SOCK_SEQ_LTE(seq, ack->end)) {
  2688. /* seq exists in this entry,
  2689. do nothing */
  2690. debug(CCI_DB_MSG, "%s: seq %u exists between %u-%u",
  2691. __func__, seq, ack->start, ack->end);
  2692. done = 1;
  2693. break;
  2694. } else if (seq == ack->start - 1) {
  2695. /* add it to start of this entry */
  2696. ack->start = seq;
  2697. debug(CCI_DB_MSG, "%s: seq %u exists before %u-%u",
  2698. __func__, seq, ack->start, ack->end);
  2699. done = 1;
  2700. break;
  2701. } else if (seq == ack->end + 1) {
  2702. sock_ack_t *next = TAILQ_NEXT(ack, entry);
  2703. /* add it to the end of this entry */
  2704. debug(CCI_DB_MSG, "%s: seq %u exists after %u-%u",
  2705. __func__, seq, ack->start, ack->end);
  2706. ack->end = seq;
  2707. /* did we plug a hole between entries? */
  2708. if (next) {
  2709. /* add this range to next and delete this entry */
  2710. debug(CCI_DB_MSG,
  2711. "%s: merging acks %u-%u with %u-%u",
  2712. __func__, ack->start, ack->end,
  2713. next->start, next->end);
  2714. next->start = ack->start;
  2715. TAILQ_REMOVE(&sconn->acks, ack, entry);
  2716. free(ack);
  2717. }
  2718. /* Forcing ACK */
  2719. if (ack->end - ack->start >= PENDING_ACK_THRESHOLD) {
  2720. debug(CCI_DB_MSG, "%s: Forcing ACK", __func__);
  2721. pthread_mutex_unlock(&ep->lock);
  2722. sock_ack_conns (ep);
  2723. pthread_mutex_lock(&ep->lock);
  2724. }
  2725. done = 1;
  2726. break;
  2727. } else if (last && SOCK_SEQ_GT(seq, last->end) &&
  2728. SOCK_SEQ_LT(seq, ack->start)) {
  2729. sock_ack_t *new;
  2730. /* add a new entry before this entry */
  2731. new = calloc(1, sizeof(*new));
  2732. if (new) {
  2733. debug(CCI_DB_MSG,
  2734. "%s: seq %u insert after %u-%u before %u-%u ",
  2735. __func__, seq, last->start, last->end,
  2736. ack->start, ack->end);
  2737. new->start = new->end = seq;
  2738. TAILQ_INSERT_BEFORE(ack, new, entry);
  2739. }
  2740. done = 1;
  2741. break;
  2742. }
  2743. last = ack;
  2744. }
  2745. if (!done) {
  2746. /* add new entry to tail */
  2747. ack = calloc(1, sizeof(*ack));
  2748. if (ack) {
  2749. ack->start = ack->end = seq;
  2750. TAILQ_INSERT_TAIL(&sconn->acks, ack, entry);
  2751. debug(CCI_DB_MSG, "%s: seq %u add at tail", __func__,
  2752. seq);
  2753. }
  2754. }
  2755. pthread_mutex_unlock(&ep->lock);
  2756. return;
  2757. }
  2758. static void
  2759. sock_handle_active_message(sock_conn_t * sconn,
  2760. sock_rx_t * rx,
  2761. uint16_t len,
  2762. uint32_t id)
  2763. {
  2764. cci__evt_t *evt;
  2765. cci__conn_t *conn = sconn->conn;
  2766. cci_endpoint_t *endpoint; /* generic CCI endpoint */
  2767. cci__ep_t *ep;
  2768. sock_ep_t *sep;
  2769. CCI_ENTER;
  2770. UNUSED_PARAM (id);
  2771. endpoint = (&conn->connection)->endpoint;
  2772. ep = container_of(endpoint, cci__ep_t, endpoint);
  2773. sep = ep->priv;
  2774. /* get cci__evt_t to hang on ep->events */
  2775. evt = &rx->evt;
  2776. if (!evt->conn)
  2777. evt->conn = conn;
  2778. /* set wire header so we can find user header */
  2779. if (cci_conn_is_reliable(conn)) {
  2780. sock_header_r_t *hdr_r = (sock_header_r_t *) rx->buffer;
  2781. evt->event.recv.ptr = (void *)&hdr_r->data;
  2782. } else {
  2783. sock_header_t *hdr = (sock_header_t *) rx->buffer;
  2784. evt->event.recv.ptr = (void *)&hdr->data;
  2785. }
  2786. /* setup the generic event for the application */
  2787. evt->event.type = CCI_EVENT_RECV;
  2788. evt->event.recv.len = len;
  2789. evt->event.recv.connection = &conn->connection;
  2790. /* queue event on endpoint's completed event queue */
  2791. sock_queue_event (ep, evt);
  2792. /* waking up the app thread if it is blocking on a OS handle */
  2793. if (sep->event_fd) {
  2794. int rc;
  2795. rc = write (sep->fd[1], "a", 1);
  2796. if (rc != 1) {
  2797. debug (CCI_DB_WARN, "%s: Write failed", __func__);
  2798. }
  2799. }
  2800. CCI_EXIT;
  2801. return;
  2802. }
  2803. /*!
  2804. Handle incoming RNR messages
  2805. */
  2806. static void sock_handle_rnr(sock_conn_t * sconn, uint32_t seq, uint32_t ts)
  2807. {
  2808. sock_tx_t *tx = NULL;
  2809. sock_tx_t *tmp = NULL;
  2810. int found = 0;
  2811. UNUSED_PARAM (ts);
  2812. /* Find the corresponding SEQ/TS */
  2813. TAILQ_FOREACH_SAFE(tx, &sconn->tx_seqs, tx_seq, tmp) {
  2814. if (tx->seq == seq) {
  2815. debug(CCI_DB_MSG,
  2816. "%s: Receiver not ready (seq: %u)", __func__,
  2817. seq);
  2818. tx->rnr = 1;
  2819. found = 1;
  2820. }
  2821. }
  2822. /* We also mark the conn as RNR */
  2823. if (sconn->rnr == 0)
  2824. sconn->rnr = seq;
  2825. if (found == 0)
  2826. debug (CCI_DB_INFO,
  2827. "%s: Cannot find TX corresponding to RNR", __func__);
  2828. }
  2829. /*!
  2830. Handle incoming nack
  2831. */
  2832. static void
  2833. sock_handle_nack (sock_conn_t * sconn,
  2834. cci__ep_t *ep,
  2835. sock_ep_t *sep,
  2836. uint32_t seq)
  2837. {
  2838. sock_tx_t *tx = NULL;
  2839. sock_tx_t *tmp = NULL;
  2840. debug_ep (ep,
  2841. CCI_DB_MSG,
  2842. "%s: Received NACK (seq: %u)",
  2843. __func__,
  2844. seq);
  2845. /* If the message is still in the pending queue, we resend it,
  2846. otherwise it means the message has been acked meanwhile and
  2847. therefore we can ignore the NACK */
  2848. TAILQ_FOREACH_SAFE (tx, &sconn->tx_seqs, tx_seq, tmp) {
  2849. if (tx->seq == seq) {
  2850. /* Resend and return */
  2851. debug_ep (ep,
  2852. CCI_DB_MSG,
  2853. "Resending NACKed msg (seq %u)",
  2854. seq);
  2855. sock_sendto (sep->sock,
  2856. tx->buffer,
  2857. tx->len,
  2858. tx->rma_ptr,
  2859. tx->rma_len,
  2860. sconn->sin);
  2861. return;
  2862. }
  2863. }
  2864. return;
  2865. }
  2866. /*!
  2867. Handle incoming ack
  2868. Check the device pending list for the matching tx
  2869. if found, remove it and hang it on the completion list
  2870. if not found, ignore (it is a duplicate)
  2871. */
  2872. static void
  2873. sock_handle_ack(sock_conn_t * sconn,
  2874. sock_msg_type_t type,
  2875. sock_rx_t * rx,
  2876. uint32_t count,
  2877. uint32_t id)
  2878. {
  2879. uint32_t i = 0;
  2880. int found = 0;
  2881. cci__conn_t *conn = sconn->conn;
  2882. cci_connection_t *connection = &conn->connection;
  2883. cci_endpoint_t *endpoint = connection->endpoint;
  2884. cci__ep_t *ep = container_of(endpoint, cci__ep_t, endpoint);
  2885. cci__dev_t *dev = ep->dev;
  2886. sock_ep_t *sep = ep->priv;
  2887. sock_tx_t *tx = NULL;
  2888. sock_tx_t *tmp = NULL;
  2889. sock_header_r_t *hdr_r = rx->buffer;
  2890. uint32_t acks[SOCK_MAX_SACK * 2];
  2891. TAILQ_HEAD(s_idle_txs, sock_tx) idle_txs
  2892. = TAILQ_HEAD_INITIALIZER(idle_txs);
  2893. TAILQ_HEAD(s_evts, cci__evt) evts = TAILQ_HEAD_INITIALIZER(evts);
  2894. TAILQ_HEAD(s_queued, sock_tx) queued = TAILQ_HEAD_INITIALIZER(queued);
  2895. TAILQ_INIT(&idle_txs);
  2896. TAILQ_INIT(&evts);
  2897. TAILQ_INIT(&queued);
  2898. assert(id == sconn->id);
  2899. assert(count > 0);
  2900. if (count == 1) {
  2901. assert(type == SOCK_MSG_ACK_ONLY || type == SOCK_MSG_ACK_UP_TO
  2902. || type == SOCK_MSG_SEND || type == SOCK_MSG_RMA_WRITE
  2903. || type == SOCK_MSG_RMA_READ_REQUEST
  2904. || type == SOCK_MSG_RMA_WRITE_DONE
  2905. || type == SOCK_MSG_RMA_READ_REPLY);
  2906. } else {
  2907. assert(type == SOCK_MSG_SACK);
  2908. }
  2909. sock_parse_ack(hdr_r, type, acks, count);
  2910. if (type == SOCK_MSG_ACK_ONLY) {
  2911. if (sconn->seq_pending == acks[0] - 1)
  2912. sconn->seq_pending = acks[0];
  2913. } else if (type == SOCK_MSG_ACK_UP_TO) {
  2914. sconn->seq_pending = acks[0];
  2915. } else if (type == SOCK_MSG_SEND
  2916. || type == SOCK_MSG_RMA_WRITE
  2917. || type == SOCK_MSG_RMA_WRITE_DONE
  2918. || type == SOCK_MSG_RMA_READ_REQUEST
  2919. || type == SOCK_MSG_RMA_READ_REPLY)
  2920. {
  2921. /* Piggybacked ACK */
  2922. acks[0] = hdr_r->pb_ack;
  2923. /* Reset hdr_r->pb_ack so we cannot do this again later */
  2924. hdr_r->pb_ack = 0;
  2925. if (sconn->seq_pending == acks[0] - 1)
  2926. sconn->seq_pending = acks[0];
  2927. }
  2928. /*
  2929. If this is an explicit ACK message, we "extracted" all the info we
  2930. need, we can return the RX buffer. If it is NOT an explicit ACK
  2931. (for instance in the context of a piggybacked ACK, the RX buffer
  2932. is returned by the function handling the specific type of messages
  2933. */
  2934. if (type == SOCK_MSG_ACK_ONLY || type == SOCK_MSG_ACK_UP_TO
  2935. || type == SOCK_MSG_SACK)
  2936. {
  2937. pthread_mutex_lock(&ep->lock);
  2938. TAILQ_INSERT_HEAD(&sep->idle_rxs, rx, entry);
  2939. pthread_mutex_unlock(&ep->lock);
  2940. }
  2941. pthread_mutex_lock(&dev->lock);
  2942. pthread_mutex_lock(&ep->lock);
  2943. TAILQ_FOREACH_SAFE(tx, &sconn->tx_seqs, tx_seq, tmp) {
  2944. /* Note that type of msgs can include a piggybacked ACK */
  2945. if (type == SOCK_MSG_ACK_ONLY
  2946. || type == SOCK_MSG_SEND
  2947. || type == SOCK_MSG_RMA_WRITE
  2948. || type == SOCK_MSG_RMA_READ_REQUEST
  2949. || type == SOCK_MSG_RMA_WRITE_DONE
  2950. || type == SOCK_MSG_RMA_READ_REPLY)
  2951. {
  2952. if (tx->seq == acks[0]) {
  2953. if (tx->state == SOCK_TX_PENDING) {
  2954. debug(CCI_DB_MSG,
  2955. "%s: acking only seq %u", __func__,
  2956. acks[0]);
  2957. TAILQ_REMOVE(&sep->pending, &tx->evt, entry);
  2958. TAILQ_REMOVE(&sconn->tx_seqs, tx, tx_seq);
  2959. if (tx->msg_type == SOCK_MSG_RMA_WRITE
  2960. || tx->msg_type == SOCK_MSG_RMA_READ_REQUEST)
  2961. tx->rma_op->pending--;
  2962. if (tx->msg_type == SOCK_MSG_SEND) {
  2963. sconn->pending--;
  2964. #if 0
  2965. if (sconn->pending <=
  2966. sconn->ssthresh) {
  2967. sconn->cwnd++;
  2968. debug(CCI_DB_INFO,
  2969. "%s increase cwnd from %d to %d",
  2970. __func__,
  2971. sconn->cwnd - 1,
  2972. sconn->cwnd);
  2973. } else {
  2974. sconn->cwnd++;
  2975. }
  2976. #endif
  2977. }
  2978. /* if SILENT, put idle tx */
  2979. if (tx->flags & CCI_FLAG_SILENT) {
  2980. tx->state = SOCK_TX_IDLE;
  2981. /* store locally until we can drop the locks */
  2982. TAILQ_INSERT_HEAD(&idle_txs, tx,
  2983. dentry);
  2984. } else {
  2985. tx->state = SOCK_TX_COMPLETED;
  2986. tx->evt.event.send.status = CCI_SUCCESS;
  2987. /* store locally until we can drop the locks */
  2988. TAILQ_INSERT_TAIL(&evts, &tx->evt, entry);
  2989. }
  2990. }
  2991. found = 1;
  2992. break;
  2993. }
  2994. } else if (type == SOCK_MSG_ACK_UP_TO) {
  2995. if (SOCK_SEQ_LTE(tx->seq, acks[0])) {
  2996. if (tx->state == SOCK_TX_PENDING) {
  2997. debug(CCI_DB_MSG,
  2998. "%s: acking tx seq %u (up to seq %u)",
  2999. __func__, tx->seq, acks[0]);
  3000. TAILQ_REMOVE(&sep->pending, &tx->evt, entry);
  3001. TAILQ_REMOVE(&sconn->tx_seqs, tx, tx_seq);
  3002. if (tx->msg_type == SOCK_MSG_RMA_WRITE)
  3003. tx->rma_op->pending--;
  3004. if (tx->msg_type == SOCK_MSG_SEND) {
  3005. sconn->pending--;
  3006. #if 0
  3007. if (sconn->pending <=
  3008. sconn->ssthresh) {
  3009. sconn->cwnd++;
  3010. debug(CCI_DB_INFO,
  3011. "%s increase cwnd from %d to %d",
  3012. __func__,
  3013. sconn->cwnd - 1,
  3014. sconn->cwnd);
  3015. } else {
  3016. sconn->cwnd++;
  3017. }
  3018. #endif
  3019. }
  3020. /* if SILENT, put idle tx */
  3021. if (tx->flags & CCI_FLAG_SILENT) {
  3022. tx->state = SOCK_TX_IDLE;
  3023. /* store locally until we can drop the locks */
  3024. TAILQ_INSERT_HEAD(&idle_txs, tx,
  3025. dentry);
  3026. } else {
  3027. tx->state = SOCK_TX_COMPLETED;
  3028. tx->evt.event.send.status = CCI_SUCCESS;
  3029. /* store locally until we can drop the locks */
  3030. TAILQ_INSERT_TAIL(&evts, &tx->evt, entry);
  3031. }
  3032. found++;
  3033. }
  3034. } else {
  3035. break;
  3036. }
  3037. } else { /* SACK */
  3038. for (i = 0; i < (uint32_t) count; i += 2) {
  3039. if (SOCK_SEQ_GTE(tx->seq, acks[i]) &&
  3040. SOCK_SEQ_LTE(tx->seq, acks[i + 1])) {
  3041. if (sconn->seq_pending == acks[i] - 1)
  3042. sconn->seq_pending =
  3043. acks[i + 1];
  3044. if (tx->state == SOCK_TX_PENDING) {
  3045. debug(CCI_DB_MSG,
  3046. "%s: sacking seq %u",
  3047. __func__, tx->seq);
  3048. found++;
  3049. TAILQ_REMOVE(&sep->pending, &tx->evt, entry);
  3050. TAILQ_REMOVE(&sconn->tx_seqs, tx, tx_seq);
  3051. if (tx->msg_type == SOCK_MSG_RMA_WRITE ||
  3052. tx->msg_type == SOCK_MSG_RMA_READ_REPLY)
  3053. {
  3054. tx->rma_op->pending--;
  3055. }
  3056. if (tx->msg_type == SOCK_MSG_SEND) {
  3057. sconn->pending--;
  3058. #if 0
  3059. if (sconn->pending <=
  3060. sconn->ssthresh) {
  3061. sconn->cwnd++;
  3062. debug
  3063. (CCI_DB_INFO,
  3064. "%s increase cwnd from %d to %d",
  3065. __func__,
  3066. sconn->cwnd
  3067. - 1,
  3068. sconn->cwnd);
  3069. } else {
  3070. sconn->cwnd++;
  3071. }
  3072. #endif
  3073. }
  3074. /* if SILENT, put idle tx */
  3075. if (tx->flags & CCI_FLAG_SILENT) {
  3076. tx->state = SOCK_TX_IDLE;
  3077. /* store locally until we can drop the dev->lock */
  3078. TAILQ_INSERT_HEAD (&idle_txs, tx, dentry);
  3079. } else {
  3080. tx->state = SOCK_TX_COMPLETED;
  3081. tx->evt.event.send.status = CCI_SUCCESS;
  3082. /* store locally until we can drop the dev->lock */
  3083. TAILQ_INSERT_TAIL(&evts, &tx->evt, entry);
  3084. }
  3085. }
  3086. }
  3087. }
  3088. }
  3089. }
  3090. pthread_mutex_unlock(&ep->lock);
  3091. pthread_mutex_unlock(&dev->lock);
  3092. debug(CCI_DB_MSG, "%s: acked %d msgs (%s %u)", __func__, found,
  3093. sock_msg_type(type), acks[0]);
  3094. pthread_mutex_lock(&ep->lock);
  3095. /* transfer txs to sock ep's list */
  3096. while (!TAILQ_EMPTY(&idle_txs)) {
  3097. sock_rma_op_t *rma_op = NULL;
  3098. tx = TAILQ_FIRST(&idle_txs);
  3099. TAILQ_REMOVE(&idle_txs, tx, dentry);
  3100. rma_op = tx->rma_op;
  3101. if (rma_op && rma_op->status == CCI_SUCCESS) {
  3102. sock_rma_handle_t *local = NULL;
  3103. if (rma_op->local_handle != NULL) {
  3104. local = (void*)((uintptr_t)rma_op->local_handle->stuff[0]);
  3105. }
  3106. rma_op->completed++;
  3107. /* progress RMA */
  3108. if (tx == rma_op->tx) {
  3109. int flags = rma_op->flags;
  3110. void *context = rma_op->context;
  3111. /* they acked our remote completion */
  3112. TAILQ_REMOVE(&sep->rma_ops, rma_op, entry);
  3113. TAILQ_REMOVE(&sconn->rmas, rma_op, rmas);
  3114. free(rma_op);
  3115. if (!(flags & CCI_FLAG_SILENT)) {
  3116. tx->evt.event.send.status = CCI_SUCCESS;
  3117. tx->evt.event.send.context = context;
  3118. TAILQ_INSERT_HEAD(&evts, &tx->evt,
  3119. entry);
  3120. continue;
  3121. }
  3122. }
  3123. /* they acked a data segment, do we need to send more
  3124. * or send the remote completion? */
  3125. if (rma_op->next < rma_op->num_msgs) {
  3126. sock_rma_header_t *write = (sock_rma_header_t *) tx->buffer;
  3127. uint64_t offset = 0ULL;
  3128. size_t max_send_size;
  3129. /* send more data */
  3130. i = rma_op->next;
  3131. rma_op->next++;
  3132. tx->flags = rma_op->flags | CCI_FLAG_SILENT;
  3133. tx->state = SOCK_TX_QUEUED;
  3134. /* payload size for now */
  3135. RMA_PAYLOAD_SIZE (connection, max_send_size);
  3136. tx->send_count = 0;
  3137. tx->last_attempt_us = 0ULL;
  3138. tx->timeout_us = 0ULL;
  3139. tx->rma_op = rma_op;
  3140. tx->evt.event.type = CCI_EVENT_SEND;
  3141. tx->evt.event.send.connection = connection;
  3142. tx->evt.conn = conn;
  3143. if (i == (rma_op->num_msgs - 1)) {
  3144. if (rma_op->data_len % max_send_size)
  3145. tx->rma_len = rma_op->data_len % max_send_size;
  3146. } else {
  3147. tx->rma_len = (uint16_t)max_send_size;
  3148. }
  3149. tx->seq = ++(sconn->seq);
  3150. tx->len = sizeof(sock_rma_header_t);
  3151. offset = (uint64_t) i * (uint64_t) max_send_size;
  3152. if (tx->flags & CCI_FLAG_WRITE) {
  3153. uint64_t src_offset = rma_op->local_offset + offset;
  3154. uint64_t dst_offset = rma_op->remote_offset + offset;
  3155. debug_ep (ep, CCI_DB_INFO,
  3156. "%s: Prepare RMA write -- "
  3157. "start: %p, offset: %"PRIu64", "
  3158. "len: %u, seq: %u",
  3159. __func__, local->start,
  3160. src_offset, tx->rma_len, tx->seq);
  3161. tx->msg_type = SOCK_MSG_RMA_WRITE;
  3162. tx->rma_ptr = (void*)((uintptr_t)local->start + src_offset);
  3163. sock_pack_rma_write(write,
  3164. tx->rma_len,
  3165. sconn->peer_id,
  3166. tx->seq,
  3167. 0,
  3168. rma_op->local_handle->stuff[0],
  3169. src_offset,
  3170. rma_op->remote_handle->stuff[0],
  3171. dst_offset);
  3172. } else {
  3173. tx->msg_type = SOCK_MSG_RMA_READ_REQUEST;
  3174. /* FIXME: not nice to use a "write" variable here, esp since
  3175. * the code is correct, only the name is confusing */
  3176. sock_pack_rma_read_request (write, tx->rma_len,
  3177. sconn->peer_id, tx->seq, 0,
  3178. rma_op->local_handle->stuff[0],
  3179. rma_op->local_offset + offset,
  3180. rma_op->remote_handle->stuff[0],
  3181. rma_op->remote_offset + offset);
  3182. }
  3183. /* now include the header */
  3184. TAILQ_INSERT_TAIL(&queued, tx, dentry);
  3185. continue;
  3186. } else if (rma_op->completed == rma_op->num_msgs) {
  3187. /* send remote completion? */
  3188. if (rma_op->msg_len) {
  3189. sock_header_r_t *hdr_r = tx->buffer;
  3190. sock_rma_header_t *write = NULL;
  3191. void *msg_ptr = NULL;
  3192. rma_op->tx = tx;
  3193. tx->msg_type = SOCK_MSG_RMA_WRITE_DONE;
  3194. tx->flags = rma_op->flags | CCI_FLAG_SILENT;
  3195. tx->state = SOCK_TX_QUEUED;
  3196. /* payload size for now */
  3197. tx->len = (uint16_t) rma_op->msg_len;
  3198. tx->send_count = 0;
  3199. tx->last_attempt_us = 0ULL;
  3200. tx->timeout_us = 0ULL;
  3201. tx->rma_op = rma_op;
  3202. tx->seq = ++(sconn->seq);
  3203. tx->evt.event.type = CCI_EVENT_SEND;
  3204. tx->evt.event.send.connection = connection;
  3205. tx->evt.event.send.context = rma_op->context;
  3206. tx->evt.conn = conn;
  3207. tx->evt.ep = ep;
  3208. /* From here we have a valid TX buffer
  3209. that we can use to send the remote
  3210. completion. First we prepare the
  3211. header */
  3212. write = (sock_rma_header_t *) tx->buffer;
  3213. debug_ep (ep, CCI_DB_EP,
  3214. "%s: Sending msg completion; "
  3215. "msg cmpl len: %u, seq: %u",
  3216. __func__, rma_op->msg_len, tx->seq);
  3217. sock_pack_rma_write_done(write,
  3218. sizeof(uint32_t) + rma_op->msg_len,
  3219. sconn->peer_id,
  3220. tx->seq, 0);
  3221. /* Then we copy the completion data
  3222. (len + data) */
  3223. msg_ptr = (void *)(hdr_r->data);
  3224. memcpy(msg_ptr, &rma_op->msg_len,
  3225. sizeof(uint32_t));
  3226. msg_ptr = (void *)(hdr_r->data + sizeof(uint32_t));
  3227. memcpy(msg_ptr, rma_op->msg_ptr, rma_op->msg_len);
  3228. /* The total size of the RMA_WRITE_DONE
  3229. msg is the RMA header + len and data
  3230. for the remote completion msg */
  3231. tx->len = sizeof (sock_rma_header_t)
  3232. + sizeof(uint32_t)
  3233. + rma_op->msg_len;
  3234. TAILQ_INSERT_TAIL(&queued, tx, dentry);
  3235. continue;
  3236. } else {
  3237. int flags = rma_op->flags;
  3238. void *context = rma_op->context;
  3239. /* complete now */
  3240. TAILQ_REMOVE(&sep->rma_ops, rma_op, entry);
  3241. TAILQ_REMOVE(&sconn->rmas, rma_op, rmas);
  3242. local->refcnt--;
  3243. free(rma_op);
  3244. if (!(flags & CCI_FLAG_SILENT)) {
  3245. tx->evt.event.send.status =
  3246. CCI_SUCCESS;
  3247. tx->evt.event.send.context =
  3248. context;
  3249. TAILQ_INSERT_HEAD(&evts,
  3250. &tx->evt,
  3251. entry);
  3252. continue;
  3253. }
  3254. }
  3255. }
  3256. }
  3257. TAILQ_INSERT_HEAD(&sep->idle_txs, tx, dentry);
  3258. }
  3259. /* transfer evts to the ep's list */
  3260. while (!TAILQ_EMPTY(&evts)) {
  3261. cci__evt_t *evt;
  3262. evt = TAILQ_FIRST(&evts);
  3263. TAILQ_REMOVE(&evts, evt, entry);
  3264. TAILQ_INSERT_TAIL(&ep->evts, evt, entry);
  3265. /* waking up the app thread if it is blocking on a OS handle */
  3266. if (sep->event_fd) {
  3267. int rc;
  3268. rc = write (sep->fd[1], "a", 1);
  3269. if (rc != 1) {
  3270. debug (CCI_DB_WARN, "%s: Write failed", __func__);
  3271. CCI_EXIT;
  3272. return;
  3273. }
  3274. }
  3275. }
  3276. pthread_mutex_unlock(&ep->lock);
  3277. pthread_mutex_lock(&dev->lock);
  3278. pthread_mutex_lock(&ep->lock);
  3279. while (!TAILQ_EMPTY(&queued)) {
  3280. sock_tx_t *my_tx;
  3281. my_tx = TAILQ_FIRST(&queued);
  3282. TAILQ_REMOVE(&queued, my_tx, dentry);
  3283. TAILQ_INSERT_TAIL(&sep->queued, &my_tx->evt, entry);
  3284. }
  3285. pthread_mutex_unlock(&ep->lock);
  3286. pthread_mutex_unlock(&dev->lock);
  3287. /* We received a ACK so we wake up the send thread */
  3288. if (!sep->closing) {
  3289. pthread_mutex_lock(&sep->progress_mutex);
  3290. pthread_cond_signal(&sep->wait_condition);
  3291. pthread_mutex_unlock(&sep->progress_mutex);
  3292. }
  3293. CCI_EXIT;
  3294. return;
  3295. }
  3296. static void
  3297. sock_handle_conn_request(sock_rx_t * rx,
  3298. cci_conn_attribute_t attr,
  3299. uint16_t len, struct sockaddr_in sin, cci__ep_t * ep)
  3300. {
  3301. char name[32];
  3302. sock_ep_t *sep = NULL;
  3303. CCI_ENTER;
  3304. memset(name, 0, sizeof(name));
  3305. sock_sin_to_name(sin, name, sizeof(name));
  3306. debug_ep(ep, CCI_DB_CONN, "%s: recv'd conn_req from %s",
  3307. __func__, name);
  3308. rx->evt.event.type = CCI_EVENT_CONNECT_REQUEST;
  3309. rx->evt.event.request.attribute = attr;
  3310. *((uint32_t *) & rx->evt.event.request.data_len) = len;
  3311. if (len)
  3312. *((void **)&rx->evt.event.request.data_ptr) =
  3313. (void *)((((sock_header_r_t *) rx->buffer)->data) +
  3314. (uintptr_t) sizeof(sock_handshake_t));
  3315. else
  3316. *((void **)&rx->evt.event.request.data_ptr) = NULL;
  3317. /* queue event on endpoint's completed event queue */
  3318. sock_queue_event (ep, &rx->evt);
  3319. /* waking up the app thread if it is blocking on a OS handle */
  3320. sep = ep->priv;
  3321. if (sep->event_fd) {
  3322. int rc;
  3323. rc = write (sep->fd[1], "a", 1);
  3324. if (rc != 1) {
  3325. debug (CCI_DB_WARN, "%s: Write failed", __func__);
  3326. return;
  3327. }
  3328. }
  3329. CCI_EXIT;
  3330. return;
  3331. }
  3332. /**
  3333. * Possible states and what to do:
  3334. *
  3335. * Recv send send with complete switch
  3336. * Success conn_ack reliably seq_ts event lists
  3337. * -------------------------------------------------------------------
  3338. * No conn Error
  3339. * Active conn Yes Yes Yes Yes Yes
  3340. * Ready conn Yes Yes Yes No No
  3341. * ===================================================================
  3342. * Recv send send with complete free
  3343. * Rejected conn_ack reliably seq_ts event conn
  3344. * -------------------------------------------------------------------
  3345. * No conn Yes No No No No
  3346. * Active conn Yes No No Yes Yes
  3347. * Ready conn Error
  3348. * @param[in] reply CCI_SUCCESS or CCI_ECONNREFUSED
  3349. */
  3350. static void sock_handle_conn_reply(sock_conn_t * sconn,
  3351. sock_rx_t * rx,
  3352. uint8_t reply,
  3353. uint16_t unused,
  3354. uint32_t id,
  3355. struct sockaddr_in sin,
  3356. cci__ep_t * ep)
  3357. {
  3358. int i, ret;
  3359. cci__evt_t *evt = NULL, *tmp = NULL, *e = NULL;
  3360. cci__conn_t *conn = NULL;
  3361. sock_ep_t *sep = NULL;
  3362. sock_tx_t *tx = NULL, *t = NULL;
  3363. sock_header_r_t *hdr_r; /* wire header */
  3364. union cci_event *event; /* generic CCI event */
  3365. uint32_t seq; /* peer's seq */
  3366. uint32_t ts; /* FIXME our original seq */
  3367. sock_handshake_t *hs = NULL;
  3368. struct s_active *active_list;
  3369. CCI_ENTER;
  3370. UNUSED_PARAM (unused);
  3371. sep = ep->priv;
  3372. if (!sconn) {
  3373. /*
  3374. * Either this is a dup and the conn is now ready or
  3375. * the conn is closed and we simply ack the msg
  3376. */
  3377. /* look for a conn that is ready */
  3378. sconn = sock_find_conn(sep, sin.sin_addr.s_addr, sin.sin_port,
  3379. id, SOCK_MSG_SEND);
  3380. if (!sconn) {
  3381. sock_header_r_t hdr;
  3382. int len = (int)sizeof(hdr);
  3383. char from[32];
  3384. memset(from, 0, sizeof(from));
  3385. sock_sin_to_name(sin, from, sizeof(from));
  3386. debug_ep(ep, (CCI_DB_CONN | CCI_DB_MSG),
  3387. "%s: recv'd conn_reply (%s) from %s"
  3388. " with no matching conn",
  3389. __func__,
  3390. reply == CCI_SUCCESS ? "success" : "rejected",
  3391. from);
  3392. /* simply ack this msg and cleanup */
  3393. memset(&hdr, 0, sizeof(hdr));
  3394. sock_pack_conn_ack(&hdr.header, id);
  3395. ret = sock_sendto(sep->sock, &hdr, len, NULL, 0, sin);
  3396. if (ret != len) {
  3397. debug_ep(ep, (CCI_DB_CONN | CCI_DB_MSG),
  3398. "%s: failed to send conn_ack with %s",
  3399. __func__,
  3400. cci_strerror(&ep->endpoint,
  3401. (enum cci_status)ret));
  3402. }
  3403. pthread_mutex_lock(&ep->lock);
  3404. TAILQ_INSERT_HEAD(&sep->idle_rxs, rx, entry);
  3405. pthread_mutex_unlock(&ep->lock);
  3406. /* We only did a peek of the header so far and we got enough
  3407. data to move on so we drop the msg */
  3408. sock_drop_msg (sep->sock);
  3409. CCI_EXIT;
  3410. return;
  3411. }
  3412. /* else we have a connection and we can ack normally */
  3413. }
  3414. conn = sconn->conn;
  3415. /* set wire header so we can find user header */
  3416. hdr_r = (sock_header_r_t *) rx->buffer;
  3417. /* FIXME do something with ts */
  3418. sock_parse_seq_ts(&hdr_r->seq_ts, &seq, &ts);
  3419. /* get cci__evt_t to hang on ep->events */
  3420. evt = &rx->evt;
  3421. /* setup the generic event for the application */
  3422. event = & evt->event;
  3423. event->type = CCI_EVENT_CONNECT;
  3424. event->connect.status = reply;
  3425. event->connect.connection =
  3426. reply == CCI_SUCCESS ? &conn->connection : NULL;
  3427. event->connect.context = conn->connection.context;
  3428. i = sock_ip_hash(sin.sin_addr.s_addr, 0);
  3429. active_list = &sep->active_hash[i];
  3430. pthread_mutex_lock(&ep->lock);
  3431. TAILQ_REMOVE(active_list, sconn, entry);
  3432. pthread_mutex_unlock(&ep->lock);
  3433. if (sconn->status == SOCK_CONN_ACTIVE) {
  3434. uint32_t peer_id, ack, max_recv_buffer_count, mss, keepalive;
  3435. if (CCI_SUCCESS == reply)
  3436. {
  3437. /* Connection is accepted */
  3438. /* We finally get the entire message */
  3439. uint32_t total_size = sizeof (sock_header_r_t)
  3440. + sizeof (sock_handshake_t);
  3441. uint32_t recv_len = sock_recv_msg (sep->sock,
  3442. rx->buffer,
  3443. total_size, 0,
  3444. NULL);
  3445. debug (CCI_DB_EP, "%s: We now have %d/%u bytes",
  3446. __func__, recv_len, total_size);
  3447. #if CCI_DEBUG
  3448. assert (recv_len == total_size);
  3449. #endif
  3450. debug(CCI_DB_CONN,
  3451. "%s: transition active connection to ready",
  3452. __func__);
  3453. hs = (sock_handshake_t *) ((uintptr_t)rx->buffer
  3454. + sizeof(*hdr_r));
  3455. /* With conn_reply, we do not care about the keepalive
  3456. param */
  3457. sock_parse_handshake(hs, &peer_id, &ack,
  3458. &max_recv_buffer_count, &mss,
  3459. &keepalive);
  3460. /* get pending conn_req tx, create event, move conn to
  3461. conn_hash */
  3462. pthread_mutex_lock(&ep->lock);
  3463. TAILQ_FOREACH_SAFE(e, &sep->pending, entry, tmp) {
  3464. t = container_of (e, sock_tx_t, evt);
  3465. if (t->seq == ack) {
  3466. TAILQ_REMOVE(&sep->pending, e, entry);
  3467. tx = t;
  3468. break;
  3469. }
  3470. }
  3471. pthread_mutex_unlock(&ep->lock);
  3472. /* Since we remove the pending tx, update the
  3473. pending_seq for that given connection */
  3474. if (sconn->seq_pending == ack - 1)
  3475. sconn->seq_pending = ack;
  3476. if (!tx) {
  3477. char from[32];
  3478. memset(from, 0, sizeof(from));
  3479. sock_sin_to_name(sin, from, sizeof(from));
  3480. /* We cannot be active without a tx pending */
  3481. debug_ep(ep, CCI_DB_WARN,
  3482. "%s: recv'd conn_reply (%s) from %s "
  3483. "with an active conn and no matching "
  3484. "tx",
  3485. __func__,
  3486. reply == CCI_SUCCESS ? "success"
  3487. : "rejected",
  3488. from);
  3489. /* we can't transition to ready since we do not
  3490. have the context from the conn_request tx */
  3491. assert(0);
  3492. }
  3493. /* check mss and rx count */
  3494. if (mss < conn->connection.max_send_size)
  3495. conn->connection.max_send_size = mss;
  3496. if (cci_conn_is_reliable(conn)) {
  3497. sconn->max_tx_cnt = max_recv_buffer_count <
  3498. ep->tx_buf_cnt ?
  3499. max_recv_buffer_count :
  3500. ep->tx_buf_cnt;
  3501. sconn->ssthresh = sconn->max_tx_cnt;
  3502. }
  3503. sconn->peer_id = peer_id;
  3504. sconn->status = SOCK_CONN_READY;
  3505. *((struct sockaddr_in *)&sconn->sin) = sin;
  3506. sconn->acked = seq;
  3507. i = sock_ip_hash(sin.sin_addr.s_addr, sin.sin_port);
  3508. pthread_mutex_lock(&ep->lock);
  3509. TAILQ_INSERT_TAIL(&sep->conn_hash[i], sconn, entry);
  3510. pthread_mutex_unlock(&ep->lock);
  3511. debug(CCI_DB_CONN, "%s: conn ready on hash %d",
  3512. __func__, i);
  3513. } else {
  3514. /* Connection is rejected */
  3515. sock_header_r_t hdr;
  3516. int len = (int)sizeof(hdr);
  3517. char name[32];
  3518. /* We finally get the entire message */
  3519. uint32_t total_size = sizeof (sock_header_r_t);
  3520. uint32_t recv_len = sock_recv_msg (sep->sock,
  3521. rx->buffer,
  3522. total_size, 0,
  3523. NULL);
  3524. debug (CCI_DB_EP, "%s: We now have %d/%u bytes",
  3525. __func__, recv_len, total_size);
  3526. #if CCI_DEBUG
  3527. assert (recv_len == total_size);
  3528. #endif
  3529. free(sconn);
  3530. if (conn->uri)
  3531. free((char *)conn->uri);
  3532. free(conn);
  3533. /* send unreliable conn_ack */
  3534. memset(name, 0, sizeof(name));
  3535. sock_sin_to_name(sin, name, sizeof(name));
  3536. debug_ep(ep, (CCI_DB_CONN | CCI_DB_MSG),
  3537. "%s: recv'd conn_reply (rejected) from %s"
  3538. " - closing conn", __func__, name);
  3539. /*
  3540. * Implicit ACK of the corresponding conn_req
  3541. */
  3542. debug((CCI_DB_CONN | CCI_DB_MSG),
  3543. "%s: Implicitely ACKing conn_req %u",
  3544. __func__, seq);
  3545. /* get pending conn_req tx, create event, move conn to
  3546. conn_hash */
  3547. pthread_mutex_lock(&ep->lock);
  3548. TAILQ_FOREACH_SAFE(e, &sep->pending, entry, tmp) {
  3549. t = container_of (e, sock_tx_t, evt);
  3550. if (t->seq == seq) {
  3551. TAILQ_REMOVE(&sep->pending, e, entry);
  3552. tx = t;
  3553. break;
  3554. }
  3555. }
  3556. pthread_mutex_unlock(&ep->lock);
  3557. /* Since we remove the pending tx, update the
  3558. pending_seq for that given connection */
  3559. if (sconn->seq_pending == seq - 1)
  3560. sconn->seq_pending = seq;
  3561. /* simply ack this msg and cleanup */
  3562. memset(&hdr, 0, sizeof(hdr));
  3563. sock_pack_conn_ack(&hdr.header, sconn->peer_id);
  3564. ret = sock_sendto(sep->sock, &hdr, len, NULL, 0, sin);
  3565. if (ret != len) {
  3566. debug_ep(ep, (CCI_DB_CONN | CCI_DB_MSG),
  3567. "%s: failed to send conn_ack with %s",
  3568. __func__,
  3569. cci_strerror(&ep->endpoint,
  3570. (enum cci_status)ret));
  3571. }
  3572. }
  3573. /* add rx->evt to ep->evts */
  3574. sock_queue_event (ep, &rx->evt);
  3575. /* waking up the app thread if it is blocking on a OS handle */
  3576. if (sep->event_fd) {
  3577. int rc;
  3578. rc = write (sep->fd[1], "a", 1);
  3579. if (rc != 1) {
  3580. debug (CCI_DB_WARN, "%s: Write failed", __func__);
  3581. CCI_EXIT;
  3582. return;
  3583. }
  3584. }
  3585. if (reply != CCI_SUCCESS) {
  3586. CCI_EXIT;
  3587. return;
  3588. }
  3589. } else if (sconn->status == SOCK_CONN_READY) {
  3590. tx = sock_get_tx (ep);
  3591. if (!tx) {
  3592. char to[32];
  3593. memset(to, 0, sizeof(to));
  3594. sock_sin_to_name(sin, to, sizeof(to));
  3595. /* we can't ack, cleanup */
  3596. debug_ep(ep, (CCI_DB_CONN | CCI_DB_MSG),
  3597. "%s: no tx buff to send a conn_ack to %s",
  3598. __func__, to);
  3599. pthread_mutex_lock(&ep->lock);
  3600. TAILQ_INSERT_HEAD(&sep->idle_rxs, rx, entry);
  3601. pthread_mutex_unlock(&ep->lock);
  3602. CCI_EXIT;
  3603. return;
  3604. }
  3605. }
  3606. /* we have a tx for the conn_ack */
  3607. tx->rma_ptr = NULL;
  3608. tx->rma_len = 0;
  3609. tx->seq = ++(sconn->seq);
  3610. tx->flags = CCI_FLAG_SILENT;
  3611. tx->msg_type = SOCK_MSG_CONN_ACK;
  3612. tx->evt.event.type = CCI_EVENT_SEND;
  3613. tx->evt.event.connect.connection = &conn->connection;
  3614. tx->evt.ep = ep;
  3615. tx->evt.conn = conn;
  3616. tx->last_attempt_us = 0ULL;
  3617. tx->timeout_us = 0ULL;
  3618. tx->rma_op = NULL;
  3619. hdr_r = tx->buffer;
  3620. sock_pack_conn_ack(&hdr_r->header, sconn->peer_id);
  3621. sconn->last_ack_ts = sock_get_usecs();
  3622. /* the conn_ack acks the server's seq in the timestamp */
  3623. sock_pack_seq_ts(&hdr_r->seq_ts, tx->seq, seq);
  3624. tx->len = sizeof (sock_header_r_t);
  3625. debug(CCI_DB_CONN, "%s: queuing conn_ack with seq %u",
  3626. __func__, tx->seq);
  3627. tx->state = SOCK_TX_QUEUED;
  3628. pthread_mutex_lock(&ep->lock);
  3629. TAILQ_INSERT_TAIL(&sep->queued, &tx->evt, entry);
  3630. pthread_mutex_unlock(&ep->lock);
  3631. #if DEBUG_RNR
  3632. conn_established = true;
  3633. #endif
  3634. /* try to progress txs */
  3635. pthread_mutex_lock(&sep->progress_mutex);
  3636. pthread_cond_signal(&sep->wait_condition);
  3637. pthread_mutex_unlock(&sep->progress_mutex);
  3638. CCI_EXIT;
  3639. return;
  3640. }
  3641. static void
  3642. sock_handle_rma_read_reply(sock_conn_t *sconn,
  3643. sock_rx_t *rx,
  3644. uint32_t len,
  3645. uint32_t tx_id)
  3646. {
  3647. int ret = 0;
  3648. cci__conn_t *conn = sconn->conn;
  3649. cci_endpoint_t *endpoint;
  3650. cci__ep_t *ep;
  3651. sock_ep_t *sep;
  3652. sock_rma_header_t *read = rx->buffer;
  3653. uint64_t local_handle, local_offset;
  3654. sock_rma_handle_t *local, *h = NULL;
  3655. sock_header_r_t *hdr_r;
  3656. uint32_t seq, ts;
  3657. struct msghdr msg;
  3658. struct iovec iov[2];
  3659. struct sockaddr_in sin;
  3660. CCI_ENTER;
  3661. UNUSED_PARAM (tx_id);
  3662. /* RX already contains the header */
  3663. hdr_r = (sock_header_r_t *) rx->buffer;
  3664. sock_parse_seq_ts(&hdr_r->seq_ts, &seq, &ts);
  3665. debug(CCI_DB_MSG,
  3666. "%s: recv'ing RMA_READ_REPLY on conn %p with len %u",
  3667. __func__, (void*)conn, len);
  3668. sock_parse_rma_handle_offset(&read->local, &local_handle, &local_offset);
  3669. local = (sock_rma_handle_t *) (uintptr_t) local_handle;
  3670. assert (local);
  3671. endpoint = (&conn->connection)->endpoint;
  3672. ep = container_of (endpoint, cci__ep_t, endpoint);
  3673. sep = ep->priv;
  3674. pthread_mutex_lock(&ep->lock);
  3675. TAILQ_FOREACH(h, &sep->handles, entry) {
  3676. if (h == local) {
  3677. break;
  3678. }
  3679. }
  3680. pthread_mutex_unlock(&ep->lock);
  3681. if (h != local) {
  3682. /* local is no longer valid, send CCI_ERR_RMA_HANDLE */
  3683. ret = CCI_ERR_RMA_HANDLE;
  3684. debug(CCI_DB_WARN, "%s: local handle not valid", __func__);
  3685. goto out;
  3686. } else if (local_offset > local->length) {
  3687. /* offset exceeds local handle's range, send nak */
  3688. ret = CCI_ERR_RMA_HANDLE;
  3689. debug(CCI_DB_WARN, "%s: local offset not valid", __func__);
  3690. goto out;
  3691. } else if ((local_offset + len) > local->length) {
  3692. /* length exceeds local handle's range, send nak */
  3693. ret = CCI_ERR_RMA_HANDLE;
  3694. debug(CCI_DB_WARN, "%s: local length not valid (%"PRIu64"/%"PRIu64")",
  3695. __func__, local_offset + len, local->length);
  3696. goto out;
  3697. }
  3698. /* valid local handle, copy the data */
  3699. debug(CCI_DB_MSG, "%s: recv'ing data into target buffer (%u bytes)",
  3700. __func__, len);
  3701. /* We receive the entire message using an IOVEC: the first elt of the
  3702. IOVEC is the header and the second one the actual data */
  3703. memset (&msg, 0, sizeof (msg));
  3704. msg.msg_name = (void*)&sin;
  3705. msg.msg_namelen = sizeof(sin);
  3706. iov[0].iov_len = sizeof (sock_rma_header_t);
  3707. iov[0].iov_base = rx->buffer;
  3708. iov[1].iov_len = len;
  3709. iov[1].iov_base = (void*)((uintptr_t)h->start + (uintptr_t)local_offset);
  3710. msg.msg_iov = iov;
  3711. msg.msg_iovlen = 2;
  3712. again:
  3713. ret = recvmsg (sep->sock, &msg, 0);
  3714. if (ret == -1) {
  3715. /* TODO we need to drain the message from the fd */
  3716. if (errno == EAGAIN)
  3717. goto again;
  3718. debug(CCI_DB_MSG,
  3719. "%s: recv'ing RMA READ payload failed with %s",
  3720. __func__, strerror(errno));
  3721. }
  3722. #if CCI_DEBUG
  3723. assert (ret == (int)(sizeof (sock_rma_header_t) + len));
  3724. #endif
  3725. debug (CCI_DB_EP,
  3726. "%s: We now have %d/%lu bytes",
  3727. __func__, ret,
  3728. sizeof (sock_rma_header_t) + len);
  3729. out:
  3730. pthread_mutex_lock(&ep->lock);
  3731. TAILQ_INSERT_HEAD(&sep->idle_rxs, rx, entry);
  3732. pthread_mutex_unlock(&ep->lock);
  3733. CCI_EXIT;
  3734. return;
  3735. }
  3736. /*
  3737. * GV [2012/01/16] Not sure that an event needs to be returned on the server
  3738. * side so part of the following comments may be wrong. The event related code
  3739. * is deactivated until this is clarified.
  3740. * First of all, remember that on the server side, we always receive a conn_ack
  3741. * for both an accepted and a rejected connection (in the context of a reliable
  3742. * connection). Therefore, the conn_ack follows a conn_reply that was either
  3743. * a CCI_EVENT_CONNECT_ACCEPTED or a CCI_EVENT_CONNECT_REJECTED. When receiving
  3744. * the conn_reply, we queue the conn_ack, we check the "context" (accept or
  3745. * reject) and generate an event to the server application.
  3746. * Therefore, when receiving a CONN_ACK, we have to:
  3747. * - if the connection is accepted, return an event to the server app with the
  3748. * ID of the remote peer; find the corresponding CONN_REPLY TX and "release"
  3749. * it (the TX is also used to know the context of the conn_reply (i.e., accept
  3750. * or reject),
  3751. * - if the connection is rejected, return an event to the app specifying that
  3752. * no ID has assigned to the remote peer.
  3753. */
  3754. static void
  3755. sock_handle_conn_ack(sock_conn_t * sconn,
  3756. sock_rx_t * rx,
  3757. uint8_t unused1,
  3758. uint16_t unused2,
  3759. uint32_t peer_id,
  3760. struct sockaddr_in sin)
  3761. {
  3762. cci__ep_t *ep = NULL;
  3763. cci__conn_t *conn = NULL;
  3764. sock_ep_t *sep = NULL;
  3765. cci__evt_t *e = NULL;
  3766. cci__evt_t *tmp = NULL;
  3767. sock_tx_t *tx = NULL;
  3768. sock_tx_t *t = NULL;
  3769. sock_header_r_t *hdr_r; /* wire header */
  3770. cci_endpoint_t *endpoint; /* generic CCI endpoint */
  3771. uint32_t seq;
  3772. uint32_t ts;
  3773. CCI_ENTER;
  3774. UNUSED_PARAM (unused1);
  3775. UNUSED_PARAM (unused2);
  3776. UNUSED_PARAM (sin);
  3777. if (sconn == NULL) {
  3778. /* Connection was rejected */
  3779. } else {
  3780. /* Connection was accepted */
  3781. conn = sconn->conn;
  3782. endpoint = (&conn->connection)->endpoint;
  3783. ep = container_of(endpoint, cci__ep_t, endpoint);
  3784. sep = ep->priv;
  3785. /* we check whether the connection ack match the id associated to the
  3786. connection */
  3787. assert(peer_id == sconn->id);
  3788. hdr_r = rx->buffer;
  3789. sock_parse_seq_ts(&hdr_r->seq_ts, &seq, &ts);
  3790. debug(CCI_DB_CONN, "%s: seq %u acking conn_reply %u",
  3791. __func__, seq, ts);
  3792. pthread_mutex_lock(&ep->lock);
  3793. TAILQ_FOREACH_SAFE(e, &sep->pending, entry, tmp) {
  3794. /* the conn_ack stores the ack for the conn_reply in ts */
  3795. t = container_of (e, sock_tx_t, evt);
  3796. if (t->seq == ts) {
  3797. TAILQ_REMOVE(&sep->pending, e, entry);
  3798. tx = t;
  3799. debug(CCI_DB_CONN, "%s: found conn_reply",
  3800. __func__);
  3801. break;
  3802. }
  3803. }
  3804. pthread_mutex_unlock(&ep->lock);
  3805. if (!tx) {
  3806. /* FIXME do what here? */
  3807. /* if no tx, then it timed out or this is a duplicate,
  3808. * but we have a sconn */
  3809. debug((CCI_DB_MSG | CCI_DB_CONN),
  3810. "%s: received conn_ack and no matching tx "
  3811. "(seq %u ack %u)", __func__, seq, ts);
  3812. } else {
  3813. pthread_mutex_lock(&ep->lock);
  3814. if (tx->evt.event.accept.connection) {
  3815. debug(CCI_DB_CONN,
  3816. "%s: Generate the connect accept event",
  3817. __func__);
  3818. TAILQ_INSERT_TAIL(&ep->evts, &tx->evt, entry);
  3819. /* waking up the app thread if it is blocking
  3820. on a OS handle */
  3821. if (sep->event_fd) {
  3822. int rc;
  3823. rc = write (sep->fd[1], "a", 1);
  3824. if (rc != 1) {
  3825. debug (CCI_DB_WARN,
  3826. "%s: Write failed",
  3827. __func__);
  3828. CCI_EXIT;
  3829. return;
  3830. }
  3831. }
  3832. } else {
  3833. TAILQ_INSERT_HEAD(&sep->idle_txs, tx, dentry);
  3834. }
  3835. pthread_mutex_unlock(&ep->lock);
  3836. }
  3837. pthread_mutex_lock(&ep->lock);
  3838. TAILQ_INSERT_HEAD(&sep->idle_rxs, rx, entry);
  3839. pthread_mutex_unlock(&ep->lock);
  3840. pthread_mutex_lock(&sep->progress_mutex);
  3841. pthread_cond_signal(&sep->wait_condition);
  3842. pthread_mutex_unlock(&sep->progress_mutex);
  3843. }
  3844. CCI_EXIT;
  3845. return;
  3846. }
  3847. static int
  3848. sock_handle_rma_read_request(sock_conn_t * sconn, sock_rx_t * rx,
  3849. uint16_t len, uint32_t id)
  3850. {
  3851. cci__ep_t *ep = NULL;
  3852. cci__conn_t *conn = sconn->conn;
  3853. sock_rma_header_t *read = rx->buffer;
  3854. cci_connection_t *connection = NULL;
  3855. sock_ep_t *sep = NULL;
  3856. uint64_t local_handle;
  3857. uint64_t local_offset;
  3858. uint64_t remote_handle;
  3859. uint64_t remote_offset;
  3860. uint32_t seq, ts = 0;
  3861. int ret = CCI_SUCCESS;
  3862. sock_rma_header_t *rma_hdr;
  3863. sock_rma_handle_t *remote, *h;
  3864. sock_header_r_t *hdr_r;
  3865. sock_tx_t *tx = NULL;
  3866. hdr_r = (sock_header_r_t *) rx->buffer;
  3867. sock_parse_seq_ts(&hdr_r->seq_ts, &seq, &ts);
  3868. connection = &conn->connection;
  3869. ep = container_of(connection->endpoint, cci__ep_t, endpoint);
  3870. sep = ep->priv;
  3871. /* Get a TX buffer */
  3872. tx = sock_get_tx (ep);
  3873. if (tx == NULL) {
  3874. send_nack (sconn, sep, seq, ts);
  3875. goto out;
  3876. }
  3877. if (hdr_r->pb_ack != 0) {
  3878. sock_handle_ack (sconn, SOCK_MSG_RMA_READ_REQUEST, rx, 1, id);
  3879. }
  3880. /* Parse the RMA read request message */
  3881. sock_parse_rma_handle_offset(&read->local, &local_handle, &local_offset);
  3882. sock_parse_rma_handle_offset(&read->remote, &remote_handle, &remote_offset);
  3883. remote = (sock_rma_handle_t *) (uintptr_t) remote_handle;
  3884. #if CCI_DEBUG
  3885. assert (remote);
  3886. #endif
  3887. pthread_mutex_lock(&ep->lock);
  3888. TAILQ_FOREACH(h, &sep->handles, entry) {
  3889. if (h == remote) {
  3890. break;
  3891. }
  3892. }
  3893. pthread_mutex_unlock(&ep->lock);
  3894. if (h != remote) {
  3895. /* remote is no longer valid, send CCI_ERR_RMA_HANDLE */
  3896. ret = CCI_ERR_RMA_HANDLE;
  3897. debug(CCI_DB_WARN, "%s: remote handle not valid", __func__);
  3898. goto out;
  3899. } else if (remote_offset > remote->length) {
  3900. /* offset exceeds remote handle's range, send nak */
  3901. ret = CCI_ERR_RMA_HANDLE;
  3902. debug(CCI_DB_WARN,
  3903. "%s: remote offset not valid (start: %p, offset: %"PRIu64", "
  3904. "length: %"PRIu64")",
  3905. __func__, remote->start, remote_offset, remote->length);
  3906. goto out;
  3907. } else if ((remote_offset + len) > remote->length) {
  3908. /* length exceeds remote handle's range, send nak */
  3909. ret = CCI_ERR_RMA_HANDLE;
  3910. debug(CCI_DB_WARN,
  3911. "%s: remote length not valid (remote offset: %"PRIu64", "
  3912. "len: %d, length: %"PRIu64")",
  3913. __func__, remote_offset, len, remote->length);
  3914. goto out;
  3915. }
  3916. /* Prepare the TX buffer */
  3917. tx->seq = 0;
  3918. tx->msg_type = SOCK_MSG_RMA_READ_REPLY;
  3919. tx->flags = CCI_FLAG_SILENT;
  3920. tx->state = SOCK_TX_QUEUED;
  3921. tx->len = sizeof(sock_rma_header_t);
  3922. tx->rma_op = NULL;
  3923. tx->rma_ptr
  3924. = (void*)((uintptr_t)remote->start + (uintptr_t) remote_offset);
  3925. tx->rma_len = len;
  3926. tx->evt.event.type = CCI_EVENT_SEND;
  3927. tx->evt.event.send.status = CCI_SUCCESS; /* for now */
  3928. tx->evt.event.send.context = NULL;
  3929. tx->evt.event.send.connection = &conn->connection;
  3930. tx->evt.conn = conn;
  3931. rma_hdr = (sock_rma_header_t*)tx->buffer;
  3932. sock_pack_rma_read_reply(rma_hdr, (uint16_t)len, sconn->peer_id,
  3933. tx->seq, 0,
  3934. local_handle, local_offset,
  3935. remote_handle, remote_offset);
  3936. debug (CCI_DB_MSG,
  3937. "%s: Copying %d bytes in RMA_READ_REPLY msg",
  3938. __func__, len);
  3939. memcpy(rma_hdr->data, tx->rma_ptr, len);
  3940. /* We piggyback the seq of the initial READ REQUEST so it can act as an ACK */
  3941. hdr_r = (sock_header_r_t*) tx->buffer;
  3942. hdr_r->pb_ack = seq;
  3943. /* Send the message: we try to send the RMA_READ_REPLY directly, like an
  3944. ACK */
  3945. debug (CCI_DB_MSG,
  3946. "%s: Send RMA_READ_REPLY, response to RMA_READ_REQUEST seq %u"
  3947. " with %u bytes",
  3948. __func__, seq, tx->rma_len);
  3949. sock_sendto(sep->sock, tx->buffer, tx->len, tx->rma_ptr,
  3950. tx->rma_len, sconn->sin);
  3951. /* Since RMA_READ_REPLY are acting like an ACK, we return the buffer
  3952. right away. No need to generate a SEND event, this is only a
  3953. fragment of the RMA READ operation */
  3954. pthread_mutex_lock (&ep->lock);
  3955. TAILQ_INSERT_TAIL(&sep->idle_txs, tx, dentry);
  3956. pthread_mutex_unlock (&ep->lock);
  3957. out:
  3958. pthread_mutex_lock(&ep->lock);
  3959. TAILQ_INSERT_HEAD(&sep->idle_rxs, rx, entry);
  3960. pthread_mutex_unlock(&ep->lock);
  3961. pthread_mutex_lock(&sep->progress_mutex);
  3962. pthread_cond_signal(&sep->wait_condition);
  3963. pthread_mutex_unlock(&sep->progress_mutex);
  3964. return (ret);
  3965. }
  3966. static void
  3967. sock_handle_rma_write(sock_conn_t * sconn, sock_rx_t * rx, uint16_t len)
  3968. {
  3969. cci__ep_t *ep = NULL;
  3970. cci__conn_t *conn = sconn->conn;
  3971. sock_ep_t *sep = NULL;
  3972. uint64_t local_handle;
  3973. uint64_t local_offset;
  3974. uint64_t remote_handle; /* our handle */
  3975. uint64_t remote_offset; /* our offset */
  3976. sock_rma_handle_t *remote, *h;
  3977. struct sockaddr_in sin;
  3978. struct msghdr msg;
  3979. struct iovec iov[2];
  3980. sock_rma_header_t *rma_header;
  3981. #if CCI_DEBUG
  3982. int ret;
  3983. #endif
  3984. ep = container_of(conn->connection.endpoint, cci__ep_t, endpoint);
  3985. sep = ep->priv;
  3986. /* The header is already in the RX */
  3987. rma_header = rx->buffer;
  3988. sock_parse_rma_handle_offset(&(rma_header->local),
  3989. &local_handle,
  3990. &local_offset);
  3991. sock_parse_rma_handle_offset(&(rma_header->remote),
  3992. &remote_handle,
  3993. &remote_offset);
  3994. remote = (sock_rma_handle_t *) (uintptr_t) remote_handle;
  3995. #if CCI_DEBUG
  3996. assert (remote);
  3997. assert (len);
  3998. #endif
  3999. pthread_mutex_lock(&ep->lock);
  4000. TAILQ_FOREACH(h, &sep->handles, entry) {
  4001. if (h == remote) {
  4002. break;
  4003. }
  4004. }
  4005. pthread_mutex_unlock(&ep->lock);
  4006. if (h != remote) {
  4007. /* remote is no longer valid, send nack */
  4008. debug(CCI_DB_MSG, "%s: remote handle not valid", __func__);
  4009. /* TODO
  4010. Note: we have already handled the seq for this rx
  4011. and we may have acked it. If it was the last
  4012. piece, then we lost the race. We should defer
  4013. the ack until we deliver the data. */
  4014. goto out;
  4015. }
  4016. #if CCI_DEBUG
  4017. assert (h->start);
  4018. assert (len);
  4019. #endif
  4020. if (remote_offset > remote->length) {
  4021. /* offset exceeds remote handle's range, send nak */
  4022. debug(CCI_DB_MSG,
  4023. "%s: remote offset not valid (start: %p, offset: %"PRIu64", "
  4024. "length: %"PRIu64")", __func__, remote->start, remote_offset,
  4025. remote->length);
  4026. /* TODO
  4027. Note: we have already handled the seq for this rx
  4028. and we may have acked it. If it was the last
  4029. piece, then we lost the race. We should defer
  4030. the ack until we deliver the data. */
  4031. goto out;
  4032. } else if (remote_offset + len > remote->length) {
  4033. /* length exceeds remote handle's range, send nak */
  4034. debug(CCI_DB_MSG, "%s: remote length not valid", __func__);
  4035. /* TODO
  4036. Note: we have already handled the seq for this rx
  4037. and we may have acked it. If it was the last
  4038. piece, then we lost the race. We should defer
  4039. the ack until we deliver the data. */
  4040. goto out;
  4041. }
  4042. /* valid remote handle, copy the data */
  4043. debug_ep (ep, CCI_DB_INFO,
  4044. "%s: copying data into target buffer -- start: %p, "
  4045. "offset: %"PRIu64", len: %d",
  4046. __func__, h->start, remote_offset, len);
  4047. /* We receive the entire message using an IOVEC: the first elt of the
  4048. IOVEC is the header and the second one the actual data */
  4049. memset (&msg, 0, sizeof (msg));
  4050. msg.msg_name = (void*)&sin;
  4051. msg.msg_namelen = sizeof(sin);
  4052. iov[0].iov_len = sizeof (sock_rma_header_t);
  4053. iov[0].iov_base = rx->buffer;
  4054. iov[1].iov_len = len;
  4055. iov[1].iov_base = (void*)((uintptr_t)h->start + (uintptr_t)remote_offset);
  4056. msg.msg_iov = iov;
  4057. msg.msg_iovlen = 2;
  4058. #if CCI_DEBUG
  4059. ret = recvmsg (sep->sock, &msg, 0);
  4060. debug (CCI_DB_EP, "%s: We now have %d/%lu bytes",
  4061. __func__, ret, sizeof (sock_rma_header_t) + len);
  4062. assert ((unsigned int)ret == (sizeof (sock_rma_header_t) + len));
  4063. #else
  4064. recvmsg (sep->sock, &msg, 0);
  4065. #endif
  4066. out:
  4067. /* We force the ACK */
  4068. pthread_mutex_lock(&ep->lock);
  4069. sock_ack_sconn (sep, sconn);
  4070. TAILQ_INSERT_HEAD(&sep->idle_rxs, rx, entry);
  4071. pthread_mutex_unlock(&ep->lock);
  4072. return;
  4073. }
  4074. static void
  4075. sock_handle_rma_write_done(sock_conn_t * sconn,
  4076. sock_rx_t * rx,
  4077. uint16_t len,
  4078. uint32_t id)
  4079. {
  4080. cci__evt_t *evt;
  4081. cci__conn_t *conn = sconn->conn;
  4082. union cci_event *event; /* generic CCI event */
  4083. cci_endpoint_t *endpoint; /* generic CCI endpoint */
  4084. cci__ep_t *ep;
  4085. sock_ep_t *sep = NULL;
  4086. /* Length of the completion msg */
  4087. uint32_t *msg_len;
  4088. /* Completion msg */
  4089. void *ptr;
  4090. sock_header_r_t *hdr_r = rx->buffer;
  4091. uint32_t total_len;
  4092. #if CCI_DEBUG
  4093. int ret;
  4094. #endif
  4095. #if 0
  4096. if (hdr_r->pb_ack != 0) {
  4097. sock_handle_ack (sconn, SOCK_MSG_RMA_WRITE_DONE, rx, 1, id);
  4098. }
  4099. #endif
  4100. endpoint = (&conn->connection)->endpoint;
  4101. ep = container_of(endpoint, cci__ep_t, endpoint);
  4102. sep = ep->priv;
  4103. /* First we get the length of the completion message */
  4104. msg_len = (uint32_t*)hdr_r->data;
  4105. debug_ep (ep, CCI_DB_EP,
  4106. "%s: msg len is %u\n", __func__, *msg_len);
  4107. total_len = sizeof (sock_rma_header_t) + sizeof(uint32_t) + *msg_len;
  4108. #if CCI_DEBUG
  4109. ret = sock_recv_msg (sep->sock, rx->buffer, total_len, 0, NULL);
  4110. debug (CCI_DB_EP, "We now have %d/%d bytes\n", ret, total_len);
  4111. assert ((unsigned int)ret == total_len);
  4112. #else
  4113. sock_recv_msg (sep->sock, rx->buffer, total_len, 0, NULL);
  4114. #endif
  4115. /* get cci__evt_t to hang on ep->events */
  4116. evt = &rx->evt;
  4117. /* setup the generic event for the application */
  4118. event = & evt->event;
  4119. event->type = CCI_EVENT_RECV;
  4120. event->recv.len = *msg_len;
  4121. ptr = hdr_r->data + sizeof (uint32_t);
  4122. *((void **)&event->recv.ptr) = ptr;
  4123. event->recv.connection = &conn->connection;
  4124. /* queue event on endpoint's completed event queue */
  4125. sock_queue_event (ep, evt);
  4126. /* waking up the app thread if it is blocking on a OS handle */
  4127. if (sep->event_fd) {
  4128. int rc;
  4129. rc = write (sep->fd[1], "a", 1);
  4130. if (rc != 1)
  4131. debug (CCI_DB_WARN, "%s: Write failed", __func__);
  4132. }
  4133. }
  4134. static int sock_recvfrom_ep(cci__ep_t * ep)
  4135. {
  4136. int ret = 0, drop_msg = 0, q_rx = 0, reply = 0, request = 0, again = 0;
  4137. int ka = 0;
  4138. size_t recv_len = 0;
  4139. uint8_t a;
  4140. uint16_t b;
  4141. uint32_t id;
  4142. sock_rx_t *rx = NULL;
  4143. struct sockaddr_in sin;
  4144. socklen_t sin_len = sizeof(sin);
  4145. sock_conn_t *sconn = NULL;
  4146. cci__conn_t *conn = NULL;
  4147. sock_ep_t *sep;
  4148. sock_msg_type_t type;
  4149. uint32_t seq = 0;
  4150. uint32_t ts = 0;
  4151. CCI_ENTER;
  4152. sep = ep->priv;
  4153. if (!sep)
  4154. return 0;
  4155. pthread_mutex_lock(&ep->lock);
  4156. if (!TAILQ_EMPTY(&sep->idle_rxs)) {
  4157. rx = TAILQ_FIRST(&sep->idle_rxs);
  4158. TAILQ_REMOVE(&sep->idle_rxs, rx, entry);
  4159. }
  4160. pthread_mutex_unlock(&ep->lock);
  4161. /* If we run out of RX, we fall down to a special case: we have to use a
  4162. special buffer to receive the message, parse it. Ultimately, we need
  4163. the TS and the SEQ (so we can send the RNR msg), as well as the entire
  4164. header so we can know if we are in the context of a reliable connection
  4165. (otherwise RNR does not apply). */
  4166. #if DEBUG_RNR
  4167. if (conn_established) {
  4168. /* We sumilate a case where we are not ready to receive 25% of
  4169. the time */
  4170. int n = (int)(4.0 * rand() / (RAND_MAX + 1.0));
  4171. if (n == 0) {
  4172. fprintf(stderr, "Simulating lack of RX buffer...\n");
  4173. rx = NULL;
  4174. }
  4175. }
  4176. #endif
  4177. /*
  4178. * Two cases here:
  4179. * 1) Normal execution, we can get a RX buffer: then we read the msg
  4180. * and just handle the message.
  4181. * 2) We are out of RX buffers; two cases again:
  4182. * a. The connection is reliable and in this case we fall into a RNR
  4183. * mode, which may lead to dropping the message and/or creating
  4184. * an extra RX buffer. See the semantic of RNR for more details.
  4185. * b. The connection is unreliable, we just drop the message.
  4186. */
  4187. if (!rx) {
  4188. char tmp_buff[SOCK_UDP_MAX];
  4189. sock_header_t *hdr = NULL;
  4190. debug(CCI_DB_INFO,
  4191. "%s: no rx buffers available on endpoint %d",
  4192. __func__, sep->sock);
  4193. /* We do the receive using a temporary buffer so we can get
  4194. enough data to send a RNR NACK */
  4195. ret = recvfrom(sep->sock, (void *)tmp_buff, SOCK_UDP_MAX,
  4196. 0, (struct sockaddr *)&sin, &sin_len);
  4197. if (ret == -1) {
  4198. debug (CCI_DB_INFO,
  4199. "%s: No RX buffer + cannot recv data: %s",
  4200. __func__, strerror (errno));
  4201. CCI_EXIT;
  4202. return 0;
  4203. }
  4204. if (ret < (int)sizeof(sock_header_t)) {
  4205. debug(CCI_DB_INFO,
  4206. "%s: Not enough data (%d/%d) to get the header",
  4207. __func__, ret, (int)sizeof(sock_header_t));
  4208. CCI_EXIT;
  4209. return 0;
  4210. }
  4211. /* Now we get the header and parse it so we can know if we are
  4212. in the context of a reliable connection */
  4213. hdr = (sock_header_t *) tmp_buff;
  4214. sock_parse_header(hdr, &type, &a, &b, &id);
  4215. sconn = sock_find_conn(sep, sin.sin_addr.s_addr, sin.sin_port,
  4216. id, type);
  4217. conn = sconn->conn;
  4218. if (sconn == NULL) {
  4219. /* If the connection is not already established, we
  4220. just drop the message */
  4221. debug(CCI_DB_INFO,
  4222. "%s: Connection not established, dropping msg",
  4223. __func__);
  4224. CCI_EXIT;
  4225. return 0;
  4226. }
  4227. /* If this is a reliable connection, we typically fall into a
  4228. RNR mode */
  4229. if (cci_conn_is_reliable(conn)) {
  4230. sock_header_r_t *header_r = NULL;
  4231. /* We do the receive using a temporary buffer so we can
  4232. get enough data to send a RNR NACK */
  4233. /* From the buffer, we get the TS and SEQ from the
  4234. header (this is the only we need to deal with RNR)
  4235. and will be used later on */
  4236. header_r = (sock_header_r_t *) tmp_buff;
  4237. sock_parse_seq_ts(&header_r->seq_ts, &seq, &ts);
  4238. ret = update_rnr_mode (sconn, seq);
  4239. if (ret == CCI_SOCK_RESUME_RNR) {
  4240. /* In case we receive the message we were
  4241. waiting for to resume normal execution,
  4242. we make sure we have a proper RX buffer and
  4243. move on. This new buffer will be added to
  4244. the list of available RX buffers later on */
  4245. rx = alloc_rx_buffer (ep);
  4246. if (rx == NULL) {
  4247. drop_msg = 1;
  4248. goto out;
  4249. }
  4250. memcpy (rx->buffer, tmp_buff, ep->buffer_len);
  4251. } else {
  4252. /* Otherwise we drop the msg */
  4253. drop_msg = 1;
  4254. goto out;
  4255. }
  4256. } else {
  4257. /* If the connection is unreliable, we simply exit */
  4258. CCI_EXIT;
  4259. return 0;
  4260. }
  4261. } else {
  4262. ret = sock_recv_msg (sep->sock,
  4263. rx->buffer,
  4264. sizeof(sock_header_t),
  4265. MSG_PEEK,
  4266. &sin);
  4267. if (ret < 0 || ret < (int)sizeof(sock_header_t)) {
  4268. q_rx = 1;
  4269. goto out;
  4270. }
  4271. recv_len = ret;
  4272. #if CCI_DEBUG
  4273. assert (recv_len == sizeof (sock_header_t));
  4274. #endif
  4275. /* Getting here means we are in a normal execution code path
  4276. so we assume that if we received successfully a message,
  4277. another one may be already available right away, so it is
  4278. possible to try to receive it. */
  4279. again = 1;
  4280. }
  4281. /* From here, we know we have the message in a valid RX buffer so we
  4282. can parse it and handle the data */
  4283. /* lookup connection from sin and id */
  4284. sock_parse_header(rx->buffer, &type, &a, &b, &id);
  4285. if (SOCK_MSG_CONN_REPLY == type) {
  4286. reply = 1;
  4287. } else if (SOCK_MSG_CONN_REQUEST == type) {
  4288. request = 1;
  4289. rx->sin = sin;
  4290. }
  4291. if (SOCK_MSG_KEEPALIVE == type)
  4292. ka = 1;
  4293. if (!request) {
  4294. sconn = sock_find_conn(sep, sin.sin_addr.s_addr, sin.sin_port,
  4295. id, type);
  4296. }
  4297. #if CCI_DEBUG
  4298. {
  4299. char name[32];
  4300. if (CCI_DB_MSG & cci__debug) {
  4301. memset(name, 0, sizeof(name));
  4302. sock_sin_to_name(sin, name, sizeof(name));
  4303. /* Note that in the context of RMA_READ_REQUEST
  4304. messages the length of the message is actually the
  4305. size of the data to send back; for CONN_REPLY, a
  4306. specifies whether the connection is accepted or
  4307. rejected and b should be equal to 0 (so the size in
  4308. the debug msg is not relevant */
  4309. debug_ep(ep, (CCI_DB_MSG),
  4310. "%s: recv'd %s msg from %s with %d bytes",
  4311. __func__, sock_msg_type(type), name, a + b);
  4312. }
  4313. }
  4314. #endif /* CCI_DEBUG */
  4315. /* if no conn, drop msg, requeue rx */
  4316. if (!ka && !sconn && !reply && !request) {
  4317. debug((CCI_DB_CONN | CCI_DB_MSG),
  4318. "%s: no sconn for incoming %s msg from %s:%d",
  4319. __func__,
  4320. sock_msg_type(type), inet_ntoa(sin.sin_addr),
  4321. ntohs(sin.sin_port));
  4322. /* If we do not have a connection and if the message type is a
  4323. CONN_ACK, this is most certainly the ack in the context of
  4324. a conn_reject */
  4325. if (SOCK_MSG_CONN_ACK == type) {
  4326. uint32_t total_size = sizeof (sock_header_r_t);
  4327. recv_len = sock_recv_msg (sep->sock, rx->buffer,
  4328. total_size, 0, NULL);
  4329. debug (CCI_DB_EP, "%s: We now have %u/%u bytes",
  4330. __func__, (unsigned int)recv_len, total_size);
  4331. #if CCI_DEBUG
  4332. assert (recv_len == total_size);
  4333. #endif
  4334. /* If we get a conn_ack but the sconn is NULL, this is
  4335. a ack in the context of a conn_reject. We can safely
  4336. call the sock_handle_conn_ack() but we need to
  4337. explicitely return the rx */
  4338. sock_handle_conn_ack(NULL, rx, a, b, id, sin);
  4339. /* Return the RX */
  4340. TAILQ_INSERT_HEAD(&sep->idle_rxs, rx, entry);
  4341. }
  4342. q_rx = 1;
  4343. goto out;
  4344. }
  4345. /* Some actions specific to reliable connections */
  4346. if (sconn && cci_conn_is_reliable(sconn->conn))
  4347. {
  4348. sock_header_r_t *hdr_r;
  4349. /* Make sure we receive the entire reliable header */
  4350. if (recv_len < sizeof (sock_header_r_t)) {
  4351. recv_len = sock_recv_msg (sep->sock,
  4352. rx->buffer,
  4353. sizeof (sock_header_r_t),
  4354. MSG_PEEK,
  4355. NULL);
  4356. #if CCI_DEBUG
  4357. assert (recv_len == sizeof (sock_header_r_t));
  4358. #endif
  4359. }
  4360. hdr_r = rx->buffer;
  4361. assert (recv_len >= sizeof (sock_header_t));
  4362. sock_parse_seq_ts(&hdr_r->seq_ts, &seq, &ts);
  4363. /* For reliable/ordered connection, we make sure we receive the expected
  4364. next seq */
  4365. if (sconn->conn->connection.attribute == CCI_CONN_ATTR_RO) {
  4366. if (sconn->last_recvd_seq == 0)
  4367. sconn->last_recvd_seq = seq;
  4368. if (seq > sconn->last_recvd_seq + 1) {
  4369. ret = send_nack (sconn,
  4370. sep,
  4371. seq,
  4372. ts);
  4373. goto out;
  4374. }
  4375. }
  4376. if (!(type == SOCK_MSG_CONN_REPLY)) {
  4377. /* We do not want to implicitely ack RMA_READ_REQUEST and
  4378. RMA_READ_REPLY message:
  4379. - RMA_READ_REQUEST are acked with the corresponding
  4380. RMA_READ_REPLY message
  4381. - RMA_READ_REPLY message are not acked since they act as an
  4382. ACK (not ack of acks).
  4383. - SOCK_MSG_RNR are not acked since they act as a NACK */
  4384. if (!(type == SOCK_MSG_RMA_READ_REQUEST)
  4385. && !(type == SOCK_MSG_RMA_READ_REPLY)
  4386. && !(type == SOCK_MSG_NACK)
  4387. && !(type == SOCK_MSG_RNR))
  4388. {
  4389. sock_handle_seq(sconn, seq);
  4390. }
  4391. if (hdr_r->pb_ack != 0) {
  4392. sock_handle_ack (sconn, type, rx, 1, id);
  4393. /* Reset the value of pb_ack to make sure we won't try
  4394. to do it again */
  4395. hdr_r->pb_ack = 0;
  4396. }
  4397. }
  4398. }
  4399. switch (type) {
  4400. case SOCK_MSG_CONN_REQUEST: {
  4401. uint32_t total_size = sizeof (sock_header_r_t)
  4402. + sizeof (sock_handshake_t) + b;
  4403. recv_len = sock_recv_msg (sep->sock, rx->buffer,
  4404. total_size, 0, NULL);
  4405. debug (CCI_DB_EP,
  4406. "%s: We now have %u/%u bytes",
  4407. __func__,
  4408. (unsigned int)recv_len, total_size);
  4409. #if CCI_DEBUG
  4410. assert (recv_len == total_size);
  4411. #endif
  4412. sock_handle_conn_request(rx, a, b, sin, ep);
  4413. break;
  4414. }
  4415. case SOCK_MSG_CONN_REPLY: {
  4416. /* We first get the header and only the header to know if we
  4417. are in the context of a connect accept or reject */
  4418. uint32_t total_size = sizeof (sock_header_r_t);
  4419. recv_len = sock_recv_msg (sep->sock, rx->buffer,
  4420. total_size, MSG_PEEK, NULL);
  4421. #if CCI_DEBUG
  4422. assert (recv_len == total_size);
  4423. #endif
  4424. sock_handle_conn_reply(sconn, rx, a, b, id, sin, ep);
  4425. break;
  4426. }
  4427. case SOCK_MSG_CONN_ACK: {
  4428. uint32_t total_size = sizeof (sock_header_r_t);
  4429. recv_len = sock_recv_msg (sep->sock, rx->buffer,
  4430. total_size, 0, NULL);
  4431. debug (CCI_DB_EP, "%s: We now have %u/%u bytes",
  4432. __func__, (unsigned int)recv_len, total_size);
  4433. #if CCI_DEBUG
  4434. assert (recv_len == total_size);
  4435. #endif
  4436. sock_handle_conn_ack(sconn, rx, a, b, id, sin);
  4437. break;
  4438. }
  4439. case SOCK_MSG_DISCONNECT:
  4440. break;
  4441. case SOCK_MSG_SEND: {
  4442. uint16_t total_size = b;
  4443. if (cci_conn_is_reliable(sconn->conn)) {
  4444. total_size += sizeof (sock_header_r_t);
  4445. } else {
  4446. total_size += sizeof (sock_header_t);
  4447. }
  4448. /* Make sure we have the entire msg */
  4449. recv_len = sock_recv_msg (sep->sock,
  4450. rx->buffer,
  4451. total_size,
  4452. 0,
  4453. NULL);
  4454. debug (CCI_DB_EP, "%s: We now have %u/%u bytes",
  4455. __func__, (unsigned int)recv_len, total_size);
  4456. #if CCI_DEBUG
  4457. assert (recv_len == total_size);
  4458. #endif
  4459. sock_handle_active_message(sconn, rx, b, id);
  4460. break;
  4461. }
  4462. case SOCK_MSG_RNR:{
  4463. sock_header_r_t *hdr_r = rx->buffer;
  4464. debug (CCI_DB_INFO,
  4465. "%s: Receiver not ready", __func__);
  4466. sock_parse_seq_ts(&hdr_r->seq_ts, &seq, &ts);
  4467. sock_handle_rnr(sconn, seq, ts);
  4468. /* No event is directly generated from the msg
  4469. so we can reuse the RX buffer */
  4470. q_rx = 1;
  4471. break;
  4472. }
  4473. case SOCK_MSG_KEEPALIVE:
  4474. /* Nothing to do? */
  4475. break;
  4476. case SOCK_MSG_ACK_ONLY:
  4477. case SOCK_MSG_ACK_UP_TO:
  4478. case SOCK_MSG_SACK: {
  4479. uint32_t total_size = sizeof (sock_header_r_t)
  4480. + a * sizeof (uint32_t);
  4481. recv_len = sock_recv_msg (sep->sock, rx->buffer,
  4482. total_size, 0, NULL);
  4483. debug (CCI_DB_EP, "%s: We now have %u/%u bytes",
  4484. __func__, (unsigned int)recv_len, total_size);
  4485. #if CCI_DEBUG
  4486. assert (recv_len == total_size);
  4487. #endif
  4488. sock_handle_ack(sconn, type, rx, (uint32_t)a, id);
  4489. /* sock_handle_ack already requeue the RXs in the idle list */
  4490. break;
  4491. }
  4492. case SOCK_MSG_NACK: {
  4493. uint32_t total_size = sizeof (sock_header_r_t);
  4494. /* We just need to the data from the header */
  4495. recv_len = sock_recv_msg (sep->sock, rx->buffer,
  4496. total_size, 0, NULL);
  4497. debug (CCI_DB_EP, "%s: We now have %u/%u bytes",
  4498. __func__, (unsigned int)recv_len, total_size);
  4499. #if CCI_DEBUG
  4500. assert (recv_len == total_size);
  4501. #endif
  4502. sock_handle_nack (sconn, ep, sep, seq);
  4503. q_rx = 1;
  4504. break;
  4505. }
  4506. case SOCK_MSG_RMA_WRITE: {
  4507. /* At first we just need to make sure we have the header */
  4508. recv_len = sock_recv_msg (sep->sock,
  4509. rx->buffer,
  4510. sizeof (sock_rma_header_t),
  4511. MSG_PEEK,
  4512. NULL);
  4513. #if CCI_DEBUG
  4514. assert (recv_len == sizeof (sock_rma_header_t));
  4515. #endif
  4516. sock_handle_rma_write(sconn, rx, b);
  4517. break;
  4518. }
  4519. case SOCK_MSG_RMA_WRITE_DONE: {
  4520. /* At first we just need to make sure we have the header
  4521. and the length of the completion message */
  4522. uint32_t total_size = sizeof (sock_rma_header_t)
  4523. + sizeof (uint32_t);
  4524. recv_len = sock_recv_msg (sep->sock,
  4525. rx->buffer,
  4526. total_size,
  4527. MSG_PEEK,
  4528. NULL);
  4529. #if CCI_DEBUG
  4530. assert (recv_len == total_size);
  4531. #endif
  4532. sock_handle_rma_write_done(sconn, rx, b, id);
  4533. break;
  4534. }
  4535. case SOCK_MSG_RMA_READ_REQUEST: {
  4536. uint32_t total_size = sizeof (sock_rma_header_t);
  4537. recv_len = sock_recv_msg (sep->sock,
  4538. rx->buffer,
  4539. total_size,
  4540. 0,
  4541. NULL);
  4542. debug (CCI_DB_EP, "%s: We now have %u/%u bytes",
  4543. __func__, (unsigned int)recv_len, total_size);
  4544. #if CCI_DEBUG
  4545. assert (recv_len == total_size);
  4546. #endif
  4547. sock_handle_rma_read_request(sconn, rx, b, id);
  4548. break;
  4549. }
  4550. case SOCK_MSG_RMA_READ_REPLY: {
  4551. /* At first we just need to make sure we have the header */
  4552. recv_len = sock_recv_msg (sep->sock,
  4553. rx->buffer,
  4554. sizeof (sock_rma_header_t),
  4555. MSG_PEEK,
  4556. NULL);
  4557. #if CCI_DEBUG
  4558. assert (recv_len == sizeof (sock_rma_header_t));
  4559. #endif
  4560. sock_handle_rma_read_reply(sconn, rx, b, id);
  4561. break;
  4562. }
  4563. default:
  4564. debug(CCI_DB_MSG, "%s: unknown active message with type %u",
  4565. __func__, (enum sock_msg_type)type);
  4566. }
  4567. out:
  4568. if (q_rx) {
  4569. pthread_mutex_lock(&ep->lock);
  4570. TAILQ_INSERT_HEAD(&sep->idle_rxs, rx, entry);
  4571. pthread_mutex_unlock(&ep->lock);
  4572. }
  4573. if (drop_msg) {
  4574. /* If we have no connection, we can be in the context of
  4575. a connection reject */
  4576. if (sconn && cci_conn_is_reliable(sconn->conn)
  4577. && sconn->rnr == seq)
  4578. {
  4579. char buffer[SOCK_MAX_HDR_SIZE];
  4580. int len = 0;
  4581. sock_header_r_t *hdr_r = NULL;
  4582. /*
  4583. Getting here, we are in the new RNR context on the
  4584. receiver side. Note that we already got the TS and
  4585. SEQ from the message header
  4586. */
  4587. debug (CCI_DB_INFO, "%s: Sending RNR msg (%u)",
  4588. __func__, sconn->rnr);
  4589. /* Send a RNR NACK back to the sender */
  4590. memset(buffer, 0, sizeof(buffer));
  4591. hdr_r = (sock_header_r_t *) buffer;
  4592. sock_pack_nack(hdr_r, SOCK_MSG_RNR, sconn->peer_id, seq, ts, 0);
  4593. hdr_r->pb_ack = 0;
  4594. len = sizeof(*hdr_r);
  4595. ret = sock_sendto(sep->sock, buffer, len, NULL, 0, sconn->sin);
  4596. if (ret == -1)
  4597. debug (CCI_DB_INFO, "%s: Cannot send RNR", __func__);
  4598. }
  4599. /* Drop the message */
  4600. sock_drop_msg(sep->sock);
  4601. } else {
  4602. if (sconn && sconn->conn &&
  4603. sconn->conn->connection.attribute == CCI_CONN_ATTR_RO)
  4604. sconn->last_recvd_seq = seq;
  4605. }
  4606. CCI_EXIT;
  4607. return again;
  4608. }
  4609. /*
  4610. * Check whether a keeplive timeout expired for a given endpoint.
  4611. */
  4612. static void sock_keepalive(cci__ep_t *ep)
  4613. {
  4614. cci__conn_t *conn;
  4615. uint64_t now = 0ULL;
  4616. uint32_t ka_timeout;
  4617. uint8_t i;
  4618. struct s_conns *conn_list;
  4619. sock_conn_t *sconn = NULL;
  4620. sock_dev_t *sdev;
  4621. cci__dev_t *dev;
  4622. sock_ep_t *sep = NULL;
  4623. CCI_ENTER;
  4624. now = sock_get_usecs();
  4625. dev = ep->dev;
  4626. sdev = dev->priv;
  4627. sep = ep->priv;
  4628. i = sock_ip_hash(sdev->ip, sdev->port);
  4629. conn_list = &sep->conn_hash[i];
  4630. TAILQ_FOREACH(sconn, conn_list, entry) {
  4631. conn = sconn->conn;
  4632. if (conn->keepalive_timeout == 0ULL)
  4633. return;
  4634. /* The keepalive is assumed to expire if we did not hear
  4635. anything from the peer since the last receive + keepalive
  4636. timeout. */
  4637. ka_timeout = sconn->ts + conn->keepalive_timeout;
  4638. if (SOCK_U64_LT(now, ka_timeout)) {
  4639. int len;
  4640. char buffer[SOCK_MAX_HDR_SIZE];
  4641. sock_header_t *hdr = NULL;
  4642. cci_event_keepalive_timedout_t *event = NULL;
  4643. cci__evt_t *evt = NULL;
  4644. cci__ep_t *ep = NULL;
  4645. sock_ep_t *sep = NULL;
  4646. /*
  4647. * We generate a keepalive event
  4648. */
  4649. TAILQ_HEAD(s_evts, cci__evt) evts
  4650. = TAILQ_HEAD_INITIALIZER(evts);
  4651. TAILQ_INIT(&evts);
  4652. evt = TAILQ_FIRST(&evts);
  4653. event = (cci_event_keepalive_timedout_t *) evt;
  4654. event->type = CCI_EVENT_KEEPALIVE_TIMEDOUT;
  4655. event->connection = &conn->connection;
  4656. TAILQ_REMOVE(&evts, evt, entry);
  4657. sock_queue_event (evt->ep, evt);
  4658. /* waking up the app thread if it is blocking on a OS
  4659. handle */
  4660. if (sep->event_fd) {
  4661. int rc;
  4662. rc = write (sep->fd[1], "a", 1);
  4663. if (rc != 1) {
  4664. debug (CCI_DB_WARN,
  4665. "%s: Write failed", __func__);
  4666. return;
  4667. }
  4668. }
  4669. /*
  4670. * Finally we send an heartbeat
  4671. */
  4672. /* Prepare and send the msg */
  4673. ep = container_of(conn->connection.endpoint, cci__ep_t, endpoint);
  4674. sep = ep->priv;
  4675. memset(buffer, 0, sizeof(buffer));
  4676. hdr = (sock_header_t *) buffer;
  4677. sock_pack_keepalive(hdr, sconn->peer_id);
  4678. len = sizeof(*hdr);
  4679. sock_sendto(sep->sock, buffer, len, NULL, 0, sconn->sin);
  4680. }
  4681. }
  4682. CCI_EXIT;
  4683. return;
  4684. }
  4685. static inline int sock_ack_sconn (sock_ep_t *sep, sock_conn_t *sconn)
  4686. {
  4687. uint64_t now = 0ULL;
  4688. int count = 0;
  4689. now = sock_get_usecs();
  4690. if (!TAILQ_EMPTY(&sconn->acks)) {
  4691. sock_header_r_t *hdr_r;
  4692. uint32_t acks[SOCK_MAX_SACK * 2];
  4693. sock_ack_t *ack = NULL;
  4694. sock_msg_type_t type = SOCK_MSG_ACK_UP_TO;
  4695. char buffer[SOCK_MAX_HDR_SIZE];
  4696. int len = 0;
  4697. int ret;
  4698. count = 1;
  4699. memset(buffer, 0, sizeof(buffer));
  4700. if (1 == sock_need_sack(sconn)) {
  4701. /* There are more than one element in the list of pending acks */
  4702. sock_ack_t *tmp;
  4703. type = SOCK_MSG_SACK;
  4704. count = 0;
  4705. /* We first count the number of pending ACKs */
  4706. TAILQ_FOREACH_SAFE(ack, &sconn->acks, entry, tmp) {
  4707. count++;
  4708. }
  4709. /* We check whether we want to ack now or delay acks */
  4710. if (SOCK_U64_LT(now, sconn->last_ack_ts + ACK_TIMEOUT) &&
  4711. count <= PENDING_ACK_THRESHOLD)
  4712. {
  4713. debug (CCI_DB_MSG,
  4714. "%s: Delaying ACK", __func__);
  4715. return 0;
  4716. }
  4717. count = 0;
  4718. TAILQ_FOREACH_SAFE(ack, &sconn->acks, entry, tmp) {
  4719. TAILQ_REMOVE (&sconn->acks, ack, entry);
  4720. acks[count++] = ack->start;
  4721. acks[count++] = ack->end;
  4722. free(ack);
  4723. if (count == SOCK_MAX_SACK * 2)
  4724. break;
  4725. }
  4726. if (acks[0] == sconn->acked + 1) {
  4727. sconn->acked = acks[1];
  4728. }
  4729. } else {
  4730. /* There is only one element in the list of pending acks */
  4731. ack = TAILQ_FIRST(&sconn->acks);
  4732. if (SOCK_U64_LT(now, sconn->last_ack_ts + ACK_TIMEOUT)
  4733. && (ack->end - ack->start < PENDING_ACK_THRESHOLD))
  4734. {
  4735. debug (CCI_DB_MSG,
  4736. "%s: Delaying ACK", __func__);
  4737. return 0;
  4738. }
  4739. TAILQ_REMOVE(&sconn->acks, ack, entry);
  4740. if (ack->start == sconn->acked)
  4741. sconn->acked = ack->end;
  4742. acks[0] = ack->end;
  4743. /* If we have a single pending ACK, we send a
  4744. SOCK_MSG_ACK_ONLY ACK, otherwise we send a
  4745. SOCK_MSG_ACK_UP_TO ACK */
  4746. if (ack->start == ack->end)
  4747. type = SOCK_MSG_ACK_ONLY;
  4748. free(ack);
  4749. }
  4750. hdr_r = (sock_header_r_t *) buffer;
  4751. sock_pack_ack(hdr_r, type, sconn->peer_id, 0, 0, acks, count);
  4752. len = sizeof(*hdr_r) + (count * sizeof(acks[0]));
  4753. ret = sock_sendto(sep->sock, buffer, len, NULL, 0, sconn->sin);
  4754. if (ret == -1)
  4755. debug (CCI_DB_WARN, "%s: ACK send failed", __func__);
  4756. sconn->last_ack_ts = now;
  4757. }
  4758. return count;
  4759. }
  4760. static void sock_ack_conns(cci__ep_t * ep)
  4761. {
  4762. int i;
  4763. sock_ep_t *sep = ep->priv;
  4764. sock_conn_t *sconn = NULL;
  4765. uint64_t now = 0ULL;
  4766. CCI_ENTER;
  4767. pthread_mutex_lock(&ep->lock);
  4768. for (i = 0; i < SOCK_EP_HASH_SIZE; i++) {
  4769. if (!TAILQ_EMPTY(&sep->conn_hash[i])) {
  4770. TAILQ_FOREACH(sconn, &sep->conn_hash[i], entry) {
  4771. sock_ack_sconn (sep, sconn);
  4772. }
  4773. }
  4774. }
  4775. pthread_mutex_unlock(&ep->lock);
  4776. /* Since a ACK was issued, we try to receive more data */
  4777. if (sconn != NULL && sconn->last_ack_ts == now)
  4778. sock_recvfrom_ep (ep);
  4779. CCI_EXIT;
  4780. return;
  4781. }
  4782. static void *sock_progress_thread(void *arg)
  4783. {
  4784. cci__ep_t *ep = (cci__ep_t *) arg;
  4785. sock_ep_t *sep;
  4786. int i;
  4787. sock_conn_t *sconn = NULL;
  4788. assert (ep);
  4789. sep = ep->priv;
  4790. pthread_mutex_lock(&ep->lock);
  4791. while (!sep->closing) {
  4792. pthread_mutex_unlock(&ep->lock);
  4793. sock_keepalive (ep);
  4794. sock_progress_sends (ep);
  4795. /* If the endpoint is in the process of closing, we just move
  4796. on, otherwise, we wait for a signal to wake up and do progress */
  4797. if (!sep->closing) {
  4798. pthread_mutex_lock(&sep->progress_mutex);
  4799. pthread_cond_wait(&sep->wait_condition,
  4800. &sep->progress_mutex);
  4801. pthread_mutex_unlock(&sep->progress_mutex);
  4802. }
  4803. pthread_mutex_lock(&ep->lock);
  4804. }
  4805. pthread_mutex_unlock(&ep->lock);
  4806. /* Because we may have delayed some ACKs for optimization,
  4807. we drain all pending ACKs before ending the progress thread */
  4808. for (i = 0; i < SOCK_EP_HASH_SIZE; i++) {
  4809. if (!TAILQ_EMPTY(&sep->conn_hash[i])) {
  4810. TAILQ_FOREACH(sconn, &sep->conn_hash[i], entry) {
  4811. /* We trick the timeout value to ensure the ACK
  4812. will be sent */
  4813. sconn->last_ack_ts
  4814. = sconn->last_ack_ts - 2 * ACK_TIMEOUT;
  4815. }
  4816. }
  4817. }
  4818. sock_ack_conns (ep);
  4819. pthread_exit(NULL);
  4820. return (NULL); /* make pgcc happy */
  4821. }
  4822. int progress_recv (cci__ep_t *ep)
  4823. {
  4824. sock_ep_t *sep;
  4825. int ret = 0;
  4826. struct timeval tv = { 0, SOCK_PROG_TIME_US };
  4827. fd_set fds;
  4828. int again;
  4829. sep = ep->priv;
  4830. /* Not that on system without epoll support, sep->event_fd is equal to 0 */
  4831. if (!sep->event_fd) {
  4832. FD_ZERO(&fds);
  4833. FD_SET (sep->sock, &fds);
  4834. ret = select (sep->sock + 1, &fds, NULL, NULL, &tv);
  4835. if (ret == -1) {
  4836. switch (errno) {
  4837. case EBADF:
  4838. debug(CCI_DB_INFO,
  4839. "%s: select() failed with %s",
  4840. __func__, strerror(errno));
  4841. break;
  4842. default:
  4843. break;
  4844. }
  4845. goto wait4signal;
  4846. }
  4847. do {
  4848. again = sock_recvfrom_ep (ep);
  4849. } while (again == 1);
  4850. }
  4851. #ifdef HAVE_SYS_EPOLL_H
  4852. else {
  4853. struct epoll_event events[SOCK_EP_NUM_EVTS];
  4854. ret = epoll_wait (sep->event_fd, events, SOCK_EP_NUM_EVTS, 0);
  4855. if (ret > 0) {
  4856. int count = ret;
  4857. int i;
  4858. debug(CCI_DB_EP,
  4859. "%s: epoll_wait() found %d event(s)", __func__,
  4860. count);
  4861. for (i = 0; i < count; i++) {
  4862. int (*func)(cci__ep_t*) = events[i].data.ptr;
  4863. if ((events[i].events & EPOLLIN)) {
  4864. if (func != NULL && ep != NULL) {
  4865. do {
  4866. again = (*func)(ep);
  4867. } while (again == 1);
  4868. }
  4869. }
  4870. }
  4871. } else if (ret == -1) {
  4872. debug(CCI_DB_EP, "%s: epoll_wait() returned %s",
  4873. __func__, strerror(errno));
  4874. }
  4875. /* We need to avoid the case where a message is lost and we do
  4876. not handle a message timeout because we block */
  4877. pthread_mutex_lock(&ep->lock);
  4878. if (!TAILQ_EMPTY (&sep->queued) || !TAILQ_EMPTY (&sep->pending)) {
  4879. /* If the send queue is not empty, wake up the send
  4880. thread */
  4881. pthread_mutex_lock(&sep->progress_mutex);
  4882. pthread_cond_signal(&sep->wait_condition);
  4883. pthread_mutex_unlock(&sep->progress_mutex);
  4884. }
  4885. pthread_mutex_unlock(&ep->lock);
  4886. }
  4887. #else
  4888. else {
  4889. struct pollfd fds[1];
  4890. fds[0].fd = sep->sock;
  4891. fds[0].events = POLLIN;
  4892. ret = poll (fds, 1, -1);
  4893. if (ret > 0) {
  4894. int i;
  4895. for (i = 0; i < 1; i++) {
  4896. if (fds[i].revents & POLLIN) {
  4897. sock_recvfrom_ep (ep);
  4898. }
  4899. }
  4900. }
  4901. }
  4902. #endif /* HAVE_SYS_EPOLL_H */
  4903. wait4signal:
  4904. /*
  4905. pthread_mutex_lock(&sep->progress_mutex);
  4906. pthread_cond_signal(&sep->wait_condition);
  4907. pthread_mutex_unlock(&sep->progress_mutex);
  4908. */
  4909. return CCI_SUCCESS;
  4910. }
  4911. static void *sock_recv_thread(void *arg)
  4912. {
  4913. cci__ep_t *ep = (cci__ep_t *)arg;
  4914. sock_ep_t *sep;
  4915. assert (ep);
  4916. sep = ep->priv;
  4917. while (!sep->closing) {
  4918. progress_recv (ep);
  4919. }
  4920. pthread_exit(NULL);
  4921. return (NULL); /* make pgcc happy */
  4922. }