PageRenderTime 1065ms CodeModel.GetById 26ms RepoModel.GetById 1ms app.codeStats 0ms

/drivers/infiniband/sw/siw/siw_main.c

https://github.com/thurday/linux
C | 629 lines | 450 code | 118 blank | 61 comment | 50 complexity | 160588ee6be08b5a1ae8f3e56551aaee MD5 | raw file
  1. // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
  2. /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
  3. /* Copyright (c) 2008-2019, IBM Corporation */
  4. #include <linux/init.h>
  5. #include <linux/errno.h>
  6. #include <linux/netdevice.h>
  7. #include <linux/inetdevice.h>
  8. #include <net/net_namespace.h>
  9. #include <linux/rtnetlink.h>
  10. #include <linux/if_arp.h>
  11. #include <linux/list.h>
  12. #include <linux/kernel.h>
  13. #include <linux/sched.h>
  14. #include <linux/module.h>
  15. #include <linux/dma-mapping.h>
  16. #include <net/addrconf.h>
  17. #include <rdma/ib_verbs.h>
  18. #include <rdma/ib_user_verbs.h>
  19. #include <rdma/rdma_netlink.h>
  20. #include <linux/kthread.h>
  21. #include "siw.h"
  22. #include "siw_verbs.h"
  23. MODULE_AUTHOR("Bernard Metzler");
  24. MODULE_DESCRIPTION("Software iWARP Driver");
  25. MODULE_LICENSE("Dual BSD/GPL");
  26. /* transmit from user buffer, if possible */
  27. const bool zcopy_tx = true;
  28. /* Restrict usage of GSO, if hardware peer iwarp is unable to process
  29. * large packets. try_gso = true lets siw try to use local GSO,
  30. * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth.
  31. */
  32. const bool try_gso;
  33. /* Attach siw also with loopback devices */
  34. const bool loopback_enabled = true;
  35. /* We try to negotiate CRC on, if true */
  36. const bool mpa_crc_required;
  37. /* MPA CRC on/off enforced */
  38. const bool mpa_crc_strict;
  39. /* Control TCP_NODELAY socket option */
  40. const bool siw_tcp_nagle;
  41. /* Select MPA version to be used during connection setup */
  42. u_char mpa_version = MPA_REVISION_2;
  43. /* Selects MPA P2P mode (additional handshake during connection
  44. * setup, if true.
  45. */
  46. const bool peer_to_peer;
  47. struct task_struct *siw_tx_thread[NR_CPUS];
  48. struct crypto_shash *siw_crypto_shash;
  49. static int siw_device_register(struct siw_device *sdev, const char *name)
  50. {
  51. struct ib_device *base_dev = &sdev->base_dev;
  52. static int dev_id = 1;
  53. int rv;
  54. sdev->vendor_part_id = dev_id++;
  55. rv = ib_register_device(base_dev, name, NULL);
  56. if (rv) {
  57. pr_warn("siw: device registration error %d\n", rv);
  58. return rv;
  59. }
  60. siw_dbg(base_dev, "HWaddr=%pM\n", sdev->netdev->dev_addr);
  61. return 0;
  62. }
  63. static void siw_device_cleanup(struct ib_device *base_dev)
  64. {
  65. struct siw_device *sdev = to_siw_dev(base_dev);
  66. xa_destroy(&sdev->qp_xa);
  67. xa_destroy(&sdev->mem_xa);
  68. }
  69. static int siw_create_tx_threads(void)
  70. {
  71. int cpu, assigned = 0;
  72. for_each_online_cpu(cpu) {
  73. /* Skip HT cores */
  74. if (cpu % cpumask_weight(topology_sibling_cpumask(cpu)))
  75. continue;
  76. siw_tx_thread[cpu] =
  77. kthread_run_on_cpu(siw_run_sq,
  78. (unsigned long *)(long)cpu,
  79. cpu, "siw_tx/%u");
  80. if (IS_ERR(siw_tx_thread[cpu])) {
  81. siw_tx_thread[cpu] = NULL;
  82. continue;
  83. }
  84. assigned++;
  85. }
  86. return assigned;
  87. }
  88. static int siw_dev_qualified(struct net_device *netdev)
  89. {
  90. /*
  91. * Additional hardware support can be added here
  92. * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see
  93. * <linux/if_arp.h> for type identifiers.
  94. */
  95. if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 ||
  96. (netdev->type == ARPHRD_LOOPBACK && loopback_enabled))
  97. return 1;
  98. return 0;
  99. }
  100. static DEFINE_PER_CPU(atomic_t, siw_use_cnt);
  101. static struct {
  102. struct cpumask **tx_valid_cpus;
  103. int num_nodes;
  104. } siw_cpu_info;
  105. static int siw_init_cpulist(void)
  106. {
  107. int i, num_nodes = nr_node_ids;
  108. memset(siw_tx_thread, 0, sizeof(siw_tx_thread));
  109. siw_cpu_info.num_nodes = num_nodes;
  110. siw_cpu_info.tx_valid_cpus =
  111. kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL);
  112. if (!siw_cpu_info.tx_valid_cpus) {
  113. siw_cpu_info.num_nodes = 0;
  114. return -ENOMEM;
  115. }
  116. for (i = 0; i < siw_cpu_info.num_nodes; i++) {
  117. siw_cpu_info.tx_valid_cpus[i] =
  118. kzalloc(sizeof(struct cpumask), GFP_KERNEL);
  119. if (!siw_cpu_info.tx_valid_cpus[i])
  120. goto out_err;
  121. cpumask_clear(siw_cpu_info.tx_valid_cpus[i]);
  122. }
  123. for_each_possible_cpu(i)
  124. cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]);
  125. return 0;
  126. out_err:
  127. siw_cpu_info.num_nodes = 0;
  128. while (--i >= 0)
  129. kfree(siw_cpu_info.tx_valid_cpus[i]);
  130. kfree(siw_cpu_info.tx_valid_cpus);
  131. siw_cpu_info.tx_valid_cpus = NULL;
  132. return -ENOMEM;
  133. }
  134. static void siw_destroy_cpulist(void)
  135. {
  136. int i = 0;
  137. while (i < siw_cpu_info.num_nodes)
  138. kfree(siw_cpu_info.tx_valid_cpus[i++]);
  139. kfree(siw_cpu_info.tx_valid_cpus);
  140. }
  141. /*
  142. * Choose CPU with least number of active QP's from NUMA node of
  143. * TX interface.
  144. */
  145. int siw_get_tx_cpu(struct siw_device *sdev)
  146. {
  147. const struct cpumask *tx_cpumask;
  148. int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1;
  149. if (node < 0)
  150. tx_cpumask = cpu_online_mask;
  151. else
  152. tx_cpumask = siw_cpu_info.tx_valid_cpus[node];
  153. num_cpus = cpumask_weight(tx_cpumask);
  154. if (!num_cpus) {
  155. /* no CPU on this NUMA node */
  156. tx_cpumask = cpu_online_mask;
  157. num_cpus = cpumask_weight(tx_cpumask);
  158. }
  159. if (!num_cpus)
  160. goto out;
  161. cpu = cpumask_first(tx_cpumask);
  162. for (i = 0, min_use = SIW_MAX_QP; i < num_cpus;
  163. i++, cpu = cpumask_next(cpu, tx_cpumask)) {
  164. int usage;
  165. /* Skip any cores which have no TX thread */
  166. if (!siw_tx_thread[cpu])
  167. continue;
  168. usage = atomic_read(&per_cpu(siw_use_cnt, cpu));
  169. if (usage <= min_use) {
  170. tx_cpu = cpu;
  171. min_use = usage;
  172. }
  173. }
  174. siw_dbg(&sdev->base_dev,
  175. "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use);
  176. out:
  177. if (tx_cpu >= 0)
  178. atomic_inc(&per_cpu(siw_use_cnt, tx_cpu));
  179. else
  180. pr_warn("siw: no tx cpu found\n");
  181. return tx_cpu;
  182. }
  183. void siw_put_tx_cpu(int cpu)
  184. {
  185. atomic_dec(&per_cpu(siw_use_cnt, cpu));
  186. }
  187. static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id)
  188. {
  189. struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id);
  190. if (qp) {
  191. /*
  192. * siw_qp_id2obj() increments object reference count
  193. */
  194. siw_qp_put(qp);
  195. return &qp->base_qp;
  196. }
  197. return NULL;
  198. }
  199. static const struct ib_device_ops siw_device_ops = {
  200. .owner = THIS_MODULE,
  201. .uverbs_abi_ver = SIW_ABI_VERSION,
  202. .driver_id = RDMA_DRIVER_SIW,
  203. .alloc_mr = siw_alloc_mr,
  204. .alloc_pd = siw_alloc_pd,
  205. .alloc_ucontext = siw_alloc_ucontext,
  206. .create_cq = siw_create_cq,
  207. .create_qp = siw_create_qp,
  208. .create_srq = siw_create_srq,
  209. .dealloc_driver = siw_device_cleanup,
  210. .dealloc_pd = siw_dealloc_pd,
  211. .dealloc_ucontext = siw_dealloc_ucontext,
  212. .dereg_mr = siw_dereg_mr,
  213. .destroy_cq = siw_destroy_cq,
  214. .destroy_qp = siw_destroy_qp,
  215. .destroy_srq = siw_destroy_srq,
  216. .get_dma_mr = siw_get_dma_mr,
  217. .get_port_immutable = siw_get_port_immutable,
  218. .iw_accept = siw_accept,
  219. .iw_add_ref = siw_qp_get_ref,
  220. .iw_connect = siw_connect,
  221. .iw_create_listen = siw_create_listen,
  222. .iw_destroy_listen = siw_destroy_listen,
  223. .iw_get_qp = siw_get_base_qp,
  224. .iw_reject = siw_reject,
  225. .iw_rem_ref = siw_qp_put_ref,
  226. .map_mr_sg = siw_map_mr_sg,
  227. .mmap = siw_mmap,
  228. .mmap_free = siw_mmap_free,
  229. .modify_qp = siw_verbs_modify_qp,
  230. .modify_srq = siw_modify_srq,
  231. .poll_cq = siw_poll_cq,
  232. .post_recv = siw_post_receive,
  233. .post_send = siw_post_send,
  234. .post_srq_recv = siw_post_srq_recv,
  235. .query_device = siw_query_device,
  236. .query_gid = siw_query_gid,
  237. .query_port = siw_query_port,
  238. .query_qp = siw_query_qp,
  239. .query_srq = siw_query_srq,
  240. .req_notify_cq = siw_req_notify_cq,
  241. .reg_user_mr = siw_reg_user_mr,
  242. INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq),
  243. INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd),
  244. INIT_RDMA_OBJ_SIZE(ib_qp, siw_qp, base_qp),
  245. INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq),
  246. INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext),
  247. };
  248. static struct siw_device *siw_device_create(struct net_device *netdev)
  249. {
  250. struct siw_device *sdev = NULL;
  251. struct ib_device *base_dev;
  252. int rv;
  253. sdev = ib_alloc_device(siw_device, base_dev);
  254. if (!sdev)
  255. return NULL;
  256. base_dev = &sdev->base_dev;
  257. sdev->netdev = netdev;
  258. if (netdev->type != ARPHRD_LOOPBACK) {
  259. addrconf_addr_eui48((unsigned char *)&base_dev->node_guid,
  260. netdev->dev_addr);
  261. } else {
  262. /*
  263. * The loopback device does not have a HW address,
  264. * but connection mangagement lib expects gid != 0
  265. */
  266. size_t len = min_t(size_t, strlen(base_dev->name), 6);
  267. char addr[6] = { };
  268. memcpy(addr, base_dev->name, len);
  269. addrconf_addr_eui48((unsigned char *)&base_dev->node_guid,
  270. addr);
  271. }
  272. base_dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND);
  273. base_dev->node_type = RDMA_NODE_RNIC;
  274. memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON,
  275. sizeof(SIW_NODE_DESC_COMMON));
  276. /*
  277. * Current model (one-to-one device association):
  278. * One Softiwarp device per net_device or, equivalently,
  279. * per physical port.
  280. */
  281. base_dev->phys_port_cnt = 1;
  282. base_dev->num_comp_vectors = num_possible_cpus();
  283. xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1);
  284. xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1);
  285. ib_set_device_ops(base_dev, &siw_device_ops);
  286. rv = ib_device_set_netdev(base_dev, netdev, 1);
  287. if (rv)
  288. goto error;
  289. memcpy(base_dev->iw_ifname, netdev->name,
  290. sizeof(base_dev->iw_ifname));
  291. /* Disable TCP port mapping */
  292. base_dev->iw_driver_flags = IW_F_NO_PORT_MAP;
  293. sdev->attrs.max_qp = SIW_MAX_QP;
  294. sdev->attrs.max_qp_wr = SIW_MAX_QP_WR;
  295. sdev->attrs.max_ord = SIW_MAX_ORD_QP;
  296. sdev->attrs.max_ird = SIW_MAX_IRD_QP;
  297. sdev->attrs.max_sge = SIW_MAX_SGE;
  298. sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD;
  299. sdev->attrs.max_cq = SIW_MAX_CQ;
  300. sdev->attrs.max_cqe = SIW_MAX_CQE;
  301. sdev->attrs.max_mr = SIW_MAX_MR;
  302. sdev->attrs.max_pd = SIW_MAX_PD;
  303. sdev->attrs.max_mw = SIW_MAX_MW;
  304. sdev->attrs.max_srq = SIW_MAX_SRQ;
  305. sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR;
  306. sdev->attrs.max_srq_sge = SIW_MAX_SGE;
  307. INIT_LIST_HEAD(&sdev->cep_list);
  308. INIT_LIST_HEAD(&sdev->qp_list);
  309. atomic_set(&sdev->num_ctx, 0);
  310. atomic_set(&sdev->num_srq, 0);
  311. atomic_set(&sdev->num_qp, 0);
  312. atomic_set(&sdev->num_cq, 0);
  313. atomic_set(&sdev->num_mr, 0);
  314. atomic_set(&sdev->num_pd, 0);
  315. sdev->numa_node = dev_to_node(&netdev->dev);
  316. spin_lock_init(&sdev->lock);
  317. return sdev;
  318. error:
  319. ib_dealloc_device(base_dev);
  320. return NULL;
  321. }
  322. /*
  323. * Network link becomes unavailable. Mark all
  324. * affected QP's accordingly.
  325. */
  326. static void siw_netdev_down(struct work_struct *work)
  327. {
  328. struct siw_device *sdev =
  329. container_of(work, struct siw_device, netdev_down);
  330. struct siw_qp_attrs qp_attrs;
  331. struct list_head *pos, *tmp;
  332. memset(&qp_attrs, 0, sizeof(qp_attrs));
  333. qp_attrs.state = SIW_QP_STATE_ERROR;
  334. list_for_each_safe(pos, tmp, &sdev->qp_list) {
  335. struct siw_qp *qp = list_entry(pos, struct siw_qp, devq);
  336. down_write(&qp->state_lock);
  337. WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE));
  338. up_write(&qp->state_lock);
  339. }
  340. ib_device_put(&sdev->base_dev);
  341. }
  342. static void siw_device_goes_down(struct siw_device *sdev)
  343. {
  344. if (ib_device_try_get(&sdev->base_dev)) {
  345. INIT_WORK(&sdev->netdev_down, siw_netdev_down);
  346. schedule_work(&sdev->netdev_down);
  347. }
  348. }
  349. static int siw_netdev_event(struct notifier_block *nb, unsigned long event,
  350. void *arg)
  351. {
  352. struct net_device *netdev = netdev_notifier_info_to_dev(arg);
  353. struct ib_device *base_dev;
  354. struct siw_device *sdev;
  355. dev_dbg(&netdev->dev, "siw: event %lu\n", event);
  356. if (dev_net(netdev) != &init_net)
  357. return NOTIFY_OK;
  358. base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
  359. if (!base_dev)
  360. return NOTIFY_OK;
  361. sdev = to_siw_dev(base_dev);
  362. switch (event) {
  363. case NETDEV_UP:
  364. sdev->state = IB_PORT_ACTIVE;
  365. siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE);
  366. break;
  367. case NETDEV_GOING_DOWN:
  368. siw_device_goes_down(sdev);
  369. break;
  370. case NETDEV_DOWN:
  371. sdev->state = IB_PORT_DOWN;
  372. siw_port_event(sdev, 1, IB_EVENT_PORT_ERR);
  373. break;
  374. case NETDEV_REGISTER:
  375. /*
  376. * Device registration now handled only by
  377. * rdma netlink commands. So it shall be impossible
  378. * to end up here with a valid siw device.
  379. */
  380. siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n");
  381. break;
  382. case NETDEV_UNREGISTER:
  383. ib_unregister_device_queued(&sdev->base_dev);
  384. break;
  385. case NETDEV_CHANGEADDR:
  386. siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE);
  387. break;
  388. /*
  389. * Todo: Below netdev events are currently not handled.
  390. */
  391. case NETDEV_CHANGEMTU:
  392. case NETDEV_CHANGE:
  393. break;
  394. default:
  395. break;
  396. }
  397. ib_device_put(&sdev->base_dev);
  398. return NOTIFY_OK;
  399. }
  400. static struct notifier_block siw_netdev_nb = {
  401. .notifier_call = siw_netdev_event,
  402. };
  403. static int siw_newlink(const char *basedev_name, struct net_device *netdev)
  404. {
  405. struct ib_device *base_dev;
  406. struct siw_device *sdev = NULL;
  407. int rv = -ENOMEM;
  408. if (!siw_dev_qualified(netdev))
  409. return -EINVAL;
  410. base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
  411. if (base_dev) {
  412. ib_device_put(base_dev);
  413. return -EEXIST;
  414. }
  415. sdev = siw_device_create(netdev);
  416. if (sdev) {
  417. dev_dbg(&netdev->dev, "siw: new device\n");
  418. if (netif_running(netdev) && netif_carrier_ok(netdev))
  419. sdev->state = IB_PORT_ACTIVE;
  420. else
  421. sdev->state = IB_PORT_DOWN;
  422. rv = siw_device_register(sdev, basedev_name);
  423. if (rv)
  424. ib_dealloc_device(&sdev->base_dev);
  425. }
  426. return rv;
  427. }
  428. static struct rdma_link_ops siw_link_ops = {
  429. .type = "siw",
  430. .newlink = siw_newlink,
  431. };
  432. /*
  433. * siw_init_module - Initialize Softiwarp module and register with netdev
  434. * subsystem.
  435. */
  436. static __init int siw_init_module(void)
  437. {
  438. int rv;
  439. int nr_cpu;
  440. if (SENDPAGE_THRESH < SIW_MAX_INLINE) {
  441. pr_info("siw: sendpage threshold too small: %u\n",
  442. (int)SENDPAGE_THRESH);
  443. rv = -EINVAL;
  444. goto out_error;
  445. }
  446. rv = siw_init_cpulist();
  447. if (rv)
  448. goto out_error;
  449. rv = siw_cm_init();
  450. if (rv)
  451. goto out_error;
  452. if (!siw_create_tx_threads()) {
  453. pr_info("siw: Could not start any TX thread\n");
  454. rv = -ENOMEM;
  455. goto out_error;
  456. }
  457. /*
  458. * Locate CRC32 algorithm. If unsuccessful, fail
  459. * loading siw only, if CRC is required.
  460. */
  461. siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0);
  462. if (IS_ERR(siw_crypto_shash)) {
  463. pr_info("siw: Loading CRC32c failed: %ld\n",
  464. PTR_ERR(siw_crypto_shash));
  465. siw_crypto_shash = NULL;
  466. if (mpa_crc_required) {
  467. rv = -EOPNOTSUPP;
  468. goto out_error;
  469. }
  470. }
  471. rv = register_netdevice_notifier(&siw_netdev_nb);
  472. if (rv)
  473. goto out_error;
  474. rdma_link_register(&siw_link_ops);
  475. pr_info("SoftiWARP attached\n");
  476. return 0;
  477. out_error:
  478. for (nr_cpu = 0; nr_cpu < nr_cpu_ids; nr_cpu++) {
  479. if (siw_tx_thread[nr_cpu]) {
  480. siw_stop_tx_thread(nr_cpu);
  481. siw_tx_thread[nr_cpu] = NULL;
  482. }
  483. }
  484. if (siw_crypto_shash)
  485. crypto_free_shash(siw_crypto_shash);
  486. pr_info("SoftIWARP attach failed. Error: %d\n", rv);
  487. siw_cm_exit();
  488. siw_destroy_cpulist();
  489. return rv;
  490. }
  491. static void __exit siw_exit_module(void)
  492. {
  493. int cpu;
  494. for_each_possible_cpu(cpu) {
  495. if (siw_tx_thread[cpu]) {
  496. siw_stop_tx_thread(cpu);
  497. siw_tx_thread[cpu] = NULL;
  498. }
  499. }
  500. unregister_netdevice_notifier(&siw_netdev_nb);
  501. rdma_link_unregister(&siw_link_ops);
  502. ib_unregister_driver(RDMA_DRIVER_SIW);
  503. siw_cm_exit();
  504. siw_destroy_cpulist();
  505. if (siw_crypto_shash)
  506. crypto_free_shash(siw_crypto_shash);
  507. pr_info("SoftiWARP detached\n");
  508. }
  509. module_init(siw_init_module);
  510. module_exit(siw_exit_module);
  511. MODULE_ALIAS_RDMA_LINK("siw");