PageRenderTime 58ms CodeModel.GetById 26ms RepoModel.GetById 0ms app.codeStats 1ms

/lib/netlink-socket.c

https://github.com/noironetworks/ovs
C | 1817 lines | 1321 code | 194 blank | 302 comment | 238 complexity | b55adf649101602b9dd80fe6ae7a23a4 MD5 | raw file
Possible License(s): Apache-2.0, LGPL-2.1

Large files files are truncated, but you can click here to view the full file

  1. /*
  2. * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at:
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <config.h>
  17. #include "netlink-socket.h"
  18. #include <errno.h>
  19. #include <inttypes.h>
  20. #include <stdlib.h>
  21. #include <sys/types.h>
  22. #include <sys/uio.h>
  23. #include <unistd.h>
  24. #include "coverage.h"
  25. #include "dynamic-string.h"
  26. #include "hash.h"
  27. #include "hmap.h"
  28. #include "netlink.h"
  29. #include "netlink-protocol.h"
  30. #include "odp-netlink.h"
  31. #include "ofpbuf.h"
  32. #include "ovs-thread.h"
  33. #include "poll-loop.h"
  34. #include "seq.h"
  35. #include "socket-util.h"
  36. #include "util.h"
  37. #include "openvswitch/vlog.h"
  38. VLOG_DEFINE_THIS_MODULE(netlink_socket);
  39. COVERAGE_DEFINE(netlink_overflow);
  40. COVERAGE_DEFINE(netlink_received);
  41. COVERAGE_DEFINE(netlink_recv_jumbo);
  42. COVERAGE_DEFINE(netlink_sent);
  43. /* Linux header file confusion causes this to be undefined. */
  44. #ifndef SOL_NETLINK
  45. #define SOL_NETLINK 270
  46. #endif
  47. #ifdef _WIN32
  48. static struct ovs_mutex portid_mutex = OVS_MUTEX_INITIALIZER;
  49. static uint32_t g_last_portid = 0;
  50. /* Port IDs must be unique! */
  51. static uint32_t
  52. portid_next(void)
  53. OVS_GUARDED_BY(portid_mutex)
  54. {
  55. g_last_portid++;
  56. return g_last_portid;
  57. }
  58. #endif /* _WIN32 */
  59. /* A single (bad) Netlink message can in theory dump out many, many log
  60. * messages, so the burst size is set quite high here to avoid missing useful
  61. * information. Also, at high logging levels we log *all* Netlink messages. */
  62. static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 600);
  63. static uint32_t nl_sock_allocate_seq(struct nl_sock *, unsigned int n);
  64. static void log_nlmsg(const char *function, int error,
  65. const void *message, size_t size, int protocol);
  66. #ifdef _WIN32
  67. static int get_sock_pid_from_kernel(struct nl_sock *sock);
  68. #endif
  69. /* Netlink sockets. */
  70. struct nl_sock {
  71. #ifdef _WIN32
  72. HANDLE handle;
  73. OVERLAPPED overlapped;
  74. DWORD read_ioctl;
  75. #else
  76. int fd;
  77. #endif
  78. uint32_t next_seq;
  79. uint32_t pid;
  80. int protocol;
  81. unsigned int rcvbuf; /* Receive buffer size (SO_RCVBUF). */
  82. };
  83. /* Compile-time limit on iovecs, so that we can allocate a maximum-size array
  84. * of iovecs on the stack. */
  85. #define MAX_IOVS 128
  86. /* Maximum number of iovecs that may be passed to sendmsg, capped at a
  87. * minimum of _XOPEN_IOV_MAX (16) and a maximum of MAX_IOVS.
  88. *
  89. * Initialized by nl_sock_create(). */
  90. static int max_iovs;
  91. static int nl_pool_alloc(int protocol, struct nl_sock **sockp);
  92. static void nl_pool_release(struct nl_sock *);
  93. /* Creates a new netlink socket for the given netlink 'protocol'
  94. * (NETLINK_ROUTE, NETLINK_GENERIC, ...). Returns 0 and sets '*sockp' to the
  95. * new socket if successful, otherwise returns a positive errno value. */
  96. int
  97. nl_sock_create(int protocol, struct nl_sock **sockp)
  98. {
  99. static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
  100. struct nl_sock *sock;
  101. #ifndef _WIN32
  102. struct sockaddr_nl local, remote;
  103. #endif
  104. socklen_t local_size;
  105. int rcvbuf;
  106. int retval = 0;
  107. if (ovsthread_once_start(&once)) {
  108. int save_errno = errno;
  109. errno = 0;
  110. max_iovs = sysconf(_SC_UIO_MAXIOV);
  111. if (max_iovs < _XOPEN_IOV_MAX) {
  112. if (max_iovs == -1 && errno) {
  113. VLOG_WARN("sysconf(_SC_UIO_MAXIOV): %s", ovs_strerror(errno));
  114. }
  115. max_iovs = _XOPEN_IOV_MAX;
  116. } else if (max_iovs > MAX_IOVS) {
  117. max_iovs = MAX_IOVS;
  118. }
  119. errno = save_errno;
  120. ovsthread_once_done(&once);
  121. }
  122. *sockp = NULL;
  123. sock = xmalloc(sizeof *sock);
  124. #ifdef _WIN32
  125. sock->handle = CreateFile(OVS_DEVICE_NAME_USER,
  126. GENERIC_READ | GENERIC_WRITE,
  127. FILE_SHARE_READ | FILE_SHARE_WRITE,
  128. NULL, OPEN_EXISTING,
  129. FILE_FLAG_OVERLAPPED, NULL);
  130. if (sock->handle == INVALID_HANDLE_VALUE) {
  131. VLOG_ERR("fcntl: %s", ovs_lasterror_to_string());
  132. goto error;
  133. }
  134. memset(&sock->overlapped, 0, sizeof sock->overlapped);
  135. sock->overlapped.hEvent = CreateEvent(NULL, FALSE, FALSE, NULL);
  136. if (sock->overlapped.hEvent == NULL) {
  137. VLOG_ERR("fcntl: %s", ovs_lasterror_to_string());
  138. goto error;
  139. }
  140. /* Initialize the type/ioctl to Generic */
  141. sock->read_ioctl = OVS_IOCTL_READ;
  142. #else
  143. sock->fd = socket(AF_NETLINK, SOCK_RAW, protocol);
  144. if (sock->fd < 0) {
  145. VLOG_ERR("fcntl: %s", ovs_strerror(errno));
  146. goto error;
  147. }
  148. #endif
  149. sock->protocol = protocol;
  150. sock->next_seq = 1;
  151. rcvbuf = 1024 * 1024;
  152. #ifdef _WIN32
  153. sock->rcvbuf = rcvbuf;
  154. retval = get_sock_pid_from_kernel(sock);
  155. if (retval != 0) {
  156. goto error;
  157. }
  158. #else
  159. if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUFFORCE,
  160. &rcvbuf, sizeof rcvbuf)) {
  161. /* Only root can use SO_RCVBUFFORCE. Everyone else gets EPERM.
  162. * Warn only if the failure is therefore unexpected. */
  163. if (errno != EPERM) {
  164. VLOG_WARN_RL(&rl, "setting %d-byte socket receive buffer failed "
  165. "(%s)", rcvbuf, ovs_strerror(errno));
  166. }
  167. }
  168. retval = get_socket_rcvbuf(sock->fd);
  169. if (retval < 0) {
  170. retval = -retval;
  171. goto error;
  172. }
  173. sock->rcvbuf = retval;
  174. /* Connect to kernel (pid 0) as remote address. */
  175. memset(&remote, 0, sizeof remote);
  176. remote.nl_family = AF_NETLINK;
  177. remote.nl_pid = 0;
  178. if (connect(sock->fd, (struct sockaddr *) &remote, sizeof remote) < 0) {
  179. VLOG_ERR("connect(0): %s", ovs_strerror(errno));
  180. goto error;
  181. }
  182. /* Obtain pid assigned by kernel. */
  183. local_size = sizeof local;
  184. if (getsockname(sock->fd, (struct sockaddr *) &local, &local_size) < 0) {
  185. VLOG_ERR("getsockname: %s", ovs_strerror(errno));
  186. goto error;
  187. }
  188. if (local_size < sizeof local || local.nl_family != AF_NETLINK) {
  189. VLOG_ERR("getsockname returned bad Netlink name");
  190. retval = EINVAL;
  191. goto error;
  192. }
  193. sock->pid = local.nl_pid;
  194. #endif
  195. *sockp = sock;
  196. return 0;
  197. error:
  198. if (retval == 0) {
  199. retval = errno;
  200. if (retval == 0) {
  201. retval = EINVAL;
  202. }
  203. }
  204. #ifdef _WIN32
  205. if (sock->overlapped.hEvent) {
  206. CloseHandle(sock->overlapped.hEvent);
  207. }
  208. if (sock->handle != INVALID_HANDLE_VALUE) {
  209. CloseHandle(sock->handle);
  210. }
  211. #else
  212. if (sock->fd >= 0) {
  213. close(sock->fd);
  214. }
  215. #endif
  216. free(sock);
  217. return retval;
  218. }
  219. /* Creates a new netlink socket for the same protocol as 'src'. Returns 0 and
  220. * sets '*sockp' to the new socket if successful, otherwise returns a positive
  221. * errno value. */
  222. int
  223. nl_sock_clone(const struct nl_sock *src, struct nl_sock **sockp)
  224. {
  225. return nl_sock_create(src->protocol, sockp);
  226. }
  227. /* Destroys netlink socket 'sock'. */
  228. void
  229. nl_sock_destroy(struct nl_sock *sock)
  230. {
  231. if (sock) {
  232. #ifdef _WIN32
  233. if (sock->overlapped.hEvent) {
  234. CloseHandle(sock->overlapped.hEvent);
  235. }
  236. CloseHandle(sock->handle);
  237. #else
  238. close(sock->fd);
  239. #endif
  240. free(sock);
  241. }
  242. }
  243. #ifdef _WIN32
  244. /* Reads the pid for 'sock' generated in the kernel datapath. The function
  245. * follows a transaction semantic. Eventually this function should call into
  246. * nl_transact. */
  247. static int
  248. get_sock_pid_from_kernel(struct nl_sock *sock)
  249. {
  250. struct nl_transaction txn;
  251. struct ofpbuf request;
  252. uint64_t request_stub[128];
  253. struct ofpbuf reply;
  254. uint64_t reply_stub[128];
  255. struct ovs_header *ovs_header;
  256. struct nlmsghdr *nlmsg;
  257. uint32_t seq;
  258. int retval;
  259. DWORD bytes;
  260. int ovs_msg_size = sizeof (struct nlmsghdr) + sizeof (struct genlmsghdr) +
  261. sizeof (struct ovs_header);
  262. ofpbuf_use_stub(&request, request_stub, sizeof request_stub);
  263. txn.request = &request;
  264. ofpbuf_use_stub(&reply, reply_stub, sizeof reply_stub);
  265. txn.reply = &reply;
  266. seq = nl_sock_allocate_seq(sock, 1);
  267. nl_msg_put_genlmsghdr(&request, 0, OVS_WIN_NL_CTRL_FAMILY_ID, 0,
  268. OVS_CTRL_CMD_WIN_GET_PID, OVS_WIN_CONTROL_VERSION);
  269. nlmsg = nl_msg_nlmsghdr(txn.request);
  270. nlmsg->nlmsg_seq = seq;
  271. ovs_header = ofpbuf_put_uninit(&request, sizeof *ovs_header);
  272. ovs_header->dp_ifindex = 0;
  273. ovs_header = ofpbuf_put_uninit(&reply, ovs_msg_size);
  274. if (!DeviceIoControl(sock->handle, OVS_IOCTL_TRANSACT,
  275. ofpbuf_data(txn.request), ofpbuf_size(txn.request),
  276. ofpbuf_data(txn.reply), ofpbuf_size(txn.reply),
  277. &bytes, NULL)) {
  278. retval = EINVAL;
  279. goto done;
  280. } else {
  281. if (bytes < ovs_msg_size) {
  282. retval = EINVAL;
  283. goto done;
  284. }
  285. nlmsg = nl_msg_nlmsghdr(txn.reply);
  286. if (nlmsg->nlmsg_seq != seq) {
  287. retval = EINVAL;
  288. goto done;
  289. }
  290. sock->pid = nlmsg->nlmsg_pid;
  291. }
  292. retval = 0;
  293. done:
  294. ofpbuf_uninit(&request);
  295. ofpbuf_uninit(&reply);
  296. return retval;
  297. }
  298. #endif /* _WIN32 */
  299. #ifdef _WIN32
  300. static int __inline
  301. nl_sock_mcgroup(struct nl_sock *sock, unsigned int multicast_group, bool join)
  302. {
  303. struct ofpbuf request;
  304. uint64_t request_stub[128];
  305. struct ovs_header *ovs_header;
  306. struct nlmsghdr *nlmsg;
  307. int error;
  308. ofpbuf_use_stub(&request, request_stub, sizeof request_stub);
  309. nl_msg_put_genlmsghdr(&request, 0, OVS_WIN_NL_CTRL_FAMILY_ID, 0,
  310. OVS_CTRL_CMD_MC_SUBSCRIBE_REQ,
  311. OVS_WIN_CONTROL_VERSION);
  312. ovs_header = ofpbuf_put_uninit(&request, sizeof *ovs_header);
  313. ovs_header->dp_ifindex = 0;
  314. nl_msg_put_u32(&request, OVS_NL_ATTR_MCAST_GRP, multicast_group);
  315. nl_msg_put_u8(&request, OVS_NL_ATTR_MCAST_JOIN, join ? 1 : 0);
  316. error = nl_sock_send(sock, &request, true);
  317. ofpbuf_uninit(&request);
  318. return error;
  319. }
  320. #endif
  321. /* Tries to add 'sock' as a listener for 'multicast_group'. Returns 0 if
  322. * successful, otherwise a positive errno value.
  323. *
  324. * A socket that is subscribed to a multicast group that receives asynchronous
  325. * notifications must not be used for Netlink transactions or dumps, because
  326. * transactions and dumps can cause notifications to be lost.
  327. *
  328. * Multicast group numbers are always positive.
  329. *
  330. * It is not an error to attempt to join a multicast group to which a socket
  331. * already belongs. */
  332. int
  333. nl_sock_join_mcgroup(struct nl_sock *sock, unsigned int multicast_group)
  334. {
  335. #ifdef _WIN32
  336. /* Set the socket type as a "multicast" socket */
  337. sock->read_ioctl = OVS_IOCTL_READ_EVENT;
  338. int error = nl_sock_mcgroup(sock, multicast_group, true);
  339. if (error) {
  340. sock->read_ioctl = OVS_IOCTL_READ;
  341. VLOG_WARN("could not join multicast group %u (%s)",
  342. multicast_group, ovs_strerror(error));
  343. return error;
  344. }
  345. #else
  346. if (setsockopt(sock->fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP,
  347. &multicast_group, sizeof multicast_group) < 0) {
  348. VLOG_WARN("could not join multicast group %u (%s)",
  349. multicast_group, ovs_strerror(errno));
  350. return errno;
  351. }
  352. #endif
  353. return 0;
  354. }
  355. #ifdef _WIN32
  356. int
  357. nl_sock_subscribe_packets(struct nl_sock *sock)
  358. {
  359. int error;
  360. if (sock->read_ioctl != OVS_IOCTL_READ) {
  361. return EINVAL;
  362. }
  363. error = nl_sock_subscribe_packet__(sock, true);
  364. if (error) {
  365. VLOG_WARN("could not unsubscribe packets (%s)",
  366. ovs_strerror(errno));
  367. return error;
  368. }
  369. sock->read_ioctl = OVS_IOCTL_READ_PACKET;
  370. return 0;
  371. }
  372. int
  373. nl_sock_unsubscribe_packets(struct nl_sock *sock)
  374. {
  375. ovs_assert(sock->read_ioctl == OVS_IOCTL_READ_PACKET);
  376. int error = nl_sock_subscribe_packet__(sock, false);
  377. if (error) {
  378. VLOG_WARN("could not subscribe to packets (%s)",
  379. ovs_strerror(errno));
  380. return error;
  381. }
  382. sock->read_ioctl = OVS_IOCTL_READ;
  383. return 0;
  384. }
  385. int
  386. nl_sock_subscribe_packet__(struct nl_sock *sock, bool subscribe)
  387. {
  388. struct ofpbuf request;
  389. uint64_t request_stub[128];
  390. struct ovs_header *ovs_header;
  391. struct nlmsghdr *nlmsg;
  392. int error;
  393. ofpbuf_use_stub(&request, request_stub, sizeof request_stub);
  394. nl_msg_put_genlmsghdr(&request, 0, OVS_WIN_NL_CTRL_FAMILY_ID, 0,
  395. OVS_CTRL_CMD_PACKET_SUBSCRIBE_REQ,
  396. OVS_WIN_CONTROL_VERSION);
  397. ovs_header = ofpbuf_put_uninit(&request, sizeof *ovs_header);
  398. ovs_header->dp_ifindex = 0;
  399. nl_msg_put_u8(&request, OVS_NL_ATTR_PACKET_SUBSCRIBE, subscribe ? 1 : 0);
  400. nl_msg_put_u32(&request, OVS_NL_ATTR_PACKET_PID, sock->pid);
  401. error = nl_sock_send(sock, &request, true);
  402. ofpbuf_uninit(&request);
  403. return error;
  404. }
  405. #endif
  406. /* Tries to make 'sock' stop listening to 'multicast_group'. Returns 0 if
  407. * successful, otherwise a positive errno value.
  408. *
  409. * Multicast group numbers are always positive.
  410. *
  411. * It is not an error to attempt to leave a multicast group to which a socket
  412. * does not belong.
  413. *
  414. * On success, reading from 'sock' will still return any messages that were
  415. * received on 'multicast_group' before the group was left. */
  416. int
  417. nl_sock_leave_mcgroup(struct nl_sock *sock, unsigned int multicast_group)
  418. {
  419. #ifdef _WIN32
  420. int error = nl_sock_mcgroup(sock, multicast_group, false);
  421. if (error) {
  422. VLOG_WARN("could not leave multicast group %u (%s)",
  423. multicast_group, ovs_strerror(error));
  424. return error;
  425. }
  426. sock->read_ioctl = OVS_IOCTL_READ;
  427. #else
  428. if (setsockopt(sock->fd, SOL_NETLINK, NETLINK_DROP_MEMBERSHIP,
  429. &multicast_group, sizeof multicast_group) < 0) {
  430. VLOG_WARN("could not leave multicast group %u (%s)",
  431. multicast_group, ovs_strerror(errno));
  432. return errno;
  433. }
  434. #endif
  435. return 0;
  436. }
  437. static int
  438. nl_sock_send__(struct nl_sock *sock, const struct ofpbuf *msg,
  439. uint32_t nlmsg_seq, bool wait)
  440. {
  441. struct nlmsghdr *nlmsg = nl_msg_nlmsghdr(msg);
  442. int error;
  443. nlmsg->nlmsg_len = ofpbuf_size(msg);
  444. nlmsg->nlmsg_seq = nlmsg_seq;
  445. nlmsg->nlmsg_pid = sock->pid;
  446. do {
  447. int retval;
  448. #ifdef _WIN32
  449. DWORD bytes;
  450. if (!DeviceIoControl(sock->handle, OVS_IOCTL_WRITE,
  451. ofpbuf_data(msg), ofpbuf_size(msg), NULL, 0,
  452. &bytes, NULL)) {
  453. retval = -1;
  454. /* XXX: Map to a more appropriate error based on GetLastError(). */
  455. errno = EINVAL;
  456. } else {
  457. retval = ofpbuf_size(msg);
  458. }
  459. #else
  460. retval = send(sock->fd, ofpbuf_data(msg), ofpbuf_size(msg),
  461. wait ? 0 : MSG_DONTWAIT);
  462. #endif
  463. error = retval < 0 ? errno : 0;
  464. } while (error == EINTR);
  465. log_nlmsg(__func__, error, ofpbuf_data(msg), ofpbuf_size(msg), sock->protocol);
  466. if (!error) {
  467. COVERAGE_INC(netlink_sent);
  468. }
  469. return error;
  470. }
  471. /* Tries to send 'msg', which must contain a Netlink message, to the kernel on
  472. * 'sock'. nlmsg_len in 'msg' will be finalized to match ofpbuf_size(msg), nlmsg_pid
  473. * will be set to 'sock''s pid, and nlmsg_seq will be initialized to a fresh
  474. * sequence number, before the message is sent.
  475. *
  476. * Returns 0 if successful, otherwise a positive errno value. If
  477. * 'wait' is true, then the send will wait until buffer space is ready;
  478. * otherwise, returns EAGAIN if the 'sock' send buffer is full. */
  479. int
  480. nl_sock_send(struct nl_sock *sock, const struct ofpbuf *msg, bool wait)
  481. {
  482. return nl_sock_send_seq(sock, msg, nl_sock_allocate_seq(sock, 1), wait);
  483. }
  484. /* Tries to send 'msg', which must contain a Netlink message, to the kernel on
  485. * 'sock'. nlmsg_len in 'msg' will be finalized to match ofpbuf_size(msg), nlmsg_pid
  486. * will be set to 'sock''s pid, and nlmsg_seq will be initialized to
  487. * 'nlmsg_seq', before the message is sent.
  488. *
  489. * Returns 0 if successful, otherwise a positive errno value. If
  490. * 'wait' is true, then the send will wait until buffer space is ready;
  491. * otherwise, returns EAGAIN if the 'sock' send buffer is full.
  492. *
  493. * This function is suitable for sending a reply to a request that was received
  494. * with sequence number 'nlmsg_seq'. Otherwise, use nl_sock_send() instead. */
  495. int
  496. nl_sock_send_seq(struct nl_sock *sock, const struct ofpbuf *msg,
  497. uint32_t nlmsg_seq, bool wait)
  498. {
  499. return nl_sock_send__(sock, msg, nlmsg_seq, wait);
  500. }
  501. static int
  502. nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
  503. {
  504. /* We can't accurately predict the size of the data to be received. The
  505. * caller is supposed to have allocated enough space in 'buf' to handle the
  506. * "typical" case. To handle exceptions, we make available enough space in
  507. * 'tail' to allow Netlink messages to be up to 64 kB long (a reasonable
  508. * figure since that's the maximum length of a Netlink attribute). */
  509. struct nlmsghdr *nlmsghdr;
  510. uint8_t tail[65536];
  511. struct iovec iov[2];
  512. struct msghdr msg;
  513. ssize_t retval;
  514. int error;
  515. ovs_assert(buf->allocated >= sizeof *nlmsghdr);
  516. ofpbuf_clear(buf);
  517. iov[0].iov_base = ofpbuf_base(buf);
  518. iov[0].iov_len = buf->allocated;
  519. iov[1].iov_base = tail;
  520. iov[1].iov_len = sizeof tail;
  521. memset(&msg, 0, sizeof msg);
  522. msg.msg_iov = iov;
  523. msg.msg_iovlen = 2;
  524. /* Receive a Netlink message from the kernel.
  525. *
  526. * This works around a kernel bug in which the kernel returns an error code
  527. * as if it were the number of bytes read. It doesn't actually modify
  528. * anything in the receive buffer in that case, so we can initialize the
  529. * Netlink header with an impossible message length and then, upon success,
  530. * check whether it changed. */
  531. nlmsghdr = ofpbuf_base(buf);
  532. do {
  533. nlmsghdr->nlmsg_len = UINT32_MAX;
  534. #ifdef _WIN32
  535. DWORD bytes;
  536. if (!DeviceIoControl(sock->handle, sock->read_ioctl,
  537. NULL, 0, tail, sizeof tail, &bytes, NULL)) {
  538. retval = -1;
  539. errno = EINVAL;
  540. } else {
  541. retval = bytes;
  542. if (retval == 0) {
  543. retval = -1;
  544. errno = EAGAIN;
  545. } else {
  546. if (retval >= buf->allocated) {
  547. ofpbuf_reinit(buf, retval);
  548. nlmsghdr = ofpbuf_base(buf);
  549. nlmsghdr->nlmsg_len = UINT32_MAX;
  550. }
  551. memcpy(ofpbuf_data(buf), tail, retval);
  552. ofpbuf_set_size(buf, retval);
  553. }
  554. }
  555. #else
  556. retval = recvmsg(sock->fd, &msg, wait ? 0 : MSG_DONTWAIT);
  557. #endif
  558. error = (retval < 0 ? errno
  559. : retval == 0 ? ECONNRESET /* not possible? */
  560. : nlmsghdr->nlmsg_len != UINT32_MAX ? 0
  561. : retval);
  562. } while (error == EINTR);
  563. if (error) {
  564. if (error == ENOBUFS) {
  565. /* Socket receive buffer overflow dropped one or more messages that
  566. * the kernel tried to send to us. */
  567. COVERAGE_INC(netlink_overflow);
  568. }
  569. return error;
  570. }
  571. if (msg.msg_flags & MSG_TRUNC) {
  572. VLOG_ERR_RL(&rl, "truncated message (longer than %"PRIuSIZE" bytes)",
  573. sizeof tail);
  574. return E2BIG;
  575. }
  576. if (retval < sizeof *nlmsghdr
  577. || nlmsghdr->nlmsg_len < sizeof *nlmsghdr
  578. || nlmsghdr->nlmsg_len > retval) {
  579. VLOG_ERR_RL(&rl, "received invalid nlmsg (%"PRIuSIZE" bytes < %"PRIuSIZE")",
  580. retval, sizeof *nlmsghdr);
  581. return EPROTO;
  582. }
  583. #ifndef _WIN32
  584. ofpbuf_set_size(buf, MIN(retval, buf->allocated));
  585. if (retval > buf->allocated) {
  586. COVERAGE_INC(netlink_recv_jumbo);
  587. ofpbuf_put(buf, tail, retval - buf->allocated);
  588. }
  589. #endif
  590. log_nlmsg(__func__, 0, ofpbuf_data(buf), ofpbuf_size(buf), sock->protocol);
  591. COVERAGE_INC(netlink_received);
  592. return 0;
  593. }
  594. /* Tries to receive a Netlink message from the kernel on 'sock' into 'buf'. If
  595. * 'wait' is true, waits for a message to be ready. Otherwise, fails with
  596. * EAGAIN if the 'sock' receive buffer is empty.
  597. *
  598. * The caller must have initialized 'buf' with an allocation of at least
  599. * NLMSG_HDRLEN bytes. For best performance, the caller should allocate enough
  600. * space for a "typical" message.
  601. *
  602. * On success, returns 0 and replaces 'buf''s previous content by the received
  603. * message. This function expands 'buf''s allocated memory, as necessary, to
  604. * hold the actual size of the received message.
  605. *
  606. * On failure, returns a positive errno value and clears 'buf' to zero length.
  607. * 'buf' retains its previous memory allocation.
  608. *
  609. * Regardless of success or failure, this function resets 'buf''s headroom to
  610. * 0. */
  611. int
  612. nl_sock_recv(struct nl_sock *sock, struct ofpbuf *buf, bool wait)
  613. {
  614. return nl_sock_recv__(sock, buf, wait);
  615. }
  616. static void
  617. nl_sock_record_errors__(struct nl_transaction **transactions, size_t n,
  618. int error)
  619. {
  620. size_t i;
  621. for (i = 0; i < n; i++) {
  622. struct nl_transaction *txn = transactions[i];
  623. txn->error = error;
  624. if (txn->reply) {
  625. ofpbuf_clear(txn->reply);
  626. }
  627. }
  628. }
  629. static int
  630. nl_sock_transact_multiple__(struct nl_sock *sock,
  631. struct nl_transaction **transactions, size_t n,
  632. size_t *done)
  633. {
  634. uint64_t tmp_reply_stub[1024 / 8];
  635. struct nl_transaction tmp_txn;
  636. struct ofpbuf tmp_reply;
  637. uint32_t base_seq;
  638. struct iovec iovs[MAX_IOVS];
  639. struct msghdr msg;
  640. int error;
  641. int i;
  642. base_seq = nl_sock_allocate_seq(sock, n);
  643. *done = 0;
  644. for (i = 0; i < n; i++) {
  645. struct nl_transaction *txn = transactions[i];
  646. struct nlmsghdr *nlmsg = nl_msg_nlmsghdr(txn->request);
  647. nlmsg->nlmsg_len = ofpbuf_size(txn->request);
  648. nlmsg->nlmsg_seq = base_seq + i;
  649. nlmsg->nlmsg_pid = sock->pid;
  650. iovs[i].iov_base = ofpbuf_data(txn->request);
  651. iovs[i].iov_len = ofpbuf_size(txn->request);
  652. }
  653. #ifndef _WIN32
  654. memset(&msg, 0, sizeof msg);
  655. msg.msg_iov = iovs;
  656. msg.msg_iovlen = n;
  657. do {
  658. error = sendmsg(sock->fd, &msg, 0) < 0 ? errno : 0;
  659. } while (error == EINTR);
  660. for (i = 0; i < n; i++) {
  661. struct nl_transaction *txn = transactions[i];
  662. log_nlmsg(__func__, error, ofpbuf_data(txn->request),
  663. ofpbuf_size(txn->request), sock->protocol);
  664. }
  665. if (!error) {
  666. COVERAGE_ADD(netlink_sent, n);
  667. }
  668. if (error) {
  669. return error;
  670. }
  671. ofpbuf_use_stub(&tmp_reply, tmp_reply_stub, sizeof tmp_reply_stub);
  672. tmp_txn.request = NULL;
  673. tmp_txn.reply = &tmp_reply;
  674. tmp_txn.error = 0;
  675. while (n > 0) {
  676. struct nl_transaction *buf_txn, *txn;
  677. uint32_t seq;
  678. /* Find a transaction whose buffer we can use for receiving a reply.
  679. * If no such transaction is left, use tmp_txn. */
  680. buf_txn = &tmp_txn;
  681. for (i = 0; i < n; i++) {
  682. if (transactions[i]->reply) {
  683. buf_txn = transactions[i];
  684. break;
  685. }
  686. }
  687. /* Receive a reply. */
  688. error = nl_sock_recv__(sock, buf_txn->reply, false);
  689. if (error) {
  690. if (error == EAGAIN) {
  691. nl_sock_record_errors__(transactions, n, 0);
  692. *done += n;
  693. error = 0;
  694. }
  695. break;
  696. }
  697. /* Match the reply up with a transaction. */
  698. seq = nl_msg_nlmsghdr(buf_txn->reply)->nlmsg_seq;
  699. if (seq < base_seq || seq >= base_seq + n) {
  700. VLOG_DBG_RL(&rl, "ignoring unexpected seq %#"PRIx32, seq);
  701. continue;
  702. }
  703. i = seq - base_seq;
  704. txn = transactions[i];
  705. /* Fill in the results for 'txn'. */
  706. if (nl_msg_nlmsgerr(buf_txn->reply, &txn->error)) {
  707. if (txn->reply) {
  708. ofpbuf_clear(txn->reply);
  709. }
  710. if (txn->error) {
  711. VLOG_DBG_RL(&rl, "received NAK error=%d (%s)",
  712. error, ovs_strerror(txn->error));
  713. }
  714. } else {
  715. txn->error = 0;
  716. if (txn->reply && txn != buf_txn) {
  717. /* Swap buffers. */
  718. struct ofpbuf *reply = buf_txn->reply;
  719. buf_txn->reply = txn->reply;
  720. txn->reply = reply;
  721. }
  722. }
  723. /* Fill in the results for transactions before 'txn'. (We have to do
  724. * this after the results for 'txn' itself because of the buffer swap
  725. * above.) */
  726. nl_sock_record_errors__(transactions, i, 0);
  727. /* Advance. */
  728. *done += i + 1;
  729. transactions += i + 1;
  730. n -= i + 1;
  731. base_seq += i + 1;
  732. }
  733. ofpbuf_uninit(&tmp_reply);
  734. #else
  735. error = 0;
  736. uint8_t reply_buf[65536];
  737. for (i = 0; i < n; i++) {
  738. DWORD reply_len;
  739. struct nl_transaction *txn = transactions[i];
  740. struct nlmsghdr *request_nlmsg, *reply_nlmsg;
  741. if (!DeviceIoControl(sock->handle, OVS_IOCTL_TRANSACT,
  742. ofpbuf_data(txn->request),
  743. ofpbuf_size(txn->request),
  744. reply_buf, sizeof reply_buf,
  745. &reply_len, NULL)) {
  746. /* XXX: Map to a more appropriate error. */
  747. error = EINVAL;
  748. break;
  749. }
  750. if (reply_len < sizeof *reply_nlmsg) {
  751. nl_sock_record_errors__(transactions, n, 0);
  752. VLOG_DBG_RL(&rl, "insufficient length of reply %#"PRIu32
  753. " for seq: %#"PRIx32, reply_len, request_nlmsg->nlmsg_seq);
  754. break;
  755. }
  756. /* Validate the sequence number in the reply. */
  757. request_nlmsg = nl_msg_nlmsghdr(txn->request);
  758. reply_nlmsg = (struct nlmsghdr *)reply_buf;
  759. if (request_nlmsg->nlmsg_seq != reply_nlmsg->nlmsg_seq) {
  760. ovs_assert(request_nlmsg->nlmsg_seq == reply_nlmsg->nlmsg_seq);
  761. VLOG_DBG_RL(&rl, "mismatched seq request %#"PRIx32
  762. ", reply %#"PRIx32, request_nlmsg->nlmsg_seq,
  763. reply_nlmsg->nlmsg_seq);
  764. break;
  765. }
  766. /* Handle errors embedded within the netlink message. */
  767. ofpbuf_use_stub(&tmp_reply, reply_buf, sizeof reply_buf);
  768. ofpbuf_set_size(&tmp_reply, sizeof reply_buf);
  769. if (nl_msg_nlmsgerr(&tmp_reply, &txn->error)) {
  770. if (txn->reply) {
  771. ofpbuf_clear(txn->reply);
  772. }
  773. if (txn->error) {
  774. VLOG_DBG_RL(&rl, "received NAK error=%d (%s)",
  775. error, ovs_strerror(txn->error));
  776. }
  777. } else {
  778. txn->error = 0;
  779. if (txn->reply) {
  780. /* Copy the reply to the buffer specified by the caller. */
  781. if (reply_len > txn->reply->allocated) {
  782. ofpbuf_reinit(txn->reply, reply_len);
  783. }
  784. memcpy(ofpbuf_data(txn->reply), reply_buf, reply_len);
  785. ofpbuf_set_size(txn->reply, reply_len);
  786. }
  787. }
  788. ofpbuf_uninit(&tmp_reply);
  789. /* Count the number of successful transactions. */
  790. (*done)++;
  791. }
  792. if (!error) {
  793. COVERAGE_ADD(netlink_sent, n);
  794. }
  795. #endif
  796. return error;
  797. }
  798. static void
  799. nl_sock_transact_multiple(struct nl_sock *sock,
  800. struct nl_transaction **transactions, size_t n)
  801. {
  802. int max_batch_count;
  803. int error;
  804. if (!n) {
  805. return;
  806. }
  807. /* In theory, every request could have a 64 kB reply. But the default and
  808. * maximum socket rcvbuf size with typical Dom0 memory sizes both tend to
  809. * be a bit below 128 kB, so that would only allow a single message in a
  810. * "batch". So we assume that replies average (at most) 4 kB, which allows
  811. * a good deal of batching.
  812. *
  813. * In practice, most of the requests that we batch either have no reply at
  814. * all or a brief reply. */
  815. max_batch_count = MAX(sock->rcvbuf / 4096, 1);
  816. max_batch_count = MIN(max_batch_count, max_iovs);
  817. while (n > 0) {
  818. size_t count, bytes;
  819. size_t done;
  820. /* Batch up to 'max_batch_count' transactions. But cap it at about a
  821. * page of requests total because big skbuffs are expensive to
  822. * allocate in the kernel. */
  823. #if defined(PAGESIZE)
  824. enum { MAX_BATCH_BYTES = MAX(1, PAGESIZE - 512) };
  825. #else
  826. enum { MAX_BATCH_BYTES = 4096 - 512 };
  827. #endif
  828. bytes = ofpbuf_size(transactions[0]->request);
  829. for (count = 1; count < n && count < max_batch_count; count++) {
  830. if (bytes + ofpbuf_size(transactions[count]->request) > MAX_BATCH_BYTES) {
  831. break;
  832. }
  833. bytes += ofpbuf_size(transactions[count]->request);
  834. }
  835. error = nl_sock_transact_multiple__(sock, transactions, count, &done);
  836. transactions += done;
  837. n -= done;
  838. if (error == ENOBUFS) {
  839. VLOG_DBG_RL(&rl, "receive buffer overflow, resending request");
  840. } else if (error) {
  841. VLOG_ERR_RL(&rl, "transaction error (%s)", ovs_strerror(error));
  842. nl_sock_record_errors__(transactions, n, error);
  843. }
  844. }
  845. }
  846. static int
  847. nl_sock_transact(struct nl_sock *sock, const struct ofpbuf *request,
  848. struct ofpbuf **replyp)
  849. {
  850. struct nl_transaction *transactionp;
  851. struct nl_transaction transaction;
  852. transaction.request = CONST_CAST(struct ofpbuf *, request);
  853. transaction.reply = replyp ? ofpbuf_new(1024) : NULL;
  854. transactionp = &transaction;
  855. nl_sock_transact_multiple(sock, &transactionp, 1);
  856. if (replyp) {
  857. if (transaction.error) {
  858. ofpbuf_delete(transaction.reply);
  859. *replyp = NULL;
  860. } else {
  861. *replyp = transaction.reply;
  862. }
  863. }
  864. return transaction.error;
  865. }
  866. /* Drain all the messages currently in 'sock''s receive queue. */
  867. int
  868. nl_sock_drain(struct nl_sock *sock)
  869. {
  870. #ifdef _WIN32
  871. return 0;
  872. #else
  873. return drain_rcvbuf(sock->fd);
  874. #endif
  875. }
  876. /* Starts a Netlink "dump" operation, by sending 'request' to the kernel on a
  877. * Netlink socket created with the given 'protocol', and initializes 'dump' to
  878. * reflect the state of the operation.
  879. *
  880. * 'request' must contain a Netlink message. Before sending the message,
  881. * nlmsg_len will be finalized to match request->size, and nlmsg_pid will be
  882. * set to the Netlink socket's pid. NLM_F_DUMP and NLM_F_ACK will be set in
  883. * nlmsg_flags.
  884. *
  885. * The design of this Netlink socket library ensures that the dump is reliable.
  886. *
  887. * This function provides no status indication. nl_dump_done() provides an
  888. * error status for the entire dump operation.
  889. *
  890. * The caller must eventually destroy 'request'.
  891. */
  892. void
  893. nl_dump_start(struct nl_dump *dump, int protocol, const struct ofpbuf *request)
  894. {
  895. nl_msg_nlmsghdr(request)->nlmsg_flags |= NLM_F_DUMP | NLM_F_ACK;
  896. ovs_mutex_init(&dump->mutex);
  897. ovs_mutex_lock(&dump->mutex);
  898. dump->status = nl_pool_alloc(protocol, &dump->sock);
  899. if (!dump->status) {
  900. dump->status = nl_sock_send__(dump->sock, request,
  901. nl_sock_allocate_seq(dump->sock, 1),
  902. true);
  903. }
  904. dump->nl_seq = nl_msg_nlmsghdr(request)->nlmsg_seq;
  905. ovs_mutex_unlock(&dump->mutex);
  906. }
  907. static int
  908. nl_dump_refill(struct nl_dump *dump, struct ofpbuf *buffer)
  909. OVS_REQUIRES(dump->mutex)
  910. {
  911. struct nlmsghdr *nlmsghdr;
  912. int error;
  913. while (!ofpbuf_size(buffer)) {
  914. error = nl_sock_recv__(dump->sock, buffer, false);
  915. if (error) {
  916. /* The kernel never blocks providing the results of a dump, so
  917. * error == EAGAIN means that we've read the whole thing, and
  918. * therefore transform it into EOF. (The kernel always provides
  919. * NLMSG_DONE as a sentinel. Some other thread must have received
  920. * that already but not yet signaled it in 'status'.)
  921. *
  922. * Any other error is just an error. */
  923. return error == EAGAIN ? EOF : error;
  924. }
  925. nlmsghdr = nl_msg_nlmsghdr(buffer);
  926. if (dump->nl_seq != nlmsghdr->nlmsg_seq) {
  927. VLOG_DBG_RL(&rl, "ignoring seq %#"PRIx32" != expected %#"PRIx32,
  928. nlmsghdr->nlmsg_seq, dump->nl_seq);
  929. ofpbuf_clear(buffer);
  930. }
  931. }
  932. if (nl_msg_nlmsgerr(buffer, &error) && error) {
  933. VLOG_INFO_RL(&rl, "netlink dump request error (%s)",
  934. ovs_strerror(error));
  935. ofpbuf_clear(buffer);
  936. return error;
  937. }
  938. return 0;
  939. }
  940. static int
  941. nl_dump_next__(struct ofpbuf *reply, struct ofpbuf *buffer)
  942. {
  943. struct nlmsghdr *nlmsghdr = nl_msg_next(buffer, reply);
  944. if (!nlmsghdr) {
  945. VLOG_WARN_RL(&rl, "netlink dump contains message fragment");
  946. return EPROTO;
  947. } else if (nlmsghdr->nlmsg_type == NLMSG_DONE) {
  948. return EOF;
  949. } else {
  950. return 0;
  951. }
  952. }
  953. /* Attempts to retrieve another reply from 'dump' into 'buffer'. 'dump' must
  954. * have been initialized with nl_dump_start(), and 'buffer' must have been
  955. * initialized. 'buffer' should be at least NL_DUMP_BUFSIZE bytes long.
  956. *
  957. * If successful, returns true and points 'reply->data' and
  958. * 'ofpbuf_size(reply)' to the message that was retrieved. The caller must not
  959. * modify 'reply' (because it points within 'buffer', which will be used by
  960. * future calls to this function).
  961. *
  962. * On failure, returns false and sets 'reply->data' to NULL and
  963. * 'ofpbuf_size(reply)' to 0. Failure might indicate an actual error or merely
  964. * the end of replies. An error status for the entire dump operation is
  965. * provided when it is completed by calling nl_dump_done().
  966. *
  967. * Multiple threads may call this function, passing the same nl_dump, however
  968. * each must provide independent buffers. This function may cache multiple
  969. * replies in the buffer, and these will be processed before more replies are
  970. * fetched. When this function returns false, other threads may continue to
  971. * process replies in their buffers, but they will not fetch more replies.
  972. */
  973. bool
  974. nl_dump_next(struct nl_dump *dump, struct ofpbuf *reply, struct ofpbuf *buffer)
  975. {
  976. int retval = 0;
  977. /* If the buffer is empty, refill it.
  978. *
  979. * If the buffer is not empty, we don't check the dump's status.
  980. * Otherwise, we could end up skipping some of the dump results if thread A
  981. * hits EOF while thread B is in the midst of processing a batch. */
  982. if (!ofpbuf_size(buffer)) {
  983. ovs_mutex_lock(&dump->mutex);
  984. if (!dump->status) {
  985. /* Take the mutex here to avoid an in-kernel race. If two threads
  986. * try to read from a Netlink dump socket at once, then the socket
  987. * error can be set to EINVAL, which will be encountered on the
  988. * next recv on that socket, which could be anywhere due to the way
  989. * that we pool Netlink sockets. Serializing the recv calls avoids
  990. * the issue. */
  991. dump->status = nl_dump_refill(dump, buffer);
  992. }
  993. retval = dump->status;
  994. ovs_mutex_unlock(&dump->mutex);
  995. }
  996. /* Fetch the next message from the buffer. */
  997. if (!retval) {
  998. retval = nl_dump_next__(reply, buffer);
  999. if (retval) {
  1000. /* Record 'retval' as the dump status, but don't overwrite an error
  1001. * with EOF. */
  1002. ovs_mutex_lock(&dump->mutex);
  1003. if (dump->status <= 0) {
  1004. dump->status = retval;
  1005. }
  1006. ovs_mutex_unlock(&dump->mutex);
  1007. }
  1008. }
  1009. if (retval) {
  1010. ofpbuf_set_data(reply, NULL);
  1011. ofpbuf_set_size(reply, 0);
  1012. }
  1013. return !retval;
  1014. }
  1015. /* Completes Netlink dump operation 'dump', which must have been initialized
  1016. * with nl_dump_start(). Returns 0 if the dump operation was error-free,
  1017. * otherwise a positive errno value describing the problem. */
  1018. int
  1019. nl_dump_done(struct nl_dump *dump)
  1020. {
  1021. int status;
  1022. ovs_mutex_lock(&dump->mutex);
  1023. status = dump->status;
  1024. ovs_mutex_unlock(&dump->mutex);
  1025. /* Drain any remaining messages that the client didn't read. Otherwise the
  1026. * kernel will continue to queue them up and waste buffer space.
  1027. *
  1028. * XXX We could just destroy and discard the socket in this case. */
  1029. if (!status) {
  1030. uint64_t tmp_reply_stub[NL_DUMP_BUFSIZE / 8];
  1031. struct ofpbuf reply, buf;
  1032. ofpbuf_use_stub(&buf, tmp_reply_stub, sizeof tmp_reply_stub);
  1033. while (nl_dump_next(dump, &reply, &buf)) {
  1034. /* Nothing to do. */
  1035. }
  1036. ofpbuf_uninit(&buf);
  1037. ovs_mutex_lock(&dump->mutex);
  1038. status = dump->status;
  1039. ovs_mutex_unlock(&dump->mutex);
  1040. ovs_assert(status);
  1041. }
  1042. nl_pool_release(dump->sock);
  1043. ovs_mutex_destroy(&dump->mutex);
  1044. return status == EOF ? 0 : status;
  1045. }
  1046. #ifdef _WIN32
  1047. /* Pend an I/O request in the driver. The driver completes the I/O whenever
  1048. * an event or a packet is ready to be read. Once the I/O is completed
  1049. * the overlapped structure event associated with the pending I/O will be set
  1050. */
  1051. static int
  1052. pend_io_request(struct nl_sock *sock)
  1053. {
  1054. struct ofpbuf request;
  1055. uint64_t request_stub[128];
  1056. struct ovs_header *ovs_header;
  1057. struct nlmsghdr *nlmsg;
  1058. uint32_t seq;
  1059. int retval;
  1060. int error;
  1061. DWORD bytes;
  1062. OVERLAPPED *overlapped = CONST_CAST(OVERLAPPED *, &sock->overlapped);
  1063. int ovs_msg_size = sizeof (struct nlmsghdr) + sizeof (struct genlmsghdr) +
  1064. sizeof (struct ovs_header);
  1065. ofpbuf_use_stub(&request, request_stub, sizeof request_stub);
  1066. seq = nl_sock_allocate_seq(sock, 1);
  1067. nl_msg_put_genlmsghdr(&request, 0, OVS_WIN_NL_CTRL_FAMILY_ID, 0,
  1068. OVS_CTRL_CMD_WIN_PEND_REQ, OVS_WIN_CONTROL_VERSION);
  1069. nlmsg = nl_msg_nlmsghdr(&request);
  1070. nlmsg->nlmsg_seq = seq;
  1071. nlmsg->nlmsg_pid = sock->pid;
  1072. ovs_header = ofpbuf_put_uninit(&request, sizeof *ovs_header);
  1073. ovs_header->dp_ifindex = 0;
  1074. if (!DeviceIoControl(sock->handle, OVS_IOCTL_WRITE,
  1075. ofpbuf_data(&request), ofpbuf_size(&request),
  1076. NULL, 0, &bytes, overlapped)) {
  1077. error = GetLastError();
  1078. /* Check if the I/O got pended */
  1079. if (error != ERROR_IO_INCOMPLETE && error != ERROR_IO_PENDING) {
  1080. VLOG_ERR("nl_sock_wait failed - %s\n", ovs_format_message(error));
  1081. retval = EINVAL;
  1082. goto done;
  1083. }
  1084. } else {
  1085. /* The I/O was completed synchronously */
  1086. poll_immediate_wake();
  1087. }
  1088. retval = 0;
  1089. done:
  1090. ofpbuf_uninit(&request);
  1091. return retval;
  1092. }
  1093. #endif /* _WIN32 */
  1094. /* Causes poll_block() to wake up when any of the specified 'events' (which is
  1095. * a OR'd combination of POLLIN, POLLOUT, etc.) occur on 'sock'.
  1096. * On Windows, 'sock' is not treated as const, and may be modified. */
  1097. void
  1098. nl_sock_wait(const struct nl_sock *sock, short int events)
  1099. {
  1100. #ifdef _WIN32
  1101. if (sock->overlapped.Internal != STATUS_PENDING) {
  1102. pend_io_request(CONST_CAST(struct nl_sock *, sock));
  1103. /* XXX: poll_wevent_wait(sock->overlapped.hEvent); */
  1104. }
  1105. poll_immediate_wake(); /* XXX: temporary. */
  1106. #else
  1107. poll_fd_wait(sock->fd, events);
  1108. #endif
  1109. }
  1110. /* Returns the underlying fd for 'sock', for use in "poll()"-like operations
  1111. * that can't use nl_sock_wait().
  1112. *
  1113. * It's a little tricky to use the returned fd correctly, because nl_sock does
  1114. * "copy on write" to allow a single nl_sock to be used for notifications,
  1115. * transactions, and dumps. If 'sock' is used only for notifications and
  1116. * transactions (and never for dump) then the usage is safe. */
  1117. int
  1118. nl_sock_fd(const struct nl_sock *sock)
  1119. {
  1120. #ifdef _WIN32
  1121. BUILD_ASSERT_DECL(sizeof sock->handle == sizeof(int));
  1122. return (int)sock->handle;
  1123. #else
  1124. return sock->fd;
  1125. #endif
  1126. }
  1127. /* Returns the PID associated with this socket. */
  1128. uint32_t
  1129. nl_sock_pid(const struct nl_sock *sock)
  1130. {
  1131. return sock->pid;
  1132. }
  1133. /* Miscellaneous. */
  1134. struct genl_family {
  1135. struct hmap_node hmap_node;
  1136. uint16_t id;
  1137. char *name;
  1138. };
  1139. static struct hmap genl_families = HMAP_INITIALIZER(&genl_families);
  1140. static const struct nl_policy family_policy[CTRL_ATTR_MAX + 1] = {
  1141. [CTRL_ATTR_FAMILY_ID] = {.type = NL_A_U16},
  1142. [CTRL_ATTR_MCAST_GROUPS] = {.type = NL_A_NESTED, .optional = true},
  1143. };
  1144. static struct genl_family *
  1145. find_genl_family_by_id(uint16_t id)
  1146. {
  1147. struct genl_family *family;
  1148. HMAP_FOR_EACH_IN_BUCKET (family, hmap_node, hash_int(id, 0),
  1149. &genl_families) {
  1150. if (family->id == id) {
  1151. return family;
  1152. }
  1153. }
  1154. return NULL;
  1155. }
  1156. static void
  1157. define_genl_family(uint16_t id, const char *name)
  1158. {
  1159. struct genl_family *family = find_genl_family_by_id(id);
  1160. if (family) {
  1161. if (!strcmp(family->name, name)) {
  1162. return;
  1163. }
  1164. free(family->name);
  1165. } else {
  1166. family = xmalloc(sizeof *family);
  1167. family->id = id;
  1168. hmap_insert(&genl_families, &family->hmap_node, hash_int(id, 0));
  1169. }
  1170. family->name = xstrdup(name);
  1171. }
  1172. static const char *
  1173. genl_family_to_name(uint16_t id)
  1174. {
  1175. if (id == GENL_ID_CTRL) {
  1176. return "control";
  1177. } else {
  1178. struct genl_family *family = find_genl_family_by_id(id);
  1179. return family ? family->name : "unknown";
  1180. }
  1181. }
  1182. #ifndef _WIN32
  1183. static int
  1184. do_lookup_genl_family(const char *name, struct nlattr **attrs,
  1185. struct ofpbuf **replyp)
  1186. {
  1187. struct nl_sock *sock;
  1188. struct ofpbuf request, *reply;
  1189. int error;
  1190. *replyp = NULL;
  1191. error = nl_sock_create(NETLINK_GENERIC, &sock);
  1192. if (error) {
  1193. return error;
  1194. }
  1195. ofpbuf_init(&request, 0);
  1196. nl_msg_put_genlmsghdr(&request, 0, GENL_ID_CTRL, NLM_F_REQUEST,
  1197. CTRL_CMD_GETFAMILY, 1);
  1198. nl_msg_put_string(&request, CTRL_ATTR_FAMILY_NAME, name);
  1199. error = nl_sock_transact(sock, &request, &reply);
  1200. ofpbuf_uninit(&request);
  1201. if (error) {
  1202. nl_sock_destroy(sock);
  1203. return error;
  1204. }
  1205. if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN,
  1206. family_policy, attrs, ARRAY_SIZE(family_policy))
  1207. || nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]) == 0) {
  1208. nl_sock_destroy(sock);
  1209. ofpbuf_delete(reply);
  1210. return EPROTO;
  1211. }
  1212. nl_sock_destroy(sock);
  1213. *replyp = reply;
  1214. return 0;
  1215. }
  1216. #else
  1217. static int
  1218. do_lookup_genl_family(const char *name, struct nlattr **attrs,
  1219. struct ofpbuf **replyp)
  1220. {
  1221. struct nlmsghdr *nlmsg;
  1222. struct ofpbuf *reply;
  1223. int error;
  1224. uint16_t family_id;
  1225. const char *family_name;
  1226. uint32_t family_version;
  1227. uint32_t family_attrmax;
  1228. uint32_t mcgrp_id = OVS_WIN_NL_INVALID_MCGRP_ID;
  1229. const char *mcgrp_name = NULL;
  1230. *replyp = NULL;
  1231. reply = ofpbuf_new(1024);
  1232. /* CTRL_ATTR_MCAST_GROUPS is supported only for VPORT family. */
  1233. if (!strcmp(name, OVS_WIN_CONTROL_FAMILY)) {
  1234. family_id = OVS_WIN_NL_CTRL_FAMILY_ID;
  1235. family_name = OVS_WIN_CONTROL_FAMILY;
  1236. family_version = OVS_WIN_CONTROL_VERSION;
  1237. family_attrmax = OVS_WIN_CONTROL_ATTR_MAX;
  1238. } else if (!strcmp(name, OVS_DATAPATH_FAMILY)) {
  1239. family_id = OVS_WIN_NL_DATAPATH_FAMILY_ID;
  1240. family_name = OVS_DATAPATH_FAMILY;
  1241. family_version = OVS_DATAPATH_VERSION;
  1242. family_attrmax = OVS_DP_ATTR_MAX;
  1243. } else if (!strcmp(name, OVS_PACKET_FAMILY)) {
  1244. family_id = OVS_WIN_NL_PACKET_FAMILY_ID;
  1245. family_name = OVS_PACKET_FAMILY;
  1246. family_version = OVS_PACKET_VERSION;
  1247. family_attrmax = OVS_PACKET_ATTR_MAX;
  1248. } else if (!strcmp(name, OVS_VPORT_FAMILY)) {
  1249. family_id = OVS_WIN_NL_VPORT_FAMILY_ID;
  1250. family_name = OVS_VPORT_FAMILY;
  1251. family_version = OVS_VPORT_VERSION;
  1252. family_attrmax = OVS_VPORT_ATTR_MAX;
  1253. mcgrp_id = OVS_WIN_NL_VPORT_MCGRP_ID;
  1254. mcgrp_name = OVS_VPORT_MCGROUP;
  1255. } else if (!strcmp(name, OVS_FLOW_FAMILY)) {
  1256. family_id = OVS_WIN_NL_FLOW_FAMILY_ID;
  1257. family_name = OVS_FLOW_FAMILY;
  1258. family_version = OVS_FLOW_VERSION;
  1259. family_attrmax = OVS_FLOW_ATTR_MAX;
  1260. } else if (!strcmp(name, OVS_WIN_NETDEV_FAMILY)) {
  1261. family_id = OVS_WIN_NL_NETDEV_FAMILY_ID;
  1262. family_name = OVS_WIN_NETDEV_FAMILY;
  1263. family_version = OVS_WIN_NETDEV_VERSION;
  1264. family_attrmax = OVS_WIN_NETDEV_ATTR_MAX;
  1265. } else {
  1266. ofpbuf_delete(reply);
  1267. return EINVAL;
  1268. }
  1269. nl_msg_put_genlmsghdr(reply, 0, GENL_ID_CTRL, 0,
  1270. CTRL_CMD_NEWFAMILY, family_version);
  1271. /* CTRL_ATTR_HDRSIZE and CTRL_ATTR_OPS are not populated, but the
  1272. * callers do not seem to need them. */
  1273. nl_msg_put_u16(reply, CTRL_ATTR_FAMILY_ID, family_id);
  1274. nl_msg_put_string(reply, CTRL_ATTR_FAMILY_NAME, family_name);
  1275. nl_msg_put_u32(reply, CTRL_ATTR_VERSION, family_version);
  1276. nl_msg_put_u32(reply, CTRL_ATTR_MAXATTR, family_attrmax);
  1277. if (mcgrp_id != OVS_WIN_NL_INVALID_MCGRP_ID) {
  1278. size_t mcgrp_ofs1 = nl_msg_start_nested(reply, CTRL_ATTR_MCAST_GROUPS);
  1279. size_t mcgrp_ofs2= nl_msg_start_nested(reply,
  1280. OVS_WIN_NL_VPORT_MCGRP_ID - OVS_WIN_NL_MCGRP_START_ID);
  1281. nl_msg_put_u32(reply, CTRL_ATTR_MCAST_GRP_ID, mcgrp_id);
  1282. ovs_assert(mcgrp_name != NULL);
  1283. nl_msg_put_string(reply, CTRL_ATTR_MCAST_GRP_NAME, mcgrp_name);
  1284. nl_msg_end_nested(reply, mcgrp_ofs2);
  1285. nl_msg_end_nested(reply, mcgrp_ofs1);
  1286. }
  1287. /* Set the total length of the netlink message. */
  1288. nlmsg = nl_msg_nlmsghdr(reply);
  1289. nlmsg->nlmsg_len = ofpbuf_size(reply);
  1290. if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN,
  1291. family_policy, attrs, ARRAY_SIZE(family_policy))
  1292. || nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]) == 0) {
  1293. ofpbuf_delete(reply);
  1294. return EPROTO;
  1295. }
  1296. *replyp = reply;
  1297. return 0;
  1298. }
  1299. #endif
  1300. /* Finds the multicast group called 'group_name' in genl family 'family_name'.
  1301. * When successful, writes its result to 'multicast_group' and returns 0.
  1302. * Otherwise, clears 'multicast_group' and returns a positive error code.
  1303. */
  1304. int
  1305. nl_lookup_genl_mcgroup(const char *family_name, const char *group_name,
  1306. unsigned int *multicast_group)
  1307. {
  1308. struct nlattr *family_attrs[ARRAY_SIZE(family_policy)];
  1309. const struct nlattr *mc;
  1310. struct ofpbuf *reply;
  1311. unsigned int left;
  1312. int error;
  1313. *multicast_group = 0;
  1314. error = do_lookup_genl_family(family_name, family_attrs, &reply);
  1315. if (error) {
  1316. return error;
  1317. }
  1318. if (!family_attrs[CTRL_ATTR_MCAST_GROUPS]) {
  1319. error = EPROTO;
  1320. goto exit;
  1321. }
  1322. NL_NESTED_FOR_EACH (mc, left, family_attrs[CTRL_ATTR_MCAST_GROUPS]) {
  1323. static const struct nl_policy mc_policy[] = {
  1324. [CTRL_ATTR_MCAST_GRP_ID] = {.type = NL_A_U32},
  1325. [CTRL_ATTR_MCAST_GRP_NAME] = {.type = NL_A_STRING},
  1326. };
  1327. struct nlattr *mc_attrs[ARRAY_SIZE(mc_policy)];
  1328. const char *mc_name;
  1329. if (!nl_parse_nested(mc, mc_policy, mc_attrs, ARRAY_SIZE(mc_policy))) {
  1330. error = EPROTO;
  1331. goto exit;
  1332. }
  1333. mc_name = nl_attr_get_string(mc_attrs[CTRL_ATTR_MCAST_GRP_NAME]);
  1334. if (!strcmp(group_name, mc_name)) {
  1335. *multicast_group =
  1336. nl_attr_get_u32(mc_attrs[CTRL_ATTR_MCAST_GRP_ID]);
  1337. error = 0;
  1338. goto exit;
  1339. }
  1340. }
  1341. error = EPROTO;
  1342. exit:
  1343. ofpbuf_delete(reply);
  1344. return error;
  1345. }
  1346. /* If '*number' is 0, translates the given Generic Netlink family 'name' to a
  1347. * number and stores it in '*number'. If successful, returns 0 and the caller
  1348. * may use '*number' as the family number. On failure, returns a positive
  1349. * errno value and '*number' caches the errno value. */
  1350. int
  1351. nl_lookup_genl_family(const char *name, int *number)
  1352. {
  1353. if (*number == 0) {
  1354. struct nlattr *attrs[ARRAY_SIZE(family_policy)];
  1355. struct ofpbuf *reply;
  1356. int error;
  1357. error = do_lookup_genl_family(name, attrs, &reply);
  1358. if (!error) {
  1359. *number = nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]);
  1360. define_genl_family(*number, name);
  1361. } else {
  1362. *number = -error;
  1363. }
  1364. ofpbuf_delete(reply);
  1365. ovs_assert(*number != 0);
  1366. }
  1367. return *number > 0 ? 0 : -*number;
  1368. }
  1369. struct nl_pool {
  1370. struct nl_sock *socks[16];
  1371. int n;
  1372. };
  1373. static struct ovs_mutex pool_mutex = OVS_MUTEX_INITIALIZER;
  1374. static struct nl_pool pools[MAX_LINKS] OVS_GUARDED_BY(pool_mutex);
  1375. static int
  1376. nl_pool_alloc(int protocol, struct nl_sock **sockp)
  1377. {
  1378. struct nl_sock *sock = NULL;
  1379. struct nl_pool *pool;
  1380. ovs_assert(protocol >= 0 && protocol < ARRAY_SIZE(pools));
  1381. ovs_mutex_lock(&pool_mutex);
  1382. pool = &pools[protocol];
  1383. if (pool->n > 0) {
  1384. sock = pool->socks[--pool->n];
  1385. }
  1386. ovs_mutex_unlock(&pool_mutex);
  1387. if (sock) {
  1388. *sockp = sock;
  1389. return 0;
  1390. } else {
  1391. return nl_sock_create(protocol, sockp);
  1392. }
  1393. }
  1394. static void
  1395. nl_pool_release(struct nl_sock *sock)
  1396. {
  1397. if (sock) {
  1398. struct nl_pool *pool = &pools[sock->protocol];
  1399. ovs_mutex_lock(&pool_mutex);
  1400. if (pool->n < ARRAY_SIZE(pool->socks)) {
  1401. pool->socks[pool->n++] = sock;
  1402. sock = NULL;
  1403. }
  1404. ovs_mu

Large files files are truncated, but you can click here to view the full file