PageRenderTime 51ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/lib/netlink-socket.h

https://github.com/noironetworks/ovs
C Header | 279 lines | 54 code | 21 blank | 204 comment | 0 complexity | 4174e24faa98d8148d66ea219cc68a7e MD5 | raw file
Possible License(s): Apache-2.0, LGPL-2.1
  1. /*
  2. * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at:
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef NETLINK_SOCKET_H
  17. #define NETLINK_SOCKET_H 1
  18. /* Netlink socket definitions.
  19. *
  20. * This header file defines functions for working with Netlink sockets. Only
  21. * Linux natively supports Netlink sockets, but Netlink is well suited as a
  22. * basis for extensible low-level protocols, so it can make sense to implement
  23. * a Netlink layer on other systems. This doesn't have to be done in exactly
  24. * the same way as on Linux, as long as the implementation can support the
  25. * semantics that are important to Open vSwitch. See "Usage concepts" below
  26. * for more information.
  27. *
  28. * For Netlink protocol definitions, see netlink-protocol.h. For helper
  29. * functions for working with Netlink messages, see netlink.h.
  30. *
  31. *
  32. * Usage concepts
  33. * ==============
  34. *
  35. * Netlink is a datagram-based network protocol primarily for communication
  36. * between user processes and the kernel. Netlink is specified in RFC 3549,
  37. * "Linux Netlink as an IP Services Protocol".
  38. *
  39. * Netlink is not suitable for use in physical networks of heterogeneous
  40. * machines because host byte order is used throughout.
  41. *
  42. * The AF_NETLINK socket namespace is subdivided into statically numbered
  43. * protocols, e.g. NETLINK_ROUTE, NETLINK_NETFILTER, provided as the third
  44. * argument to the socket() function. Maintaining the assigned numbers became
  45. * a bit of a problem, so the "Generic Netlink" NETLINK_GENERIC protocol was
  46. * introduced to map between human-readable names and dynamically assigned
  47. * numbers. All recently introduced Netlink protocol messages in Linux
  48. * (including all of the Open vSwitch specific messages) fall under
  49. * NETLINK_GENERIC. The Netlink library provides the nl_lookup_genl_family()
  50. * function for translating a Generic Netlink name to a number. On Linux, this
  51. * queries the kernel Generic Netlink implementation, but on other systems it
  52. * might be easier to statically assign each of the names used by Open vSwitch
  53. * and then implement this function entirely in userspace.
  54. *
  55. * Each Netlink socket is distinguished by its Netlink PID, a 32-bit integer
  56. * that is analogous to a TCP or UDP port number. The kernel has PID 0.
  57. *
  58. * Most Netlink messages manage a kernel table of some kind, e.g. the kernel
  59. * routing table, ARP table, etc. Open vSwitch specific messages manage tables
  60. * of datapaths, ports within datapaths ("vports"), and flows within
  61. * datapaths. Open vSwitch also has messages related to network packets
  62. * received on vports, which aren't really a table.
  63. *
  64. * Datagram protocols over a physical network are typically unreliable: in UDP,
  65. * for example, messages can be dropped, delivered more than once, or delivered
  66. * out of order. In Linux, Netlink does not deliver messages out of order or
  67. * multiple times. In some cases it can drop messages, but the kernel
  68. * indicates when a message has been dropped. The description below of each
  69. * way Open vSwitch uses Netlink also explains how to work around dropped
  70. * messages.
  71. *
  72. * Open vSwitch uses Netlink in four characteristic ways:
  73. *
  74. * 1. Transactions. A transaction is analogous to a system call, an ioctl,
  75. * or an RPC: userspace sends a request to the kernel, which processes
  76. * the request synchronously and returns a reply to userspace.
  77. * (Sometimes there is no explicit reply, but even in that case userspace
  78. * will receive an immediate reply if there is an error.)
  79. *
  80. * nl_transact() is the primary interface for transactions over Netlink.
  81. * This function doesn't take a socket as a parameter because sockets do
  82. * not have any state related to transactions.
  83. *
  84. * Netlink uses 16-bit "length" fields extensively, which effectively
  85. * limits requests and replies to 64 kB. "Dumps" (see below) are one way
  86. * to work around this limit for replies.
  87. *
  88. * In the Linux implementation of Netlink transactions, replies can
  89. * sometimes be lost. When this happens, nl_transact() automatically
  90. * executes the transaction again. This means that it is important that
  91. * transactions be idempotent, or that the client be prepared to tolerate
  92. * that a transaction might actually execute more than once.
  93. *
  94. * The Linux implementation can execute several transactions at the same
  95. * time more efficiently than individually. nl_transact_multiple()
  96. * allows for this. The semantics are no different from executing each
  97. * of the transactions individually with nl_transact().
  98. *
  99. * 2. Dumps. A dump asks the kernel to provide all of the information in a
  100. * table. It consists of a request and a reply, where the reply consists
  101. * of an arbitrary number of messages. Each message in the reply is
  102. * limited to 64 kB, as is the request, but the total size of the reply
  103. * can be many times larger.
  104. *
  105. * The reply to a dump is usually generated piece by piece, not
  106. * atomically. The reply can represent an inconsistent snapshot of the
  107. * table. This is especially likely if entries in the table were being
  108. * added or deleted or changing during the dump.
  109. *
  110. * nl_dump_start() begins a dump based on the caller-provided request and
  111. * initializes a "struct nl_dump" to identify the dump. Subsequent calls
  112. * to nl_dump_next() then obtain the reply, one message at a time.
  113. * Usually, each message gives information about some entry in a table,
  114. * e.g. one flow in the Open vSwitch flow table, or one route in a
  115. * routing table. nl_dump_done() ends the dump.
  116. *
  117. * Linux implements dumps so that messages in a reply do not get lost.
  118. *
  119. * 3. Multicast subscriptions. Most kernel Netlink implementations allow a
  120. * process to monitor changes to its table, by subscribing to a Netlink
  121. * multicast group dedicated to that table. Whenever the table's content
  122. * changes (e.g. an entry is added or deleted or modified), the Netlink
  123. * implementation sends a message to all sockets that subscribe to its
  124. * multicast group notifying it of details of the change. (This doesn't
  125. * require much extra work by the Netlink implementer because the message
  126. * is generally identical to the one sent as a reply to the request that
  127. * changed the table.)
  128. *
  129. * nl_sock_join_mcgroup() subscribes a socket to a multicast group, and
  130. * nl_sock_recv() reads notifications.
  131. *
  132. * If userspace doesn't read messages from a socket subscribed to a
  133. * multicast group quickly enough, then notification messages can pile up
  134. * in the socket's receive buffer. If this continues long enough, the
  135. * receive buffer will fill up and notifications will be lost. In that
  136. * case, nl_sock_recv() will return ENOBUFS. The client can then use a
  137. * dump to resynchronize with the table state. (A simple implementation
  138. * of multicast groups might take advantage of this by simply returning
  139. * ENOBUFS whenever a table changes, without implementing actual
  140. * notifications. This would cause lots of extra dumps, so it may not be
  141. * suitable as a production implementation.)
  142. *
  143. * 4. Unicast subscriptions (Open vSwitch specific). Userspace can assign
  144. * one or more Netlink PIDs to a vport as "upcall PIDs". When a packet
  145. * received on the vport does not match any flow in its datapath's flow
  146. * table, the kernel hashes some of the packet's headers, uses the hash
  147. * to select one of the PIDs, and sends the packet (encapsulated in an
  148. * Open vSwitch Netlink message) to the socket with the selected PID.
  149. *
  150. * nl_sock_recv() reads notifications sent this way.
  151. *
  152. * Specifically on Windows platform, the datapath needs to allocate a
  153. * queue for packets, and it does so only when userspace "subscribe"'s to
  154. * packets on that netlink socket. Before closing the netlink socket,
  155. * userspace needs to "unsubscribe" packets on that netlink socket.
  156. *
  157. * nl_sock_subscribe_packets() and nl_sock_unsubscribe_packets() are
  158. * Windows specific.
  159. *
  160. * Messages received this way can overflow, just like multicast
  161. * subscription messages, and they are reported the same way. Because
  162. * packet notification messages do not report the state of a table, there
  163. * is no way to recover the dropped packets; they are simply lost.
  164. *
  165. * The main reason to support multiple PIDs per vport is to increase
  166. * fairness, that is, to make it harder for a single high-flow-rate
  167. * sender to drown out lower rate sources. Multiple PIDs per vport might
  168. * also improve packet handling latency or flow setup rate, but that is
  169. * not the main goal.
  170. *
  171. * Old versions of the Linux kernel module supported only one PID per
  172. * vport, and userspace still copes with this, so a simple or early
  173. * implementation might only support one PID per vport too.
  174. *
  175. *
  176. * Thread-safety
  177. * =============
  178. *
  179. * Most of the netlink functions are not fully thread-safe: Only a single
  180. * thread may use a given nl_sock or nl_dump at one time. The exceptions are:
  181. *
  182. * - nl_sock_recv() is conditionally thread-safe: it may be called from
  183. * different threads with the same nl_sock, but each caller must provide
  184. * an independent receive buffer.
  185. *
  186. * - nl_dump_next() is conditionally thread-safe: it may be called from
  187. * different threads with the same nl_dump, but each caller must provide
  188. * independent buffers.
  189. */
  190. #include <stdbool.h>
  191. #include <stddef.h>
  192. #include <stdint.h>
  193. #include "ofpbuf.h"
  194. #include "ovs-atomic.h"
  195. #include "ovs-thread.h"
  196. struct nl_sock;
  197. #ifndef HAVE_NETLINK
  198. #ifndef _WIN32
  199. #error "netlink-socket.h is only for hosts that support Netlink sockets"
  200. #endif
  201. #endif
  202. /* Netlink sockets. */
  203. int nl_sock_create(int protocol, struct nl_sock **);
  204. int nl_sock_clone(const struct nl_sock *, struct nl_sock **);
  205. void nl_sock_destroy(struct nl_sock *);
  206. int nl_sock_join_mcgroup(struct nl_sock *, unsigned int multicast_group);
  207. int nl_sock_leave_mcgroup(struct nl_sock *, unsigned int multicast_group);
  208. #ifdef _WIN32
  209. int nl_sock_subscribe_packets(struct nl_sock *sock);
  210. int nl_sock_unsubscribe_packets(struct nl_sock *sock);
  211. #endif
  212. int nl_sock_send(struct nl_sock *, const struct ofpbuf *, bool wait);
  213. int nl_sock_send_seq(struct nl_sock *, const struct ofpbuf *,
  214. uint32_t nlmsg_seq, bool wait);
  215. int nl_sock_recv(struct nl_sock *, struct ofpbuf *, bool wait);
  216. int nl_sock_drain(struct nl_sock *);
  217. void nl_sock_wait(const struct nl_sock *, short int events);
  218. int nl_sock_fd(const struct nl_sock *);
  219. uint32_t nl_sock_pid(const struct nl_sock *);
  220. /* Batching transactions. */
  221. struct nl_transaction {
  222. /* Filled in by client. */
  223. struct ofpbuf *request; /* Request to send. */
  224. /* The client must initialize 'reply' to one of:
  225. *
  226. * - NULL, if it does not care to examine the reply.
  227. *
  228. * - Otherwise, to an ofpbuf with a memory allocation of at least
  229. * NLMSG_HDRLEN bytes.
  230. */
  231. struct ofpbuf *reply; /* Reply (empty if reply was an error code). */
  232. int error; /* Positive errno value, 0 if no error. */
  233. };
  234. /* Transactions without an allocated socket. */
  235. int nl_transact(int protocol, const struct ofpbuf *request,
  236. struct ofpbuf **replyp);
  237. void nl_transact_multiple(int protocol, struct nl_transaction **, size_t n);
  238. /* Table dumping. */
  239. #define NL_DUMP_BUFSIZE 4096
  240. struct nl_dump {
  241. /* These members are immutable during the lifetime of the nl_dump. */
  242. struct nl_sock *sock; /* Socket being dumped. */
  243. uint32_t nl_seq; /* Expected nlmsg_seq for replies. */
  244. /* 'mutex' protects 'status' and serializes access to 'sock'. */
  245. struct ovs_mutex mutex; /* Protects 'status', synchronizes recv(). */
  246. int status OVS_GUARDED; /* 0: dump in progress,
  247. * positive errno: dump completed with error,
  248. * EOF: dump completed successfully. */
  249. };
  250. void nl_dump_start(struct nl_dump *, int protocol,
  251. const struct ofpbuf *request);
  252. bool nl_dump_next(struct nl_dump *, struct ofpbuf *reply, struct ofpbuf *buf);
  253. int nl_dump_done(struct nl_dump *);
  254. /* Miscellaneous */
  255. int nl_lookup_genl_family(const char *name, int *number);
  256. int nl_lookup_genl_mcgroup(const char *family_name, const char *group_name,
  257. unsigned int *multicast_group);
  258. #endif /* netlink-socket.h */