/drivers/net/veth.c

http://github.com/mirrors/linux · C · 1487 lines · 1179 code · 248 blank · 60 comment · 151 complexity · 9478b604201c870d16a72d48ac8edfad MD5 · raw file

  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * drivers/net/veth.c
  4. *
  5. * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc
  6. *
  7. * Author: Pavel Emelianov <xemul@openvz.org>
  8. * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com>
  9. *
  10. */
  11. #include <linux/netdevice.h>
  12. #include <linux/slab.h>
  13. #include <linux/ethtool.h>
  14. #include <linux/etherdevice.h>
  15. #include <linux/u64_stats_sync.h>
  16. #include <net/rtnetlink.h>
  17. #include <net/dst.h>
  18. #include <net/xfrm.h>
  19. #include <net/xdp.h>
  20. #include <linux/veth.h>
  21. #include <linux/module.h>
  22. #include <linux/bpf.h>
  23. #include <linux/filter.h>
  24. #include <linux/ptr_ring.h>
  25. #include <linux/bpf_trace.h>
  26. #include <linux/net_tstamp.h>
  27. #define DRV_NAME "veth"
  28. #define DRV_VERSION "1.0"
  29. #define VETH_XDP_FLAG BIT(0)
  30. #define VETH_RING_SIZE 256
  31. #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN)
  32. #define VETH_XDP_TX_BULK_SIZE 16
  33. struct veth_stats {
  34. u64 rx_drops;
  35. /* xdp */
  36. u64 xdp_packets;
  37. u64 xdp_bytes;
  38. u64 xdp_redirect;
  39. u64 xdp_drops;
  40. u64 xdp_tx;
  41. u64 xdp_tx_err;
  42. u64 peer_tq_xdp_xmit;
  43. u64 peer_tq_xdp_xmit_err;
  44. };
  45. struct veth_rq_stats {
  46. struct veth_stats vs;
  47. struct u64_stats_sync syncp;
  48. };
  49. struct veth_rq {
  50. struct napi_struct xdp_napi;
  51. struct net_device *dev;
  52. struct bpf_prog __rcu *xdp_prog;
  53. struct xdp_mem_info xdp_mem;
  54. struct veth_rq_stats stats;
  55. bool rx_notify_masked;
  56. struct ptr_ring xdp_ring;
  57. struct xdp_rxq_info xdp_rxq;
  58. };
  59. struct veth_priv {
  60. struct net_device __rcu *peer;
  61. atomic64_t dropped;
  62. struct bpf_prog *_xdp_prog;
  63. struct veth_rq *rq;
  64. unsigned int requested_headroom;
  65. };
  66. struct veth_xdp_tx_bq {
  67. struct xdp_frame *q[VETH_XDP_TX_BULK_SIZE];
  68. unsigned int count;
  69. };
  70. /*
  71. * ethtool interface
  72. */
  73. struct veth_q_stat_desc {
  74. char desc[ETH_GSTRING_LEN];
  75. size_t offset;
  76. };
  77. #define VETH_RQ_STAT(m) offsetof(struct veth_stats, m)
  78. static const struct veth_q_stat_desc veth_rq_stats_desc[] = {
  79. { "xdp_packets", VETH_RQ_STAT(xdp_packets) },
  80. { "xdp_bytes", VETH_RQ_STAT(xdp_bytes) },
  81. { "drops", VETH_RQ_STAT(rx_drops) },
  82. { "xdp_redirect", VETH_RQ_STAT(xdp_redirect) },
  83. { "xdp_drops", VETH_RQ_STAT(xdp_drops) },
  84. { "xdp_tx", VETH_RQ_STAT(xdp_tx) },
  85. { "xdp_tx_errors", VETH_RQ_STAT(xdp_tx_err) },
  86. };
  87. #define VETH_RQ_STATS_LEN ARRAY_SIZE(veth_rq_stats_desc)
  88. static const struct veth_q_stat_desc veth_tq_stats_desc[] = {
  89. { "xdp_xmit", VETH_RQ_STAT(peer_tq_xdp_xmit) },
  90. { "xdp_xmit_errors", VETH_RQ_STAT(peer_tq_xdp_xmit_err) },
  91. };
  92. #define VETH_TQ_STATS_LEN ARRAY_SIZE(veth_tq_stats_desc)
  93. static struct {
  94. const char string[ETH_GSTRING_LEN];
  95. } ethtool_stats_keys[] = {
  96. { "peer_ifindex" },
  97. };
  98. static int veth_get_link_ksettings(struct net_device *dev,
  99. struct ethtool_link_ksettings *cmd)
  100. {
  101. cmd->base.speed = SPEED_10000;
  102. cmd->base.duplex = DUPLEX_FULL;
  103. cmd->base.port = PORT_TP;
  104. cmd->base.autoneg = AUTONEG_DISABLE;
  105. return 0;
  106. }
  107. static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
  108. {
  109. strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
  110. strlcpy(info->version, DRV_VERSION, sizeof(info->version));
  111. }
  112. static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
  113. {
  114. char *p = (char *)buf;
  115. int i, j;
  116. switch(stringset) {
  117. case ETH_SS_STATS:
  118. memcpy(p, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
  119. p += sizeof(ethtool_stats_keys);
  120. for (i = 0; i < dev->real_num_rx_queues; i++) {
  121. for (j = 0; j < VETH_RQ_STATS_LEN; j++) {
  122. snprintf(p, ETH_GSTRING_LEN,
  123. "rx_queue_%u_%.18s",
  124. i, veth_rq_stats_desc[j].desc);
  125. p += ETH_GSTRING_LEN;
  126. }
  127. }
  128. for (i = 0; i < dev->real_num_tx_queues; i++) {
  129. for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
  130. snprintf(p, ETH_GSTRING_LEN,
  131. "tx_queue_%u_%.18s",
  132. i, veth_tq_stats_desc[j].desc);
  133. p += ETH_GSTRING_LEN;
  134. }
  135. }
  136. break;
  137. }
  138. }
  139. static int veth_get_sset_count(struct net_device *dev, int sset)
  140. {
  141. switch (sset) {
  142. case ETH_SS_STATS:
  143. return ARRAY_SIZE(ethtool_stats_keys) +
  144. VETH_RQ_STATS_LEN * dev->real_num_rx_queues +
  145. VETH_TQ_STATS_LEN * dev->real_num_tx_queues;
  146. default:
  147. return -EOPNOTSUPP;
  148. }
  149. }
  150. static void veth_get_ethtool_stats(struct net_device *dev,
  151. struct ethtool_stats *stats, u64 *data)
  152. {
  153. struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
  154. struct net_device *peer = rtnl_dereference(priv->peer);
  155. int i, j, idx;
  156. data[0] = peer ? peer->ifindex : 0;
  157. idx = 1;
  158. for (i = 0; i < dev->real_num_rx_queues; i++) {
  159. const struct veth_rq_stats *rq_stats = &priv->rq[i].stats;
  160. const void *stats_base = (void *)&rq_stats->vs;
  161. unsigned int start;
  162. size_t offset;
  163. do {
  164. start = u64_stats_fetch_begin_irq(&rq_stats->syncp);
  165. for (j = 0; j < VETH_RQ_STATS_LEN; j++) {
  166. offset = veth_rq_stats_desc[j].offset;
  167. data[idx + j] = *(u64 *)(stats_base + offset);
  168. }
  169. } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
  170. idx += VETH_RQ_STATS_LEN;
  171. }
  172. if (!peer)
  173. return;
  174. rcv_priv = netdev_priv(peer);
  175. for (i = 0; i < peer->real_num_rx_queues; i++) {
  176. const struct veth_rq_stats *rq_stats = &rcv_priv->rq[i].stats;
  177. const void *base = (void *)&rq_stats->vs;
  178. unsigned int start, tx_idx = idx;
  179. size_t offset;
  180. tx_idx += (i % dev->real_num_tx_queues) * VETH_TQ_STATS_LEN;
  181. do {
  182. start = u64_stats_fetch_begin_irq(&rq_stats->syncp);
  183. for (j = 0; j < VETH_TQ_STATS_LEN; j++) {
  184. offset = veth_tq_stats_desc[j].offset;
  185. data[tx_idx + j] += *(u64 *)(base + offset);
  186. }
  187. } while (u64_stats_fetch_retry_irq(&rq_stats->syncp, start));
  188. }
  189. }
  190. static const struct ethtool_ops veth_ethtool_ops = {
  191. .get_drvinfo = veth_get_drvinfo,
  192. .get_link = ethtool_op_get_link,
  193. .get_strings = veth_get_strings,
  194. .get_sset_count = veth_get_sset_count,
  195. .get_ethtool_stats = veth_get_ethtool_stats,
  196. .get_link_ksettings = veth_get_link_ksettings,
  197. .get_ts_info = ethtool_op_get_ts_info,
  198. };
  199. /* general routines */
  200. static bool veth_is_xdp_frame(void *ptr)
  201. {
  202. return (unsigned long)ptr & VETH_XDP_FLAG;
  203. }
  204. static void *veth_ptr_to_xdp(void *ptr)
  205. {
  206. return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
  207. }
  208. static void *veth_xdp_to_ptr(void *ptr)
  209. {
  210. return (void *)((unsigned long)ptr | VETH_XDP_FLAG);
  211. }
  212. static void veth_ptr_free(void *ptr)
  213. {
  214. if (veth_is_xdp_frame(ptr))
  215. xdp_return_frame(veth_ptr_to_xdp(ptr));
  216. else
  217. kfree_skb(ptr);
  218. }
  219. static void __veth_xdp_flush(struct veth_rq *rq)
  220. {
  221. /* Write ptr_ring before reading rx_notify_masked */
  222. smp_mb();
  223. if (!rq->rx_notify_masked) {
  224. rq->rx_notify_masked = true;
  225. napi_schedule(&rq->xdp_napi);
  226. }
  227. }
  228. static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
  229. {
  230. if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
  231. dev_kfree_skb_any(skb);
  232. return NET_RX_DROP;
  233. }
  234. return NET_RX_SUCCESS;
  235. }
  236. static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
  237. struct veth_rq *rq, bool xdp)
  238. {
  239. return __dev_forward_skb(dev, skb) ?: xdp ?
  240. veth_xdp_rx(rq, skb) :
  241. netif_rx(skb);
  242. }
  243. static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
  244. {
  245. struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
  246. struct veth_rq *rq = NULL;
  247. struct net_device *rcv;
  248. int length = skb->len;
  249. bool rcv_xdp = false;
  250. int rxq;
  251. rcu_read_lock();
  252. rcv = rcu_dereference(priv->peer);
  253. if (unlikely(!rcv)) {
  254. kfree_skb(skb);
  255. goto drop;
  256. }
  257. rcv_priv = netdev_priv(rcv);
  258. rxq = skb_get_queue_mapping(skb);
  259. if (rxq < rcv->real_num_rx_queues) {
  260. rq = &rcv_priv->rq[rxq];
  261. rcv_xdp = rcu_access_pointer(rq->xdp_prog);
  262. if (rcv_xdp)
  263. skb_record_rx_queue(skb, rxq);
  264. }
  265. skb_tx_timestamp(skb);
  266. if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) {
  267. if (!rcv_xdp)
  268. dev_lstats_add(dev, length);
  269. } else {
  270. drop:
  271. atomic64_inc(&priv->dropped);
  272. }
  273. if (rcv_xdp)
  274. __veth_xdp_flush(rq);
  275. rcu_read_unlock();
  276. return NETDEV_TX_OK;
  277. }
  278. static u64 veth_stats_tx(struct net_device *dev, u64 *packets, u64 *bytes)
  279. {
  280. struct veth_priv *priv = netdev_priv(dev);
  281. dev_lstats_read(dev, packets, bytes);
  282. return atomic64_read(&priv->dropped);
  283. }
  284. static void veth_stats_rx(struct veth_stats *result, struct net_device *dev)
  285. {
  286. struct veth_priv *priv = netdev_priv(dev);
  287. int i;
  288. result->peer_tq_xdp_xmit_err = 0;
  289. result->xdp_packets = 0;
  290. result->xdp_tx_err = 0;
  291. result->xdp_bytes = 0;
  292. result->rx_drops = 0;
  293. for (i = 0; i < dev->num_rx_queues; i++) {
  294. u64 packets, bytes, drops, xdp_tx_err, peer_tq_xdp_xmit_err;
  295. struct veth_rq_stats *stats = &priv->rq[i].stats;
  296. unsigned int start;
  297. do {
  298. start = u64_stats_fetch_begin_irq(&stats->syncp);
  299. peer_tq_xdp_xmit_err = stats->vs.peer_tq_xdp_xmit_err;
  300. xdp_tx_err = stats->vs.xdp_tx_err;
  301. packets = stats->vs.xdp_packets;
  302. bytes = stats->vs.xdp_bytes;
  303. drops = stats->vs.rx_drops;
  304. } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
  305. result->peer_tq_xdp_xmit_err += peer_tq_xdp_xmit_err;
  306. result->xdp_tx_err += xdp_tx_err;
  307. result->xdp_packets += packets;
  308. result->xdp_bytes += bytes;
  309. result->rx_drops += drops;
  310. }
  311. }
  312. static void veth_get_stats64(struct net_device *dev,
  313. struct rtnl_link_stats64 *tot)
  314. {
  315. struct veth_priv *priv = netdev_priv(dev);
  316. struct net_device *peer;
  317. struct veth_stats rx;
  318. u64 packets, bytes;
  319. tot->tx_dropped = veth_stats_tx(dev, &packets, &bytes);
  320. tot->tx_bytes = bytes;
  321. tot->tx_packets = packets;
  322. veth_stats_rx(&rx, dev);
  323. tot->tx_dropped += rx.xdp_tx_err;
  324. tot->rx_dropped = rx.rx_drops + rx.peer_tq_xdp_xmit_err;
  325. tot->rx_bytes = rx.xdp_bytes;
  326. tot->rx_packets = rx.xdp_packets;
  327. rcu_read_lock();
  328. peer = rcu_dereference(priv->peer);
  329. if (peer) {
  330. veth_stats_tx(peer, &packets, &bytes);
  331. tot->rx_bytes += bytes;
  332. tot->rx_packets += packets;
  333. veth_stats_rx(&rx, peer);
  334. tot->tx_dropped += rx.peer_tq_xdp_xmit_err;
  335. tot->rx_dropped += rx.xdp_tx_err;
  336. tot->tx_bytes += rx.xdp_bytes;
  337. tot->tx_packets += rx.xdp_packets;
  338. }
  339. rcu_read_unlock();
  340. }
  341. /* fake multicast ability */
  342. static void veth_set_multicast_list(struct net_device *dev)
  343. {
  344. }
  345. static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
  346. int buflen)
  347. {
  348. struct sk_buff *skb;
  349. if (!buflen) {
  350. buflen = SKB_DATA_ALIGN(headroom + len) +
  351. SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
  352. }
  353. skb = build_skb(head, buflen);
  354. if (!skb)
  355. return NULL;
  356. skb_reserve(skb, headroom);
  357. skb_put(skb, len);
  358. return skb;
  359. }
  360. static int veth_select_rxq(struct net_device *dev)
  361. {
  362. return smp_processor_id() % dev->real_num_rx_queues;
  363. }
  364. static int veth_xdp_xmit(struct net_device *dev, int n,
  365. struct xdp_frame **frames,
  366. u32 flags, bool ndo_xmit)
  367. {
  368. struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
  369. int i, ret = -ENXIO, drops = 0;
  370. struct net_device *rcv;
  371. unsigned int max_len;
  372. struct veth_rq *rq;
  373. if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
  374. return -EINVAL;
  375. rcu_read_lock();
  376. rcv = rcu_dereference(priv->peer);
  377. if (unlikely(!rcv))
  378. goto out;
  379. rcv_priv = netdev_priv(rcv);
  380. rq = &rcv_priv->rq[veth_select_rxq(rcv)];
  381. /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
  382. * side. This means an XDP program is loaded on the peer and the peer
  383. * device is up.
  384. */
  385. if (!rcu_access_pointer(rq->xdp_prog))
  386. goto out;
  387. max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN;
  388. spin_lock(&rq->xdp_ring.producer_lock);
  389. for (i = 0; i < n; i++) {
  390. struct xdp_frame *frame = frames[i];
  391. void *ptr = veth_xdp_to_ptr(frame);
  392. if (unlikely(frame->len > max_len ||
  393. __ptr_ring_produce(&rq->xdp_ring, ptr))) {
  394. xdp_return_frame_rx_napi(frame);
  395. drops++;
  396. }
  397. }
  398. spin_unlock(&rq->xdp_ring.producer_lock);
  399. if (flags & XDP_XMIT_FLUSH)
  400. __veth_xdp_flush(rq);
  401. ret = n - drops;
  402. if (ndo_xmit) {
  403. u64_stats_update_begin(&rq->stats.syncp);
  404. rq->stats.vs.peer_tq_xdp_xmit += n - drops;
  405. rq->stats.vs.peer_tq_xdp_xmit_err += drops;
  406. u64_stats_update_end(&rq->stats.syncp);
  407. }
  408. out:
  409. rcu_read_unlock();
  410. return ret;
  411. }
  412. static int veth_ndo_xdp_xmit(struct net_device *dev, int n,
  413. struct xdp_frame **frames, u32 flags)
  414. {
  415. int err;
  416. err = veth_xdp_xmit(dev, n, frames, flags, true);
  417. if (err < 0) {
  418. struct veth_priv *priv = netdev_priv(dev);
  419. atomic64_add(n, &priv->dropped);
  420. }
  421. return err;
  422. }
  423. static void veth_xdp_flush_bq(struct veth_rq *rq, struct veth_xdp_tx_bq *bq)
  424. {
  425. int sent, i, err = 0;
  426. sent = veth_xdp_xmit(rq->dev, bq->count, bq->q, 0, false);
  427. if (sent < 0) {
  428. err = sent;
  429. sent = 0;
  430. for (i = 0; i < bq->count; i++)
  431. xdp_return_frame(bq->q[i]);
  432. }
  433. trace_xdp_bulk_tx(rq->dev, sent, bq->count - sent, err);
  434. u64_stats_update_begin(&rq->stats.syncp);
  435. rq->stats.vs.xdp_tx += sent;
  436. rq->stats.vs.xdp_tx_err += bq->count - sent;
  437. u64_stats_update_end(&rq->stats.syncp);
  438. bq->count = 0;
  439. }
  440. static void veth_xdp_flush(struct veth_rq *rq, struct veth_xdp_tx_bq *bq)
  441. {
  442. struct veth_priv *rcv_priv, *priv = netdev_priv(rq->dev);
  443. struct net_device *rcv;
  444. struct veth_rq *rcv_rq;
  445. rcu_read_lock();
  446. veth_xdp_flush_bq(rq, bq);
  447. rcv = rcu_dereference(priv->peer);
  448. if (unlikely(!rcv))
  449. goto out;
  450. rcv_priv = netdev_priv(rcv);
  451. rcv_rq = &rcv_priv->rq[veth_select_rxq(rcv)];
  452. /* xdp_ring is initialized on receive side? */
  453. if (unlikely(!rcu_access_pointer(rcv_rq->xdp_prog)))
  454. goto out;
  455. __veth_xdp_flush(rcv_rq);
  456. out:
  457. rcu_read_unlock();
  458. }
  459. static int veth_xdp_tx(struct veth_rq *rq, struct xdp_buff *xdp,
  460. struct veth_xdp_tx_bq *bq)
  461. {
  462. struct xdp_frame *frame = convert_to_xdp_frame(xdp);
  463. if (unlikely(!frame))
  464. return -EOVERFLOW;
  465. if (unlikely(bq->count == VETH_XDP_TX_BULK_SIZE))
  466. veth_xdp_flush_bq(rq, bq);
  467. bq->q[bq->count++] = frame;
  468. return 0;
  469. }
  470. static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
  471. struct xdp_frame *frame,
  472. struct veth_xdp_tx_bq *bq,
  473. struct veth_stats *stats)
  474. {
  475. void *hard_start = frame->data - frame->headroom;
  476. void *head = hard_start - sizeof(struct xdp_frame);
  477. int len = frame->len, delta = 0;
  478. struct xdp_frame orig_frame;
  479. struct bpf_prog *xdp_prog;
  480. unsigned int headroom;
  481. struct sk_buff *skb;
  482. rcu_read_lock();
  483. xdp_prog = rcu_dereference(rq->xdp_prog);
  484. if (likely(xdp_prog)) {
  485. struct xdp_buff xdp;
  486. u32 act;
  487. xdp.data_hard_start = hard_start;
  488. xdp.data = frame->data;
  489. xdp.data_end = frame->data + frame->len;
  490. xdp.data_meta = frame->data - frame->metasize;
  491. xdp.rxq = &rq->xdp_rxq;
  492. act = bpf_prog_run_xdp(xdp_prog, &xdp);
  493. switch (act) {
  494. case XDP_PASS:
  495. delta = frame->data - xdp.data;
  496. len = xdp.data_end - xdp.data;
  497. break;
  498. case XDP_TX:
  499. orig_frame = *frame;
  500. xdp.data_hard_start = head;
  501. xdp.rxq->mem = frame->mem;
  502. if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) {
  503. trace_xdp_exception(rq->dev, xdp_prog, act);
  504. frame = &orig_frame;
  505. stats->rx_drops++;
  506. goto err_xdp;
  507. }
  508. stats->xdp_tx++;
  509. rcu_read_unlock();
  510. goto xdp_xmit;
  511. case XDP_REDIRECT:
  512. orig_frame = *frame;
  513. xdp.data_hard_start = head;
  514. xdp.rxq->mem = frame->mem;
  515. if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
  516. frame = &orig_frame;
  517. stats->rx_drops++;
  518. goto err_xdp;
  519. }
  520. stats->xdp_redirect++;
  521. rcu_read_unlock();
  522. goto xdp_xmit;
  523. default:
  524. bpf_warn_invalid_xdp_action(act);
  525. /* fall through */
  526. case XDP_ABORTED:
  527. trace_xdp_exception(rq->dev, xdp_prog, act);
  528. /* fall through */
  529. case XDP_DROP:
  530. stats->xdp_drops++;
  531. goto err_xdp;
  532. }
  533. }
  534. rcu_read_unlock();
  535. headroom = sizeof(struct xdp_frame) + frame->headroom - delta;
  536. skb = veth_build_skb(head, headroom, len, 0);
  537. if (!skb) {
  538. xdp_return_frame(frame);
  539. stats->rx_drops++;
  540. goto err;
  541. }
  542. xdp_release_frame(frame);
  543. xdp_scrub_frame(frame);
  544. skb->protocol = eth_type_trans(skb, rq->dev);
  545. err:
  546. return skb;
  547. err_xdp:
  548. rcu_read_unlock();
  549. xdp_return_frame(frame);
  550. xdp_xmit:
  551. return NULL;
  552. }
  553. static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
  554. struct sk_buff *skb,
  555. struct veth_xdp_tx_bq *bq,
  556. struct veth_stats *stats)
  557. {
  558. u32 pktlen, headroom, act, metalen;
  559. void *orig_data, *orig_data_end;
  560. struct bpf_prog *xdp_prog;
  561. int mac_len, delta, off;
  562. struct xdp_buff xdp;
  563. skb_orphan(skb);
  564. rcu_read_lock();
  565. xdp_prog = rcu_dereference(rq->xdp_prog);
  566. if (unlikely(!xdp_prog)) {
  567. rcu_read_unlock();
  568. goto out;
  569. }
  570. mac_len = skb->data - skb_mac_header(skb);
  571. pktlen = skb->len + mac_len;
  572. headroom = skb_headroom(skb) - mac_len;
  573. if (skb_shared(skb) || skb_head_is_locked(skb) ||
  574. skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
  575. struct sk_buff *nskb;
  576. int size, head_off;
  577. void *head, *start;
  578. struct page *page;
  579. size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
  580. SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
  581. if (size > PAGE_SIZE)
  582. goto drop;
  583. page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
  584. if (!page)
  585. goto drop;
  586. head = page_address(page);
  587. start = head + VETH_XDP_HEADROOM;
  588. if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
  589. page_frag_free(head);
  590. goto drop;
  591. }
  592. nskb = veth_build_skb(head,
  593. VETH_XDP_HEADROOM + mac_len, skb->len,
  594. PAGE_SIZE);
  595. if (!nskb) {
  596. page_frag_free(head);
  597. goto drop;
  598. }
  599. skb_copy_header(nskb, skb);
  600. head_off = skb_headroom(nskb) - skb_headroom(skb);
  601. skb_headers_offset_update(nskb, head_off);
  602. consume_skb(skb);
  603. skb = nskb;
  604. }
  605. xdp.data_hard_start = skb->head;
  606. xdp.data = skb_mac_header(skb);
  607. xdp.data_end = xdp.data + pktlen;
  608. xdp.data_meta = xdp.data;
  609. xdp.rxq = &rq->xdp_rxq;
  610. orig_data = xdp.data;
  611. orig_data_end = xdp.data_end;
  612. act = bpf_prog_run_xdp(xdp_prog, &xdp);
  613. switch (act) {
  614. case XDP_PASS:
  615. break;
  616. case XDP_TX:
  617. get_page(virt_to_page(xdp.data));
  618. consume_skb(skb);
  619. xdp.rxq->mem = rq->xdp_mem;
  620. if (unlikely(veth_xdp_tx(rq, &xdp, bq) < 0)) {
  621. trace_xdp_exception(rq->dev, xdp_prog, act);
  622. stats->rx_drops++;
  623. goto err_xdp;
  624. }
  625. stats->xdp_tx++;
  626. rcu_read_unlock();
  627. goto xdp_xmit;
  628. case XDP_REDIRECT:
  629. get_page(virt_to_page(xdp.data));
  630. consume_skb(skb);
  631. xdp.rxq->mem = rq->xdp_mem;
  632. if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
  633. stats->rx_drops++;
  634. goto err_xdp;
  635. }
  636. stats->xdp_redirect++;
  637. rcu_read_unlock();
  638. goto xdp_xmit;
  639. default:
  640. bpf_warn_invalid_xdp_action(act);
  641. /* fall through */
  642. case XDP_ABORTED:
  643. trace_xdp_exception(rq->dev, xdp_prog, act);
  644. /* fall through */
  645. case XDP_DROP:
  646. stats->xdp_drops++;
  647. goto xdp_drop;
  648. }
  649. rcu_read_unlock();
  650. delta = orig_data - xdp.data;
  651. off = mac_len + delta;
  652. if (off > 0)
  653. __skb_push(skb, off);
  654. else if (off < 0)
  655. __skb_pull(skb, -off);
  656. skb->mac_header -= delta;
  657. off = xdp.data_end - orig_data_end;
  658. if (off != 0)
  659. __skb_put(skb, off);
  660. skb->protocol = eth_type_trans(skb, rq->dev);
  661. metalen = xdp.data - xdp.data_meta;
  662. if (metalen)
  663. skb_metadata_set(skb, metalen);
  664. out:
  665. return skb;
  666. drop:
  667. stats->rx_drops++;
  668. xdp_drop:
  669. rcu_read_unlock();
  670. kfree_skb(skb);
  671. return NULL;
  672. err_xdp:
  673. rcu_read_unlock();
  674. page_frag_free(xdp.data);
  675. xdp_xmit:
  676. return NULL;
  677. }
  678. static int veth_xdp_rcv(struct veth_rq *rq, int budget,
  679. struct veth_xdp_tx_bq *bq,
  680. struct veth_stats *stats)
  681. {
  682. int i, done = 0;
  683. for (i = 0; i < budget; i++) {
  684. void *ptr = __ptr_ring_consume(&rq->xdp_ring);
  685. struct sk_buff *skb;
  686. if (!ptr)
  687. break;
  688. if (veth_is_xdp_frame(ptr)) {
  689. struct xdp_frame *frame = veth_ptr_to_xdp(ptr);
  690. stats->xdp_bytes += frame->len;
  691. skb = veth_xdp_rcv_one(rq, frame, bq, stats);
  692. } else {
  693. skb = ptr;
  694. stats->xdp_bytes += skb->len;
  695. skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
  696. }
  697. if (skb)
  698. napi_gro_receive(&rq->xdp_napi, skb);
  699. done++;
  700. }
  701. u64_stats_update_begin(&rq->stats.syncp);
  702. rq->stats.vs.xdp_redirect += stats->xdp_redirect;
  703. rq->stats.vs.xdp_bytes += stats->xdp_bytes;
  704. rq->stats.vs.xdp_drops += stats->xdp_drops;
  705. rq->stats.vs.rx_drops += stats->rx_drops;
  706. rq->stats.vs.xdp_packets += done;
  707. u64_stats_update_end(&rq->stats.syncp);
  708. return done;
  709. }
  710. static int veth_poll(struct napi_struct *napi, int budget)
  711. {
  712. struct veth_rq *rq =
  713. container_of(napi, struct veth_rq, xdp_napi);
  714. struct veth_stats stats = {};
  715. struct veth_xdp_tx_bq bq;
  716. int done;
  717. bq.count = 0;
  718. xdp_set_return_frame_no_direct();
  719. done = veth_xdp_rcv(rq, budget, &bq, &stats);
  720. if (done < budget && napi_complete_done(napi, done)) {
  721. /* Write rx_notify_masked before reading ptr_ring */
  722. smp_store_mb(rq->rx_notify_masked, false);
  723. if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) {
  724. rq->rx_notify_masked = true;
  725. napi_schedule(&rq->xdp_napi);
  726. }
  727. }
  728. if (stats.xdp_tx > 0)
  729. veth_xdp_flush(rq, &bq);
  730. if (stats.xdp_redirect > 0)
  731. xdp_do_flush();
  732. xdp_clear_return_frame_no_direct();
  733. return done;
  734. }
  735. static int veth_napi_add(struct net_device *dev)
  736. {
  737. struct veth_priv *priv = netdev_priv(dev);
  738. int err, i;
  739. for (i = 0; i < dev->real_num_rx_queues; i++) {
  740. struct veth_rq *rq = &priv->rq[i];
  741. err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
  742. if (err)
  743. goto err_xdp_ring;
  744. }
  745. for (i = 0; i < dev->real_num_rx_queues; i++) {
  746. struct veth_rq *rq = &priv->rq[i];
  747. netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
  748. napi_enable(&rq->xdp_napi);
  749. }
  750. return 0;
  751. err_xdp_ring:
  752. for (i--; i >= 0; i--)
  753. ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free);
  754. return err;
  755. }
  756. static void veth_napi_del(struct net_device *dev)
  757. {
  758. struct veth_priv *priv = netdev_priv(dev);
  759. int i;
  760. for (i = 0; i < dev->real_num_rx_queues; i++) {
  761. struct veth_rq *rq = &priv->rq[i];
  762. napi_disable(&rq->xdp_napi);
  763. napi_hash_del(&rq->xdp_napi);
  764. }
  765. synchronize_net();
  766. for (i = 0; i < dev->real_num_rx_queues; i++) {
  767. struct veth_rq *rq = &priv->rq[i];
  768. netif_napi_del(&rq->xdp_napi);
  769. rq->rx_notify_masked = false;
  770. ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
  771. }
  772. }
  773. static int veth_enable_xdp(struct net_device *dev)
  774. {
  775. struct veth_priv *priv = netdev_priv(dev);
  776. int err, i;
  777. if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) {
  778. for (i = 0; i < dev->real_num_rx_queues; i++) {
  779. struct veth_rq *rq = &priv->rq[i];
  780. err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i);
  781. if (err < 0)
  782. goto err_rxq_reg;
  783. err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
  784. MEM_TYPE_PAGE_SHARED,
  785. NULL);
  786. if (err < 0)
  787. goto err_reg_mem;
  788. /* Save original mem info as it can be overwritten */
  789. rq->xdp_mem = rq->xdp_rxq.mem;
  790. }
  791. err = veth_napi_add(dev);
  792. if (err)
  793. goto err_rxq_reg;
  794. }
  795. for (i = 0; i < dev->real_num_rx_queues; i++)
  796. rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog);
  797. return 0;
  798. err_reg_mem:
  799. xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
  800. err_rxq_reg:
  801. for (i--; i >= 0; i--)
  802. xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
  803. return err;
  804. }
  805. static void veth_disable_xdp(struct net_device *dev)
  806. {
  807. struct veth_priv *priv = netdev_priv(dev);
  808. int i;
  809. for (i = 0; i < dev->real_num_rx_queues; i++)
  810. rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
  811. veth_napi_del(dev);
  812. for (i = 0; i < dev->real_num_rx_queues; i++) {
  813. struct veth_rq *rq = &priv->rq[i];
  814. rq->xdp_rxq.mem = rq->xdp_mem;
  815. xdp_rxq_info_unreg(&rq->xdp_rxq);
  816. }
  817. }
  818. static int veth_open(struct net_device *dev)
  819. {
  820. struct veth_priv *priv = netdev_priv(dev);
  821. struct net_device *peer = rtnl_dereference(priv->peer);
  822. int err;
  823. if (!peer)
  824. return -ENOTCONN;
  825. if (priv->_xdp_prog) {
  826. err = veth_enable_xdp(dev);
  827. if (err)
  828. return err;
  829. }
  830. if (peer->flags & IFF_UP) {
  831. netif_carrier_on(dev);
  832. netif_carrier_on(peer);
  833. }
  834. return 0;
  835. }
  836. static int veth_close(struct net_device *dev)
  837. {
  838. struct veth_priv *priv = netdev_priv(dev);
  839. struct net_device *peer = rtnl_dereference(priv->peer);
  840. netif_carrier_off(dev);
  841. if (peer)
  842. netif_carrier_off(peer);
  843. if (priv->_xdp_prog)
  844. veth_disable_xdp(dev);
  845. return 0;
  846. }
  847. static int is_valid_veth_mtu(int mtu)
  848. {
  849. return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU;
  850. }
  851. static int veth_alloc_queues(struct net_device *dev)
  852. {
  853. struct veth_priv *priv = netdev_priv(dev);
  854. int i;
  855. priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL);
  856. if (!priv->rq)
  857. return -ENOMEM;
  858. for (i = 0; i < dev->num_rx_queues; i++) {
  859. priv->rq[i].dev = dev;
  860. u64_stats_init(&priv->rq[i].stats.syncp);
  861. }
  862. return 0;
  863. }
  864. static void veth_free_queues(struct net_device *dev)
  865. {
  866. struct veth_priv *priv = netdev_priv(dev);
  867. kfree(priv->rq);
  868. }
  869. static int veth_dev_init(struct net_device *dev)
  870. {
  871. int err;
  872. dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
  873. if (!dev->lstats)
  874. return -ENOMEM;
  875. err = veth_alloc_queues(dev);
  876. if (err) {
  877. free_percpu(dev->lstats);
  878. return err;
  879. }
  880. return 0;
  881. }
  882. static void veth_dev_free(struct net_device *dev)
  883. {
  884. veth_free_queues(dev);
  885. free_percpu(dev->lstats);
  886. }
  887. #ifdef CONFIG_NET_POLL_CONTROLLER
  888. static void veth_poll_controller(struct net_device *dev)
  889. {
  890. /* veth only receives frames when its peer sends one
  891. * Since it has nothing to do with disabling irqs, we are guaranteed
  892. * never to have pending data when we poll for it so
  893. * there is nothing to do here.
  894. *
  895. * We need this though so netpoll recognizes us as an interface that
  896. * supports polling, which enables bridge devices in virt setups to
  897. * still use netconsole
  898. */
  899. }
  900. #endif /* CONFIG_NET_POLL_CONTROLLER */
  901. static int veth_get_iflink(const struct net_device *dev)
  902. {
  903. struct veth_priv *priv = netdev_priv(dev);
  904. struct net_device *peer;
  905. int iflink;
  906. rcu_read_lock();
  907. peer = rcu_dereference(priv->peer);
  908. iflink = peer ? peer->ifindex : 0;
  909. rcu_read_unlock();
  910. return iflink;
  911. }
  912. static netdev_features_t veth_fix_features(struct net_device *dev,
  913. netdev_features_t features)
  914. {
  915. struct veth_priv *priv = netdev_priv(dev);
  916. struct net_device *peer;
  917. peer = rtnl_dereference(priv->peer);
  918. if (peer) {
  919. struct veth_priv *peer_priv = netdev_priv(peer);
  920. if (peer_priv->_xdp_prog)
  921. features &= ~NETIF_F_GSO_SOFTWARE;
  922. }
  923. return features;
  924. }
  925. static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
  926. {
  927. struct veth_priv *peer_priv, *priv = netdev_priv(dev);
  928. struct net_device *peer;
  929. if (new_hr < 0)
  930. new_hr = 0;
  931. rcu_read_lock();
  932. peer = rcu_dereference(priv->peer);
  933. if (unlikely(!peer))
  934. goto out;
  935. peer_priv = netdev_priv(peer);
  936. priv->requested_headroom = new_hr;
  937. new_hr = max(priv->requested_headroom, peer_priv->requested_headroom);
  938. dev->needed_headroom = new_hr;
  939. peer->needed_headroom = new_hr;
  940. out:
  941. rcu_read_unlock();
  942. }
  943. static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
  944. struct netlink_ext_ack *extack)
  945. {
  946. struct veth_priv *priv = netdev_priv(dev);
  947. struct bpf_prog *old_prog;
  948. struct net_device *peer;
  949. unsigned int max_mtu;
  950. int err;
  951. old_prog = priv->_xdp_prog;
  952. priv->_xdp_prog = prog;
  953. peer = rtnl_dereference(priv->peer);
  954. if (prog) {
  955. if (!peer) {
  956. NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached");
  957. err = -ENOTCONN;
  958. goto err;
  959. }
  960. max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM -
  961. peer->hard_header_len -
  962. SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
  963. if (peer->mtu > max_mtu) {
  964. NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP");
  965. err = -ERANGE;
  966. goto err;
  967. }
  968. if (dev->real_num_rx_queues < peer->real_num_tx_queues) {
  969. NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues");
  970. err = -ENOSPC;
  971. goto err;
  972. }
  973. if (dev->flags & IFF_UP) {
  974. err = veth_enable_xdp(dev);
  975. if (err) {
  976. NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed");
  977. goto err;
  978. }
  979. }
  980. if (!old_prog) {
  981. peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
  982. peer->max_mtu = max_mtu;
  983. }
  984. }
  985. if (old_prog) {
  986. if (!prog) {
  987. if (dev->flags & IFF_UP)
  988. veth_disable_xdp(dev);
  989. if (peer) {
  990. peer->hw_features |= NETIF_F_GSO_SOFTWARE;
  991. peer->max_mtu = ETH_MAX_MTU;
  992. }
  993. }
  994. bpf_prog_put(old_prog);
  995. }
  996. if ((!!old_prog ^ !!prog) && peer)
  997. netdev_update_features(peer);
  998. return 0;
  999. err:
  1000. priv->_xdp_prog = old_prog;
  1001. return err;
  1002. }
  1003. static u32 veth_xdp_query(struct net_device *dev)
  1004. {
  1005. struct veth_priv *priv = netdev_priv(dev);
  1006. const struct bpf_prog *xdp_prog;
  1007. xdp_prog = priv->_xdp_prog;
  1008. if (xdp_prog)
  1009. return xdp_prog->aux->id;
  1010. return 0;
  1011. }
  1012. static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
  1013. {
  1014. switch (xdp->command) {
  1015. case XDP_SETUP_PROG:
  1016. return veth_xdp_set(dev, xdp->prog, xdp->extack);
  1017. case XDP_QUERY_PROG:
  1018. xdp->prog_id = veth_xdp_query(dev);
  1019. return 0;
  1020. default:
  1021. return -EINVAL;
  1022. }
  1023. }
  1024. static const struct net_device_ops veth_netdev_ops = {
  1025. .ndo_init = veth_dev_init,
  1026. .ndo_open = veth_open,
  1027. .ndo_stop = veth_close,
  1028. .ndo_start_xmit = veth_xmit,
  1029. .ndo_get_stats64 = veth_get_stats64,
  1030. .ndo_set_rx_mode = veth_set_multicast_list,
  1031. .ndo_set_mac_address = eth_mac_addr,
  1032. #ifdef CONFIG_NET_POLL_CONTROLLER
  1033. .ndo_poll_controller = veth_poll_controller,
  1034. #endif
  1035. .ndo_get_iflink = veth_get_iflink,
  1036. .ndo_fix_features = veth_fix_features,
  1037. .ndo_features_check = passthru_features_check,
  1038. .ndo_set_rx_headroom = veth_set_rx_headroom,
  1039. .ndo_bpf = veth_xdp,
  1040. .ndo_xdp_xmit = veth_ndo_xdp_xmit,
  1041. };
  1042. #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
  1043. NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \
  1044. NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \
  1045. NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \
  1046. NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX )
  1047. static void veth_setup(struct net_device *dev)
  1048. {
  1049. ether_setup(dev);
  1050. dev->priv_flags &= ~IFF_TX_SKB_SHARING;
  1051. dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
  1052. dev->priv_flags |= IFF_NO_QUEUE;
  1053. dev->priv_flags |= IFF_PHONY_HEADROOM;
  1054. dev->netdev_ops = &veth_netdev_ops;
  1055. dev->ethtool_ops = &veth_ethtool_ops;
  1056. dev->features |= NETIF_F_LLTX;
  1057. dev->features |= VETH_FEATURES;
  1058. dev->vlan_features = dev->features &
  1059. ~(NETIF_F_HW_VLAN_CTAG_TX |
  1060. NETIF_F_HW_VLAN_STAG_TX |
  1061. NETIF_F_HW_VLAN_CTAG_RX |
  1062. NETIF_F_HW_VLAN_STAG_RX);
  1063. dev->needs_free_netdev = true;
  1064. dev->priv_destructor = veth_dev_free;
  1065. dev->max_mtu = ETH_MAX_MTU;
  1066. dev->hw_features = VETH_FEATURES;
  1067. dev->hw_enc_features = VETH_FEATURES;
  1068. dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
  1069. }
  1070. /*
  1071. * netlink interface
  1072. */
  1073. static int veth_validate(struct nlattr *tb[], struct nlattr *data[],
  1074. struct netlink_ext_ack *extack)
  1075. {
  1076. if (tb[IFLA_ADDRESS]) {
  1077. if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
  1078. return -EINVAL;
  1079. if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
  1080. return -EADDRNOTAVAIL;
  1081. }
  1082. if (tb[IFLA_MTU]) {
  1083. if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU])))
  1084. return -EINVAL;
  1085. }
  1086. return 0;
  1087. }
  1088. static struct rtnl_link_ops veth_link_ops;
  1089. static int veth_newlink(struct net *src_net, struct net_device *dev,
  1090. struct nlattr *tb[], struct nlattr *data[],
  1091. struct netlink_ext_ack *extack)
  1092. {
  1093. int err;
  1094. struct net_device *peer;
  1095. struct veth_priv *priv;
  1096. char ifname[IFNAMSIZ];
  1097. struct nlattr *peer_tb[IFLA_MAX + 1], **tbp;
  1098. unsigned char name_assign_type;
  1099. struct ifinfomsg *ifmp;
  1100. struct net *net;
  1101. /*
  1102. * create and register peer first
  1103. */
  1104. if (data != NULL && data[VETH_INFO_PEER] != NULL) {
  1105. struct nlattr *nla_peer;
  1106. nla_peer = data[VETH_INFO_PEER];
  1107. ifmp = nla_data(nla_peer);
  1108. err = rtnl_nla_parse_ifla(peer_tb,
  1109. nla_data(nla_peer) + sizeof(struct ifinfomsg),
  1110. nla_len(nla_peer) - sizeof(struct ifinfomsg),
  1111. NULL);
  1112. if (err < 0)
  1113. return err;
  1114. err = veth_validate(peer_tb, NULL, extack);
  1115. if (err < 0)
  1116. return err;
  1117. tbp = peer_tb;
  1118. } else {
  1119. ifmp = NULL;
  1120. tbp = tb;
  1121. }
  1122. if (ifmp && tbp[IFLA_IFNAME]) {
  1123. nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
  1124. name_assign_type = NET_NAME_USER;
  1125. } else {
  1126. snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
  1127. name_assign_type = NET_NAME_ENUM;
  1128. }
  1129. net = rtnl_link_get_net(src_net, tbp);
  1130. if (IS_ERR(net))
  1131. return PTR_ERR(net);
  1132. peer = rtnl_create_link(net, ifname, name_assign_type,
  1133. &veth_link_ops, tbp, extack);
  1134. if (IS_ERR(peer)) {
  1135. put_net(net);
  1136. return PTR_ERR(peer);
  1137. }
  1138. if (!ifmp || !tbp[IFLA_ADDRESS])
  1139. eth_hw_addr_random(peer);
  1140. if (ifmp && (dev->ifindex != 0))
  1141. peer->ifindex = ifmp->ifi_index;
  1142. peer->gso_max_size = dev->gso_max_size;
  1143. peer->gso_max_segs = dev->gso_max_segs;
  1144. err = register_netdevice(peer);
  1145. put_net(net);
  1146. net = NULL;
  1147. if (err < 0)
  1148. goto err_register_peer;
  1149. netif_carrier_off(peer);
  1150. err = rtnl_configure_link(peer, ifmp);
  1151. if (err < 0)
  1152. goto err_configure_peer;
  1153. /*
  1154. * register dev last
  1155. *
  1156. * note, that since we've registered new device the dev's name
  1157. * should be re-allocated
  1158. */
  1159. if (tb[IFLA_ADDRESS] == NULL)
  1160. eth_hw_addr_random(dev);
  1161. if (tb[IFLA_IFNAME])
  1162. nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
  1163. else
  1164. snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d");
  1165. err = register_netdevice(dev);
  1166. if (err < 0)
  1167. goto err_register_dev;
  1168. netif_carrier_off(dev);
  1169. /*
  1170. * tie the deviced together
  1171. */
  1172. priv = netdev_priv(dev);
  1173. rcu_assign_pointer(priv->peer, peer);
  1174. priv = netdev_priv(peer);
  1175. rcu_assign_pointer(priv->peer, dev);
  1176. return 0;
  1177. err_register_dev:
  1178. /* nothing to do */
  1179. err_configure_peer:
  1180. unregister_netdevice(peer);
  1181. return err;
  1182. err_register_peer:
  1183. free_netdev(peer);
  1184. return err;
  1185. }
  1186. static void veth_dellink(struct net_device *dev, struct list_head *head)
  1187. {
  1188. struct veth_priv *priv;
  1189. struct net_device *peer;
  1190. priv = netdev_priv(dev);
  1191. peer = rtnl_dereference(priv->peer);
  1192. /* Note : dellink() is called from default_device_exit_batch(),
  1193. * before a rcu_synchronize() point. The devices are guaranteed
  1194. * not being freed before one RCU grace period.
  1195. */
  1196. RCU_INIT_POINTER(priv->peer, NULL);
  1197. unregister_netdevice_queue(dev, head);
  1198. if (peer) {
  1199. priv = netdev_priv(peer);
  1200. RCU_INIT_POINTER(priv->peer, NULL);
  1201. unregister_netdevice_queue(peer, head);
  1202. }
  1203. }
  1204. static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = {
  1205. [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) },
  1206. };
  1207. static struct net *veth_get_link_net(const struct net_device *dev)
  1208. {
  1209. struct veth_priv *priv = netdev_priv(dev);
  1210. struct net_device *peer = rtnl_dereference(priv->peer);
  1211. return peer ? dev_net(peer) : dev_net(dev);
  1212. }
  1213. static struct rtnl_link_ops veth_link_ops = {
  1214. .kind = DRV_NAME,
  1215. .priv_size = sizeof(struct veth_priv),
  1216. .setup = veth_setup,
  1217. .validate = veth_validate,
  1218. .newlink = veth_newlink,
  1219. .dellink = veth_dellink,
  1220. .policy = veth_policy,
  1221. .maxtype = VETH_INFO_MAX,
  1222. .get_link_net = veth_get_link_net,
  1223. };
  1224. /*
  1225. * init/fini
  1226. */
  1227. static __init int veth_init(void)
  1228. {
  1229. return rtnl_link_register(&veth_link_ops);
  1230. }
  1231. static __exit void veth_exit(void)
  1232. {
  1233. rtnl_link_unregister(&veth_link_ops);
  1234. }
  1235. module_init(veth_init);
  1236. module_exit(veth_exit);
  1237. MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
  1238. MODULE_LICENSE("GPL v2");
  1239. MODULE_ALIAS_RTNL_LINK(DRV_NAME);