/net/netfilter/ipvs/ip_vs_xmit.c

http://github.com/mirrors/linux · C · 1678 lines · 1227 code · 258 blank · 193 comment · 269 complexity · 9f374f8c15b48ebd1e7f2a478c6b575e MD5 · raw file

  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * ip_vs_xmit.c: various packet transmitters for IPVS
  4. *
  5. * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
  6. * Julian Anastasov <ja@ssi.bg>
  7. *
  8. * Changes:
  9. *
  10. * Description of forwarding methods:
  11. * - all transmitters are called from LOCAL_IN (remote clients) and
  12. * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
  13. * - not all connections have destination server, for example,
  14. * connections in backup server when fwmark is used
  15. * - bypass connections use daddr from packet
  16. * - we can use dst without ref while sending in RCU section, we use
  17. * ref when returning NF_ACCEPT for NAT-ed packet via loopback
  18. * LOCAL_OUT rules:
  19. * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
  20. * - skb->pkt_type is not set yet
  21. * - the only place where we can see skb->sk != NULL
  22. */
  23. #define KMSG_COMPONENT "IPVS"
  24. #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  25. #include <linux/kernel.h>
  26. #include <linux/slab.h>
  27. #include <linux/tcp.h> /* for tcphdr */
  28. #include <net/ip.h>
  29. #include <net/gue.h>
  30. #include <net/gre.h>
  31. #include <net/tcp.h> /* for csum_tcpudp_magic */
  32. #include <net/udp.h>
  33. #include <net/icmp.h> /* for icmp_send */
  34. #include <net/route.h> /* for ip_route_output */
  35. #include <net/ipv6.h>
  36. #include <net/ip6_route.h>
  37. #include <net/ip_tunnels.h>
  38. #include <net/ip6_checksum.h>
  39. #include <net/addrconf.h>
  40. #include <linux/icmpv6.h>
  41. #include <linux/netfilter.h>
  42. #include <linux/netfilter_ipv4.h>
  43. #include <net/ip_vs.h>
  44. enum {
  45. IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */
  46. IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */
  47. IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to
  48. * local
  49. */
  50. IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */
  51. IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */
  52. IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */
  53. };
  54. static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void)
  55. {
  56. return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC);
  57. }
  58. static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst)
  59. {
  60. kfree(dest_dst);
  61. }
  62. /*
  63. * Destination cache to speed up outgoing route lookup
  64. */
  65. static inline void
  66. __ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst,
  67. struct dst_entry *dst, u32 dst_cookie)
  68. {
  69. struct ip_vs_dest_dst *old;
  70. old = rcu_dereference_protected(dest->dest_dst,
  71. lockdep_is_held(&dest->dst_lock));
  72. if (dest_dst) {
  73. dest_dst->dst_cache = dst;
  74. dest_dst->dst_cookie = dst_cookie;
  75. }
  76. rcu_assign_pointer(dest->dest_dst, dest_dst);
  77. if (old)
  78. call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
  79. }
  80. static inline struct ip_vs_dest_dst *
  81. __ip_vs_dst_check(struct ip_vs_dest *dest)
  82. {
  83. struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst);
  84. struct dst_entry *dst;
  85. if (!dest_dst)
  86. return NULL;
  87. dst = dest_dst->dst_cache;
  88. if (dst->obsolete &&
  89. dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
  90. return NULL;
  91. return dest_dst;
  92. }
  93. static inline bool
  94. __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
  95. {
  96. if (IP6CB(skb)->frag_max_size) {
  97. /* frag_max_size tell us that, this packet have been
  98. * defragmented by netfilter IPv6 conntrack module.
  99. */
  100. if (IP6CB(skb)->frag_max_size > mtu)
  101. return true; /* largest fragment violate MTU */
  102. }
  103. else if (skb->len > mtu && !skb_is_gso(skb)) {
  104. return true; /* Packet size violate MTU size */
  105. }
  106. return false;
  107. }
  108. /* Get route to daddr, update *saddr, optionally bind route to saddr */
  109. static struct rtable *do_output_route4(struct net *net, __be32 daddr,
  110. int rt_mode, __be32 *saddr)
  111. {
  112. struct flowi4 fl4;
  113. struct rtable *rt;
  114. bool loop = false;
  115. memset(&fl4, 0, sizeof(fl4));
  116. fl4.daddr = daddr;
  117. fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
  118. FLOWI_FLAG_KNOWN_NH : 0;
  119. retry:
  120. rt = ip_route_output_key(net, &fl4);
  121. if (IS_ERR(rt)) {
  122. /* Invalid saddr ? */
  123. if (PTR_ERR(rt) == -EINVAL && *saddr &&
  124. rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
  125. *saddr = 0;
  126. flowi4_update_output(&fl4, 0, 0, daddr, 0);
  127. goto retry;
  128. }
  129. IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
  130. return NULL;
  131. } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
  132. ip_rt_put(rt);
  133. *saddr = fl4.saddr;
  134. flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr);
  135. loop = true;
  136. goto retry;
  137. }
  138. *saddr = fl4.saddr;
  139. return rt;
  140. }
  141. #ifdef CONFIG_IP_VS_IPV6
  142. static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
  143. {
  144. return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK;
  145. }
  146. #endif
  147. static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb,
  148. int rt_mode,
  149. bool new_rt_is_local)
  150. {
  151. bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL);
  152. bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_NON_LOCAL);
  153. bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR);
  154. bool source_is_loopback;
  155. bool old_rt_is_local;
  156. #ifdef CONFIG_IP_VS_IPV6
  157. if (skb_af == AF_INET6) {
  158. int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr);
  159. source_is_loopback =
  160. (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
  161. (addr_type & IPV6_ADDR_LOOPBACK);
  162. old_rt_is_local = __ip_vs_is_local_route6(
  163. (struct rt6_info *)skb_dst(skb));
  164. } else
  165. #endif
  166. {
  167. source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr);
  168. old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
  169. }
  170. if (unlikely(new_rt_is_local)) {
  171. if (!rt_mode_allow_local)
  172. return true;
  173. if (!rt_mode_allow_redirect && !old_rt_is_local)
  174. return true;
  175. } else {
  176. if (!rt_mode_allow_non_local)
  177. return true;
  178. if (source_is_loopback)
  179. return true;
  180. }
  181. return false;
  182. }
  183. static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu)
  184. {
  185. struct sock *sk = skb->sk;
  186. struct rtable *ort = skb_rtable(skb);
  187. if (!skb->dev && sk && sk_fullsock(sk))
  188. ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu, true);
  189. }
  190. static inline bool ensure_mtu_is_adequate(struct netns_ipvs *ipvs, int skb_af,
  191. int rt_mode,
  192. struct ip_vs_iphdr *ipvsh,
  193. struct sk_buff *skb, int mtu)
  194. {
  195. #ifdef CONFIG_IP_VS_IPV6
  196. if (skb_af == AF_INET6) {
  197. struct net *net = ipvs->net;
  198. if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
  199. if (!skb->dev)
  200. skb->dev = net->loopback_dev;
  201. /* only send ICMP too big on first fragment */
  202. if (!ipvsh->fragoffs && !ip_vs_iph_icmp(ipvsh))
  203. icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
  204. IP_VS_DBG(1, "frag needed for %pI6c\n",
  205. &ipv6_hdr(skb)->saddr);
  206. return false;
  207. }
  208. } else
  209. #endif
  210. {
  211. /* If we're going to tunnel the packet and pmtu discovery
  212. * is disabled, we'll just fragment it anyway
  213. */
  214. if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs))
  215. return true;
  216. if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) &&
  217. skb->len > mtu && !skb_is_gso(skb) &&
  218. !ip_vs_iph_icmp(ipvsh))) {
  219. icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
  220. htonl(mtu));
  221. IP_VS_DBG(1, "frag needed for %pI4\n",
  222. &ip_hdr(skb)->saddr);
  223. return false;
  224. }
  225. }
  226. return true;
  227. }
  228. static inline bool decrement_ttl(struct netns_ipvs *ipvs,
  229. int skb_af,
  230. struct sk_buff *skb)
  231. {
  232. struct net *net = ipvs->net;
  233. #ifdef CONFIG_IP_VS_IPV6
  234. if (skb_af == AF_INET6) {
  235. struct dst_entry *dst = skb_dst(skb);
  236. /* check and decrement ttl */
  237. if (ipv6_hdr(skb)->hop_limit <= 1) {
  238. struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
  239. /* Force OUTPUT device used as source address */
  240. skb->dev = dst->dev;
  241. icmpv6_send(skb, ICMPV6_TIME_EXCEED,
  242. ICMPV6_EXC_HOPLIMIT, 0);
  243. __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
  244. return false;
  245. }
  246. /* don't propagate ttl change to cloned packets */
  247. if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
  248. return false;
  249. ipv6_hdr(skb)->hop_limit--;
  250. } else
  251. #endif
  252. {
  253. if (ip_hdr(skb)->ttl <= 1) {
  254. /* Tell the sender its packet died... */
  255. __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
  256. icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
  257. return false;
  258. }
  259. /* don't propagate ttl change to cloned packets */
  260. if (skb_ensure_writable(skb, sizeof(struct iphdr)))
  261. return false;
  262. /* Decrease ttl */
  263. ip_decrease_ttl(ip_hdr(skb));
  264. }
  265. return true;
  266. }
  267. /* Get route to destination or remote server */
  268. static int
  269. __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
  270. struct ip_vs_dest *dest,
  271. __be32 daddr, int rt_mode, __be32 *ret_saddr,
  272. struct ip_vs_iphdr *ipvsh)
  273. {
  274. struct net *net = ipvs->net;
  275. struct ip_vs_dest_dst *dest_dst;
  276. struct rtable *rt; /* Route to the other host */
  277. int mtu;
  278. int local, noref = 1;
  279. if (dest) {
  280. dest_dst = __ip_vs_dst_check(dest);
  281. if (likely(dest_dst))
  282. rt = (struct rtable *) dest_dst->dst_cache;
  283. else {
  284. dest_dst = ip_vs_dest_dst_alloc();
  285. spin_lock_bh(&dest->dst_lock);
  286. if (!dest_dst) {
  287. __ip_vs_dst_set(dest, NULL, NULL, 0);
  288. spin_unlock_bh(&dest->dst_lock);
  289. goto err_unreach;
  290. }
  291. rt = do_output_route4(net, dest->addr.ip, rt_mode,
  292. &dest_dst->dst_saddr.ip);
  293. if (!rt) {
  294. __ip_vs_dst_set(dest, NULL, NULL, 0);
  295. spin_unlock_bh(&dest->dst_lock);
  296. ip_vs_dest_dst_free(dest_dst);
  297. goto err_unreach;
  298. }
  299. __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
  300. spin_unlock_bh(&dest->dst_lock);
  301. IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
  302. &dest->addr.ip, &dest_dst->dst_saddr.ip,
  303. atomic_read(&rt->dst.__refcnt));
  304. }
  305. if (ret_saddr)
  306. *ret_saddr = dest_dst->dst_saddr.ip;
  307. } else {
  308. __be32 saddr = htonl(INADDR_ANY);
  309. noref = 0;
  310. /* For such unconfigured boxes avoid many route lookups
  311. * for performance reasons because we do not remember saddr
  312. */
  313. rt_mode &= ~IP_VS_RT_MODE_CONNECT;
  314. rt = do_output_route4(net, daddr, rt_mode, &saddr);
  315. if (!rt)
  316. goto err_unreach;
  317. if (ret_saddr)
  318. *ret_saddr = saddr;
  319. }
  320. local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
  321. if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
  322. local))) {
  323. IP_VS_DBG_RL("We are crossing local and non-local addresses"
  324. " daddr=%pI4\n", &daddr);
  325. goto err_put;
  326. }
  327. if (unlikely(local)) {
  328. /* skb to local stack, preserve old route */
  329. if (!noref)
  330. ip_rt_put(rt);
  331. return local;
  332. }
  333. if (!decrement_ttl(ipvs, skb_af, skb))
  334. goto err_put;
  335. if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
  336. mtu = dst_mtu(&rt->dst);
  337. } else {
  338. mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
  339. if (!dest)
  340. goto err_put;
  341. if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
  342. mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
  343. if ((dest->tun_flags &
  344. IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
  345. skb->ip_summed == CHECKSUM_PARTIAL)
  346. mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
  347. } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
  348. __be16 tflags = 0;
  349. if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
  350. tflags |= TUNNEL_CSUM;
  351. mtu -= gre_calc_hlen(tflags);
  352. }
  353. if (mtu < 68) {
  354. IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
  355. goto err_put;
  356. }
  357. maybe_update_pmtu(skb_af, skb, mtu);
  358. }
  359. if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
  360. goto err_put;
  361. skb_dst_drop(skb);
  362. if (noref)
  363. skb_dst_set_noref(skb, &rt->dst);
  364. else
  365. skb_dst_set(skb, &rt->dst);
  366. return local;
  367. err_put:
  368. if (!noref)
  369. ip_rt_put(rt);
  370. return -1;
  371. err_unreach:
  372. dst_link_failure(skb);
  373. return -1;
  374. }
  375. #ifdef CONFIG_IP_VS_IPV6
  376. static struct dst_entry *
  377. __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
  378. struct in6_addr *ret_saddr, int do_xfrm, int rt_mode)
  379. {
  380. struct dst_entry *dst;
  381. struct flowi6 fl6 = {
  382. .daddr = *daddr,
  383. };
  384. if (rt_mode & IP_VS_RT_MODE_KNOWN_NH)
  385. fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
  386. dst = ip6_route_output(net, NULL, &fl6);
  387. if (dst->error)
  388. goto out_err;
  389. if (!ret_saddr)
  390. return dst;
  391. if (ipv6_addr_any(&fl6.saddr) &&
  392. ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
  393. &fl6.daddr, 0, &fl6.saddr) < 0)
  394. goto out_err;
  395. if (do_xfrm) {
  396. dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
  397. if (IS_ERR(dst)) {
  398. dst = NULL;
  399. goto out_err;
  400. }
  401. }
  402. *ret_saddr = fl6.saddr;
  403. return dst;
  404. out_err:
  405. dst_release(dst);
  406. IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
  407. return NULL;
  408. }
  409. /*
  410. * Get route to destination or remote server
  411. */
  412. static int
  413. __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
  414. struct ip_vs_dest *dest,
  415. struct in6_addr *daddr, struct in6_addr *ret_saddr,
  416. struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
  417. {
  418. struct net *net = ipvs->net;
  419. struct ip_vs_dest_dst *dest_dst;
  420. struct rt6_info *rt; /* Route to the other host */
  421. struct dst_entry *dst;
  422. int mtu;
  423. int local, noref = 1;
  424. if (dest) {
  425. dest_dst = __ip_vs_dst_check(dest);
  426. if (likely(dest_dst))
  427. rt = (struct rt6_info *) dest_dst->dst_cache;
  428. else {
  429. u32 cookie;
  430. dest_dst = ip_vs_dest_dst_alloc();
  431. spin_lock_bh(&dest->dst_lock);
  432. if (!dest_dst) {
  433. __ip_vs_dst_set(dest, NULL, NULL, 0);
  434. spin_unlock_bh(&dest->dst_lock);
  435. goto err_unreach;
  436. }
  437. dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
  438. &dest_dst->dst_saddr.in6,
  439. do_xfrm, rt_mode);
  440. if (!dst) {
  441. __ip_vs_dst_set(dest, NULL, NULL, 0);
  442. spin_unlock_bh(&dest->dst_lock);
  443. ip_vs_dest_dst_free(dest_dst);
  444. goto err_unreach;
  445. }
  446. rt = (struct rt6_info *) dst;
  447. cookie = rt6_get_cookie(rt);
  448. __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
  449. spin_unlock_bh(&dest->dst_lock);
  450. IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
  451. &dest->addr.in6, &dest_dst->dst_saddr.in6,
  452. atomic_read(&rt->dst.__refcnt));
  453. }
  454. if (ret_saddr)
  455. *ret_saddr = dest_dst->dst_saddr.in6;
  456. } else {
  457. noref = 0;
  458. dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm,
  459. rt_mode);
  460. if (!dst)
  461. goto err_unreach;
  462. rt = (struct rt6_info *) dst;
  463. }
  464. local = __ip_vs_is_local_route6(rt);
  465. if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
  466. local))) {
  467. IP_VS_DBG_RL("We are crossing local and non-local addresses"
  468. " daddr=%pI6\n", daddr);
  469. goto err_put;
  470. }
  471. if (unlikely(local)) {
  472. /* skb to local stack, preserve old route */
  473. if (!noref)
  474. dst_release(&rt->dst);
  475. return local;
  476. }
  477. if (!decrement_ttl(ipvs, skb_af, skb))
  478. goto err_put;
  479. /* MTU checking */
  480. if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
  481. mtu = dst_mtu(&rt->dst);
  482. else {
  483. mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
  484. if (!dest)
  485. goto err_put;
  486. if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
  487. mtu -= sizeof(struct udphdr) + sizeof(struct guehdr);
  488. if ((dest->tun_flags &
  489. IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
  490. skb->ip_summed == CHECKSUM_PARTIAL)
  491. mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
  492. } else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
  493. __be16 tflags = 0;
  494. if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
  495. tflags |= TUNNEL_CSUM;
  496. mtu -= gre_calc_hlen(tflags);
  497. }
  498. if (mtu < IPV6_MIN_MTU) {
  499. IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
  500. IPV6_MIN_MTU);
  501. goto err_put;
  502. }
  503. maybe_update_pmtu(skb_af, skb, mtu);
  504. }
  505. if (!ensure_mtu_is_adequate(ipvs, skb_af, rt_mode, ipvsh, skb, mtu))
  506. goto err_put;
  507. skb_dst_drop(skb);
  508. if (noref)
  509. skb_dst_set_noref(skb, &rt->dst);
  510. else
  511. skb_dst_set(skb, &rt->dst);
  512. return local;
  513. err_put:
  514. if (!noref)
  515. dst_release(&rt->dst);
  516. return -1;
  517. err_unreach:
  518. /* The ip6_link_failure function requires the dev field to be set
  519. * in order to get the net (further for the sake of fwmark
  520. * reflection).
  521. */
  522. if (!skb->dev)
  523. skb->dev = skb_dst(skb)->dev;
  524. dst_link_failure(skb);
  525. return -1;
  526. }
  527. #endif
  528. /* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
  529. static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
  530. struct ip_vs_conn *cp)
  531. {
  532. int ret = NF_ACCEPT;
  533. skb->ipvs_property = 1;
  534. if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
  535. ret = ip_vs_confirm_conntrack(skb);
  536. if (ret == NF_ACCEPT) {
  537. nf_reset_ct(skb);
  538. skb_forward_csum(skb);
  539. }
  540. return ret;
  541. }
  542. /* In the event of a remote destination, it's possible that we would have
  543. * matches against an old socket (particularly a TIME-WAIT socket). This
  544. * causes havoc down the line (ip_local_out et. al. expect regular sockets
  545. * and invalid memory accesses will happen) so simply drop the association
  546. * in this case.
  547. */
  548. static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb)
  549. {
  550. /* If dev is set, the packet came from the LOCAL_IN callback and
  551. * not from a local TCP socket.
  552. */
  553. if (skb->dev)
  554. skb_orphan(skb);
  555. }
  556. /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
  557. static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
  558. struct ip_vs_conn *cp, int local)
  559. {
  560. int ret = NF_STOLEN;
  561. skb->ipvs_property = 1;
  562. if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
  563. ip_vs_notrack(skb);
  564. else
  565. ip_vs_update_conntrack(skb, cp, 1);
  566. /* Remove the early_demux association unless it's bound for the
  567. * exact same port and address on this host after translation.
  568. */
  569. if (!local || cp->vport != cp->dport ||
  570. !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr))
  571. ip_vs_drop_early_demux_sk(skb);
  572. if (!local) {
  573. skb_forward_csum(skb);
  574. NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
  575. NULL, skb_dst(skb)->dev, dst_output);
  576. } else
  577. ret = NF_ACCEPT;
  578. return ret;
  579. }
  580. /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
  581. static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
  582. struct ip_vs_conn *cp, int local)
  583. {
  584. int ret = NF_STOLEN;
  585. skb->ipvs_property = 1;
  586. if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
  587. ip_vs_notrack(skb);
  588. if (!local) {
  589. ip_vs_drop_early_demux_sk(skb);
  590. skb_forward_csum(skb);
  591. NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
  592. NULL, skb_dst(skb)->dev, dst_output);
  593. } else
  594. ret = NF_ACCEPT;
  595. return ret;
  596. }
  597. /*
  598. * NULL transmitter (do nothing except return NF_ACCEPT)
  599. */
  600. int
  601. ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
  602. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  603. {
  604. /* we do not touch skb and do not need pskb ptr */
  605. return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
  606. }
  607. /*
  608. * Bypass transmitter
  609. * Let packets bypass the destination when the destination is not
  610. * available, it may be only used in transparent cache cluster.
  611. */
  612. int
  613. ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
  614. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  615. {
  616. struct iphdr *iph = ip_hdr(skb);
  617. EnterFunction(10);
  618. if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr,
  619. IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
  620. goto tx_error;
  621. ip_send_check(iph);
  622. /* Another hack: avoid icmp_send in ip_fragment */
  623. skb->ignore_df = 1;
  624. ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
  625. LeaveFunction(10);
  626. return NF_STOLEN;
  627. tx_error:
  628. kfree_skb(skb);
  629. LeaveFunction(10);
  630. return NF_STOLEN;
  631. }
  632. #ifdef CONFIG_IP_VS_IPV6
  633. int
  634. ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
  635. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  636. {
  637. struct ipv6hdr *iph = ipv6_hdr(skb);
  638. EnterFunction(10);
  639. if (__ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, NULL,
  640. &iph->daddr, NULL,
  641. ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
  642. goto tx_error;
  643. /* Another hack: avoid icmp_send in ip_fragment */
  644. skb->ignore_df = 1;
  645. ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
  646. LeaveFunction(10);
  647. return NF_STOLEN;
  648. tx_error:
  649. kfree_skb(skb);
  650. LeaveFunction(10);
  651. return NF_STOLEN;
  652. }
  653. #endif
  654. /*
  655. * NAT transmitter (only for outside-to-inside nat forwarding)
  656. * Not used for related ICMP
  657. */
  658. int
  659. ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
  660. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  661. {
  662. struct rtable *rt; /* Route to the other host */
  663. int local, rc, was_input;
  664. EnterFunction(10);
  665. /* check if it is a connection of no-client-port */
  666. if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
  667. __be16 _pt, *p;
  668. p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
  669. if (p == NULL)
  670. goto tx_error;
  671. ip_vs_conn_fill_cport(cp, *p);
  672. IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
  673. }
  674. was_input = rt_is_input_route(skb_rtable(skb));
  675. local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
  676. IP_VS_RT_MODE_LOCAL |
  677. IP_VS_RT_MODE_NON_LOCAL |
  678. IP_VS_RT_MODE_RDR, NULL, ipvsh);
  679. if (local < 0)
  680. goto tx_error;
  681. rt = skb_rtable(skb);
  682. /*
  683. * Avoid duplicate tuple in reply direction for NAT traffic
  684. * to local address when connection is sync-ed
  685. */
  686. #if IS_ENABLED(CONFIG_NF_CONNTRACK)
  687. if (cp->flags & IP_VS_CONN_F_SYNC && local) {
  688. enum ip_conntrack_info ctinfo;
  689. struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
  690. if (ct) {
  691. IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off,
  692. "ip_vs_nat_xmit(): "
  693. "stopping DNAT to local address");
  694. goto tx_error;
  695. }
  696. }
  697. #endif
  698. /* From world but DNAT to loopback address? */
  699. if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
  700. IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off,
  701. "ip_vs_nat_xmit(): stopping DNAT to loopback "
  702. "address");
  703. goto tx_error;
  704. }
  705. /* copy-on-write the packet before mangling it */
  706. if (skb_ensure_writable(skb, sizeof(struct iphdr)))
  707. goto tx_error;
  708. if (skb_cow(skb, rt->dst.dev->hard_header_len))
  709. goto tx_error;
  710. /* mangle the packet */
  711. if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
  712. goto tx_error;
  713. ip_hdr(skb)->daddr = cp->daddr.ip;
  714. ip_send_check(ip_hdr(skb));
  715. IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT");
  716. /* FIXME: when application helper enlarges the packet and the length
  717. is larger than the MTU of outgoing device, there will be still
  718. MTU problem. */
  719. /* Another hack: avoid icmp_send in ip_fragment */
  720. skb->ignore_df = 1;
  721. rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
  722. LeaveFunction(10);
  723. return rc;
  724. tx_error:
  725. kfree_skb(skb);
  726. LeaveFunction(10);
  727. return NF_STOLEN;
  728. }
  729. #ifdef CONFIG_IP_VS_IPV6
  730. int
  731. ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
  732. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  733. {
  734. struct rt6_info *rt; /* Route to the other host */
  735. int local, rc;
  736. EnterFunction(10);
  737. /* check if it is a connection of no-client-port */
  738. if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
  739. __be16 _pt, *p;
  740. p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
  741. if (p == NULL)
  742. goto tx_error;
  743. ip_vs_conn_fill_cport(cp, *p);
  744. IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
  745. }
  746. local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
  747. &cp->daddr.in6,
  748. NULL, ipvsh, 0,
  749. IP_VS_RT_MODE_LOCAL |
  750. IP_VS_RT_MODE_NON_LOCAL |
  751. IP_VS_RT_MODE_RDR);
  752. if (local < 0)
  753. goto tx_error;
  754. rt = (struct rt6_info *) skb_dst(skb);
  755. /*
  756. * Avoid duplicate tuple in reply direction for NAT traffic
  757. * to local address when connection is sync-ed
  758. */
  759. #if IS_ENABLED(CONFIG_NF_CONNTRACK)
  760. if (cp->flags & IP_VS_CONN_F_SYNC && local) {
  761. enum ip_conntrack_info ctinfo;
  762. struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
  763. if (ct) {
  764. IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off,
  765. "ip_vs_nat_xmit_v6(): "
  766. "stopping DNAT to local address");
  767. goto tx_error;
  768. }
  769. }
  770. #endif
  771. /* From world but DNAT to loopback address? */
  772. if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
  773. ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
  774. IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, ipvsh->off,
  775. "ip_vs_nat_xmit_v6(): "
  776. "stopping DNAT to loopback address");
  777. goto tx_error;
  778. }
  779. /* copy-on-write the packet before mangling it */
  780. if (skb_ensure_writable(skb, sizeof(struct ipv6hdr)))
  781. goto tx_error;
  782. if (skb_cow(skb, rt->dst.dev->hard_header_len))
  783. goto tx_error;
  784. /* mangle the packet */
  785. if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
  786. goto tx_error;
  787. ipv6_hdr(skb)->daddr = cp->daddr.in6;
  788. IP_VS_DBG_PKT(10, AF_INET6, pp, skb, ipvsh->off, "After DNAT");
  789. /* FIXME: when application helper enlarges the packet and the length
  790. is larger than the MTU of outgoing device, there will be still
  791. MTU problem. */
  792. /* Another hack: avoid icmp_send in ip_fragment */
  793. skb->ignore_df = 1;
  794. rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
  795. LeaveFunction(10);
  796. return rc;
  797. tx_error:
  798. LeaveFunction(10);
  799. kfree_skb(skb);
  800. return NF_STOLEN;
  801. }
  802. #endif
  803. /* When forwarding a packet, we must ensure that we've got enough headroom
  804. * for the encapsulation packet in the skb. This also gives us an
  805. * opportunity to figure out what the payload_len, dsfield, ttl, and df
  806. * values should be, so that we won't need to look at the old ip header
  807. * again
  808. */
  809. static struct sk_buff *
  810. ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
  811. unsigned int max_headroom, __u8 *next_protocol,
  812. __u32 *payload_len, __u8 *dsfield, __u8 *ttl,
  813. __be16 *df)
  814. {
  815. struct sk_buff *new_skb = NULL;
  816. struct iphdr *old_iph = NULL;
  817. __u8 old_dsfield;
  818. #ifdef CONFIG_IP_VS_IPV6
  819. struct ipv6hdr *old_ipv6h = NULL;
  820. #endif
  821. ip_vs_drop_early_demux_sk(skb);
  822. if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
  823. new_skb = skb_realloc_headroom(skb, max_headroom);
  824. if (!new_skb)
  825. goto error;
  826. if (skb->sk)
  827. skb_set_owner_w(new_skb, skb->sk);
  828. consume_skb(skb);
  829. skb = new_skb;
  830. }
  831. #ifdef CONFIG_IP_VS_IPV6
  832. if (skb_af == AF_INET6) {
  833. old_ipv6h = ipv6_hdr(skb);
  834. *next_protocol = IPPROTO_IPV6;
  835. if (payload_len)
  836. *payload_len =
  837. ntohs(old_ipv6h->payload_len) +
  838. sizeof(*old_ipv6h);
  839. old_dsfield = ipv6_get_dsfield(old_ipv6h);
  840. *ttl = old_ipv6h->hop_limit;
  841. if (df)
  842. *df = 0;
  843. } else
  844. #endif
  845. {
  846. old_iph = ip_hdr(skb);
  847. /* Copy DF, reset fragment offset and MF */
  848. if (df)
  849. *df = (old_iph->frag_off & htons(IP_DF));
  850. *next_protocol = IPPROTO_IPIP;
  851. /* fix old IP header checksum */
  852. ip_send_check(old_iph);
  853. old_dsfield = ipv4_get_dsfield(old_iph);
  854. *ttl = old_iph->ttl;
  855. if (payload_len)
  856. *payload_len = ntohs(old_iph->tot_len);
  857. }
  858. /* Implement full-functionality option for ECN encapsulation */
  859. *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield);
  860. return skb;
  861. error:
  862. kfree_skb(skb);
  863. return ERR_PTR(-ENOMEM);
  864. }
  865. static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
  866. {
  867. switch (encaps_af) {
  868. case AF_INET:
  869. return SKB_GSO_IPXIP4;
  870. case AF_INET6:
  871. return SKB_GSO_IPXIP6;
  872. default:
  873. return 0;
  874. }
  875. }
  876. static int
  877. ipvs_gue_encap(struct net *net, struct sk_buff *skb,
  878. struct ip_vs_conn *cp, __u8 *next_protocol)
  879. {
  880. __be16 dport;
  881. __be16 sport = udp_flow_src_port(net, skb, 0, 0, false);
  882. struct udphdr *udph; /* Our new UDP header */
  883. struct guehdr *gueh; /* Our new GUE header */
  884. size_t hdrlen, optlen = 0;
  885. void *data;
  886. bool need_priv = false;
  887. if ((cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
  888. skb->ip_summed == CHECKSUM_PARTIAL) {
  889. optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
  890. need_priv = true;
  891. }
  892. hdrlen = sizeof(struct guehdr) + optlen;
  893. skb_push(skb, hdrlen);
  894. gueh = (struct guehdr *)skb->data;
  895. gueh->control = 0;
  896. gueh->version = 0;
  897. gueh->hlen = optlen >> 2;
  898. gueh->flags = 0;
  899. gueh->proto_ctype = *next_protocol;
  900. data = &gueh[1];
  901. if (need_priv) {
  902. __be32 *flags = data;
  903. u16 csum_start = skb_checksum_start_offset(skb);
  904. __be16 *pd;
  905. gueh->flags |= GUE_FLAG_PRIV;
  906. *flags = 0;
  907. data += GUE_LEN_PRIV;
  908. if (csum_start < hdrlen)
  909. return -EINVAL;
  910. csum_start -= hdrlen;
  911. pd = data;
  912. pd[0] = htons(csum_start);
  913. pd[1] = htons(csum_start + skb->csum_offset);
  914. if (!skb_is_gso(skb)) {
  915. skb->ip_summed = CHECKSUM_NONE;
  916. skb->encapsulation = 0;
  917. }
  918. *flags |= GUE_PFLAG_REMCSUM;
  919. data += GUE_PLEN_REMCSUM;
  920. }
  921. skb_push(skb, sizeof(struct udphdr));
  922. skb_reset_transport_header(skb);
  923. udph = udp_hdr(skb);
  924. dport = cp->dest->tun_port;
  925. udph->dest = dport;
  926. udph->source = sport;
  927. udph->len = htons(skb->len);
  928. udph->check = 0;
  929. *next_protocol = IPPROTO_UDP;
  930. return 0;
  931. }
  932. static void
  933. ipvs_gre_encap(struct net *net, struct sk_buff *skb,
  934. struct ip_vs_conn *cp, __u8 *next_protocol)
  935. {
  936. __be16 proto = *next_protocol == IPPROTO_IPIP ?
  937. htons(ETH_P_IP) : htons(ETH_P_IPV6);
  938. __be16 tflags = 0;
  939. size_t hdrlen;
  940. if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
  941. tflags |= TUNNEL_CSUM;
  942. hdrlen = gre_calc_hlen(tflags);
  943. gre_build_header(skb, hdrlen, tflags, proto, 0, 0);
  944. *next_protocol = IPPROTO_GRE;
  945. }
  946. /*
  947. * IP Tunneling transmitter
  948. *
  949. * This function encapsulates the packet in a new IP packet, its
  950. * destination will be set to cp->daddr. Most code of this function
  951. * is taken from ipip.c.
  952. *
  953. * It is used in VS/TUN cluster. The load balancer selects a real
  954. * server from a cluster based on a scheduling algorithm,
  955. * encapsulates the request packet and forwards it to the selected
  956. * server. For example, all real servers are configured with
  957. * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
  958. * the encapsulated packet, it will decapsulate the packet, processe
  959. * the request and return the response packets directly to the client
  960. * without passing the load balancer. This can greatly increase the
  961. * scalability of virtual server.
  962. *
  963. * Used for ANY protocol
  964. */
  965. int
  966. ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
  967. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  968. {
  969. struct netns_ipvs *ipvs = cp->ipvs;
  970. struct net *net = ipvs->net;
  971. struct rtable *rt; /* Route to the other host */
  972. __be32 saddr; /* Source for tunnel */
  973. struct net_device *tdev; /* Device to other host */
  974. __u8 next_protocol = 0;
  975. __u8 dsfield = 0;
  976. __u8 ttl = 0;
  977. __be16 df = 0;
  978. __be16 *dfp = NULL;
  979. struct iphdr *iph; /* Our new IP header */
  980. unsigned int max_headroom; /* The extra header space needed */
  981. int ret, local;
  982. int tun_type, gso_type;
  983. int tun_flags;
  984. EnterFunction(10);
  985. local = __ip_vs_get_out_rt(ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
  986. IP_VS_RT_MODE_LOCAL |
  987. IP_VS_RT_MODE_NON_LOCAL |
  988. IP_VS_RT_MODE_CONNECT |
  989. IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh);
  990. if (local < 0)
  991. goto tx_error;
  992. if (local)
  993. return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
  994. rt = skb_rtable(skb);
  995. tdev = rt->dst.dev;
  996. /*
  997. * Okay, now see if we can stuff it in the buffer as-is.
  998. */
  999. max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
  1000. tun_type = cp->dest->tun_type;
  1001. tun_flags = cp->dest->tun_flags;
  1002. if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
  1003. size_t gue_hdrlen, gue_optlen = 0;
  1004. if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
  1005. skb->ip_summed == CHECKSUM_PARTIAL) {
  1006. gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
  1007. }
  1008. gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
  1009. max_headroom += sizeof(struct udphdr) + gue_hdrlen;
  1010. } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
  1011. size_t gre_hdrlen;
  1012. __be16 tflags = 0;
  1013. if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
  1014. tflags |= TUNNEL_CSUM;
  1015. gre_hdrlen = gre_calc_hlen(tflags);
  1016. max_headroom += gre_hdrlen;
  1017. }
  1018. /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
  1019. dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
  1020. skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
  1021. &next_protocol, NULL, &dsfield,
  1022. &ttl, dfp);
  1023. if (IS_ERR(skb))
  1024. goto tx_error;
  1025. gso_type = __tun_gso_type_mask(AF_INET, cp->af);
  1026. if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
  1027. if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
  1028. (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
  1029. gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
  1030. else
  1031. gso_type |= SKB_GSO_UDP_TUNNEL;
  1032. if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
  1033. skb->ip_summed == CHECKSUM_PARTIAL) {
  1034. gso_type |= SKB_GSO_TUNNEL_REMCSUM;
  1035. }
  1036. } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
  1037. if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
  1038. gso_type |= SKB_GSO_GRE_CSUM;
  1039. else
  1040. gso_type |= SKB_GSO_GRE;
  1041. }
  1042. if (iptunnel_handle_offloads(skb, gso_type))
  1043. goto tx_error;
  1044. skb->transport_header = skb->network_header;
  1045. skb_set_inner_ipproto(skb, next_protocol);
  1046. if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
  1047. bool check = false;
  1048. if (ipvs_gue_encap(net, skb, cp, &next_protocol))
  1049. goto tx_error;
  1050. if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
  1051. (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
  1052. check = true;
  1053. udp_set_csum(!check, skb, saddr, cp->daddr.ip, skb->len);
  1054. } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
  1055. ipvs_gre_encap(net, skb, cp, &next_protocol);
  1056. skb_push(skb, sizeof(struct iphdr));
  1057. skb_reset_network_header(skb);
  1058. memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
  1059. /*
  1060. * Push down and install the IPIP header.
  1061. */
  1062. iph = ip_hdr(skb);
  1063. iph->version = 4;
  1064. iph->ihl = sizeof(struct iphdr)>>2;
  1065. iph->frag_off = df;
  1066. iph->protocol = next_protocol;
  1067. iph->tos = dsfield;
  1068. iph->daddr = cp->daddr.ip;
  1069. iph->saddr = saddr;
  1070. iph->ttl = ttl;
  1071. ip_select_ident(net, skb, NULL);
  1072. /* Another hack: avoid icmp_send in ip_fragment */
  1073. skb->ignore_df = 1;
  1074. ret = ip_vs_tunnel_xmit_prepare(skb, cp);
  1075. if (ret == NF_ACCEPT)
  1076. ip_local_out(net, skb->sk, skb);
  1077. else if (ret == NF_DROP)
  1078. kfree_skb(skb);
  1079. LeaveFunction(10);
  1080. return NF_STOLEN;
  1081. tx_error:
  1082. if (!IS_ERR(skb))
  1083. kfree_skb(skb);
  1084. LeaveFunction(10);
  1085. return NF_STOLEN;
  1086. }
  1087. #ifdef CONFIG_IP_VS_IPV6
  1088. int
  1089. ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
  1090. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  1091. {
  1092. struct netns_ipvs *ipvs = cp->ipvs;
  1093. struct net *net = ipvs->net;
  1094. struct rt6_info *rt; /* Route to the other host */
  1095. struct in6_addr saddr; /* Source for tunnel */
  1096. struct net_device *tdev; /* Device to other host */
  1097. __u8 next_protocol = 0;
  1098. __u32 payload_len = 0;
  1099. __u8 dsfield = 0;
  1100. __u8 ttl = 0;
  1101. struct ipv6hdr *iph; /* Our new IP header */
  1102. unsigned int max_headroom; /* The extra header space needed */
  1103. int ret, local;
  1104. int tun_type, gso_type;
  1105. int tun_flags;
  1106. EnterFunction(10);
  1107. local = __ip_vs_get_out_rt_v6(ipvs, cp->af, skb, cp->dest,
  1108. &cp->daddr.in6,
  1109. &saddr, ipvsh, 1,
  1110. IP_VS_RT_MODE_LOCAL |
  1111. IP_VS_RT_MODE_NON_LOCAL |
  1112. IP_VS_RT_MODE_TUNNEL);
  1113. if (local < 0)
  1114. goto tx_error;
  1115. if (local)
  1116. return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
  1117. rt = (struct rt6_info *) skb_dst(skb);
  1118. tdev = rt->dst.dev;
  1119. /*
  1120. * Okay, now see if we can stuff it in the buffer as-is.
  1121. */
  1122. max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
  1123. tun_type = cp->dest->tun_type;
  1124. tun_flags = cp->dest->tun_flags;
  1125. if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
  1126. size_t gue_hdrlen, gue_optlen = 0;
  1127. if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
  1128. skb->ip_summed == CHECKSUM_PARTIAL) {
  1129. gue_optlen += GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
  1130. }
  1131. gue_hdrlen = sizeof(struct guehdr) + gue_optlen;
  1132. max_headroom += sizeof(struct udphdr) + gue_hdrlen;
  1133. } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
  1134. size_t gre_hdrlen;
  1135. __be16 tflags = 0;
  1136. if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
  1137. tflags |= TUNNEL_CSUM;
  1138. gre_hdrlen = gre_calc_hlen(tflags);
  1139. max_headroom += gre_hdrlen;
  1140. }
  1141. skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
  1142. &next_protocol, &payload_len,
  1143. &dsfield, &ttl, NULL);
  1144. if (IS_ERR(skb))
  1145. goto tx_error;
  1146. gso_type = __tun_gso_type_mask(AF_INET6, cp->af);
  1147. if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
  1148. if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
  1149. (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
  1150. gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
  1151. else
  1152. gso_type |= SKB_GSO_UDP_TUNNEL;
  1153. if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM) &&
  1154. skb->ip_summed == CHECKSUM_PARTIAL) {
  1155. gso_type |= SKB_GSO_TUNNEL_REMCSUM;
  1156. }
  1157. } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
  1158. if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
  1159. gso_type |= SKB_GSO_GRE_CSUM;
  1160. else
  1161. gso_type |= SKB_GSO_GRE;
  1162. }
  1163. if (iptunnel_handle_offloads(skb, gso_type))
  1164. goto tx_error;
  1165. skb->transport_header = skb->network_header;
  1166. skb_set_inner_ipproto(skb, next_protocol);
  1167. if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
  1168. bool check = false;
  1169. if (ipvs_gue_encap(net, skb, cp, &next_protocol))
  1170. goto tx_error;
  1171. if ((tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM) ||
  1172. (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_REMCSUM))
  1173. check = true;
  1174. udp6_set_csum(!check, skb, &saddr, &cp->daddr.in6, skb->len);
  1175. } else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE)
  1176. ipvs_gre_encap(net, skb, cp, &next_protocol);
  1177. skb_push(skb, sizeof(struct ipv6hdr));
  1178. skb_reset_network_header(skb);
  1179. memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
  1180. /*
  1181. * Push down and install the IPIP header.
  1182. */
  1183. iph = ipv6_hdr(skb);
  1184. iph->version = 6;
  1185. iph->nexthdr = next_protocol;
  1186. iph->payload_len = htons(payload_len);
  1187. memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
  1188. ipv6_change_dsfield(iph, 0, dsfield);
  1189. iph->daddr = cp->daddr.in6;
  1190. iph->saddr = saddr;
  1191. iph->hop_limit = ttl;
  1192. /* Another hack: avoid icmp_send in ip_fragment */
  1193. skb->ignore_df = 1;
  1194. ret = ip_vs_tunnel_xmit_prepare(skb, cp);
  1195. if (ret == NF_ACCEPT)
  1196. ip6_local_out(net, skb->sk, skb);
  1197. else if (ret == NF_DROP)
  1198. kfree_skb(skb);
  1199. LeaveFunction(10);
  1200. return NF_STOLEN;
  1201. tx_error:
  1202. if (!IS_ERR(skb))
  1203. kfree_skb(skb);
  1204. LeaveFunction(10);
  1205. return NF_STOLEN;
  1206. }
  1207. #endif
  1208. /*
  1209. * Direct Routing transmitter
  1210. * Used for ANY protocol
  1211. */
  1212. int
  1213. ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
  1214. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  1215. {
  1216. int local;
  1217. EnterFunction(10);
  1218. local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip,
  1219. IP_VS_RT_MODE_LOCAL |
  1220. IP_VS_RT_MODE_NON_LOCAL |
  1221. IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh);
  1222. if (local < 0)
  1223. goto tx_error;
  1224. if (local)
  1225. return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
  1226. ip_send_check(ip_hdr(skb));
  1227. /* Another hack: avoid icmp_send in ip_fragment */
  1228. skb->ignore_df = 1;
  1229. ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
  1230. LeaveFunction(10);
  1231. return NF_STOLEN;
  1232. tx_error:
  1233. kfree_skb(skb);
  1234. LeaveFunction(10);
  1235. return NF_STOLEN;
  1236. }
  1237. #ifdef CONFIG_IP_VS_IPV6
  1238. int
  1239. ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
  1240. struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
  1241. {
  1242. int local;
  1243. EnterFunction(10);
  1244. local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
  1245. &cp->daddr.in6,
  1246. NULL, ipvsh, 0,
  1247. IP_VS_RT_MODE_LOCAL |
  1248. IP_VS_RT_MODE_NON_LOCAL |
  1249. IP_VS_RT_MODE_KNOWN_NH);
  1250. if (local < 0)
  1251. goto tx_error;
  1252. if (local)
  1253. return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
  1254. /* Another hack: avoid icmp_send in ip_fragment */
  1255. skb->ignore_df = 1;
  1256. ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
  1257. LeaveFunction(10);
  1258. return NF_STOLEN;
  1259. tx_error:
  1260. kfree_skb(skb);
  1261. LeaveFunction(10);
  1262. return NF_STOLEN;
  1263. }
  1264. #endif
  1265. /*
  1266. * ICMP packet transmitter
  1267. * called by the ip_vs_in_icmp
  1268. */
  1269. int
  1270. ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
  1271. struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
  1272. struct ip_vs_iphdr *iph)
  1273. {
  1274. struct rtable *rt; /* Route to the other host */
  1275. int rc;
  1276. int local;
  1277. int rt_mode, was_input;
  1278. EnterFunction(10);
  1279. /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
  1280. forwarded directly here, because there is no need to
  1281. translate address/port back */
  1282. if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
  1283. if (cp->packet_xmit)
  1284. rc = cp->packet_xmit(skb, cp, pp, iph);
  1285. else
  1286. rc = NF_ACCEPT;
  1287. /* do not touch skb anymore */
  1288. atomic_inc(&cp->in_pkts);
  1289. goto out;
  1290. }
  1291. /*
  1292. * mangle and send the packet here (only for VS/NAT)
  1293. */
  1294. was_input = rt_is_input_route(skb_rtable(skb));
  1295. /* LOCALNODE from FORWARD hook is not supported */
  1296. rt_mode = (hooknum != NF_INET_FORWARD) ?
  1297. IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
  1298. IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
  1299. local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, rt_mode,
  1300. NULL, iph);
  1301. if (local < 0)
  1302. goto tx_error;
  1303. rt = skb_rtable(skb);
  1304. /*
  1305. * Avoid duplicate tuple in reply direction for NAT traffic
  1306. * to local address when connection is sync-ed
  1307. */
  1308. #if IS_ENABLED(CONFIG_NF_CONNTRACK)
  1309. if (cp->flags & IP_VS_CONN_F_SYNC && local) {
  1310. enum ip_conntrack_info ctinfo;
  1311. struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
  1312. if (ct) {
  1313. IP_VS_DBG(10, "%s(): "
  1314. "stopping DNAT to local address %pI4\n",
  1315. __func__, &cp->daddr.ip);
  1316. goto tx_error;
  1317. }
  1318. }
  1319. #endif
  1320. /* From world but DNAT to loopback address? */
  1321. if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
  1322. IP_VS_DBG(1, "%s(): "
  1323. "stopping DNAT to loopback %pI4\n",
  1324. __func__, &cp->daddr.ip);
  1325. goto tx_error;
  1326. }
  1327. /* copy-on-write the packet before mangling it */
  1328. if (skb_ensure_writable(skb, offset))
  1329. goto tx_error;
  1330. if (skb_cow(skb, rt->dst.dev->hard_header_len))
  1331. goto tx_error;
  1332. ip_vs_nat_icmp(skb, pp, cp, 0);
  1333. /* Another hack: avoid icmp_send in ip_fragment */
  1334. skb->ignore_df = 1;
  1335. rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
  1336. goto out;
  1337. tx_error:
  1338. kfree_skb(skb);
  1339. rc = NF_STOLEN;
  1340. out:
  1341. LeaveFunction(10);
  1342. return rc;
  1343. }
  1344. #ifdef CONFIG_IP_VS_IPV6
  1345. int
  1346. ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
  1347. struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
  1348. struct ip_vs_iphdr *ipvsh)
  1349. {
  1350. struct rt6_info *rt; /* Route to the other host */
  1351. int rc;
  1352. int local;
  1353. int rt_mode;
  1354. EnterFunction(10);
  1355. /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
  1356. forwarded directly here, because there is no need to
  1357. translate address/port back */
  1358. if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
  1359. if (cp->packet_xmit)
  1360. rc = cp->packet_xmit(skb, cp, pp, ipvsh);
  1361. else
  1362. rc = NF_ACCEPT;
  1363. /* do not touch skb anymore */
  1364. atomic_inc(&cp->in_pkts);
  1365. goto out;
  1366. }
  1367. /*
  1368. * mangle and send the packet here (only for VS/NAT)
  1369. */
  1370. /* LOCALNODE from FORWARD hook is not supported */
  1371. rt_mode = (hooknum != NF_INET_FORWARD) ?
  1372. IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
  1373. IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
  1374. local = __ip_vs_get_out_rt_v6(cp->ipvs, cp->af, skb, cp->dest,
  1375. &cp->daddr.in6, NULL, ipvsh, 0, rt_mode);
  1376. if (local < 0)
  1377. goto tx_error;
  1378. rt = (struct rt6_info *) skb_dst(skb);
  1379. /*
  1380. * Avoid duplicate tuple in reply direction for NAT traffic
  1381. * to local address when connection is sync-ed
  1382. */
  1383. #if IS_ENABLED(CONFIG_NF_CONNTRACK)
  1384. if (cp->flags & IP_VS_CONN_F_SYNC && local) {
  1385. enum ip_conntrack_info ctinfo;
  1386. struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
  1387. if (ct) {
  1388. IP_VS_DBG(10, "%s(): "
  1389. "stopping DNAT to local address %pI6\n",
  1390. __func__, &cp->daddr.in6);
  1391. goto tx_error;
  1392. }
  1393. }
  1394. #endif
  1395. /* From world but DNAT to loopback address? */
  1396. if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
  1397. ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) {
  1398. IP_VS_DBG(1, "%s(): "
  1399. "stopping DNAT to loopback %pI6\n",
  1400. __func__, &cp->daddr.in6);
  1401. goto tx_error;
  1402. }
  1403. /* copy-on-write the packet before mangling it */
  1404. if (skb_ensure_writable(skb, offset))
  1405. goto tx_error;
  1406. if (skb_cow(skb, rt->dst.dev->hard_header_len))
  1407. goto tx_error;
  1408. ip_vs_nat_icmp_v6(skb, pp, cp, 0);
  1409. /* Another hack: avoid icmp_send in ip_fragment */
  1410. skb->ignore_df = 1;
  1411. rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
  1412. goto out;
  1413. tx_error:
  1414. kfree_skb(skb);
  1415. rc = NF_STOLEN;
  1416. out:
  1417. LeaveFunction(10);
  1418. return rc;
  1419. }
  1420. #endif