PageRenderTime 120ms CodeModel.GetById 3ms app.highlight 102ms RepoModel.GetById 1ms app.codeStats 0ms

/net/netfilter/ipvs/ip_vs_core.c

http://github.com/mirrors/linux
C | 2534 lines | 1761 code | 346 blank | 427 comment | 415 complexity | 74680fb4a34037458ed51e88a81ee483 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * IPVS         An implementation of the IP virtual server support for the
   4 *              LINUX operating system.  IPVS is now implemented as a module
   5 *              over the Netfilter framework. IPVS can be used to build a
   6 *              high-performance and highly available server based on a
   7 *              cluster of servers.
   8 *
   9 * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
  10 *              Peter Kese <peter.kese@ijs.si>
  11 *              Julian Anastasov <ja@ssi.bg>
  12 *
  13 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
  14 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
  15 * and others.
  16 *
  17 * Changes:
  18 *	Paul `Rusty' Russell		properly handle non-linear skbs
  19 *	Harald Welte			don't use nfcache
  20 */
  21
  22#define KMSG_COMPONENT "IPVS"
  23#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  24
  25#include <linux/module.h>
  26#include <linux/kernel.h>
  27#include <linux/ip.h>
  28#include <linux/tcp.h>
  29#include <linux/sctp.h>
  30#include <linux/icmp.h>
  31#include <linux/slab.h>
  32
  33#include <net/ip.h>
  34#include <net/tcp.h>
  35#include <net/udp.h>
  36#include <net/icmp.h>                   /* for icmp_send */
  37#include <net/gue.h>
  38#include <net/gre.h>
  39#include <net/route.h>
  40#include <net/ip6_checksum.h>
  41#include <net/netns/generic.h>		/* net_generic() */
  42
  43#include <linux/netfilter.h>
  44#include <linux/netfilter_ipv4.h>
  45
  46#ifdef CONFIG_IP_VS_IPV6
  47#include <net/ipv6.h>
  48#include <linux/netfilter_ipv6.h>
  49#include <net/ip6_route.h>
  50#endif
  51
  52#include <net/ip_vs.h>
  53#include <linux/indirect_call_wrapper.h>
  54
  55
  56EXPORT_SYMBOL(register_ip_vs_scheduler);
  57EXPORT_SYMBOL(unregister_ip_vs_scheduler);
  58EXPORT_SYMBOL(ip_vs_proto_name);
  59EXPORT_SYMBOL(ip_vs_conn_new);
  60EXPORT_SYMBOL(ip_vs_conn_in_get);
  61EXPORT_SYMBOL(ip_vs_conn_out_get);
  62#ifdef CONFIG_IP_VS_PROTO_TCP
  63EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
  64#endif
  65EXPORT_SYMBOL(ip_vs_conn_put);
  66#ifdef CONFIG_IP_VS_DEBUG
  67EXPORT_SYMBOL(ip_vs_get_debug_level);
  68#endif
  69EXPORT_SYMBOL(ip_vs_new_conn_out);
  70
  71#ifdef CONFIG_IP_VS_PROTO_TCP
  72INDIRECT_CALLABLE_DECLARE(int
  73	tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
  74			 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph));
  75#endif
  76
  77#ifdef CONFIG_IP_VS_PROTO_UDP
  78INDIRECT_CALLABLE_DECLARE(int
  79	udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
  80			 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph));
  81#endif
  82
  83#if defined(CONFIG_IP_VS_PROTO_TCP) && defined(CONFIG_IP_VS_PROTO_UDP)
  84#define SNAT_CALL(f, ...) \
  85	INDIRECT_CALL_2(f, tcp_snat_handler, udp_snat_handler, __VA_ARGS__)
  86#elif defined(CONFIG_IP_VS_PROTO_TCP)
  87#define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, tcp_snat_handler, __VA_ARGS__)
  88#elif defined(CONFIG_IP_VS_PROTO_UDP)
  89#define SNAT_CALL(f, ...) INDIRECT_CALL_1(f, udp_snat_handler, __VA_ARGS__)
  90#else
  91#define SNAT_CALL(f, ...) f(__VA_ARGS__)
  92#endif
  93
  94static unsigned int ip_vs_net_id __read_mostly;
  95/* netns cnt used for uniqueness */
  96static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
  97
  98/* ID used in ICMP lookups */
  99#define icmp_id(icmph)          (((icmph)->un).echo.id)
 100#define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
 101
 102const char *ip_vs_proto_name(unsigned int proto)
 103{
 104	static char buf[20];
 105
 106	switch (proto) {
 107	case IPPROTO_IP:
 108		return "IP";
 109	case IPPROTO_UDP:
 110		return "UDP";
 111	case IPPROTO_TCP:
 112		return "TCP";
 113	case IPPROTO_SCTP:
 114		return "SCTP";
 115	case IPPROTO_ICMP:
 116		return "ICMP";
 117#ifdef CONFIG_IP_VS_IPV6
 118	case IPPROTO_ICMPV6:
 119		return "ICMPv6";
 120#endif
 121	default:
 122		sprintf(buf, "IP_%u", proto);
 123		return buf;
 124	}
 125}
 126
 127void ip_vs_init_hash_table(struct list_head *table, int rows)
 128{
 129	while (--rows >= 0)
 130		INIT_LIST_HEAD(&table[rows]);
 131}
 132
 133static inline void
 134ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 135{
 136	struct ip_vs_dest *dest = cp->dest;
 137	struct netns_ipvs *ipvs = cp->ipvs;
 138
 139	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 140		struct ip_vs_cpu_stats *s;
 141		struct ip_vs_service *svc;
 142
 143		local_bh_disable();
 144
 145		s = this_cpu_ptr(dest->stats.cpustats);
 146		u64_stats_update_begin(&s->syncp);
 147		s->cnt.inpkts++;
 148		s->cnt.inbytes += skb->len;
 149		u64_stats_update_end(&s->syncp);
 150
 151		svc = rcu_dereference(dest->svc);
 152		s = this_cpu_ptr(svc->stats.cpustats);
 153		u64_stats_update_begin(&s->syncp);
 154		s->cnt.inpkts++;
 155		s->cnt.inbytes += skb->len;
 156		u64_stats_update_end(&s->syncp);
 157
 158		s = this_cpu_ptr(ipvs->tot_stats.cpustats);
 159		u64_stats_update_begin(&s->syncp);
 160		s->cnt.inpkts++;
 161		s->cnt.inbytes += skb->len;
 162		u64_stats_update_end(&s->syncp);
 163
 164		local_bh_enable();
 165	}
 166}
 167
 168
 169static inline void
 170ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 171{
 172	struct ip_vs_dest *dest = cp->dest;
 173	struct netns_ipvs *ipvs = cp->ipvs;
 174
 175	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
 176		struct ip_vs_cpu_stats *s;
 177		struct ip_vs_service *svc;
 178
 179		local_bh_disable();
 180
 181		s = this_cpu_ptr(dest->stats.cpustats);
 182		u64_stats_update_begin(&s->syncp);
 183		s->cnt.outpkts++;
 184		s->cnt.outbytes += skb->len;
 185		u64_stats_update_end(&s->syncp);
 186
 187		svc = rcu_dereference(dest->svc);
 188		s = this_cpu_ptr(svc->stats.cpustats);
 189		u64_stats_update_begin(&s->syncp);
 190		s->cnt.outpkts++;
 191		s->cnt.outbytes += skb->len;
 192		u64_stats_update_end(&s->syncp);
 193
 194		s = this_cpu_ptr(ipvs->tot_stats.cpustats);
 195		u64_stats_update_begin(&s->syncp);
 196		s->cnt.outpkts++;
 197		s->cnt.outbytes += skb->len;
 198		u64_stats_update_end(&s->syncp);
 199
 200		local_bh_enable();
 201	}
 202}
 203
 204
 205static inline void
 206ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 207{
 208	struct netns_ipvs *ipvs = svc->ipvs;
 209	struct ip_vs_cpu_stats *s;
 210
 211	local_bh_disable();
 212
 213	s = this_cpu_ptr(cp->dest->stats.cpustats);
 214	u64_stats_update_begin(&s->syncp);
 215	s->cnt.conns++;
 216	u64_stats_update_end(&s->syncp);
 217
 218	s = this_cpu_ptr(svc->stats.cpustats);
 219	u64_stats_update_begin(&s->syncp);
 220	s->cnt.conns++;
 221	u64_stats_update_end(&s->syncp);
 222
 223	s = this_cpu_ptr(ipvs->tot_stats.cpustats);
 224	u64_stats_update_begin(&s->syncp);
 225	s->cnt.conns++;
 226	u64_stats_update_end(&s->syncp);
 227
 228	local_bh_enable();
 229}
 230
 231
 232static inline void
 233ip_vs_set_state(struct ip_vs_conn *cp, int direction,
 234		const struct sk_buff *skb,
 235		struct ip_vs_proto_data *pd)
 236{
 237	if (likely(pd->pp->state_transition))
 238		pd->pp->state_transition(cp, direction, skb, pd);
 239}
 240
 241static inline int
 242ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
 243			      struct sk_buff *skb, int protocol,
 244			      const union nf_inet_addr *caddr, __be16 cport,
 245			      const union nf_inet_addr *vaddr, __be16 vport,
 246			      struct ip_vs_conn_param *p)
 247{
 248	ip_vs_conn_fill_param(svc->ipvs, svc->af, protocol, caddr, cport, vaddr,
 249			      vport, p);
 250	p->pe = rcu_dereference(svc->pe);
 251	if (p->pe && p->pe->fill_param)
 252		return p->pe->fill_param(p, skb);
 253
 254	return 0;
 255}
 256
 257/*
 258 *  IPVS persistent scheduling function
 259 *  It creates a connection entry according to its template if exists,
 260 *  or selects a server and creates a connection entry plus a template.
 261 *  Locking: we are svc user (svc->refcnt), so we hold all dests too
 262 *  Protocols supported: TCP, UDP
 263 */
 264static struct ip_vs_conn *
 265ip_vs_sched_persist(struct ip_vs_service *svc,
 266		    struct sk_buff *skb, __be16 src_port, __be16 dst_port,
 267		    int *ignored, struct ip_vs_iphdr *iph)
 268{
 269	struct ip_vs_conn *cp = NULL;
 270	struct ip_vs_dest *dest;
 271	struct ip_vs_conn *ct;
 272	__be16 dport = 0;		/* destination port to forward */
 273	unsigned int flags;
 274	struct ip_vs_conn_param param;
 275	const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
 276	union nf_inet_addr snet;	/* source network of the client,
 277					   after masking */
 278	const union nf_inet_addr *src_addr, *dst_addr;
 279
 280	if (likely(!ip_vs_iph_inverse(iph))) {
 281		src_addr = &iph->saddr;
 282		dst_addr = &iph->daddr;
 283	} else {
 284		src_addr = &iph->daddr;
 285		dst_addr = &iph->saddr;
 286	}
 287
 288
 289	/* Mask saddr with the netmask to adjust template granularity */
 290#ifdef CONFIG_IP_VS_IPV6
 291	if (svc->af == AF_INET6)
 292		ipv6_addr_prefix(&snet.in6, &src_addr->in6,
 293				 (__force __u32) svc->netmask);
 294	else
 295#endif
 296		snet.ip = src_addr->ip & svc->netmask;
 297
 298	IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
 299		      "mnet %s\n",
 300		      IP_VS_DBG_ADDR(svc->af, src_addr), ntohs(src_port),
 301		      IP_VS_DBG_ADDR(svc->af, dst_addr), ntohs(dst_port),
 302		      IP_VS_DBG_ADDR(svc->af, &snet));
 303
 304	/*
 305	 * As far as we know, FTP is a very complicated network protocol, and
 306	 * it uses control connection and data connections. For active FTP,
 307	 * FTP server initialize data connection to the client, its source port
 308	 * is often 20. For passive FTP, FTP server tells the clients the port
 309	 * that it passively listens to,  and the client issues the data
 310	 * connection. In the tunneling or direct routing mode, the load
 311	 * balancer is on the client-to-server half of connection, the port
 312	 * number is unknown to the load balancer. So, a conn template like
 313	 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
 314	 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
 315	 * is created for other persistent services.
 316	 */
 317	{
 318		int protocol = iph->protocol;
 319		const union nf_inet_addr *vaddr = dst_addr;
 320		__be16 vport = 0;
 321
 322		if (dst_port == svc->port) {
 323			/* non-FTP template:
 324			 * <protocol, caddr, 0, vaddr, vport, daddr, dport>
 325			 * FTP template:
 326			 * <protocol, caddr, 0, vaddr, 0, daddr, 0>
 327			 */
 328			if (svc->port != FTPPORT)
 329				vport = dst_port;
 330		} else {
 331			/* Note: persistent fwmark-based services and
 332			 * persistent port zero service are handled here.
 333			 * fwmark template:
 334			 * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
 335			 * port zero template:
 336			 * <protocol,caddr,0,vaddr,0,daddr,0>
 337			 */
 338			if (svc->fwmark) {
 339				protocol = IPPROTO_IP;
 340				vaddr = &fwmark;
 341			}
 342		}
 343		/* return *ignored = -1 so NF_DROP can be used */
 344		if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
 345						  vaddr, vport, &param) < 0) {
 346			*ignored = -1;
 347			return NULL;
 348		}
 349	}
 350
 351	/* Check if a template already exists */
 352	ct = ip_vs_ct_in_get(&param);
 353	if (!ct || !ip_vs_check_template(ct, NULL)) {
 354		struct ip_vs_scheduler *sched;
 355
 356		/*
 357		 * No template found or the dest of the connection
 358		 * template is not available.
 359		 * return *ignored=0 i.e. ICMP and NF_DROP
 360		 */
 361		sched = rcu_dereference(svc->scheduler);
 362		if (sched) {
 363			/* read svc->sched_data after svc->scheduler */
 364			smp_rmb();
 365			dest = sched->schedule(svc, skb, iph);
 366		} else {
 367			dest = NULL;
 368		}
 369		if (!dest) {
 370			IP_VS_DBG(1, "p-schedule: no dest found.\n");
 371			kfree(param.pe_data);
 372			*ignored = 0;
 373			return NULL;
 374		}
 375
 376		if (dst_port == svc->port && svc->port != FTPPORT)
 377			dport = dest->port;
 378
 379		/* Create a template
 380		 * This adds param.pe_data to the template,
 381		 * and thus param.pe_data will be destroyed
 382		 * when the template expires */
 383		ct = ip_vs_conn_new(&param, dest->af, &dest->addr, dport,
 384				    IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
 385		if (ct == NULL) {
 386			kfree(param.pe_data);
 387			*ignored = -1;
 388			return NULL;
 389		}
 390
 391		ct->timeout = svc->timeout;
 392	} else {
 393		/* set destination with the found template */
 394		dest = ct->dest;
 395		kfree(param.pe_data);
 396	}
 397
 398	dport = dst_port;
 399	if (dport == svc->port && dest->port)
 400		dport = dest->port;
 401
 402	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
 403		 && iph->protocol == IPPROTO_UDP) ?
 404		IP_VS_CONN_F_ONE_PACKET : 0;
 405
 406	/*
 407	 *    Create a new connection according to the template
 408	 */
 409	ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, src_addr,
 410			      src_port, dst_addr, dst_port, &param);
 411
 412	cp = ip_vs_conn_new(&param, dest->af, &dest->addr, dport, flags, dest,
 413			    skb->mark);
 414	if (cp == NULL) {
 415		ip_vs_conn_put(ct);
 416		*ignored = -1;
 417		return NULL;
 418	}
 419
 420	/*
 421	 *    Add its control
 422	 */
 423	ip_vs_control_add(cp, ct);
 424	ip_vs_conn_put(ct);
 425
 426	ip_vs_conn_stats(cp, svc);
 427	return cp;
 428}
 429
 430
 431/*
 432 *  IPVS main scheduling function
 433 *  It selects a server according to the virtual service, and
 434 *  creates a connection entry.
 435 *  Protocols supported: TCP, UDP
 436 *
 437 *  Usage of *ignored
 438 *
 439 * 1 :   protocol tried to schedule (eg. on SYN), found svc but the
 440 *       svc/scheduler decides that this packet should be accepted with
 441 *       NF_ACCEPT because it must not be scheduled.
 442 *
 443 * 0 :   scheduler can not find destination, so try bypass or
 444 *       return ICMP and then NF_DROP (ip_vs_leave).
 445 *
 446 * -1 :  scheduler tried to schedule but fatal error occurred, eg.
 447 *       ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
 448 *       failure such as missing Call-ID, ENOMEM on skb_linearize
 449 *       or pe_data. In this case we should return NF_DROP without
 450 *       any attempts to send ICMP with ip_vs_leave.
 451 */
 452struct ip_vs_conn *
 453ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
 454	       struct ip_vs_proto_data *pd, int *ignored,
 455	       struct ip_vs_iphdr *iph)
 456{
 457	struct ip_vs_protocol *pp = pd->pp;
 458	struct ip_vs_conn *cp = NULL;
 459	struct ip_vs_scheduler *sched;
 460	struct ip_vs_dest *dest;
 461	__be16 _ports[2], *pptr, cport, vport;
 462	const void *caddr, *vaddr;
 463	unsigned int flags;
 464
 465	*ignored = 1;
 466	/*
 467	 * IPv6 frags, only the first hit here.
 468	 */
 469	pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports);
 470	if (pptr == NULL)
 471		return NULL;
 472
 473	if (likely(!ip_vs_iph_inverse(iph))) {
 474		cport = pptr[0];
 475		caddr = &iph->saddr;
 476		vport = pptr[1];
 477		vaddr = &iph->daddr;
 478	} else {
 479		cport = pptr[1];
 480		caddr = &iph->daddr;
 481		vport = pptr[0];
 482		vaddr = &iph->saddr;
 483	}
 484
 485	/*
 486	 * FTPDATA needs this check when using local real server.
 487	 * Never schedule Active FTPDATA connections from real server.
 488	 * For LVS-NAT they must be already created. For other methods
 489	 * with persistence the connection is created on SYN+ACK.
 490	 */
 491	if (cport == FTPDATA) {
 492		IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off,
 493			      "Not scheduling FTPDATA");
 494		return NULL;
 495	}
 496
 497	/*
 498	 *    Do not schedule replies from local real server.
 499	 */
 500	if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) {
 501		iph->hdr_flags ^= IP_VS_HDR_INVERSE;
 502		cp = INDIRECT_CALL_1(pp->conn_in_get,
 503				     ip_vs_conn_in_get_proto, svc->ipvs,
 504				     svc->af, skb, iph);
 505		iph->hdr_flags ^= IP_VS_HDR_INVERSE;
 506
 507		if (cp) {
 508			IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off,
 509				      "Not scheduling reply for existing"
 510				      " connection");
 511			__ip_vs_conn_put(cp);
 512			return NULL;
 513		}
 514	}
 515
 516	/*
 517	 *    Persistent service
 518	 */
 519	if (svc->flags & IP_VS_SVC_F_PERSISTENT)
 520		return ip_vs_sched_persist(svc, skb, cport, vport, ignored,
 521					   iph);
 522
 523	*ignored = 0;
 524
 525	/*
 526	 *    Non-persistent service
 527	 */
 528	if (!svc->fwmark && vport != svc->port) {
 529		if (!svc->port)
 530			pr_err("Schedule: port zero only supported "
 531			       "in persistent services, "
 532			       "check your ipvs configuration\n");
 533		return NULL;
 534	}
 535
 536	sched = rcu_dereference(svc->scheduler);
 537	if (sched) {
 538		/* read svc->sched_data after svc->scheduler */
 539		smp_rmb();
 540		dest = sched->schedule(svc, skb, iph);
 541	} else {
 542		dest = NULL;
 543	}
 544	if (dest == NULL) {
 545		IP_VS_DBG(1, "Schedule: no dest found.\n");
 546		return NULL;
 547	}
 548
 549	flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
 550		 && iph->protocol == IPPROTO_UDP) ?
 551		IP_VS_CONN_F_ONE_PACKET : 0;
 552
 553	/*
 554	 *    Create a connection entry.
 555	 */
 556	{
 557		struct ip_vs_conn_param p;
 558
 559		ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
 560				      caddr, cport, vaddr, vport, &p);
 561		cp = ip_vs_conn_new(&p, dest->af, &dest->addr,
 562				    dest->port ? dest->port : vport,
 563				    flags, dest, skb->mark);
 564		if (!cp) {
 565			*ignored = -1;
 566			return NULL;
 567		}
 568	}
 569
 570	IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
 571		      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
 572		      ip_vs_fwd_tag(cp),
 573		      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
 574		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
 575		      IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
 576		      cp->flags, refcount_read(&cp->refcnt));
 577
 578	ip_vs_conn_stats(cp, svc);
 579	return cp;
 580}
 581
 582static inline int ip_vs_addr_is_unicast(struct net *net, int af,
 583					union nf_inet_addr *addr)
 584{
 585#ifdef CONFIG_IP_VS_IPV6
 586	if (af == AF_INET6)
 587		return ipv6_addr_type(&addr->in6) & IPV6_ADDR_UNICAST;
 588#endif
 589	return (inet_addr_type(net, addr->ip) == RTN_UNICAST);
 590}
 591
 592/*
 593 *  Pass or drop the packet.
 594 *  Called by ip_vs_in, when the virtual service is available but
 595 *  no destination is available for a new connection.
 596 */
 597int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
 598		struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph)
 599{
 600	__be16 _ports[2], *pptr, dport;
 601	struct netns_ipvs *ipvs = svc->ipvs;
 602	struct net *net = ipvs->net;
 603
 604	pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports);
 605	if (!pptr)
 606		return NF_DROP;
 607	dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0];
 608
 609	/* if it is fwmark-based service, the cache_bypass sysctl is up
 610	   and the destination is a non-local unicast, then create
 611	   a cache_bypass connection entry */
 612	if (sysctl_cache_bypass(ipvs) && svc->fwmark &&
 613	    !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) &&
 614	    ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) {
 615		int ret;
 616		struct ip_vs_conn *cp;
 617		unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
 618				      iph->protocol == IPPROTO_UDP) ?
 619				      IP_VS_CONN_F_ONE_PACKET : 0;
 620		union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
 621
 622		/* create a new connection entry */
 623		IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
 624		{
 625			struct ip_vs_conn_param p;
 626			ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
 627					      &iph->saddr, pptr[0],
 628					      &iph->daddr, pptr[1], &p);
 629			cp = ip_vs_conn_new(&p, svc->af, &daddr, 0,
 630					    IP_VS_CONN_F_BYPASS | flags,
 631					    NULL, skb->mark);
 632			if (!cp)
 633				return NF_DROP;
 634		}
 635
 636		/* statistics */
 637		ip_vs_in_stats(cp, skb);
 638
 639		/* set state */
 640		ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
 641
 642		/* transmit the first SYN packet */
 643		ret = cp->packet_xmit(skb, cp, pd->pp, iph);
 644		/* do not touch skb anymore */
 645
 646		if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
 647			atomic_inc(&cp->control->in_pkts);
 648		else
 649			atomic_inc(&cp->in_pkts);
 650		ip_vs_conn_put(cp);
 651		return ret;
 652	}
 653
 654	/*
 655	 * When the virtual ftp service is presented, packets destined
 656	 * for other services on the VIP may get here (except services
 657	 * listed in the ipvs table), pass the packets, because it is
 658	 * not ipvs job to decide to drop the packets.
 659	 */
 660	if (svc->port == FTPPORT && dport != FTPPORT)
 661		return NF_ACCEPT;
 662
 663	if (unlikely(ip_vs_iph_icmp(iph)))
 664		return NF_DROP;
 665
 666	/*
 667	 * Notify the client that the destination is unreachable, and
 668	 * release the socket buffer.
 669	 * Since it is in IP layer, the TCP socket is not actually
 670	 * created, the TCP RST packet cannot be sent, instead that
 671	 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
 672	 */
 673#ifdef CONFIG_IP_VS_IPV6
 674	if (svc->af == AF_INET6) {
 675		if (!skb->dev)
 676			skb->dev = net->loopback_dev;
 677		icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
 678	} else
 679#endif
 680		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 681
 682	return NF_DROP;
 683}
 684
 685#ifdef CONFIG_SYSCTL
 686
 687static int sysctl_snat_reroute(struct netns_ipvs *ipvs)
 688{
 689	return ipvs->sysctl_snat_reroute;
 690}
 691
 692static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs)
 693{
 694	return ipvs->sysctl_nat_icmp_send;
 695}
 696
 697static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
 698{
 699	return ipvs->sysctl_expire_nodest_conn;
 700}
 701
 702#else
 703
 704static int sysctl_snat_reroute(struct netns_ipvs *ipvs) { return 0; }
 705static int sysctl_nat_icmp_send(struct netns_ipvs *ipvs) { return 0; }
 706static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; }
 707
 708#endif
 709
 710__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
 711{
 712	return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
 713}
 714
 715static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
 716{
 717	if (NF_INET_LOCAL_IN == hooknum)
 718		return IP_DEFRAG_VS_IN;
 719	if (NF_INET_FORWARD == hooknum)
 720		return IP_DEFRAG_VS_FWD;
 721	return IP_DEFRAG_VS_OUT;
 722}
 723
 724static inline int ip_vs_gather_frags(struct netns_ipvs *ipvs,
 725				     struct sk_buff *skb, u_int32_t user)
 726{
 727	int err;
 728
 729	local_bh_disable();
 730	err = ip_defrag(ipvs->net, skb, user);
 731	local_bh_enable();
 732	if (!err)
 733		ip_send_check(ip_hdr(skb));
 734
 735	return err;
 736}
 737
 738static int ip_vs_route_me_harder(struct netns_ipvs *ipvs, int af,
 739				 struct sk_buff *skb, unsigned int hooknum)
 740{
 741	if (!sysctl_snat_reroute(ipvs))
 742		return 0;
 743	/* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */
 744	if (NF_INET_LOCAL_IN == hooknum)
 745		return 0;
 746#ifdef CONFIG_IP_VS_IPV6
 747	if (af == AF_INET6) {
 748		struct dst_entry *dst = skb_dst(skb);
 749
 750		if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) &&
 751		    ip6_route_me_harder(ipvs->net, skb) != 0)
 752			return 1;
 753	} else
 754#endif
 755		if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
 756		    ip_route_me_harder(ipvs->net, skb, RTN_LOCAL) != 0)
 757			return 1;
 758
 759	return 0;
 760}
 761
 762/*
 763 * Packet has been made sufficiently writable in caller
 764 * - inout: 1=in->out, 0=out->in
 765 */
 766void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
 767		    struct ip_vs_conn *cp, int inout)
 768{
 769	struct iphdr *iph	 = ip_hdr(skb);
 770	unsigned int icmp_offset = iph->ihl*4;
 771	struct icmphdr *icmph	 = (struct icmphdr *)(skb_network_header(skb) +
 772						      icmp_offset);
 773	struct iphdr *ciph	 = (struct iphdr *)(icmph + 1);
 774
 775	if (inout) {
 776		iph->saddr = cp->vaddr.ip;
 777		ip_send_check(iph);
 778		ciph->daddr = cp->vaddr.ip;
 779		ip_send_check(ciph);
 780	} else {
 781		iph->daddr = cp->daddr.ip;
 782		ip_send_check(iph);
 783		ciph->saddr = cp->daddr.ip;
 784		ip_send_check(ciph);
 785	}
 786
 787	/* the TCP/UDP/SCTP port */
 788	if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
 789	    IPPROTO_SCTP == ciph->protocol) {
 790		__be16 *ports = (void *)ciph + ciph->ihl*4;
 791
 792		if (inout)
 793			ports[1] = cp->vport;
 794		else
 795			ports[0] = cp->dport;
 796	}
 797
 798	/* And finally the ICMP checksum */
 799	icmph->checksum = 0;
 800	icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
 801	skb->ip_summed = CHECKSUM_UNNECESSARY;
 802
 803	if (inout)
 804		IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
 805			"Forwarding altered outgoing ICMP");
 806	else
 807		IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
 808			"Forwarding altered incoming ICMP");
 809}
 810
 811#ifdef CONFIG_IP_VS_IPV6
 812void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
 813		    struct ip_vs_conn *cp, int inout)
 814{
 815	struct ipv6hdr *iph	 = ipv6_hdr(skb);
 816	unsigned int icmp_offset = 0;
 817	unsigned int offs	 = 0; /* header offset*/
 818	int protocol;
 819	struct icmp6hdr *icmph;
 820	struct ipv6hdr *ciph;
 821	unsigned short fragoffs;
 822
 823	ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL);
 824	icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset);
 825	offs = icmp_offset + sizeof(struct icmp6hdr);
 826	ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs);
 827
 828	protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL);
 829
 830	if (inout) {
 831		iph->saddr = cp->vaddr.in6;
 832		ciph->daddr = cp->vaddr.in6;
 833	} else {
 834		iph->daddr = cp->daddr.in6;
 835		ciph->saddr = cp->daddr.in6;
 836	}
 837
 838	/* the TCP/UDP/SCTP port */
 839	if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
 840			  IPPROTO_SCTP == protocol)) {
 841		__be16 *ports = (void *)(skb_network_header(skb) + offs);
 842
 843		IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__,
 844			      ntohs(inout ? ports[1] : ports[0]),
 845			      ntohs(inout ? cp->vport : cp->dport));
 846		if (inout)
 847			ports[1] = cp->vport;
 848		else
 849			ports[0] = cp->dport;
 850	}
 851
 852	/* And finally the ICMP checksum */
 853	icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
 854					      skb->len - icmp_offset,
 855					      IPPROTO_ICMPV6, 0);
 856	skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
 857	skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
 858	skb->ip_summed = CHECKSUM_PARTIAL;
 859
 860	if (inout)
 861		IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
 862			      (void *)ciph - (void *)iph,
 863			      "Forwarding altered outgoing ICMPv6");
 864	else
 865		IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
 866			      (void *)ciph - (void *)iph,
 867			      "Forwarding altered incoming ICMPv6");
 868}
 869#endif
 870
 871/* Handle relevant response ICMP messages - forward to the right
 872 * destination host.
 873 */
 874static int handle_response_icmp(int af, struct sk_buff *skb,
 875				union nf_inet_addr *snet,
 876				__u8 protocol, struct ip_vs_conn *cp,
 877				struct ip_vs_protocol *pp,
 878				unsigned int offset, unsigned int ihl,
 879				unsigned int hooknum)
 880{
 881	unsigned int verdict = NF_DROP;
 882
 883	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
 884		goto ignore_cp;
 885
 886	/* Ensure the checksum is correct */
 887	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
 888		/* Failed checksum! */
 889		IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
 890			      IP_VS_DBG_ADDR(af, snet));
 891		goto out;
 892	}
 893
 894	if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
 895	    IPPROTO_SCTP == protocol)
 896		offset += 2 * sizeof(__u16);
 897	if (skb_ensure_writable(skb, offset))
 898		goto out;
 899
 900#ifdef CONFIG_IP_VS_IPV6
 901	if (af == AF_INET6)
 902		ip_vs_nat_icmp_v6(skb, pp, cp, 1);
 903	else
 904#endif
 905		ip_vs_nat_icmp(skb, pp, cp, 1);
 906
 907	if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))
 908		goto out;
 909
 910	/* do the statistics and put it back */
 911	ip_vs_out_stats(cp, skb);
 912
 913	skb->ipvs_property = 1;
 914	if (!(cp->flags & IP_VS_CONN_F_NFCT))
 915		ip_vs_notrack(skb);
 916	else
 917		ip_vs_update_conntrack(skb, cp, 0);
 918
 919ignore_cp:
 920	verdict = NF_ACCEPT;
 921
 922out:
 923	__ip_vs_conn_put(cp);
 924
 925	return verdict;
 926}
 927
 928/*
 929 *	Handle ICMP messages in the inside-to-outside direction (outgoing).
 930 *	Find any that might be relevant, check against existing connections.
 931 *	Currently handles error types - unreachable, quench, ttl exceeded.
 932 */
 933static int ip_vs_out_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb,
 934			  int *related, unsigned int hooknum)
 935{
 936	struct iphdr *iph;
 937	struct icmphdr	_icmph, *ic;
 938	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
 939	struct ip_vs_iphdr ciph;
 940	struct ip_vs_conn *cp;
 941	struct ip_vs_protocol *pp;
 942	unsigned int offset, ihl;
 943	union nf_inet_addr snet;
 944
 945	*related = 1;
 946
 947	/* reassemble IP fragments */
 948	if (ip_is_fragment(ip_hdr(skb))) {
 949		if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum)))
 950			return NF_STOLEN;
 951	}
 952
 953	iph = ip_hdr(skb);
 954	offset = ihl = iph->ihl * 4;
 955	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
 956	if (ic == NULL)
 957		return NF_DROP;
 958
 959	IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
 960		  ic->type, ntohs(icmp_id(ic)),
 961		  &iph->saddr, &iph->daddr);
 962
 963	/*
 964	 * Work through seeing if this is for us.
 965	 * These checks are supposed to be in an order that means easy
 966	 * things are checked first to speed up processing.... however
 967	 * this means that some packets will manage to get a long way
 968	 * down this stack and then be rejected, but that's life.
 969	 */
 970	if ((ic->type != ICMP_DEST_UNREACH) &&
 971	    (ic->type != ICMP_SOURCE_QUENCH) &&
 972	    (ic->type != ICMP_TIME_EXCEEDED)) {
 973		*related = 0;
 974		return NF_ACCEPT;
 975	}
 976
 977	/* Now find the contained IP header */
 978	offset += sizeof(_icmph);
 979	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
 980	if (cih == NULL)
 981		return NF_ACCEPT; /* The packet looks wrong, ignore */
 982
 983	pp = ip_vs_proto_get(cih->protocol);
 984	if (!pp)
 985		return NF_ACCEPT;
 986
 987	/* Is the embedded protocol header present? */
 988	if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
 989		     pp->dont_defrag))
 990		return NF_ACCEPT;
 991
 992	IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
 993		      "Checking outgoing ICMP for");
 994
 995	ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, true, &ciph);
 996
 997	/* The embedded headers contain source and dest in reverse order */
 998	cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
 999			     ipvs, AF_INET, skb, &ciph);
1000	if (!cp)
1001		return NF_ACCEPT;
1002
1003	snet.ip = iph->saddr;
1004	return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
1005				    pp, ciph.len, ihl, hooknum);
1006}
1007
1008#ifdef CONFIG_IP_VS_IPV6
1009static int ip_vs_out_icmp_v6(struct netns_ipvs *ipvs, struct sk_buff *skb,
1010			     int *related,  unsigned int hooknum,
1011			     struct ip_vs_iphdr *ipvsh)
1012{
1013	struct icmp6hdr	_icmph, *ic;
1014	struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
1015	struct ip_vs_conn *cp;
1016	struct ip_vs_protocol *pp;
1017	union nf_inet_addr snet;
1018	unsigned int offset;
1019
1020	*related = 1;
1021	ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph);
1022	if (ic == NULL)
1023		return NF_DROP;
1024
1025	/*
1026	 * Work through seeing if this is for us.
1027	 * These checks are supposed to be in an order that means easy
1028	 * things are checked first to speed up processing.... however
1029	 * this means that some packets will manage to get a long way
1030	 * down this stack and then be rejected, but that's life.
1031	 */
1032	if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
1033		*related = 0;
1034		return NF_ACCEPT;
1035	}
1036	/* Fragment header that is before ICMP header tells us that:
1037	 * it's not an error message since they can't be fragmented.
1038	 */
1039	if (ipvsh->flags & IP6_FH_F_FRAG)
1040		return NF_DROP;
1041
1042	IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n",
1043		  ic->icmp6_type, ntohs(icmpv6_id(ic)),
1044		  &ipvsh->saddr, &ipvsh->daddr);
1045
1046	if (!ip_vs_fill_iph_skb_icmp(AF_INET6, skb, ipvsh->len + sizeof(_icmph),
1047				     true, &ciph))
1048		return NF_ACCEPT; /* The packet looks wrong, ignore */
1049
1050	pp = ip_vs_proto_get(ciph.protocol);
1051	if (!pp)
1052		return NF_ACCEPT;
1053
1054	/* The embedded headers contain source and dest in reverse order */
1055	cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
1056			     ipvs, AF_INET6, skb, &ciph);
1057	if (!cp)
1058		return NF_ACCEPT;
1059
1060	snet.in6 = ciph.saddr.in6;
1061	offset = ciph.len;
1062	return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp,
1063				    pp, offset, sizeof(struct ipv6hdr),
1064				    hooknum);
1065}
1066#endif
1067
1068/*
1069 * Check if sctp chunc is ABORT chunk
1070 */
1071static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
1072{
1073	struct sctp_chunkhdr *sch, schunk;
1074	sch = skb_header_pointer(skb, nh_len + sizeof(struct sctphdr),
1075				 sizeof(schunk), &schunk);
1076	if (sch == NULL)
1077		return 0;
1078	if (sch->type == SCTP_CID_ABORT)
1079		return 1;
1080	return 0;
1081}
1082
1083static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
1084{
1085	struct tcphdr _tcph, *th;
1086
1087	th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
1088	if (th == NULL)
1089		return 0;
1090	return th->rst;
1091}
1092
1093static inline bool is_new_conn(const struct sk_buff *skb,
1094			       struct ip_vs_iphdr *iph)
1095{
1096	switch (iph->protocol) {
1097	case IPPROTO_TCP: {
1098		struct tcphdr _tcph, *th;
1099
1100		th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
1101		if (th == NULL)
1102			return false;
1103		return th->syn;
1104	}
1105	case IPPROTO_SCTP: {
1106		struct sctp_chunkhdr *sch, schunk;
1107
1108		sch = skb_header_pointer(skb, iph->len + sizeof(struct sctphdr),
1109					 sizeof(schunk), &schunk);
1110		if (sch == NULL)
1111			return false;
1112		return sch->type == SCTP_CID_INIT;
1113	}
1114	default:
1115		return false;
1116	}
1117}
1118
1119static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
1120					int conn_reuse_mode)
1121{
1122	/* Controlled (FTP DATA or persistence)? */
1123	if (cp->control)
1124		return false;
1125
1126	switch (cp->protocol) {
1127	case IPPROTO_TCP:
1128		return (cp->state == IP_VS_TCP_S_TIME_WAIT) ||
1129		       (cp->state == IP_VS_TCP_S_CLOSE) ||
1130			((conn_reuse_mode & 2) &&
1131			 (cp->state == IP_VS_TCP_S_FIN_WAIT) &&
1132			 (cp->flags & IP_VS_CONN_F_NOOUTPUT));
1133	case IPPROTO_SCTP:
1134		return cp->state == IP_VS_SCTP_S_CLOSED;
1135	default:
1136		return false;
1137	}
1138}
1139
1140/* Generic function to create new connections for outgoing RS packets
1141 *
1142 * Pre-requisites for successful connection creation:
1143 * 1) Virtual Service is NOT fwmark based:
1144 *    In fwmark-VS actual vaddr and vport are unknown to IPVS
1145 * 2) Real Server and Virtual Service were NOT configured without port:
1146 *    This is to allow match of different VS to the same RS ip-addr
1147 */
1148struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
1149				      struct ip_vs_dest *dest,
1150				      struct sk_buff *skb,
1151				      const struct ip_vs_iphdr *iph,
1152				      __be16 dport,
1153				      __be16 cport)
1154{
1155	struct ip_vs_conn_param param;
1156	struct ip_vs_conn *ct = NULL, *cp = NULL;
1157	const union nf_inet_addr *vaddr, *daddr, *caddr;
1158	union nf_inet_addr snet;
1159	__be16 vport;
1160	unsigned int flags;
1161
1162	EnterFunction(12);
1163	vaddr = &svc->addr;
1164	vport = svc->port;
1165	daddr = &iph->saddr;
1166	caddr = &iph->daddr;
1167
1168	/* check pre-requisites are satisfied */
1169	if (svc->fwmark)
1170		return NULL;
1171	if (!vport || !dport)
1172		return NULL;
1173
1174	/* for persistent service first create connection template */
1175	if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
1176		/* apply netmask the same way ingress-side does */
1177#ifdef CONFIG_IP_VS_IPV6
1178		if (svc->af == AF_INET6)
1179			ipv6_addr_prefix(&snet.in6, &caddr->in6,
1180					 (__force __u32)svc->netmask);
1181		else
1182#endif
1183			snet.ip = caddr->ip & svc->netmask;
1184		/* fill params and create template if not existent */
1185		if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol,
1186						  &snet, 0, vaddr,
1187						  vport, &param) < 0)
1188			return NULL;
1189		ct = ip_vs_ct_in_get(&param);
1190		/* check if template exists and points to the same dest */
1191		if (!ct || !ip_vs_check_template(ct, dest)) {
1192			ct = ip_vs_conn_new(&param, dest->af, daddr, dport,
1193					    IP_VS_CONN_F_TEMPLATE, dest, 0);
1194			if (!ct) {
1195				kfree(param.pe_data);
1196				return NULL;
1197			}
1198			ct->timeout = svc->timeout;
1199		} else {
1200			kfree(param.pe_data);
1201		}
1202	}
1203
1204	/* connection flags */
1205	flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) &&
1206		 iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0;
1207	/* create connection */
1208	ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
1209			      caddr, cport, vaddr, vport, &param);
1210	cp = ip_vs_conn_new(&param, dest->af, daddr, dport, flags, dest, 0);
1211	if (!cp) {
1212		if (ct)
1213			ip_vs_conn_put(ct);
1214		return NULL;
1215	}
1216	if (ct) {
1217		ip_vs_control_add(cp, ct);
1218		ip_vs_conn_put(ct);
1219	}
1220	ip_vs_conn_stats(cp, svc);
1221
1222	/* return connection (will be used to handle outgoing packet) */
1223	IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u "
1224		      "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
1225		      ip_vs_fwd_tag(cp),
1226		      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
1227		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
1228		      IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
1229		      cp->flags, refcount_read(&cp->refcnt));
1230	LeaveFunction(12);
1231	return cp;
1232}
1233
1234/* Handle outgoing packets which are considered requests initiated by
1235 * real servers, so that subsequent responses from external client can be
1236 * routed to the right real server.
1237 * Used also for outgoing responses in OPS mode.
1238 *
1239 * Connection management is handled by persistent-engine specific callback.
1240 */
1241static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum,
1242					      struct netns_ipvs *ipvs,
1243					      int af, struct sk_buff *skb,
1244					      const struct ip_vs_iphdr *iph)
1245{
1246	struct ip_vs_dest *dest;
1247	struct ip_vs_conn *cp = NULL;
1248	__be16 _ports[2], *pptr;
1249
1250	if (hooknum == NF_INET_LOCAL_IN)
1251		return NULL;
1252
1253	pptr = frag_safe_skb_hp(skb, iph->len,
1254				sizeof(_ports), _ports);
1255	if (!pptr)
1256		return NULL;
1257
1258	dest = ip_vs_find_real_service(ipvs, af, iph->protocol,
1259				       &iph->saddr, pptr[0]);
1260	if (dest) {
1261		struct ip_vs_service *svc;
1262		struct ip_vs_pe *pe;
1263
1264		svc = rcu_dereference(dest->svc);
1265		if (svc) {
1266			pe = rcu_dereference(svc->pe);
1267			if (pe && pe->conn_out)
1268				cp = pe->conn_out(svc, dest, skb, iph,
1269						  pptr[0], pptr[1]);
1270		}
1271	}
1272
1273	return cp;
1274}
1275
1276/* Handle response packets: rewrite addresses and send away...
1277 */
1278static unsigned int
1279handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
1280		struct ip_vs_conn *cp, struct ip_vs_iphdr *iph,
1281		unsigned int hooknum)
1282{
1283	struct ip_vs_protocol *pp = pd->pp;
1284
1285	IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet");
1286
1287	if (skb_ensure_writable(skb, iph->len))
1288		goto drop;
1289
1290	/* mangle the packet */
1291	if (pp->snat_handler &&
1292	    !SNAT_CALL(pp->snat_handler, skb, pp, cp, iph))
1293		goto drop;
1294
1295#ifdef CONFIG_IP_VS_IPV6
1296	if (af == AF_INET6)
1297		ipv6_hdr(skb)->saddr = cp->vaddr.in6;
1298	else
1299#endif
1300	{
1301		ip_hdr(skb)->saddr = cp->vaddr.ip;
1302		ip_send_check(ip_hdr(skb));
1303	}
1304
1305	/*
1306	 * nf_iterate does not expect change in the skb->dst->dev.
1307	 * It looks like it is not fatal to enable this code for hooks
1308	 * where our handlers are at the end of the chain list and
1309	 * when all next handlers use skb->dst->dev and not outdev.
1310	 * It will definitely route properly the inout NAT traffic
1311	 * when multiple paths are used.
1312	 */
1313
1314	/* For policy routing, packets originating from this
1315	 * machine itself may be routed differently to packets
1316	 * passing through.  We want this packet to be routed as
1317	 * if it came from this machine itself.  So re-compute
1318	 * the routing information.
1319	 */
1320	if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))
1321		goto drop;
1322
1323	IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT");
1324
1325	ip_vs_out_stats(cp, skb);
1326	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
1327	skb->ipvs_property = 1;
1328	if (!(cp->flags & IP_VS_CONN_F_NFCT))
1329		ip_vs_notrack(skb);
1330	else
1331		ip_vs_update_conntrack(skb, cp, 0);
1332	ip_vs_conn_put(cp);
1333
1334	LeaveFunction(11);
1335	return NF_ACCEPT;
1336
1337drop:
1338	ip_vs_conn_put(cp);
1339	kfree_skb(skb);
1340	LeaveFunction(11);
1341	return NF_STOLEN;
1342}
1343
1344/*
1345 *	Check if outgoing packet belongs to the established ip_vs_conn.
1346 */
1347static unsigned int
1348ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
1349{
1350	struct ip_vs_iphdr iph;
1351	struct ip_vs_protocol *pp;
1352	struct ip_vs_proto_data *pd;
1353	struct ip_vs_conn *cp;
1354	struct sock *sk;
1355
1356	EnterFunction(11);
1357
1358	/* Already marked as IPVS request or reply? */
1359	if (skb->ipvs_property)
1360		return NF_ACCEPT;
1361
1362	sk = skb_to_full_sk(skb);
1363	/* Bad... Do not break raw sockets */
1364	if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
1365		     af == AF_INET)) {
1366
1367		if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
1368			return NF_ACCEPT;
1369	}
1370
1371	if (unlikely(!skb_dst(skb)))
1372		return NF_ACCEPT;
1373
1374	if (!ipvs->enable)
1375		return NF_ACCEPT;
1376
1377	ip_vs_fill_iph_skb(af, skb, false, &iph);
1378#ifdef CONFIG_IP_VS_IPV6
1379	if (af == AF_INET6) {
1380		if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1381			int related;
1382			int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related,
1383							hooknum, &iph);
1384
1385			if (related)
1386				return verdict;
1387		}
1388	} else
1389#endif
1390		if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1391			int related;
1392			int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum);
1393
1394			if (related)
1395				return verdict;
1396		}
1397
1398	pd = ip_vs_proto_data_get(ipvs, iph.protocol);
1399	if (unlikely(!pd))
1400		return NF_ACCEPT;
1401	pp = pd->pp;
1402
1403	/* reassemble IP fragments */
1404#ifdef CONFIG_IP_VS_IPV6
1405	if (af == AF_INET)
1406#endif
1407		if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
1408			if (ip_vs_gather_frags(ipvs, skb,
1409					       ip_vs_defrag_user(hooknum)))
1410				return NF_STOLEN;
1411
1412			ip_vs_fill_iph_skb(AF_INET, skb, false, &iph);
1413		}
1414
1415	/*
1416	 * Check if the packet belongs to an existing entry
1417	 */
1418	cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
1419			     ipvs, af, skb, &iph);
1420
1421	if (likely(cp)) {
1422		if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
1423			goto ignore_cp;
1424		return handle_response(af, skb, pd, cp, &iph, hooknum);
1425	}
1426
1427	/* Check for real-server-started requests */
1428	if (atomic_read(&ipvs->conn_out_counter)) {
1429		/* Currently only for UDP:
1430		 * connection oriented protocols typically use
1431		 * ephemeral ports for outgoing connections, so
1432		 * related incoming responses would not match any VS
1433		 */
1434		if (pp->protocol == IPPROTO_UDP) {
1435			cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph);
1436			if (likely(cp))
1437				return handle_response(af, skb, pd, cp, &iph,
1438						       hooknum);
1439		}
1440	}
1441
1442	if (sysctl_nat_icmp_send(ipvs) &&
1443	    (pp->protocol == IPPROTO_TCP ||
1444	     pp->protocol == IPPROTO_UDP ||
1445	     pp->protocol == IPPROTO_SCTP)) {
1446		__be16 _ports[2], *pptr;
1447
1448		pptr = frag_safe_skb_hp(skb, iph.len,
1449					 sizeof(_ports), _ports);
1450		if (pptr == NULL)
1451			return NF_ACCEPT;	/* Not for me */
1452		if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr,
1453					   pptr[0])) {
1454			/*
1455			 * Notify the real server: there is no
1456			 * existing entry if it is not RST
1457			 * packet or not TCP packet.
1458			 */
1459			if ((iph.protocol != IPPROTO_TCP &&
1460			     iph.protocol != IPPROTO_SCTP)
1461			     || ((iph.protocol == IPPROTO_TCP
1462				  && !is_tcp_reset(skb, iph.len))
1463				 || (iph.protocol == IPPROTO_SCTP
1464					&& !is_sctp_abort(skb,
1465						iph.len)))) {
1466#ifdef CONFIG_IP_VS_IPV6
1467				if (af == AF_INET6) {
1468					if (!skb->dev)
1469						skb->dev = ipvs->net->loopback_dev;
1470					icmpv6_send(skb,
1471						    ICMPV6_DEST_UNREACH,
1472						    ICMPV6_PORT_UNREACH,
1473						    0);
1474				} else
1475#endif
1476					icmp_send(skb,
1477						  ICMP_DEST_UNREACH,
1478						  ICMP_PORT_UNREACH, 0);
1479				return NF_DROP;
1480			}
1481		}
1482	}
1483
1484out:
1485	IP_VS_DBG_PKT(12, af, pp, skb, iph.off,
1486		      "ip_vs_out: packet continues traversal as normal");
1487	return NF_ACCEPT;
1488
1489ignore_cp:
1490	__ip_vs_conn_put(cp);
1491	goto out;
1492}
1493
1494/*
1495 *	It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1496 *	used only for VS/NAT.
1497 *	Check if packet is reply for established ip_vs_conn.
1498 */
1499static unsigned int
1500ip_vs_reply4(void *priv, struct sk_buff *skb,
1501	     const struct nf_hook_state *state)
1502{
1503	return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET);
1504}
1505
1506/*
1507 *	It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1508 *	Check if packet is reply for established ip_vs_conn.
1509 */
1510static unsigned int
1511ip_vs_local_reply4(void *priv, struct sk_buff *skb,
1512		   const struct nf_hook_state *state)
1513{
1514	return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET);
1515}
1516
1517#ifdef CONFIG_IP_VS_IPV6
1518
1519/*
1520 *	It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
1521 *	used only for VS/NAT.
1522 *	Check if packet is reply for established ip_vs_conn.
1523 */
1524static unsigned int
1525ip_vs_reply6(void *priv, struct sk_buff *skb,
1526	     const struct nf_hook_state *state)
1527{
1528	return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6);
1529}
1530
1531/*
1532 *	It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
1533 *	Check if packet is reply for established ip_vs_conn.
1534 */
1535static unsigned int
1536ip_vs_local_reply6(void *priv, struct sk_buff *skb,
1537		   const struct nf_hook_state *state)
1538{
1539	return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6);
1540}
1541
1542#endif
1543
1544static unsigned int
1545ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
1546		      struct ip_vs_proto_data *pd,
1547		      int *verdict, struct ip_vs_conn **cpp,
1548		      struct ip_vs_iphdr *iph)
1549{
1550	struct ip_vs_protocol *pp = pd->pp;
1551
1552	if (!iph->fragoffs) {
1553		/* No (second) fragments need to enter here, as nf_defrag_ipv6
1554		 * replayed fragment zero will already have created the cp
1555		 */
1556
1557		/* Schedule and create new connection entry into cpp */
1558		if (!pp->conn_schedule(ipvs, af, skb, pd, verdict, cpp, iph))
1559			return 0;
1560	}
1561
1562	if (unlikely(!*cpp)) {
1563		/* sorry, all this trouble for a no-hit :) */
1564		IP_VS_DBG_PKT(12, af, pp, skb, iph->off,
1565			      "ip_vs_in: packet continues traversal as normal");
1566
1567		/* Fragment couldn't be mapped to a conn entry */
1568		if (iph->fragoffs)
1569			IP_VS_DBG_PKT(7, af, pp, skb, iph->off,
1570				      "unhandled fragment");
1571
1572		*verdict = NF_ACCEPT;
1573		return 0;
1574	}
1575
1576	return 1;
1577}
1578
1579/* Check the UDP tunnel and return its header length */
1580static int ipvs_udp_decap(struct netns_ipvs *ipvs, struct sk_buff *skb,
1581			  unsigned int offset, __u16 af,
1582			  const union nf_inet_addr *daddr, __u8 *proto)
1583{
1584	struct udphdr _udph, *udph;
1585	struct ip_vs_dest *dest;
1586
1587	udph = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
1588	if (!udph)
1589		goto unk;
1590	offset += sizeof(struct udphdr);
1591	dest = ip_vs_find_tunnel(ipvs, af, daddr, udph->dest);
1592	if (!dest)
1593		goto unk;
1594	if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1595		struct guehdr _gueh, *gueh;
1596
1597		gueh = skb_header_pointer(skb, offset, sizeof(_gueh), &_gueh);
1598		if (!gueh)
1599			goto unk;
1600		if (gueh->control != 0 || gueh->version != 0)
1601			goto unk;
1602		/* Later we can support also IPPROTO_IPV6 */
1603		if (gueh->proto_ctype != IPPROTO_IPIP)
1604			goto unk;
1605		*proto = gueh->proto_ctype;
1606		return sizeof(struct udphdr) + sizeof(struct guehdr) +
1607		       (gueh->hlen << 2);
1608	}
1609
1610unk:
1611	return 0;
1612}
1613
1614/* Check the GRE tunnel and return its header length */
1615static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb,
1616			  unsigned int offset, __u16 af,
1617			  const union nf_inet_addr *daddr, __u8 *proto)
1618{
1619	struct gre_base_hdr _greh, *greh;
1620	struct ip_vs_dest *dest;
1621
1622	greh = skb_header_pointer(skb, offset, sizeof(_greh), &_greh);
1623	if (!greh)
1624		goto unk;
1625	dest = ip_vs_find_tunnel(ipvs, af, daddr, 0);
1626	if (!dest)
1627		goto unk;
1628	if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
1629		__be16 type;
1630
1631		/* Only support version 0 and C (csum) */
1632		if ((greh->flags & ~GRE_CSUM) != 0)
1633			goto unk;
1634		type = greh->protocol;
1635		/* Later we can support also IPPROTO_IPV6 */
1636		if (type != htons(ETH_P_IP))
1637			goto unk;
1638		*proto = IPPROTO_IPIP;
1639		return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags));
1640	}
1641
1642unk:
1643	return 0;
1644}
1645
1646/*
1647 *	Handle ICMP messages in the outside-to-inside direction (incoming).
1648 *	Find any that might be relevant, check against existing connections,
1649 *	forward to the right destination host if relevant.
1650 *	Currently handles error types - unreachable, quench, ttl exceeded.
1651 */
1652static int
1653ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
1654	      unsigned int hooknum)
1655{
1656	struct iphdr *iph;
1657	struct icmphdr	_icmph, *ic;
1658	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
1659	struct ip_vs_iphdr ciph;
1660	struct ip_vs_conn *cp;
1661	struct ip_vs_protocol *pp;
1662	struct ip_vs_proto_data *pd;
1663	unsigned int offset, offset2, ihl, verdict;
1664	bool tunnel, new_cp = false;
1665	union nf_inet_addr *raddr;
1666	char *outer_proto = "IPIP";
1667
1668	*related = 1;
1669
1670	/* reassemble IP fragments */
1671	if (ip_is_fragment(ip_hdr(skb))) {
1672		if (ip_vs_gather_frags(ipvs, skb, ip_vs_defrag_user(hooknum)))
1673			return NF_STOLEN;
1674	}
1675
1676	iph = ip_hdr(skb);
1677	offset = ihl = iph->ihl * 4;
1678	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1679	if (ic == NULL)
1680		return NF_DROP;
1681
1682	IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1683		  ic->type, ntohs(icmp_id(ic)),
1684		  &iph->saddr, &iph->daddr);
1685
1686	/*
1687	 * Work through seeing if this is for us.
1688	 * These checks are supposed to be in an order that means easy
1689	 * things are checked first to speed up processing.... however
1690	 * this means that some packets will manage to get a long way
1691	 * down this stack and then be rejected, but that's life.
1692	 */
1693	if ((ic->type != ICMP_DEST_UNREACH) &&
1694	    (ic->type != ICMP_SOURCE_QUENCH) &&
1695	    (ic->type != ICMP_TIME_EXCEEDED)) {
1696		*related = 0;
1697		return NF_ACCEPT;
1698	}
1699
1700	/* Now find the contained IP header */
1701	offset += sizeof(_icmph);
1702	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1703	if (cih == NULL)
1704		return NF_ACCEPT; /* The packet looks wrong, ignore */
1705	raddr = (union nf_inet_addr *)&cih->daddr;
1706
1707	/* Special case for errors for IPIP/UDP/GRE tunnel packets */
1708	tunnel = false;
1709	if (cih->protocol == IPPROTO_IPIP) {
1710		struct ip_vs_dest *dest;
1711
1712		if (unlikely(cih->frag_off & htons(IP_OFFSET)))
1713			return NF_ACCEPT;
1714		/* Error for our IPIP must arrive at LOCAL_IN */
1715		if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL))
1716			return NF_ACCEPT;
1717		dest = ip_vs_find_tunnel(ipvs, AF_INET, raddr, 0);
1718		/* Only for known tunnel */
1719		if (!dest || dest->tun_type != IP_VS_CONN_F_TUNNEL_TYPE_IPIP)
1720			return NF_ACCEPT;
1721		offset += cih->ihl * 4;
1722		cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1723		if (cih == NULL)
1724			return NF_ACCEPT; /* The packet looks wrong, ignore */
1725		tunnel = true;
1726	} else if ((cih->protocol == IPPROTO_UDP ||	/* Can be UDP encap */
1727		    cih->protocol == IPPROTO_GRE) &&	/* Can be GRE encap */
1728		   /* Error for our tunnel must arrive at LOCAL_IN */
1729		   (skb_rtable(skb)->rt_flags & RTCF_LOCAL)) {
1730		__u8 iproto;
1731		int ulen;
1732
1733		/* Non-first fragment has no UDP/GRE header */
1734		if (unlikely(cih->frag_off & htons(IP_OFFSET)))
1735			return NF_ACCEPT;
1736		offset2 = offset + cih->ihl * 4;
1737		if (cih->protocol == IPPROTO_UDP) {
1738			ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET,
1739					      raddr, &iproto);
1740			outer_proto = "UDP";
1741		} else {
1742			ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET,
1743					      raddr, &iproto);
1744			outer_proto = "GRE";
1745		}
1746		if (ulen > 0) {
1747			/* Skip IP and UDP/GRE tunnel headers */
1748			offset = offset2 + ulen;
1749			/* Now we should be at the original IP header */
1750			cih = skb_header_pointer(skb, offset, sizeof(_ciph),
1751						 &_ciph);
1752			if (cih && cih->version == 4 && cih->ihl >= 5 &&
1753			    iproto == IPPROTO_IPIP)
1754				tunnel = true;
1755			else
1756				return NF_ACCEPT;
1757		}
1758	}
1759
1760	pd = ip_vs_proto_data_get(ipvs, cih->protocol);
1761	if (!pd)
1762		return NF_ACCEPT;
1763	pp = pd->pp;
1764
1765	/* Is the embedded protocol header present? */
1766	if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1767		     pp->dont_defrag))
1768		return NF_ACCEPT;
1769
1770	IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
1771		      "Checking incoming ICMP for");
1772
1773	offset2 = offset;
1774	ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !tunnel, &ciph);
1775	offset = ciph.len;
1776
1777	/* The embedded headers contain source and dest in reverse order.
1778	 * For IPIP/UDP/GRE tunnel this is error for request, not for reply.
1779	 */
1780	cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
1781			     ipvs, AF_INET, skb, &ciph);
1782
1783	if (!cp) {
1784		int v;
1785
1786		if (tunnel || !sysctl_schedule_icmp(ipvs))
1787			return NF_ACCEPT;
1788
1789		if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph))
1790			return v;
1791		new_cp = true;
1792	}
1793
1794	verdict = NF_DROP;
1795
1796	/* Ensure the checksum is correct */
1797	if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1798		/* Failed checksum! */
1799		IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1800			  &iph->saddr);
1801		goto out;
1802	}
1803
1804	if (tunnel) {
1805		__be32 info = ic->un.gateway;
1806		__u8 type = ic->type;
1807		__u8 code = ic->code;
1808
1809		/* Update the MTU */
1810		if (ic->type == ICMP_DEST_UNREACH &&
1811		    ic->code == ICMP_FRAG_NEEDED) {
1812			struct ip_vs_dest *dest = cp->dest;
1813			u32 mtu = ntohs(ic->un.frag.mtu);
1814			__be16 frag_off = cih->frag_off;
1815
1816			/* Strip outer IP and ICMP, go to IPIP/UDP/GRE header */
1817			if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL)
1818				goto ignore_tunnel;
1819			offset2 -= ihl + sizeof(_icmph);
1820			skb_reset_network_header(skb);
1821			IP_VS_DBG(12, "ICMP for %s %pI4->%pI4: mtu=%u\n",
1822				  outer_proto, &ip_hdr(skb)->saddr,
1823				  &ip_hdr(skb)->daddr, mtu);
1824			ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0);
1825			/* Client uses PMTUD? */
1826			if (!(frag_off & htons(IP_DF)))
1827				goto ignore_tunnel;
1828			/* Prefer the resulting PMTU …

Large files files are truncated, but you can click here to view the full file