PageRenderTime 80ms CodeModel.GetById 19ms app.highlight 56ms RepoModel.GetById 1ms app.codeStats 0ms

/net/netfilter/ipvs/ip_vs_lblc.c

http://github.com/mirrors/linux
C | 630 lines | 385 code | 107 blank | 138 comment | 42 complexity | 812e8f00956c22f1462cfd00e9c0e884 MD5 | raw file
  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/*
  3 * IPVS:        Locality-Based Least-Connection scheduling module
  4 *
  5 * Authors:     Wensong Zhang <wensong@gnuchina.org>
  6 *
  7 * Changes:
  8 *     Martin Hamilton         :    fixed the terrible locking bugs
  9 *                                   *lock(tbl->lock) ==> *lock(&tbl->lock)
 10 *     Wensong Zhang           :    fixed the uninitialized tbl->lock bug
 11 *     Wensong Zhang           :    added doing full expiration check to
 12 *                                   collect stale entries of 24+ hours when
 13 *                                   no partial expire check in a half hour
 14 *     Julian Anastasov        :    replaced del_timer call with del_timer_sync
 15 *                                   to avoid the possible race between timer
 16 *                                   handler and del_timer thread in SMP
 17 */
 18
 19/*
 20 * The lblc algorithm is as follows (pseudo code):
 21 *
 22 *       if cachenode[dest_ip] is null then
 23 *               n, cachenode[dest_ip] <- {weighted least-conn node};
 24 *       else
 25 *               n <- cachenode[dest_ip];
 26 *               if (n is dead) OR
 27 *                  (n.conns>n.weight AND
 28 *                   there is a node m with m.conns<m.weight/2) then
 29 *                 n, cachenode[dest_ip] <- {weighted least-conn node};
 30 *
 31 *       return n;
 32 *
 33 * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
 34 * me to write this module.
 35 */
 36
 37#define KMSG_COMPONENT "IPVS"
 38#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 39
 40#include <linux/ip.h>
 41#include <linux/slab.h>
 42#include <linux/module.h>
 43#include <linux/kernel.h>
 44#include <linux/skbuff.h>
 45#include <linux/jiffies.h>
 46#include <linux/hash.h>
 47
 48/* for sysctl */
 49#include <linux/fs.h>
 50#include <linux/sysctl.h>
 51
 52#include <net/ip_vs.h>
 53
 54
 55/*
 56 *    It is for garbage collection of stale IPVS lblc entries,
 57 *    when the table is full.
 58 */
 59#define CHECK_EXPIRE_INTERVAL   (60*HZ)
 60#define ENTRY_TIMEOUT           (6*60*HZ)
 61
 62#define DEFAULT_EXPIRATION	(24*60*60*HZ)
 63
 64/*
 65 *    It is for full expiration check.
 66 *    When there is no partial expiration check (garbage collection)
 67 *    in a half hour, do a full expiration check to collect stale
 68 *    entries that haven't been touched for a day.
 69 */
 70#define COUNT_FOR_FULL_EXPIRATION   30
 71
 72
 73/*
 74 *     for IPVS lblc entry hash table
 75 */
 76#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
 77#define CONFIG_IP_VS_LBLC_TAB_BITS      10
 78#endif
 79#define IP_VS_LBLC_TAB_BITS     CONFIG_IP_VS_LBLC_TAB_BITS
 80#define IP_VS_LBLC_TAB_SIZE     (1 << IP_VS_LBLC_TAB_BITS)
 81#define IP_VS_LBLC_TAB_MASK     (IP_VS_LBLC_TAB_SIZE - 1)
 82
 83
 84/*
 85 *      IPVS lblc entry represents an association between destination
 86 *      IP address and its destination server
 87 */
 88struct ip_vs_lblc_entry {
 89	struct hlist_node	list;
 90	int			af;		/* address family */
 91	union nf_inet_addr      addr;           /* destination IP address */
 92	struct ip_vs_dest	*dest;          /* real server (cache) */
 93	unsigned long           lastuse;        /* last used time */
 94	struct rcu_head		rcu_head;
 95};
 96
 97
 98/*
 99 *      IPVS lblc hash table
100 */
101struct ip_vs_lblc_table {
102	struct rcu_head		rcu_head;
103	struct hlist_head	bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
104	struct timer_list       periodic_timer; /* collect stale entries */
105	struct ip_vs_service	*svc;		/* pointer back to service */
106	atomic_t                entries;        /* number of entries */
107	int                     max_size;       /* maximum size of entries */
108	int                     rover;          /* rover for expire check */
109	int                     counter;        /* counter for no expire */
110	bool			dead;
111};
112
113
114/*
115 *      IPVS LBLC sysctl table
116 */
117#ifdef CONFIG_SYSCTL
118static struct ctl_table vs_vars_table[] = {
119	{
120		.procname	= "lblc_expiration",
121		.data		= NULL,
122		.maxlen		= sizeof(int),
123		.mode		= 0644,
124		.proc_handler	= proc_dointvec_jiffies,
125	},
126	{ }
127};
128#endif
129
130static void ip_vs_lblc_rcu_free(struct rcu_head *head)
131{
132	struct ip_vs_lblc_entry *en = container_of(head,
133						   struct ip_vs_lblc_entry,
134						   rcu_head);
135
136	ip_vs_dest_put_and_free(en->dest);
137	kfree(en);
138}
139
140static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en)
141{
142	hlist_del_rcu(&en->list);
143	call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free);
144}
145
146/*
147 *	Returns hash value for IPVS LBLC entry
148 */
149static inline unsigned int
150ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
151{
152	__be32 addr_fold = addr->ip;
153
154#ifdef CONFIG_IP_VS_IPV6
155	if (af == AF_INET6)
156		addr_fold = addr->ip6[0]^addr->ip6[1]^
157			    addr->ip6[2]^addr->ip6[3];
158#endif
159	return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS);
160}
161
162
163/*
164 *	Hash an entry in the ip_vs_lblc_table.
165 *	returns bool success.
166 */
167static void
168ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
169{
170	unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr);
171
172	hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
173	atomic_inc(&tbl->entries);
174}
175
176
177/* Get ip_vs_lblc_entry associated with supplied parameters. */
178static inline struct ip_vs_lblc_entry *
179ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
180	       const union nf_inet_addr *addr)
181{
182	unsigned int hash = ip_vs_lblc_hashkey(af, addr);
183	struct ip_vs_lblc_entry *en;
184
185	hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
186		if (ip_vs_addr_equal(af, &en->addr, addr))
187			return en;
188
189	return NULL;
190}
191
192
193/*
194 * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
195 * address to a server. Called under spin lock.
196 */
197static inline struct ip_vs_lblc_entry *
198ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
199	       u16 af, struct ip_vs_dest *dest)
200{
201	struct ip_vs_lblc_entry *en;
202
203	en = ip_vs_lblc_get(af, tbl, daddr);
204	if (en) {
205		if (en->dest == dest)
206			return en;
207		ip_vs_lblc_del(en);
208	}
209	en = kmalloc(sizeof(*en), GFP_ATOMIC);
210	if (!en)
211		return NULL;
212
213	en->af = af;
214	ip_vs_addr_copy(af, &en->addr, daddr);
215	en->lastuse = jiffies;
216
217	ip_vs_dest_hold(dest);
218	en->dest = dest;
219
220	ip_vs_lblc_hash(tbl, en);
221
222	return en;
223}
224
225
226/*
227 *      Flush all the entries of the specified table.
228 */
229static void ip_vs_lblc_flush(struct ip_vs_service *svc)
230{
231	struct ip_vs_lblc_table *tbl = svc->sched_data;
232	struct ip_vs_lblc_entry *en;
233	struct hlist_node *next;
234	int i;
235
236	spin_lock_bh(&svc->sched_lock);
237	tbl->dead = true;
238	for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
239		hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
240			ip_vs_lblc_del(en);
241			atomic_dec(&tbl->entries);
242		}
243	}
244	spin_unlock_bh(&svc->sched_lock);
245}
246
247static int sysctl_lblc_expiration(struct ip_vs_service *svc)
248{
249#ifdef CONFIG_SYSCTL
250	return svc->ipvs->sysctl_lblc_expiration;
251#else
252	return DEFAULT_EXPIRATION;
253#endif
254}
255
256static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
257{
258	struct ip_vs_lblc_table *tbl = svc->sched_data;
259	struct ip_vs_lblc_entry *en;
260	struct hlist_node *next;
261	unsigned long now = jiffies;
262	int i, j;
263
264	for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
265		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
266
267		spin_lock(&svc->sched_lock);
268		hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
269			if (time_before(now,
270					en->lastuse +
271					sysctl_lblc_expiration(svc)))
272				continue;
273
274			ip_vs_lblc_del(en);
275			atomic_dec(&tbl->entries);
276		}
277		spin_unlock(&svc->sched_lock);
278	}
279	tbl->rover = j;
280}
281
282
283/*
284 *      Periodical timer handler for IPVS lblc table
285 *      It is used to collect stale entries when the number of entries
286 *      exceeds the maximum size of the table.
287 *
288 *      Fixme: we probably need more complicated algorithm to collect
289 *             entries that have not been used for a long time even
290 *             if the number of entries doesn't exceed the maximum size
291 *             of the table.
292 *      The full expiration check is for this purpose now.
293 */
294static void ip_vs_lblc_check_expire(struct timer_list *t)
295{
296	struct ip_vs_lblc_table *tbl = from_timer(tbl, t, periodic_timer);
297	struct ip_vs_service *svc = tbl->svc;
298	unsigned long now = jiffies;
299	int goal;
300	int i, j;
301	struct ip_vs_lblc_entry *en;
302	struct hlist_node *next;
303
304	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
305		/* do full expiration check */
306		ip_vs_lblc_full_check(svc);
307		tbl->counter = 1;
308		goto out;
309	}
310
311	if (atomic_read(&tbl->entries) <= tbl->max_size) {
312		tbl->counter++;
313		goto out;
314	}
315
316	goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
317	if (goal > tbl->max_size/2)
318		goal = tbl->max_size/2;
319
320	for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
321		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
322
323		spin_lock(&svc->sched_lock);
324		hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
325			if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
326				continue;
327
328			ip_vs_lblc_del(en);
329			atomic_dec(&tbl->entries);
330			goal--;
331		}
332		spin_unlock(&svc->sched_lock);
333		if (goal <= 0)
334			break;
335	}
336	tbl->rover = j;
337
338  out:
339	mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
340}
341
342
343static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
344{
345	int i;
346	struct ip_vs_lblc_table *tbl;
347
348	/*
349	 *    Allocate the ip_vs_lblc_table for this service
350	 */
351	tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
352	if (tbl == NULL)
353		return -ENOMEM;
354
355	svc->sched_data = tbl;
356	IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) allocated for "
357		  "current service\n", sizeof(*tbl));
358
359	/*
360	 *    Initialize the hash buckets
361	 */
362	for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
363		INIT_HLIST_HEAD(&tbl->bucket[i]);
364	}
365	tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
366	tbl->rover = 0;
367	tbl->counter = 1;
368	tbl->dead = false;
369	tbl->svc = svc;
370	atomic_set(&tbl->entries, 0);
371
372	/*
373	 *    Hook periodic timer for garbage collection
374	 */
375	timer_setup(&tbl->periodic_timer, ip_vs_lblc_check_expire, 0);
376	mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
377
378	return 0;
379}
380
381
382static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
383{
384	struct ip_vs_lblc_table *tbl = svc->sched_data;
385
386	/* remove periodic timer */
387	del_timer_sync(&tbl->periodic_timer);
388
389	/* got to clean up table entries here */
390	ip_vs_lblc_flush(svc);
391
392	/* release the table itself */
393	kfree_rcu(tbl, rcu_head);
394	IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) released\n",
395		  sizeof(*tbl));
396}
397
398
399static inline struct ip_vs_dest *
400__ip_vs_lblc_schedule(struct ip_vs_service *svc)
401{
402	struct ip_vs_dest *dest, *least;
403	int loh, doh;
404
405	/*
406	 * We use the following formula to estimate the load:
407	 *                (dest overhead) / dest->weight
408	 *
409	 * Remember -- no floats in kernel mode!!!
410	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
411	 *                h1/w1 > h2/w2
412	 * if every weight is larger than zero.
413	 *
414	 * The server with weight=0 is quiesced and will not receive any
415	 * new connection.
416	 */
417	list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
418		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
419			continue;
420		if (atomic_read(&dest->weight) > 0) {
421			least = dest;
422			loh = ip_vs_dest_conn_overhead(least);
423			goto nextstage;
424		}
425	}
426	return NULL;
427
428	/*
429	 *    Find the destination with the least load.
430	 */
431  nextstage:
432	list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
433		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
434			continue;
435
436		doh = ip_vs_dest_conn_overhead(dest);
437		if ((__s64)loh * atomic_read(&dest->weight) >
438		    (__s64)doh * atomic_read(&least->weight)) {
439			least = dest;
440			loh = doh;
441		}
442	}
443
444	IP_VS_DBG_BUF(6, "LBLC: server %s:%d "
445		      "activeconns %d refcnt %d weight %d overhead %d\n",
446		      IP_VS_DBG_ADDR(least->af, &least->addr),
447		      ntohs(least->port),
448		      atomic_read(&least->activeconns),
449		      refcount_read(&least->refcnt),
450		      atomic_read(&least->weight), loh);
451
452	return least;
453}
454
455
456/*
457 *   If this destination server is overloaded and there is a less loaded
458 *   server, then return true.
459 */
460static inline int
461is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
462{
463	if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
464		struct ip_vs_dest *d;
465
466		list_for_each_entry_rcu(d, &svc->destinations, n_list) {
467			if (atomic_read(&d->activeconns)*2
468			    < atomic_read(&d->weight)) {
469				return 1;
470			}
471		}
472	}
473	return 0;
474}
475
476
477/*
478 *    Locality-Based (weighted) Least-Connection scheduling
479 */
480static struct ip_vs_dest *
481ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
482		    struct ip_vs_iphdr *iph)
483{
484	struct ip_vs_lblc_table *tbl = svc->sched_data;
485	struct ip_vs_dest *dest = NULL;
486	struct ip_vs_lblc_entry *en;
487
488	IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
489
490	/* First look in our cache */
491	en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr);
492	if (en) {
493		/* We only hold a read lock, but this is atomic */
494		en->lastuse = jiffies;
495
496		/*
497		 * If the destination is not available, i.e. it's in the trash,
498		 * we must ignore it, as it may be removed from under our feet,
499		 * if someone drops our reference count. Our caller only makes
500		 * sure that destinations, that are not in the trash, are not
501		 * moved to the trash, while we are scheduling. But anyone can
502		 * free up entries from the trash at any time.
503		 */
504
505		dest = en->dest;
506		if ((dest->flags & IP_VS_DEST_F_AVAILABLE) &&
507		    atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
508			goto out;
509	}
510
511	/* No cache entry or it is invalid, time to schedule */
512	dest = __ip_vs_lblc_schedule(svc);
513	if (!dest) {
514		ip_vs_scheduler_err(svc, "no destination available");
515		return NULL;
516	}
517
518	/* If we fail to create a cache entry, we'll just use the valid dest */
519	spin_lock_bh(&svc->sched_lock);
520	if (!tbl->dead)
521		ip_vs_lblc_new(tbl, &iph->daddr, svc->af, dest);
522	spin_unlock_bh(&svc->sched_lock);
523
524out:
525	IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
526		      IP_VS_DBG_ADDR(svc->af, &iph->daddr),
527		      IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
528
529	return dest;
530}
531
532
533/*
534 *      IPVS LBLC Scheduler structure
535 */
536static struct ip_vs_scheduler ip_vs_lblc_scheduler = {
537	.name =			"lblc",
538	.refcnt =		ATOMIC_INIT(0),
539	.module =		THIS_MODULE,
540	.n_list =		LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
541	.init_service =		ip_vs_lblc_init_svc,
542	.done_service =		ip_vs_lblc_done_svc,
543	.schedule =		ip_vs_lblc_schedule,
544};
545
546/*
547 *  per netns init.
548 */
549#ifdef CONFIG_SYSCTL
550static int __net_init __ip_vs_lblc_init(struct net *net)
551{
552	struct netns_ipvs *ipvs = net_ipvs(net);
553
554	if (!ipvs)
555		return -ENOENT;
556
557	if (!net_eq(net, &init_net)) {
558		ipvs->lblc_ctl_table = kmemdup(vs_vars_table,
559						sizeof(vs_vars_table),
560						GFP_KERNEL);
561		if (ipvs->lblc_ctl_table == NULL)
562			return -ENOMEM;
563
564		/* Don't export sysctls to unprivileged users */
565		if (net->user_ns != &init_user_ns)
566			ipvs->lblc_ctl_table[0].procname = NULL;
567
568	} else
569		ipvs->lblc_ctl_table = vs_vars_table;
570	ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION;
571	ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
572
573	ipvs->lblc_ctl_header =
574		register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table);
575	if (!ipvs->lblc_ctl_header) {
576		if (!net_eq(net, &init_net))
577			kfree(ipvs->lblc_ctl_table);
578		return -ENOMEM;
579	}
580
581	return 0;
582}
583
584static void __net_exit __ip_vs_lblc_exit(struct net *net)
585{
586	struct netns_ipvs *ipvs = net_ipvs(net);
587
588	unregister_net_sysctl_table(ipvs->lblc_ctl_header);
589
590	if (!net_eq(net, &init_net))
591		kfree(ipvs->lblc_ctl_table);
592}
593
594#else
595
596static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; }
597static void __net_exit __ip_vs_lblc_exit(struct net *net) { }
598
599#endif
600
601static struct pernet_operations ip_vs_lblc_ops = {
602	.init = __ip_vs_lblc_init,
603	.exit = __ip_vs_lblc_exit,
604};
605
606static int __init ip_vs_lblc_init(void)
607{
608	int ret;
609
610	ret = register_pernet_subsys(&ip_vs_lblc_ops);
611	if (ret)
612		return ret;
613
614	ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
615	if (ret)
616		unregister_pernet_subsys(&ip_vs_lblc_ops);
617	return ret;
618}
619
620static void __exit ip_vs_lblc_cleanup(void)
621{
622	unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
623	unregister_pernet_subsys(&ip_vs_lblc_ops);
624	rcu_barrier();
625}
626
627
628module_init(ip_vs_lblc_init);
629module_exit(ip_vs_lblc_cleanup);
630MODULE_LICENSE("GPL");