/net/netfilter/ipvs/ip_vs_lblc.c

http://github.com/mirrors/linux · C · 630 lines · 385 code · 107 blank · 138 comment · 42 complexity · 812e8f00956c22f1462cfd00e9c0e884 MD5 · raw file

  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * IPVS: Locality-Based Least-Connection scheduling module
  4. *
  5. * Authors: Wensong Zhang <wensong@gnuchina.org>
  6. *
  7. * Changes:
  8. * Martin Hamilton : fixed the terrible locking bugs
  9. * *lock(tbl->lock) ==> *lock(&tbl->lock)
  10. * Wensong Zhang : fixed the uninitialized tbl->lock bug
  11. * Wensong Zhang : added doing full expiration check to
  12. * collect stale entries of 24+ hours when
  13. * no partial expire check in a half hour
  14. * Julian Anastasov : replaced del_timer call with del_timer_sync
  15. * to avoid the possible race between timer
  16. * handler and del_timer thread in SMP
  17. */
  18. /*
  19. * The lblc algorithm is as follows (pseudo code):
  20. *
  21. * if cachenode[dest_ip] is null then
  22. * n, cachenode[dest_ip] <- {weighted least-conn node};
  23. * else
  24. * n <- cachenode[dest_ip];
  25. * if (n is dead) OR
  26. * (n.conns>n.weight AND
  27. * there is a node m with m.conns<m.weight/2) then
  28. * n, cachenode[dest_ip] <- {weighted least-conn node};
  29. *
  30. * return n;
  31. *
  32. * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
  33. * me to write this module.
  34. */
  35. #define KMSG_COMPONENT "IPVS"
  36. #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  37. #include <linux/ip.h>
  38. #include <linux/slab.h>
  39. #include <linux/module.h>
  40. #include <linux/kernel.h>
  41. #include <linux/skbuff.h>
  42. #include <linux/jiffies.h>
  43. #include <linux/hash.h>
  44. /* for sysctl */
  45. #include <linux/fs.h>
  46. #include <linux/sysctl.h>
  47. #include <net/ip_vs.h>
  48. /*
  49. * It is for garbage collection of stale IPVS lblc entries,
  50. * when the table is full.
  51. */
  52. #define CHECK_EXPIRE_INTERVAL (60*HZ)
  53. #define ENTRY_TIMEOUT (6*60*HZ)
  54. #define DEFAULT_EXPIRATION (24*60*60*HZ)
  55. /*
  56. * It is for full expiration check.
  57. * When there is no partial expiration check (garbage collection)
  58. * in a half hour, do a full expiration check to collect stale
  59. * entries that haven't been touched for a day.
  60. */
  61. #define COUNT_FOR_FULL_EXPIRATION 30
  62. /*
  63. * for IPVS lblc entry hash table
  64. */
  65. #ifndef CONFIG_IP_VS_LBLC_TAB_BITS
  66. #define CONFIG_IP_VS_LBLC_TAB_BITS 10
  67. #endif
  68. #define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
  69. #define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
  70. #define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
  71. /*
  72. * IPVS lblc entry represents an association between destination
  73. * IP address and its destination server
  74. */
  75. struct ip_vs_lblc_entry {
  76. struct hlist_node list;
  77. int af; /* address family */
  78. union nf_inet_addr addr; /* destination IP address */
  79. struct ip_vs_dest *dest; /* real server (cache) */
  80. unsigned long lastuse; /* last used time */
  81. struct rcu_head rcu_head;
  82. };
  83. /*
  84. * IPVS lblc hash table
  85. */
  86. struct ip_vs_lblc_table {
  87. struct rcu_head rcu_head;
  88. struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
  89. struct timer_list periodic_timer; /* collect stale entries */
  90. struct ip_vs_service *svc; /* pointer back to service */
  91. atomic_t entries; /* number of entries */
  92. int max_size; /* maximum size of entries */
  93. int rover; /* rover for expire check */
  94. int counter; /* counter for no expire */
  95. bool dead;
  96. };
  97. /*
  98. * IPVS LBLC sysctl table
  99. */
  100. #ifdef CONFIG_SYSCTL
  101. static struct ctl_table vs_vars_table[] = {
  102. {
  103. .procname = "lblc_expiration",
  104. .data = NULL,
  105. .maxlen = sizeof(int),
  106. .mode = 0644,
  107. .proc_handler = proc_dointvec_jiffies,
  108. },
  109. { }
  110. };
  111. #endif
  112. static void ip_vs_lblc_rcu_free(struct rcu_head *head)
  113. {
  114. struct ip_vs_lblc_entry *en = container_of(head,
  115. struct ip_vs_lblc_entry,
  116. rcu_head);
  117. ip_vs_dest_put_and_free(en->dest);
  118. kfree(en);
  119. }
  120. static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en)
  121. {
  122. hlist_del_rcu(&en->list);
  123. call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free);
  124. }
  125. /*
  126. * Returns hash value for IPVS LBLC entry
  127. */
  128. static inline unsigned int
  129. ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
  130. {
  131. __be32 addr_fold = addr->ip;
  132. #ifdef CONFIG_IP_VS_IPV6
  133. if (af == AF_INET6)
  134. addr_fold = addr->ip6[0]^addr->ip6[1]^
  135. addr->ip6[2]^addr->ip6[3];
  136. #endif
  137. return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS);
  138. }
  139. /*
  140. * Hash an entry in the ip_vs_lblc_table.
  141. * returns bool success.
  142. */
  143. static void
  144. ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
  145. {
  146. unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr);
  147. hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
  148. atomic_inc(&tbl->entries);
  149. }
  150. /* Get ip_vs_lblc_entry associated with supplied parameters. */
  151. static inline struct ip_vs_lblc_entry *
  152. ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
  153. const union nf_inet_addr *addr)
  154. {
  155. unsigned int hash = ip_vs_lblc_hashkey(af, addr);
  156. struct ip_vs_lblc_entry *en;
  157. hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
  158. if (ip_vs_addr_equal(af, &en->addr, addr))
  159. return en;
  160. return NULL;
  161. }
  162. /*
  163. * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
  164. * address to a server. Called under spin lock.
  165. */
  166. static inline struct ip_vs_lblc_entry *
  167. ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
  168. u16 af, struct ip_vs_dest *dest)
  169. {
  170. struct ip_vs_lblc_entry *en;
  171. en = ip_vs_lblc_get(af, tbl, daddr);
  172. if (en) {
  173. if (en->dest == dest)
  174. return en;
  175. ip_vs_lblc_del(en);
  176. }
  177. en = kmalloc(sizeof(*en), GFP_ATOMIC);
  178. if (!en)
  179. return NULL;
  180. en->af = af;
  181. ip_vs_addr_copy(af, &en->addr, daddr);
  182. en->lastuse = jiffies;
  183. ip_vs_dest_hold(dest);
  184. en->dest = dest;
  185. ip_vs_lblc_hash(tbl, en);
  186. return en;
  187. }
  188. /*
  189. * Flush all the entries of the specified table.
  190. */
  191. static void ip_vs_lblc_flush(struct ip_vs_service *svc)
  192. {
  193. struct ip_vs_lblc_table *tbl = svc->sched_data;
  194. struct ip_vs_lblc_entry *en;
  195. struct hlist_node *next;
  196. int i;
  197. spin_lock_bh(&svc->sched_lock);
  198. tbl->dead = true;
  199. for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
  200. hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
  201. ip_vs_lblc_del(en);
  202. atomic_dec(&tbl->entries);
  203. }
  204. }
  205. spin_unlock_bh(&svc->sched_lock);
  206. }
  207. static int sysctl_lblc_expiration(struct ip_vs_service *svc)
  208. {
  209. #ifdef CONFIG_SYSCTL
  210. return svc->ipvs->sysctl_lblc_expiration;
  211. #else
  212. return DEFAULT_EXPIRATION;
  213. #endif
  214. }
  215. static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
  216. {
  217. struct ip_vs_lblc_table *tbl = svc->sched_data;
  218. struct ip_vs_lblc_entry *en;
  219. struct hlist_node *next;
  220. unsigned long now = jiffies;
  221. int i, j;
  222. for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
  223. j = (j + 1) & IP_VS_LBLC_TAB_MASK;
  224. spin_lock(&svc->sched_lock);
  225. hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
  226. if (time_before(now,
  227. en->lastuse +
  228. sysctl_lblc_expiration(svc)))
  229. continue;
  230. ip_vs_lblc_del(en);
  231. atomic_dec(&tbl->entries);
  232. }
  233. spin_unlock(&svc->sched_lock);
  234. }
  235. tbl->rover = j;
  236. }
  237. /*
  238. * Periodical timer handler for IPVS lblc table
  239. * It is used to collect stale entries when the number of entries
  240. * exceeds the maximum size of the table.
  241. *
  242. * Fixme: we probably need more complicated algorithm to collect
  243. * entries that have not been used for a long time even
  244. * if the number of entries doesn't exceed the maximum size
  245. * of the table.
  246. * The full expiration check is for this purpose now.
  247. */
  248. static void ip_vs_lblc_check_expire(struct timer_list *t)
  249. {
  250. struct ip_vs_lblc_table *tbl = from_timer(tbl, t, periodic_timer);
  251. struct ip_vs_service *svc = tbl->svc;
  252. unsigned long now = jiffies;
  253. int goal;
  254. int i, j;
  255. struct ip_vs_lblc_entry *en;
  256. struct hlist_node *next;
  257. if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
  258. /* do full expiration check */
  259. ip_vs_lblc_full_check(svc);
  260. tbl->counter = 1;
  261. goto out;
  262. }
  263. if (atomic_read(&tbl->entries) <= tbl->max_size) {
  264. tbl->counter++;
  265. goto out;
  266. }
  267. goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
  268. if (goal > tbl->max_size/2)
  269. goal = tbl->max_size/2;
  270. for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
  271. j = (j + 1) & IP_VS_LBLC_TAB_MASK;
  272. spin_lock(&svc->sched_lock);
  273. hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
  274. if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
  275. continue;
  276. ip_vs_lblc_del(en);
  277. atomic_dec(&tbl->entries);
  278. goal--;
  279. }
  280. spin_unlock(&svc->sched_lock);
  281. if (goal <= 0)
  282. break;
  283. }
  284. tbl->rover = j;
  285. out:
  286. mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
  287. }
  288. static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
  289. {
  290. int i;
  291. struct ip_vs_lblc_table *tbl;
  292. /*
  293. * Allocate the ip_vs_lblc_table for this service
  294. */
  295. tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
  296. if (tbl == NULL)
  297. return -ENOMEM;
  298. svc->sched_data = tbl;
  299. IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) allocated for "
  300. "current service\n", sizeof(*tbl));
  301. /*
  302. * Initialize the hash buckets
  303. */
  304. for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
  305. INIT_HLIST_HEAD(&tbl->bucket[i]);
  306. }
  307. tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
  308. tbl->rover = 0;
  309. tbl->counter = 1;
  310. tbl->dead = false;
  311. tbl->svc = svc;
  312. atomic_set(&tbl->entries, 0);
  313. /*
  314. * Hook periodic timer for garbage collection
  315. */
  316. timer_setup(&tbl->periodic_timer, ip_vs_lblc_check_expire, 0);
  317. mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
  318. return 0;
  319. }
  320. static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
  321. {
  322. struct ip_vs_lblc_table *tbl = svc->sched_data;
  323. /* remove periodic timer */
  324. del_timer_sync(&tbl->periodic_timer);
  325. /* got to clean up table entries here */
  326. ip_vs_lblc_flush(svc);
  327. /* release the table itself */
  328. kfree_rcu(tbl, rcu_head);
  329. IP_VS_DBG(6, "LBLC hash table (memory=%zdbytes) released\n",
  330. sizeof(*tbl));
  331. }
  332. static inline struct ip_vs_dest *
  333. __ip_vs_lblc_schedule(struct ip_vs_service *svc)
  334. {
  335. struct ip_vs_dest *dest, *least;
  336. int loh, doh;
  337. /*
  338. * We use the following formula to estimate the load:
  339. * (dest overhead) / dest->weight
  340. *
  341. * Remember -- no floats in kernel mode!!!
  342. * The comparison of h1*w2 > h2*w1 is equivalent to that of
  343. * h1/w1 > h2/w2
  344. * if every weight is larger than zero.
  345. *
  346. * The server with weight=0 is quiesced and will not receive any
  347. * new connection.
  348. */
  349. list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
  350. if (dest->flags & IP_VS_DEST_F_OVERLOAD)
  351. continue;
  352. if (atomic_read(&dest->weight) > 0) {
  353. least = dest;
  354. loh = ip_vs_dest_conn_overhead(least);
  355. goto nextstage;
  356. }
  357. }
  358. return NULL;
  359. /*
  360. * Find the destination with the least load.
  361. */
  362. nextstage:
  363. list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
  364. if (dest->flags & IP_VS_DEST_F_OVERLOAD)
  365. continue;
  366. doh = ip_vs_dest_conn_overhead(dest);
  367. if ((__s64)loh * atomic_read(&dest->weight) >
  368. (__s64)doh * atomic_read(&least->weight)) {
  369. least = dest;
  370. loh = doh;
  371. }
  372. }
  373. IP_VS_DBG_BUF(6, "LBLC: server %s:%d "
  374. "activeconns %d refcnt %d weight %d overhead %d\n",
  375. IP_VS_DBG_ADDR(least->af, &least->addr),
  376. ntohs(least->port),
  377. atomic_read(&least->activeconns),
  378. refcount_read(&least->refcnt),
  379. atomic_read(&least->weight), loh);
  380. return least;
  381. }
  382. /*
  383. * If this destination server is overloaded and there is a less loaded
  384. * server, then return true.
  385. */
  386. static inline int
  387. is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
  388. {
  389. if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
  390. struct ip_vs_dest *d;
  391. list_for_each_entry_rcu(d, &svc->destinations, n_list) {
  392. if (atomic_read(&d->activeconns)*2
  393. < atomic_read(&d->weight)) {
  394. return 1;
  395. }
  396. }
  397. }
  398. return 0;
  399. }
  400. /*
  401. * Locality-Based (weighted) Least-Connection scheduling
  402. */
  403. static struct ip_vs_dest *
  404. ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
  405. struct ip_vs_iphdr *iph)
  406. {
  407. struct ip_vs_lblc_table *tbl = svc->sched_data;
  408. struct ip_vs_dest *dest = NULL;
  409. struct ip_vs_lblc_entry *en;
  410. IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
  411. /* First look in our cache */
  412. en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr);
  413. if (en) {
  414. /* We only hold a read lock, but this is atomic */
  415. en->lastuse = jiffies;
  416. /*
  417. * If the destination is not available, i.e. it's in the trash,
  418. * we must ignore it, as it may be removed from under our feet,
  419. * if someone drops our reference count. Our caller only makes
  420. * sure that destinations, that are not in the trash, are not
  421. * moved to the trash, while we are scheduling. But anyone can
  422. * free up entries from the trash at any time.
  423. */
  424. dest = en->dest;
  425. if ((dest->flags & IP_VS_DEST_F_AVAILABLE) &&
  426. atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
  427. goto out;
  428. }
  429. /* No cache entry or it is invalid, time to schedule */
  430. dest = __ip_vs_lblc_schedule(svc);
  431. if (!dest) {
  432. ip_vs_scheduler_err(svc, "no destination available");
  433. return NULL;
  434. }
  435. /* If we fail to create a cache entry, we'll just use the valid dest */
  436. spin_lock_bh(&svc->sched_lock);
  437. if (!tbl->dead)
  438. ip_vs_lblc_new(tbl, &iph->daddr, svc->af, dest);
  439. spin_unlock_bh(&svc->sched_lock);
  440. out:
  441. IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
  442. IP_VS_DBG_ADDR(svc->af, &iph->daddr),
  443. IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
  444. return dest;
  445. }
  446. /*
  447. * IPVS LBLC Scheduler structure
  448. */
  449. static struct ip_vs_scheduler ip_vs_lblc_scheduler = {
  450. .name = "lblc",
  451. .refcnt = ATOMIC_INIT(0),
  452. .module = THIS_MODULE,
  453. .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
  454. .init_service = ip_vs_lblc_init_svc,
  455. .done_service = ip_vs_lblc_done_svc,
  456. .schedule = ip_vs_lblc_schedule,
  457. };
  458. /*
  459. * per netns init.
  460. */
  461. #ifdef CONFIG_SYSCTL
  462. static int __net_init __ip_vs_lblc_init(struct net *net)
  463. {
  464. struct netns_ipvs *ipvs = net_ipvs(net);
  465. if (!ipvs)
  466. return -ENOENT;
  467. if (!net_eq(net, &init_net)) {
  468. ipvs->lblc_ctl_table = kmemdup(vs_vars_table,
  469. sizeof(vs_vars_table),
  470. GFP_KERNEL);
  471. if (ipvs->lblc_ctl_table == NULL)
  472. return -ENOMEM;
  473. /* Don't export sysctls to unprivileged users */
  474. if (net->user_ns != &init_user_ns)
  475. ipvs->lblc_ctl_table[0].procname = NULL;
  476. } else
  477. ipvs->lblc_ctl_table = vs_vars_table;
  478. ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION;
  479. ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
  480. ipvs->lblc_ctl_header =
  481. register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table);
  482. if (!ipvs->lblc_ctl_header) {
  483. if (!net_eq(net, &init_net))
  484. kfree(ipvs->lblc_ctl_table);
  485. return -ENOMEM;
  486. }
  487. return 0;
  488. }
  489. static void __net_exit __ip_vs_lblc_exit(struct net *net)
  490. {
  491. struct netns_ipvs *ipvs = net_ipvs(net);
  492. unregister_net_sysctl_table(ipvs->lblc_ctl_header);
  493. if (!net_eq(net, &init_net))
  494. kfree(ipvs->lblc_ctl_table);
  495. }
  496. #else
  497. static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; }
  498. static void __net_exit __ip_vs_lblc_exit(struct net *net) { }
  499. #endif
  500. static struct pernet_operations ip_vs_lblc_ops = {
  501. .init = __ip_vs_lblc_init,
  502. .exit = __ip_vs_lblc_exit,
  503. };
  504. static int __init ip_vs_lblc_init(void)
  505. {
  506. int ret;
  507. ret = register_pernet_subsys(&ip_vs_lblc_ops);
  508. if (ret)
  509. return ret;
  510. ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
  511. if (ret)
  512. unregister_pernet_subsys(&ip_vs_lblc_ops);
  513. return ret;
  514. }
  515. static void __exit ip_vs_lblc_cleanup(void)
  516. {
  517. unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
  518. unregister_pernet_subsys(&ip_vs_lblc_ops);
  519. rcu_barrier();
  520. }
  521. module_init(ip_vs_lblc_init);
  522. module_exit(ip_vs_lblc_cleanup);
  523. MODULE_LICENSE("GPL");