/kern_2.6.32/net/netfilter/nf_conntrack_core.c

http://omnia2droid.googlecode.com/ · C · 1398 lines · 1040 code · 203 blank · 155 comment · 134 complexity · 63766f363ad6b77975a3704faea5af51 MD5 · raw file

  1. /* Connection state tracking for netfilter. This is separated from,
  2. but required by, the NAT layer; it can also be used by an iptables
  3. extension. */
  4. /* (C) 1999-2001 Paul `Rusty' Russell
  5. * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
  6. * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
  7. *
  8. * This program is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU General Public License version 2 as
  10. * published by the Free Software Foundation.
  11. */
  12. #include <linux/types.h>
  13. #include <linux/netfilter.h>
  14. #include <linux/module.h>
  15. #include <linux/sched.h>
  16. #include <linux/skbuff.h>
  17. #include <linux/proc_fs.h>
  18. #include <linux/vmalloc.h>
  19. #include <linux/stddef.h>
  20. #include <linux/slab.h>
  21. #include <linux/random.h>
  22. #include <linux/jhash.h>
  23. #include <linux/err.h>
  24. #include <linux/percpu.h>
  25. #include <linux/moduleparam.h>
  26. #include <linux/notifier.h>
  27. #include <linux/kernel.h>
  28. #include <linux/netdevice.h>
  29. #include <linux/socket.h>
  30. #include <linux/mm.h>
  31. #include <linux/nsproxy.h>
  32. #include <linux/rculist_nulls.h>
  33. #include <net/netfilter/nf_conntrack.h>
  34. #include <net/netfilter/nf_conntrack_l3proto.h>
  35. #include <net/netfilter/nf_conntrack_l4proto.h>
  36. #include <net/netfilter/nf_conntrack_expect.h>
  37. #include <net/netfilter/nf_conntrack_helper.h>
  38. #include <net/netfilter/nf_conntrack_core.h>
  39. #include <net/netfilter/nf_conntrack_extend.h>
  40. #include <net/netfilter/nf_conntrack_acct.h>
  41. #include <net/netfilter/nf_conntrack_ecache.h>
  42. #include <net/netfilter/nf_nat.h>
  43. #include <net/netfilter/nf_nat_core.h>
  44. #define NF_CONNTRACK_VERSION "0.5.0"
  45. int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
  46. enum nf_nat_manip_type manip,
  47. const struct nlattr *attr) __read_mostly;
  48. EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
  49. DEFINE_SPINLOCK(nf_conntrack_lock);
  50. EXPORT_SYMBOL_GPL(nf_conntrack_lock);
  51. unsigned int nf_conntrack_htable_size __read_mostly;
  52. EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
  53. unsigned int nf_conntrack_max __read_mostly;
  54. EXPORT_SYMBOL_GPL(nf_conntrack_max);
  55. struct nf_conn nf_conntrack_untracked __read_mostly;
  56. EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
  57. static int nf_conntrack_hash_rnd_initted;
  58. static unsigned int nf_conntrack_hash_rnd;
  59. static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
  60. unsigned int size, unsigned int rnd)
  61. {
  62. unsigned int n;
  63. u_int32_t h;
  64. /* The direction must be ignored, so we hash everything up to the
  65. * destination ports (which is a multiple of 4) and treat the last
  66. * three bytes manually.
  67. */
  68. n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
  69. h = jhash2((u32 *)tuple, n,
  70. rnd ^ (((__force __u16)tuple->dst.u.all << 16) |
  71. tuple->dst.protonum));
  72. return ((u64)h * size) >> 32;
  73. }
  74. static inline u_int32_t hash_conntrack(const struct net *net,
  75. const struct nf_conntrack_tuple *tuple)
  76. {
  77. return __hash_conntrack(tuple, net->ct.htable_size,
  78. nf_conntrack_hash_rnd);
  79. }
  80. bool
  81. nf_ct_get_tuple(const struct sk_buff *skb,
  82. unsigned int nhoff,
  83. unsigned int dataoff,
  84. u_int16_t l3num,
  85. u_int8_t protonum,
  86. struct nf_conntrack_tuple *tuple,
  87. const struct nf_conntrack_l3proto *l3proto,
  88. const struct nf_conntrack_l4proto *l4proto)
  89. {
  90. memset(tuple, 0, sizeof(*tuple));
  91. tuple->src.l3num = l3num;
  92. if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
  93. return false;
  94. tuple->dst.protonum = protonum;
  95. tuple->dst.dir = IP_CT_DIR_ORIGINAL;
  96. return l4proto->pkt_to_tuple(skb, dataoff, tuple);
  97. }
  98. EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
  99. bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
  100. u_int16_t l3num, struct nf_conntrack_tuple *tuple)
  101. {
  102. struct nf_conntrack_l3proto *l3proto;
  103. struct nf_conntrack_l4proto *l4proto;
  104. unsigned int protoff;
  105. u_int8_t protonum;
  106. int ret;
  107. rcu_read_lock();
  108. l3proto = __nf_ct_l3proto_find(l3num);
  109. ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum);
  110. if (ret != NF_ACCEPT) {
  111. rcu_read_unlock();
  112. return false;
  113. }
  114. l4proto = __nf_ct_l4proto_find(l3num, protonum);
  115. ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, tuple,
  116. l3proto, l4proto);
  117. rcu_read_unlock();
  118. return ret;
  119. }
  120. EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
  121. bool
  122. nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
  123. const struct nf_conntrack_tuple *orig,
  124. const struct nf_conntrack_l3proto *l3proto,
  125. const struct nf_conntrack_l4proto *l4proto)
  126. {
  127. memset(inverse, 0, sizeof(*inverse));
  128. inverse->src.l3num = orig->src.l3num;
  129. if (l3proto->invert_tuple(inverse, orig) == 0)
  130. return false;
  131. inverse->dst.dir = !orig->dst.dir;
  132. inverse->dst.protonum = orig->dst.protonum;
  133. return l4proto->invert_tuple(inverse, orig);
  134. }
  135. EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
  136. static void
  137. clean_from_lists(struct nf_conn *ct)
  138. {
  139. pr_debug("clean_from_lists(%p)\n", ct);
  140. hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
  141. hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
  142. /* Destroy all pending expectations */
  143. nf_ct_remove_expectations(ct);
  144. }
  145. static void
  146. destroy_conntrack(struct nf_conntrack *nfct)
  147. {
  148. struct nf_conn *ct = (struct nf_conn *)nfct;
  149. struct net *net = nf_ct_net(ct);
  150. struct nf_conntrack_l4proto *l4proto;
  151. pr_debug("destroy_conntrack(%p)\n", ct);
  152. NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
  153. NF_CT_ASSERT(!timer_pending(&ct->timeout));
  154. /* To make sure we don't get any weird locking issues here:
  155. * destroy_conntrack() MUST NOT be called with a write lock
  156. * to nf_conntrack_lock!!! -HW */
  157. rcu_read_lock();
  158. l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
  159. if (l4proto && l4proto->destroy)
  160. l4proto->destroy(ct);
  161. rcu_read_unlock();
  162. spin_lock_bh(&nf_conntrack_lock);
  163. /* Expectations will have been removed in clean_from_lists,
  164. * except TFTP can create an expectation on the first packet,
  165. * before connection is in the list, so we need to clean here,
  166. * too. */
  167. nf_ct_remove_expectations(ct);
  168. /* We overload first tuple to link into unconfirmed list. */
  169. if (!nf_ct_is_confirmed(ct)) {
  170. BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
  171. hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
  172. }
  173. NF_CT_STAT_INC(net, delete);
  174. spin_unlock_bh(&nf_conntrack_lock);
  175. if (ct->master)
  176. nf_ct_put(ct->master);
  177. pr_debug("destroy_conntrack: returning ct=%p to slab\n", ct);
  178. nf_conntrack_free(ct);
  179. }
  180. void nf_ct_delete_from_lists(struct nf_conn *ct)
  181. {
  182. struct net *net = nf_ct_net(ct);
  183. nf_ct_helper_destroy(ct);
  184. spin_lock_bh(&nf_conntrack_lock);
  185. /* Inside lock so preempt is disabled on module removal path.
  186. * Otherwise we can get spurious warnings. */
  187. NF_CT_STAT_INC(net, delete_list);
  188. clean_from_lists(ct);
  189. spin_unlock_bh(&nf_conntrack_lock);
  190. }
  191. EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists);
  192. static void death_by_event(unsigned long ul_conntrack)
  193. {
  194. struct nf_conn *ct = (void *)ul_conntrack;
  195. struct net *net = nf_ct_net(ct);
  196. if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
  197. /* bad luck, let's retry again */
  198. ct->timeout.expires = jiffies +
  199. (random32() % net->ct.sysctl_events_retry_timeout);
  200. add_timer(&ct->timeout);
  201. return;
  202. }
  203. /* we've got the event delivered, now it's dying */
  204. set_bit(IPS_DYING_BIT, &ct->status);
  205. spin_lock(&nf_conntrack_lock);
  206. hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
  207. spin_unlock(&nf_conntrack_lock);
  208. nf_ct_put(ct);
  209. }
  210. void nf_ct_insert_dying_list(struct nf_conn *ct)
  211. {
  212. struct net *net = nf_ct_net(ct);
  213. /* add this conntrack to the dying list */
  214. spin_lock_bh(&nf_conntrack_lock);
  215. hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
  216. &net->ct.dying);
  217. spin_unlock_bh(&nf_conntrack_lock);
  218. /* set a new timer to retry event delivery */
  219. setup_timer(&ct->timeout, death_by_event, (unsigned long)ct);
  220. ct->timeout.expires = jiffies +
  221. (random32() % net->ct.sysctl_events_retry_timeout);
  222. add_timer(&ct->timeout);
  223. }
  224. EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);
  225. static void death_by_timeout(unsigned long ul_conntrack)
  226. {
  227. struct nf_conn *ct = (void *)ul_conntrack;
  228. if (!test_bit(IPS_DYING_BIT, &ct->status) &&
  229. unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
  230. /* destroy event was not delivered */
  231. nf_ct_delete_from_lists(ct);
  232. nf_ct_insert_dying_list(ct);
  233. return;
  234. }
  235. set_bit(IPS_DYING_BIT, &ct->status);
  236. nf_ct_delete_from_lists(ct);
  237. nf_ct_put(ct);
  238. }
  239. /*
  240. * Warning :
  241. * - Caller must take a reference on returned object
  242. * and recheck nf_ct_tuple_equal(tuple, &h->tuple)
  243. * OR
  244. * - Caller must lock nf_conntrack_lock before calling this function
  245. */
  246. struct nf_conntrack_tuple_hash *
  247. __nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple)
  248. {
  249. struct nf_conntrack_tuple_hash *h;
  250. struct hlist_nulls_node *n;
  251. unsigned int hash = hash_conntrack(net, tuple);
  252. /* Disable BHs the entire time since we normally need to disable them
  253. * at least once for the stats anyway.
  254. */
  255. local_bh_disable();
  256. begin:
  257. hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
  258. if (nf_ct_tuple_equal(tuple, &h->tuple)) {
  259. NF_CT_STAT_INC(net, found);
  260. local_bh_enable();
  261. return h;
  262. }
  263. NF_CT_STAT_INC(net, searched);
  264. }
  265. /*
  266. * if the nulls value we got at the end of this lookup is
  267. * not the expected one, we must restart lookup.
  268. * We probably met an item that was moved to another chain.
  269. */
  270. if (get_nulls_value(n) != hash)
  271. goto begin;
  272. local_bh_enable();
  273. return NULL;
  274. }
  275. EXPORT_SYMBOL_GPL(__nf_conntrack_find);
  276. /* Find a connection corresponding to a tuple. */
  277. struct nf_conntrack_tuple_hash *
  278. nf_conntrack_find_get(struct net *net, const struct nf_conntrack_tuple *tuple)
  279. {
  280. struct nf_conntrack_tuple_hash *h;
  281. struct nf_conn *ct;
  282. rcu_read_lock();
  283. begin:
  284. h = __nf_conntrack_find(net, tuple);
  285. if (h) {
  286. ct = nf_ct_tuplehash_to_ctrack(h);
  287. if (unlikely(nf_ct_is_dying(ct) ||
  288. !atomic_inc_not_zero(&ct->ct_general.use)))
  289. h = NULL;
  290. else {
  291. if (unlikely(!nf_ct_tuple_equal(tuple, &h->tuple))) {
  292. nf_ct_put(ct);
  293. goto begin;
  294. }
  295. }
  296. }
  297. rcu_read_unlock();
  298. return h;
  299. }
  300. EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
  301. static void __nf_conntrack_hash_insert(struct nf_conn *ct,
  302. unsigned int hash,
  303. unsigned int repl_hash)
  304. {
  305. struct net *net = nf_ct_net(ct);
  306. hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
  307. &net->ct.hash[hash]);
  308. hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
  309. &net->ct.hash[repl_hash]);
  310. }
  311. void nf_conntrack_hash_insert(struct nf_conn *ct)
  312. {
  313. struct net *net = nf_ct_net(ct);
  314. unsigned int hash, repl_hash;
  315. hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
  316. repl_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
  317. __nf_conntrack_hash_insert(ct, hash, repl_hash);
  318. }
  319. EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert);
  320. /* Confirm a connection given skb; places it in hash table */
  321. int
  322. __nf_conntrack_confirm(struct sk_buff *skb)
  323. {
  324. unsigned int hash, repl_hash;
  325. struct nf_conntrack_tuple_hash *h;
  326. struct nf_conn *ct;
  327. struct nf_conn_help *help;
  328. struct hlist_nulls_node *n;
  329. enum ip_conntrack_info ctinfo;
  330. struct net *net;
  331. ct = nf_ct_get(skb, &ctinfo);
  332. net = nf_ct_net(ct);
  333. /* ipt_REJECT uses nf_conntrack_attach to attach related
  334. ICMP/TCP RST packets in other direction. Actual packet
  335. which created connection will be IP_CT_NEW or for an
  336. expected connection, IP_CT_RELATED. */
  337. if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
  338. return NF_ACCEPT;
  339. hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
  340. repl_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
  341. /* We're not in hash table, and we refuse to set up related
  342. connections for unconfirmed conns. But packet copies and
  343. REJECT will give spurious warnings here. */
  344. /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
  345. /* No external references means noone else could have
  346. confirmed us. */
  347. NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
  348. pr_debug("Confirming conntrack %p\n", ct);
  349. spin_lock_bh(&nf_conntrack_lock);
  350. /* See if there's one in the list already, including reverse:
  351. NAT could have grabbed it without realizing, since we're
  352. not in the hash. If there is, we lost race. */
  353. hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
  354. if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
  355. &h->tuple))
  356. goto out;
  357. hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
  358. if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
  359. &h->tuple))
  360. goto out;
  361. /* Remove from unconfirmed list */
  362. hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
  363. /* Timer relative to confirmation time, not original
  364. setting time, otherwise we'd get timer wrap in
  365. weird delay cases. */
  366. ct->timeout.expires += jiffies;
  367. add_timer(&ct->timeout);
  368. atomic_inc(&ct->ct_general.use);
  369. set_bit(IPS_CONFIRMED_BIT, &ct->status);
  370. /* Since the lookup is lockless, hash insertion must be done after
  371. * starting the timer and setting the CONFIRMED bit. The RCU barriers
  372. * guarantee that no other CPU can find the conntrack before the above
  373. * stores are visible.
  374. */
  375. __nf_conntrack_hash_insert(ct, hash, repl_hash);
  376. NF_CT_STAT_INC(net, insert);
  377. spin_unlock_bh(&nf_conntrack_lock);
  378. help = nfct_help(ct);
  379. if (help && help->helper)
  380. nf_conntrack_event_cache(IPCT_HELPER, ct);
  381. nf_conntrack_event_cache(master_ct(ct) ?
  382. IPCT_RELATED : IPCT_NEW, ct);
  383. return NF_ACCEPT;
  384. out:
  385. NF_CT_STAT_INC(net, insert_failed);
  386. spin_unlock_bh(&nf_conntrack_lock);
  387. return NF_DROP;
  388. }
  389. EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
  390. /* Returns true if a connection correspondings to the tuple (required
  391. for NAT). */
  392. int
  393. nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
  394. const struct nf_conn *ignored_conntrack)
  395. {
  396. struct net *net = nf_ct_net(ignored_conntrack);
  397. struct nf_conntrack_tuple_hash *h;
  398. struct hlist_nulls_node *n;
  399. unsigned int hash = hash_conntrack(net, tuple);
  400. /* Disable BHs the entire time since we need to disable them at
  401. * least once for the stats anyway.
  402. */
  403. rcu_read_lock_bh();
  404. hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) {
  405. if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack &&
  406. nf_ct_tuple_equal(tuple, &h->tuple)) {
  407. NF_CT_STAT_INC(net, found);
  408. rcu_read_unlock_bh();
  409. return 1;
  410. }
  411. NF_CT_STAT_INC(net, searched);
  412. }
  413. rcu_read_unlock_bh();
  414. return 0;
  415. }
  416. EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
  417. #define NF_CT_EVICTION_RANGE 8
  418. /* There's a small race here where we may free a just-assured
  419. connection. Too bad: we're in trouble anyway. */
  420. static noinline int early_drop(struct net *net, unsigned int hash)
  421. {
  422. /* Use oldest entry, which is roughly LRU */
  423. struct nf_conntrack_tuple_hash *h;
  424. struct nf_conn *ct = NULL, *tmp;
  425. struct hlist_nulls_node *n;
  426. unsigned int i, cnt = 0;
  427. int dropped = 0;
  428. rcu_read_lock();
  429. for (i = 0; i < net->ct.htable_size; i++) {
  430. hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
  431. hnnode) {
  432. tmp = nf_ct_tuplehash_to_ctrack(h);
  433. if (!test_bit(IPS_ASSURED_BIT, &tmp->status))
  434. ct = tmp;
  435. cnt++;
  436. }
  437. if (ct && unlikely(nf_ct_is_dying(ct) ||
  438. !atomic_inc_not_zero(&ct->ct_general.use)))
  439. ct = NULL;
  440. if (ct || cnt >= NF_CT_EVICTION_RANGE)
  441. break;
  442. hash = (hash + 1) % net->ct.htable_size;
  443. }
  444. rcu_read_unlock();
  445. if (!ct)
  446. return dropped;
  447. if (del_timer(&ct->timeout)) {
  448. death_by_timeout((unsigned long)ct);
  449. dropped = 1;
  450. NF_CT_STAT_INC_ATOMIC(net, early_drop);
  451. }
  452. nf_ct_put(ct);
  453. return dropped;
  454. }
  455. struct nf_conn *nf_conntrack_alloc(struct net *net,
  456. const struct nf_conntrack_tuple *orig,
  457. const struct nf_conntrack_tuple *repl,
  458. gfp_t gfp)
  459. {
  460. struct nf_conn *ct;
  461. if (unlikely(!nf_conntrack_hash_rnd_initted)) {
  462. get_random_bytes(&nf_conntrack_hash_rnd,
  463. sizeof(nf_conntrack_hash_rnd));
  464. nf_conntrack_hash_rnd_initted = 1;
  465. }
  466. /* We don't want any race condition at early drop stage */
  467. atomic_inc(&net->ct.count);
  468. if (nf_conntrack_max &&
  469. unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
  470. unsigned int hash = hash_conntrack(net, orig);
  471. if (!early_drop(net, hash)) {
  472. atomic_dec(&net->ct.count);
  473. if (net_ratelimit())
  474. printk(KERN_WARNING
  475. "nf_conntrack: table full, dropping"
  476. " packet.\n");
  477. return ERR_PTR(-ENOMEM);
  478. }
  479. }
  480. /*
  481. * Do not use kmem_cache_zalloc(), as this cache uses
  482. * SLAB_DESTROY_BY_RCU.
  483. */
  484. ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp);
  485. if (ct == NULL) {
  486. pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n");
  487. atomic_dec(&net->ct.count);
  488. return ERR_PTR(-ENOMEM);
  489. }
  490. /*
  491. * Let ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.next
  492. * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged.
  493. */
  494. memset(&ct->tuplehash[IP_CT_DIR_MAX], 0,
  495. sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
  496. spin_lock_init(&ct->lock);
  497. ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
  498. ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
  499. ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
  500. ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL;
  501. /* Don't set timer yet: wait for confirmation */
  502. setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
  503. #ifdef CONFIG_NET_NS
  504. ct->ct_net = net;
  505. #endif
  506. /*
  507. * changes to lookup keys must be done before setting refcnt to 1
  508. */
  509. smp_wmb();
  510. atomic_set(&ct->ct_general.use, 1);
  511. return ct;
  512. }
  513. EXPORT_SYMBOL_GPL(nf_conntrack_alloc);
  514. void nf_conntrack_free(struct nf_conn *ct)
  515. {
  516. struct net *net = nf_ct_net(ct);
  517. nf_ct_ext_destroy(ct);
  518. atomic_dec(&net->ct.count);
  519. nf_ct_ext_free(ct);
  520. kmem_cache_free(net->ct.nf_conntrack_cachep, ct);
  521. }
  522. EXPORT_SYMBOL_GPL(nf_conntrack_free);
  523. /* Allocate a new conntrack: we return -ENOMEM if classification
  524. failed due to stress. Otherwise it really is unclassifiable. */
  525. static struct nf_conntrack_tuple_hash *
  526. init_conntrack(struct net *net,
  527. const struct nf_conntrack_tuple *tuple,
  528. struct nf_conntrack_l3proto *l3proto,
  529. struct nf_conntrack_l4proto *l4proto,
  530. struct sk_buff *skb,
  531. unsigned int dataoff)
  532. {
  533. struct nf_conn *ct;
  534. struct nf_conn_help *help;
  535. struct nf_conntrack_tuple repl_tuple;
  536. struct nf_conntrack_expect *exp;
  537. if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
  538. pr_debug("Can't invert tuple.\n");
  539. return NULL;
  540. }
  541. ct = nf_conntrack_alloc(net, tuple, &repl_tuple, GFP_ATOMIC);
  542. if (IS_ERR(ct)) {
  543. pr_debug("Can't allocate conntrack.\n");
  544. return (struct nf_conntrack_tuple_hash *)ct;
  545. }
  546. if (!l4proto->new(ct, skb, dataoff)) {
  547. nf_conntrack_free(ct);
  548. pr_debug("init conntrack: can't track with proto module\n");
  549. return NULL;
  550. }
  551. nf_ct_acct_ext_add(ct, GFP_ATOMIC);
  552. nf_ct_ecache_ext_add(ct, GFP_ATOMIC);
  553. spin_lock_bh(&nf_conntrack_lock);
  554. exp = nf_ct_find_expectation(net, tuple);
  555. if (exp) {
  556. pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
  557. ct, exp);
  558. /* Welcome, Mr. Bond. We've been expecting you... */
  559. __set_bit(IPS_EXPECTED_BIT, &ct->status);
  560. ct->master = exp->master;
  561. if (exp->helper) {
  562. help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
  563. if (help)
  564. rcu_assign_pointer(help->helper, exp->helper);
  565. }
  566. #ifdef CONFIG_NF_CONNTRACK_MARK
  567. ct->mark = exp->master->mark;
  568. #endif
  569. #ifdef CONFIG_NF_CONNTRACK_SECMARK
  570. ct->secmark = exp->master->secmark;
  571. #endif
  572. nf_conntrack_get(&ct->master->ct_general);
  573. NF_CT_STAT_INC(net, expect_new);
  574. } else {
  575. __nf_ct_try_assign_helper(ct, GFP_ATOMIC);
  576. NF_CT_STAT_INC(net, new);
  577. }
  578. /* Overload tuple linked list to put us in unconfirmed list. */
  579. hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
  580. &net->ct.unconfirmed);
  581. spin_unlock_bh(&nf_conntrack_lock);
  582. if (exp) {
  583. if (exp->expectfn)
  584. exp->expectfn(ct, exp);
  585. nf_ct_expect_put(exp);
  586. }
  587. return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
  588. }
  589. /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
  590. static inline struct nf_conn *
  591. resolve_normal_ct(struct net *net,
  592. struct sk_buff *skb,
  593. unsigned int dataoff,
  594. u_int16_t l3num,
  595. u_int8_t protonum,
  596. struct nf_conntrack_l3proto *l3proto,
  597. struct nf_conntrack_l4proto *l4proto,
  598. int *set_reply,
  599. enum ip_conntrack_info *ctinfo)
  600. {
  601. struct nf_conntrack_tuple tuple;
  602. struct nf_conntrack_tuple_hash *h;
  603. struct nf_conn *ct;
  604. if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
  605. dataoff, l3num, protonum, &tuple, l3proto,
  606. l4proto)) {
  607. pr_debug("resolve_normal_ct: Can't get tuple\n");
  608. return NULL;
  609. }
  610. /* look for tuple match */
  611. h = nf_conntrack_find_get(net, &tuple);
  612. if (!h) {
  613. h = init_conntrack(net, &tuple, l3proto, l4proto, skb, dataoff);
  614. if (!h)
  615. return NULL;
  616. if (IS_ERR(h))
  617. return (void *)h;
  618. }
  619. ct = nf_ct_tuplehash_to_ctrack(h);
  620. /* It exists; we have (non-exclusive) reference. */
  621. if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) {
  622. *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
  623. /* Please set reply bit if this packet OK */
  624. *set_reply = 1;
  625. } else {
  626. /* Once we've had two way comms, always ESTABLISHED. */
  627. if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
  628. pr_debug("nf_conntrack_in: normal packet for %p\n", ct);
  629. *ctinfo = IP_CT_ESTABLISHED;
  630. } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
  631. pr_debug("nf_conntrack_in: related packet for %p\n",
  632. ct);
  633. *ctinfo = IP_CT_RELATED;
  634. } else {
  635. pr_debug("nf_conntrack_in: new packet for %p\n", ct);
  636. *ctinfo = IP_CT_NEW;
  637. }
  638. *set_reply = 0;
  639. }
  640. skb->nfct = &ct->ct_general;
  641. skb->nfctinfo = *ctinfo;
  642. return ct;
  643. }
  644. unsigned int
  645. nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
  646. struct sk_buff *skb)
  647. {
  648. struct nf_conn *ct;
  649. enum ip_conntrack_info ctinfo;
  650. struct nf_conntrack_l3proto *l3proto;
  651. struct nf_conntrack_l4proto *l4proto;
  652. unsigned int dataoff;
  653. u_int8_t protonum;
  654. int set_reply = 0;
  655. int ret;
  656. /* Previously seen (loopback or untracked)? Ignore. */
  657. if (skb->nfct) {
  658. NF_CT_STAT_INC_ATOMIC(net, ignore);
  659. return NF_ACCEPT;
  660. }
  661. /* rcu_read_lock()ed by nf_hook_slow */
  662. l3proto = __nf_ct_l3proto_find(pf);
  663. ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
  664. &dataoff, &protonum);
  665. if (ret <= 0) {
  666. pr_debug("not prepared to track yet or error occured\n");
  667. NF_CT_STAT_INC_ATOMIC(net, error);
  668. NF_CT_STAT_INC_ATOMIC(net, invalid);
  669. return -ret;
  670. }
  671. l4proto = __nf_ct_l4proto_find(pf, protonum);
  672. /* It may be an special packet, error, unclean...
  673. * inverse of the return code tells to the netfilter
  674. * core what to do with the packet. */
  675. if (l4proto->error != NULL) {
  676. ret = l4proto->error(net, skb, dataoff, &ctinfo, pf, hooknum);
  677. if (ret <= 0) {
  678. NF_CT_STAT_INC_ATOMIC(net, error);
  679. NF_CT_STAT_INC_ATOMIC(net, invalid);
  680. return -ret;
  681. }
  682. }
  683. ct = resolve_normal_ct(net, skb, dataoff, pf, protonum,
  684. l3proto, l4proto, &set_reply, &ctinfo);
  685. if (!ct) {
  686. /* Not valid part of a connection */
  687. NF_CT_STAT_INC_ATOMIC(net, invalid);
  688. return NF_ACCEPT;
  689. }
  690. if (IS_ERR(ct)) {
  691. /* Too stressed to deal. */
  692. NF_CT_STAT_INC_ATOMIC(net, drop);
  693. return NF_DROP;
  694. }
  695. NF_CT_ASSERT(skb->nfct);
  696. ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum);
  697. if (ret <= 0) {
  698. /* Invalid: inverse of the return code tells
  699. * the netfilter core what to do */
  700. pr_debug("nf_conntrack_in: Can't track with proto module\n");
  701. nf_conntrack_put(skb->nfct);
  702. skb->nfct = NULL;
  703. NF_CT_STAT_INC_ATOMIC(net, invalid);
  704. if (ret == -NF_DROP)
  705. NF_CT_STAT_INC_ATOMIC(net, drop);
  706. return -ret;
  707. }
  708. if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
  709. nf_conntrack_event_cache(IPCT_STATUS, ct);
  710. return ret;
  711. }
  712. EXPORT_SYMBOL_GPL(nf_conntrack_in);
  713. bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
  714. const struct nf_conntrack_tuple *orig)
  715. {
  716. bool ret;
  717. rcu_read_lock();
  718. ret = nf_ct_invert_tuple(inverse, orig,
  719. __nf_ct_l3proto_find(orig->src.l3num),
  720. __nf_ct_l4proto_find(orig->src.l3num,
  721. orig->dst.protonum));
  722. rcu_read_unlock();
  723. return ret;
  724. }
  725. EXPORT_SYMBOL_GPL(nf_ct_invert_tuplepr);
  726. /* Alter reply tuple (maybe alter helper). This is for NAT, and is
  727. implicitly racy: see __nf_conntrack_confirm */
  728. void nf_conntrack_alter_reply(struct nf_conn *ct,
  729. const struct nf_conntrack_tuple *newreply)
  730. {
  731. struct nf_conn_help *help = nfct_help(ct);
  732. /* Should be unconfirmed, so not in hash table yet */
  733. NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
  734. pr_debug("Altering reply tuple of %p to ", ct);
  735. nf_ct_dump_tuple(newreply);
  736. ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
  737. if (ct->master || (help && !hlist_empty(&help->expectations)))
  738. return;
  739. rcu_read_lock();
  740. __nf_ct_try_assign_helper(ct, GFP_ATOMIC);
  741. rcu_read_unlock();
  742. }
  743. EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply);
  744. /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
  745. void __nf_ct_refresh_acct(struct nf_conn *ct,
  746. enum ip_conntrack_info ctinfo,
  747. const struct sk_buff *skb,
  748. unsigned long extra_jiffies,
  749. int do_acct)
  750. {
  751. NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
  752. NF_CT_ASSERT(skb);
  753. /* Only update if this is not a fixed timeout */
  754. if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
  755. goto acct;
  756. /* If not in hash table, timer will not be active yet */
  757. if (!nf_ct_is_confirmed(ct)) {
  758. ct->timeout.expires = extra_jiffies;
  759. } else {
  760. unsigned long newtime = jiffies + extra_jiffies;
  761. /* Only update the timeout if the new timeout is at least
  762. HZ jiffies from the old timeout. Need del_timer for race
  763. avoidance (may already be dying). */
  764. if (newtime - ct->timeout.expires >= HZ)
  765. mod_timer_pending(&ct->timeout, newtime);
  766. }
  767. acct:
  768. if (do_acct) {
  769. struct nf_conn_counter *acct;
  770. acct = nf_conn_acct_find(ct);
  771. if (acct) {
  772. spin_lock_bh(&ct->lock);
  773. acct[CTINFO2DIR(ctinfo)].packets++;
  774. acct[CTINFO2DIR(ctinfo)].bytes +=
  775. skb->len - skb_network_offset(skb);
  776. spin_unlock_bh(&ct->lock);
  777. }
  778. }
  779. }
  780. EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
  781. bool __nf_ct_kill_acct(struct nf_conn *ct,
  782. enum ip_conntrack_info ctinfo,
  783. const struct sk_buff *skb,
  784. int do_acct)
  785. {
  786. if (do_acct) {
  787. struct nf_conn_counter *acct;
  788. acct = nf_conn_acct_find(ct);
  789. if (acct) {
  790. spin_lock_bh(&ct->lock);
  791. acct[CTINFO2DIR(ctinfo)].packets++;
  792. acct[CTINFO2DIR(ctinfo)].bytes +=
  793. skb->len - skb_network_offset(skb);
  794. spin_unlock_bh(&ct->lock);
  795. }
  796. }
  797. if (del_timer(&ct->timeout)) {
  798. ct->timeout.function((unsigned long)ct);
  799. return true;
  800. }
  801. return false;
  802. }
  803. EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
  804. #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
  805. #include <linux/netfilter/nfnetlink.h>
  806. #include <linux/netfilter/nfnetlink_conntrack.h>
  807. #include <linux/mutex.h>
  808. /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
  809. * in ip_conntrack_core, since we don't want the protocols to autoload
  810. * or depend on ctnetlink */
  811. int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb,
  812. const struct nf_conntrack_tuple *tuple)
  813. {
  814. NLA_PUT_BE16(skb, CTA_PROTO_SRC_PORT, tuple->src.u.tcp.port);
  815. NLA_PUT_BE16(skb, CTA_PROTO_DST_PORT, tuple->dst.u.tcp.port);
  816. return 0;
  817. nla_put_failure:
  818. return -1;
  819. }
  820. EXPORT_SYMBOL_GPL(nf_ct_port_tuple_to_nlattr);
  821. const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = {
  822. [CTA_PROTO_SRC_PORT] = { .type = NLA_U16 },
  823. [CTA_PROTO_DST_PORT] = { .type = NLA_U16 },
  824. };
  825. EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy);
  826. int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[],
  827. struct nf_conntrack_tuple *t)
  828. {
  829. if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT])
  830. return -EINVAL;
  831. t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]);
  832. t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]);
  833. return 0;
  834. }
  835. EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_to_tuple);
  836. int nf_ct_port_nlattr_tuple_size(void)
  837. {
  838. return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
  839. }
  840. EXPORT_SYMBOL_GPL(nf_ct_port_nlattr_tuple_size);
  841. #endif
  842. /* Used by ipt_REJECT and ip6t_REJECT. */
  843. static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
  844. {
  845. struct nf_conn *ct;
  846. enum ip_conntrack_info ctinfo;
  847. /* This ICMP is in reverse direction to the packet which caused it */
  848. ct = nf_ct_get(skb, &ctinfo);
  849. if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
  850. ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
  851. else
  852. ctinfo = IP_CT_RELATED;
  853. /* Attach to new skbuff, and increment count */
  854. nskb->nfct = &ct->ct_general;
  855. nskb->nfctinfo = ctinfo;
  856. nf_conntrack_get(nskb->nfct);
  857. }
  858. /* Bring out ya dead! */
  859. static struct nf_conn *
  860. get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
  861. void *data, unsigned int *bucket)
  862. {
  863. struct nf_conntrack_tuple_hash *h;
  864. struct nf_conn *ct;
  865. struct hlist_nulls_node *n;
  866. spin_lock_bh(&nf_conntrack_lock);
  867. for (; *bucket < net->ct.htable_size; (*bucket)++) {
  868. hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
  869. ct = nf_ct_tuplehash_to_ctrack(h);
  870. if (iter(ct, data))
  871. goto found;
  872. }
  873. }
  874. hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) {
  875. ct = nf_ct_tuplehash_to_ctrack(h);
  876. if (iter(ct, data))
  877. set_bit(IPS_DYING_BIT, &ct->status);
  878. }
  879. spin_unlock_bh(&nf_conntrack_lock);
  880. return NULL;
  881. found:
  882. atomic_inc(&ct->ct_general.use);
  883. spin_unlock_bh(&nf_conntrack_lock);
  884. return ct;
  885. }
  886. void nf_ct_iterate_cleanup(struct net *net,
  887. int (*iter)(struct nf_conn *i, void *data),
  888. void *data)
  889. {
  890. struct nf_conn *ct;
  891. unsigned int bucket = 0;
  892. while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
  893. /* Time to push up daises... */
  894. if (del_timer(&ct->timeout))
  895. death_by_timeout((unsigned long)ct);
  896. /* ... else the timer will get him soon. */
  897. nf_ct_put(ct);
  898. }
  899. }
  900. EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
  901. struct __nf_ct_flush_report {
  902. u32 pid;
  903. int report;
  904. };
  905. static int kill_report(struct nf_conn *i, void *data)
  906. {
  907. struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data;
  908. /* If we fail to deliver the event, death_by_timeout() will retry */
  909. if (nf_conntrack_event_report(IPCT_DESTROY, i,
  910. fr->pid, fr->report) < 0)
  911. return 1;
  912. /* Avoid the delivery of the destroy event in death_by_timeout(). */
  913. set_bit(IPS_DYING_BIT, &i->status);
  914. return 1;
  915. }
  916. static int kill_all(struct nf_conn *i, void *data)
  917. {
  918. return 1;
  919. }
  920. void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size)
  921. {
  922. if (vmalloced)
  923. vfree(hash);
  924. else
  925. free_pages((unsigned long)hash,
  926. get_order(sizeof(struct hlist_head) * size));
  927. }
  928. EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
  929. void nf_conntrack_flush_report(struct net *net, u32 pid, int report)
  930. {
  931. struct __nf_ct_flush_report fr = {
  932. .pid = pid,
  933. .report = report,
  934. };
  935. nf_ct_iterate_cleanup(net, kill_report, &fr);
  936. }
  937. EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
  938. static void nf_ct_release_dying_list(struct net *net)
  939. {
  940. struct nf_conntrack_tuple_hash *h;
  941. struct nf_conn *ct;
  942. struct hlist_nulls_node *n;
  943. spin_lock_bh(&nf_conntrack_lock);
  944. hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) {
  945. ct = nf_ct_tuplehash_to_ctrack(h);
  946. /* never fails to remove them, no listeners at this point */
  947. nf_ct_kill(ct);
  948. }
  949. spin_unlock_bh(&nf_conntrack_lock);
  950. }
  951. static void nf_conntrack_cleanup_init_net(void)
  952. {
  953. /* wait until all references to nf_conntrack_untracked are dropped */
  954. while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1)
  955. schedule();
  956. nf_conntrack_helper_fini();
  957. nf_conntrack_proto_fini();
  958. }
  959. static void nf_conntrack_cleanup_net(struct net *net)
  960. {
  961. i_see_dead_people:
  962. nf_ct_iterate_cleanup(net, kill_all, NULL);
  963. nf_ct_release_dying_list(net);
  964. if (atomic_read(&net->ct.count) != 0) {
  965. schedule();
  966. goto i_see_dead_people;
  967. }
  968. nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
  969. net->ct.htable_size);
  970. nf_conntrack_ecache_fini(net);
  971. nf_conntrack_acct_fini(net);
  972. nf_conntrack_expect_fini(net);
  973. kmem_cache_destroy(net->ct.nf_conntrack_cachep);
  974. kfree(net->ct.slabname);
  975. free_percpu(net->ct.stat);
  976. }
  977. /* Mishearing the voices in his head, our hero wonders how he's
  978. supposed to kill the mall. */
  979. void nf_conntrack_cleanup(struct net *net)
  980. {
  981. if (net_eq(net, &init_net))
  982. rcu_assign_pointer(ip_ct_attach, NULL);
  983. /* This makes sure all current packets have passed through
  984. netfilter framework. Roll on, two-stage module
  985. delete... */
  986. synchronize_net();
  987. nf_conntrack_cleanup_net(net);
  988. if (net_eq(net, &init_net)) {
  989. rcu_assign_pointer(nf_ct_destroy, NULL);
  990. nf_conntrack_cleanup_init_net();
  991. }
  992. }
  993. void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls)
  994. {
  995. struct hlist_nulls_head *hash;
  996. unsigned int nr_slots, i;
  997. size_t sz;
  998. *vmalloced = 0;
  999. BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
  1000. nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
  1001. sz = nr_slots * sizeof(struct hlist_nulls_head);
  1002. hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
  1003. get_order(sz));
  1004. if (!hash) {
  1005. *vmalloced = 1;
  1006. printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
  1007. hash = __vmalloc(sz, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
  1008. }
  1009. if (hash && nulls)
  1010. for (i = 0; i < nr_slots; i++)
  1011. INIT_HLIST_NULLS_HEAD(&hash[i], i);
  1012. return hash;
  1013. }
  1014. EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
  1015. int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
  1016. {
  1017. int i, bucket, vmalloced, old_vmalloced;
  1018. unsigned int hashsize, old_size;
  1019. struct hlist_nulls_head *hash, *old_hash;
  1020. struct nf_conntrack_tuple_hash *h;
  1021. if (current->nsproxy->net_ns != &init_net)
  1022. return -EOPNOTSUPP;
  1023. /* On boot, we can set this without any fancy locking. */
  1024. if (!nf_conntrack_htable_size)
  1025. return param_set_uint(val, kp);
  1026. hashsize = simple_strtoul(val, NULL, 0);
  1027. if (!hashsize)
  1028. return -EINVAL;
  1029. hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced, 1);
  1030. if (!hash)
  1031. return -ENOMEM;
  1032. /* Lookups in the old hash might happen in parallel, which means we
  1033. * might get false negatives during connection lookup. New connections
  1034. * created because of a false negative won't make it into the hash
  1035. * though since that required taking the lock.
  1036. */
  1037. spin_lock_bh(&nf_conntrack_lock);
  1038. for (i = 0; i < init_net.ct.htable_size; i++) {
  1039. while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
  1040. h = hlist_nulls_entry(init_net.ct.hash[i].first,
  1041. struct nf_conntrack_tuple_hash, hnnode);
  1042. hlist_nulls_del_rcu(&h->hnnode);
  1043. bucket = __hash_conntrack(&h->tuple, hashsize,
  1044. nf_conntrack_hash_rnd);
  1045. hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]);
  1046. }
  1047. }
  1048. old_size = init_net.ct.htable_size;
  1049. old_vmalloced = init_net.ct.hash_vmalloc;
  1050. old_hash = init_net.ct.hash;
  1051. init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
  1052. init_net.ct.hash_vmalloc = vmalloced;
  1053. init_net.ct.hash = hash;
  1054. spin_unlock_bh(&nf_conntrack_lock);
  1055. nf_ct_free_hashtable(old_hash, old_vmalloced, old_size);
  1056. return 0;
  1057. }
  1058. EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
  1059. module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
  1060. &nf_conntrack_htable_size, 0600);
  1061. static int nf_conntrack_init_init_net(void)
  1062. {
  1063. int max_factor = 8;
  1064. int ret;
  1065. /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
  1066. * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
  1067. if (!nf_conntrack_htable_size) {
  1068. nf_conntrack_htable_size
  1069. = (((totalram_pages << PAGE_SHIFT) / 16384)
  1070. / sizeof(struct hlist_head));
  1071. if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
  1072. nf_conntrack_htable_size = 16384;
  1073. if (nf_conntrack_htable_size < 32)
  1074. nf_conntrack_htable_size = 32;
  1075. /* Use a max. factor of four by default to get the same max as
  1076. * with the old struct list_heads. When a table size is given
  1077. * we use the old value of 8 to avoid reducing the max.
  1078. * entries. */
  1079. max_factor = 4;
  1080. }
  1081. nf_conntrack_max = max_factor * nf_conntrack_htable_size;
  1082. printk("nf_conntrack version %s (%u buckets, %d max)\n",
  1083. NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
  1084. nf_conntrack_max);
  1085. ret = nf_conntrack_proto_init();
  1086. if (ret < 0)
  1087. goto err_proto;
  1088. ret = nf_conntrack_helper_init();
  1089. if (ret < 0)
  1090. goto err_helper;
  1091. /* Set up fake conntrack: to never be deleted, not in any hashes */
  1092. #ifdef CONFIG_NET_NS
  1093. nf_conntrack_untracked.ct_net = &init_net;
  1094. #endif
  1095. atomic_set(&nf_conntrack_untracked.ct_general.use, 1);
  1096. /* - and look it like as a confirmed connection */
  1097. set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status);
  1098. return 0;
  1099. err_helper:
  1100. nf_conntrack_proto_fini();
  1101. err_proto:
  1102. return ret;
  1103. }
  1104. /*
  1105. * We need to use special "null" values, not used in hash table
  1106. */
  1107. #define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
  1108. #define DYING_NULLS_VAL ((1<<30)+1)
  1109. static int nf_conntrack_init_net(struct net *net)
  1110. {
  1111. int ret;
  1112. atomic_set(&net->ct.count, 0);
  1113. INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL);
  1114. INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL);
  1115. net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
  1116. if (!net->ct.stat) {
  1117. ret = -ENOMEM;
  1118. goto err_stat;
  1119. }
  1120. net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
  1121. if (!net->ct.slabname) {
  1122. ret = -ENOMEM;
  1123. goto err_slabname;
  1124. }
  1125. net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
  1126. sizeof(struct nf_conn), 0,
  1127. SLAB_DESTROY_BY_RCU, NULL);
  1128. if (!net->ct.nf_conntrack_cachep) {
  1129. printk(KERN_ERR "Unable to create nf_conn slab cache\n");
  1130. ret = -ENOMEM;
  1131. goto err_cache;
  1132. }
  1133. net->ct.htable_size = nf_conntrack_htable_size;
  1134. net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size,
  1135. &net->ct.hash_vmalloc, 1);
  1136. if (!net->ct.hash) {
  1137. ret = -ENOMEM;
  1138. printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
  1139. goto err_hash;
  1140. }
  1141. ret = nf_conntrack_expect_init(net);
  1142. if (ret < 0)
  1143. goto err_expect;
  1144. ret = nf_conntrack_acct_init(net);
  1145. if (ret < 0)
  1146. goto err_acct;
  1147. ret = nf_conntrack_ecache_init(net);
  1148. if (ret < 0)
  1149. goto err_ecache;
  1150. return 0;
  1151. err_ecache:
  1152. nf_conntrack_acct_fini(net);
  1153. err_acct:
  1154. nf_conntrack_expect_fini(net);
  1155. err_expect:
  1156. nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
  1157. net->ct.htable_size);
  1158. err_hash:
  1159. kmem_cache_destroy(net->ct.nf_conntrack_cachep);
  1160. err_cache:
  1161. kfree(net->ct.slabname);
  1162. err_slabname:
  1163. free_percpu(net->ct.stat);
  1164. err_stat:
  1165. return ret;
  1166. }
  1167. s16 (*nf_ct_nat_offset)(const struct nf_conn *ct,
  1168. enum ip_conntrack_dir dir,
  1169. u32 seq);
  1170. EXPORT_SYMBOL_GPL(nf_ct_nat_offset);
  1171. int nf_conntrack_init(struct net *net)
  1172. {
  1173. int ret;
  1174. if (net_eq(net, &init_net)) {
  1175. ret = nf_conntrack_init_init_net();
  1176. if (ret < 0)
  1177. goto out_init_net;
  1178. }
  1179. ret = nf_conntrack_init_net(net);
  1180. if (ret < 0)
  1181. goto out_net;
  1182. if (net_eq(net, &init_net)) {
  1183. /* For use by REJECT target */
  1184. rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach);
  1185. rcu_assign_pointer(nf_ct_destroy, destroy_conntrack);
  1186. /* Howto get NAT offsets */
  1187. rcu_assign_pointer(nf_ct_nat_offset, NULL);
  1188. }
  1189. return 0;
  1190. out_net:
  1191. if (net_eq(net, &init_net))
  1192. nf_conntrack_cleanup_init_net();
  1193. out_init_net:
  1194. return ret;
  1195. }