/contrib/bind9/lib/dns/rbtdb.c

https://bitbucket.org/freebsd/freebsd-head/ · C · 9332 lines · 6603 code · 1036 blank · 1693 comment · 2043 complexity · 246c4f82217f43e68410e305432312b4 MD5 · raw file

Large files are truncated click here to view the full file

  1. /*
  2. * Copyright (C) 2004-2012 Internet Systems Consortium, Inc. ("ISC")
  3. * Copyright (C) 1999-2003 Internet Software Consortium.
  4. *
  5. * Permission to use, copy, modify, and/or distribute this software for any
  6. * purpose with or without fee is hereby granted, provided that the above
  7. * copyright notice and this permission notice appear in all copies.
  8. *
  9. * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
  10. * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
  11. * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
  12. * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
  13. * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
  14. * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
  15. * PERFORMANCE OF THIS SOFTWARE.
  16. */
  17. /* $Id$ */
  18. /*! \file */
  19. /*
  20. * Principal Author: Bob Halley
  21. */
  22. #include <config.h>
  23. /* #define inline */
  24. #include <isc/event.h>
  25. #include <isc/heap.h>
  26. #include <isc/mem.h>
  27. #include <isc/mutex.h>
  28. #include <isc/platform.h>
  29. #include <isc/print.h>
  30. #include <isc/random.h>
  31. #include <isc/refcount.h>
  32. #include <isc/rwlock.h>
  33. #include <isc/serial.h>
  34. #include <isc/string.h>
  35. #include <isc/task.h>
  36. #include <isc/time.h>
  37. #include <isc/util.h>
  38. #include <dns/acache.h>
  39. #include <dns/db.h>
  40. #include <dns/dbiterator.h>
  41. #include <dns/events.h>
  42. #include <dns/fixedname.h>
  43. #include <dns/lib.h>
  44. #include <dns/log.h>
  45. #include <dns/masterdump.h>
  46. #include <dns/nsec.h>
  47. #include <dns/nsec3.h>
  48. #include <dns/rbt.h>
  49. #include <dns/rpz.h>
  50. #include <dns/rdata.h>
  51. #include <dns/rdataset.h>
  52. #include <dns/rdatasetiter.h>
  53. #include <dns/rdataslab.h>
  54. #include <dns/rdatastruct.h>
  55. #include <dns/result.h>
  56. #include <dns/stats.h>
  57. #include <dns/view.h>
  58. #include <dns/zone.h>
  59. #include <dns/zonekey.h>
  60. #ifdef DNS_RBTDB_VERSION64
  61. #include "rbtdb64.h"
  62. #else
  63. #include "rbtdb.h"
  64. #endif
  65. #ifdef DNS_RBTDB_VERSION64
  66. #define RBTDB_MAGIC ISC_MAGIC('R', 'B', 'D', '8')
  67. #else
  68. #define RBTDB_MAGIC ISC_MAGIC('R', 'B', 'D', '4')
  69. #endif
  70. /*%
  71. * Note that "impmagic" is not the first four bytes of the struct, so
  72. * ISC_MAGIC_VALID cannot be used.
  73. */
  74. #define VALID_RBTDB(rbtdb) ((rbtdb) != NULL && \
  75. (rbtdb)->common.impmagic == RBTDB_MAGIC)
  76. #ifdef DNS_RBTDB_VERSION64
  77. typedef isc_uint64_t rbtdb_serial_t;
  78. /*%
  79. * Make casting easier in symbolic debuggers by using different names
  80. * for the 64 bit version.
  81. */
  82. #define dns_rbtdb_t dns_rbtdb64_t
  83. #define rdatasetheader_t rdatasetheader64_t
  84. #define rbtdb_version_t rbtdb_version64_t
  85. #else
  86. typedef isc_uint32_t rbtdb_serial_t;
  87. #endif
  88. typedef isc_uint32_t rbtdb_rdatatype_t;
  89. #define RBTDB_RDATATYPE_BASE(type) ((dns_rdatatype_t)((type) & 0xFFFF))
  90. #define RBTDB_RDATATYPE_EXT(type) ((dns_rdatatype_t)((type) >> 16))
  91. #define RBTDB_RDATATYPE_VALUE(b, e) ((rbtdb_rdatatype_t)((e) << 16) | (b))
  92. #define RBTDB_RDATATYPE_SIGNSEC \
  93. RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec)
  94. #define RBTDB_RDATATYPE_SIGNSEC3 \
  95. RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec3)
  96. #define RBTDB_RDATATYPE_SIGNS \
  97. RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns)
  98. #define RBTDB_RDATATYPE_SIGCNAME \
  99. RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname)
  100. #define RBTDB_RDATATYPE_SIGDNAME \
  101. RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname)
  102. #define RBTDB_RDATATYPE_NCACHEANY \
  103. RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any)
  104. /*
  105. * We use rwlock for DB lock only when ISC_RWLOCK_USEATOMIC is non 0.
  106. * Using rwlock is effective with regard to lookup performance only when
  107. * it is implemented in an efficient way.
  108. * Otherwise, it is generally wise to stick to the simple locking since rwlock
  109. * would require more memory or can even make lookups slower due to its own
  110. * overhead (when it internally calls mutex locks).
  111. */
  112. #ifdef ISC_RWLOCK_USEATOMIC
  113. #define DNS_RBTDB_USERWLOCK 1
  114. #else
  115. #define DNS_RBTDB_USERWLOCK 0
  116. #endif
  117. #if DNS_RBTDB_USERWLOCK
  118. #define RBTDB_INITLOCK(l) isc_rwlock_init((l), 0, 0)
  119. #define RBTDB_DESTROYLOCK(l) isc_rwlock_destroy(l)
  120. #define RBTDB_LOCK(l, t) RWLOCK((l), (t))
  121. #define RBTDB_UNLOCK(l, t) RWUNLOCK((l), (t))
  122. #else
  123. #define RBTDB_INITLOCK(l) isc_mutex_init(l)
  124. #define RBTDB_DESTROYLOCK(l) DESTROYLOCK(l)
  125. #define RBTDB_LOCK(l, t) LOCK(l)
  126. #define RBTDB_UNLOCK(l, t) UNLOCK(l)
  127. #endif
  128. /*
  129. * Since node locking is sensitive to both performance and memory footprint,
  130. * we need some trick here. If we have both high-performance rwlock and
  131. * high performance and small-memory reference counters, we use rwlock for
  132. * node lock and isc_refcount for node references. In this case, we don't have
  133. * to protect the access to the counters by locks.
  134. * Otherwise, we simply use ordinary mutex lock for node locking, and use
  135. * simple integers as reference counters which is protected by the lock.
  136. * In most cases, we can simply use wrapper macros such as NODE_LOCK and
  137. * NODE_UNLOCK. In some other cases, however, we need to protect reference
  138. * counters first and then protect other parts of a node as read-only data.
  139. * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also
  140. * provided for these special cases. When we can use the efficient backend
  141. * routines, we should only protect the "other members" by NODE_WEAKLOCK(read).
  142. * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical
  143. * section including the access to the reference counter.
  144. * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected
  145. * section is also protected by NODE_STRONGLOCK().
  146. */
  147. #if defined(ISC_RWLOCK_USEATOMIC) && defined(DNS_RBT_USEISCREFCOUNT)
  148. typedef isc_rwlock_t nodelock_t;
  149. #define NODE_INITLOCK(l) isc_rwlock_init((l), 0, 0)
  150. #define NODE_DESTROYLOCK(l) isc_rwlock_destroy(l)
  151. #define NODE_LOCK(l, t) RWLOCK((l), (t))
  152. #define NODE_UNLOCK(l, t) RWUNLOCK((l), (t))
  153. #define NODE_TRYUPGRADE(l) isc_rwlock_tryupgrade(l)
  154. #define NODE_STRONGLOCK(l) ((void)0)
  155. #define NODE_STRONGUNLOCK(l) ((void)0)
  156. #define NODE_WEAKLOCK(l, t) NODE_LOCK(l, t)
  157. #define NODE_WEAKUNLOCK(l, t) NODE_UNLOCK(l, t)
  158. #define NODE_WEAKDOWNGRADE(l) isc_rwlock_downgrade(l)
  159. #else
  160. typedef isc_mutex_t nodelock_t;
  161. #define NODE_INITLOCK(l) isc_mutex_init(l)
  162. #define NODE_DESTROYLOCK(l) DESTROYLOCK(l)
  163. #define NODE_LOCK(l, t) LOCK(l)
  164. #define NODE_UNLOCK(l, t) UNLOCK(l)
  165. #define NODE_TRYUPGRADE(l) ISC_R_SUCCESS
  166. #define NODE_STRONGLOCK(l) LOCK(l)
  167. #define NODE_STRONGUNLOCK(l) UNLOCK(l)
  168. #define NODE_WEAKLOCK(l, t) ((void)0)
  169. #define NODE_WEAKUNLOCK(l, t) ((void)0)
  170. #define NODE_WEAKDOWNGRADE(l) ((void)0)
  171. #endif
  172. /*%
  173. * Whether to rate-limit updating the LRU to avoid possible thread contention.
  174. * Our performance measurement has shown the cost is marginal, so it's defined
  175. * to be 0 by default either with or without threads.
  176. */
  177. #ifndef DNS_RBTDB_LIMITLRUUPDATE
  178. #define DNS_RBTDB_LIMITLRUUPDATE 0
  179. #endif
  180. /*
  181. * Allow clients with a virtual time of up to 5 minutes in the past to see
  182. * records that would have otherwise have expired.
  183. */
  184. #define RBTDB_VIRTUAL 300
  185. struct noqname {
  186. dns_name_t name;
  187. void * neg;
  188. void * negsig;
  189. dns_rdatatype_t type;
  190. };
  191. typedef struct acachectl acachectl_t;
  192. typedef struct rdatasetheader {
  193. /*%
  194. * Locked by the owning node's lock.
  195. */
  196. rbtdb_serial_t serial;
  197. dns_ttl_t rdh_ttl;
  198. rbtdb_rdatatype_t type;
  199. isc_uint16_t attributes;
  200. dns_trust_t trust;
  201. struct noqname *noqname;
  202. struct noqname *closest;
  203. /*%<
  204. * We don't use the LIST macros, because the LIST structure has
  205. * both head and tail pointers, and is doubly linked.
  206. */
  207. struct rdatasetheader *next;
  208. /*%<
  209. * If this is the top header for an rdataset, 'next' points
  210. * to the top header for the next rdataset (i.e., the next type).
  211. * Otherwise, it points up to the header whose down pointer points
  212. * at this header.
  213. */
  214. struct rdatasetheader *down;
  215. /*%<
  216. * Points to the header for the next older version of
  217. * this rdataset.
  218. */
  219. isc_uint32_t count;
  220. /*%<
  221. * Monotonously increased every time this rdataset is bound so that
  222. * it is used as the base of the starting point in DNS responses
  223. * when the "cyclic" rrset-order is required. Since the ordering
  224. * should not be so crucial, no lock is set for the counter for
  225. * performance reasons.
  226. */
  227. acachectl_t *additional_auth;
  228. acachectl_t *additional_glue;
  229. dns_rbtnode_t *node;
  230. isc_stdtime_t last_used;
  231. ISC_LINK(struct rdatasetheader) link;
  232. unsigned int heap_index;
  233. /*%<
  234. * Used for TTL-based cache cleaning.
  235. */
  236. isc_stdtime_t resign;
  237. } rdatasetheader_t;
  238. typedef ISC_LIST(rdatasetheader_t) rdatasetheaderlist_t;
  239. typedef ISC_LIST(dns_rbtnode_t) rbtnodelist_t;
  240. #define RDATASET_ATTR_NONEXISTENT 0x0001
  241. #define RDATASET_ATTR_STALE 0x0002
  242. #define RDATASET_ATTR_IGNORE 0x0004
  243. #define RDATASET_ATTR_RETAIN 0x0008
  244. #define RDATASET_ATTR_NXDOMAIN 0x0010
  245. #define RDATASET_ATTR_RESIGN 0x0020
  246. #define RDATASET_ATTR_STATCOUNT 0x0040
  247. #define RDATASET_ATTR_OPTOUT 0x0080
  248. #define RDATASET_ATTR_NEGATIVE 0x0100
  249. typedef struct acache_cbarg {
  250. dns_rdatasetadditional_t type;
  251. unsigned int count;
  252. dns_db_t *db;
  253. dns_dbnode_t *node;
  254. rdatasetheader_t *header;
  255. } acache_cbarg_t;
  256. struct acachectl {
  257. dns_acacheentry_t *entry;
  258. acache_cbarg_t *cbarg;
  259. };
  260. /*
  261. * XXX
  262. * When the cache will pre-expire data (due to memory low or other
  263. * situations) before the rdataset's TTL has expired, it MUST
  264. * respect the RETAIN bit and not expire the data until its TTL is
  265. * expired.
  266. */
  267. #undef IGNORE /* WIN32 winbase.h defines this. */
  268. #define EXISTS(header) \
  269. (((header)->attributes & RDATASET_ATTR_NONEXISTENT) == 0)
  270. #define NONEXISTENT(header) \
  271. (((header)->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
  272. #define IGNORE(header) \
  273. (((header)->attributes & RDATASET_ATTR_IGNORE) != 0)
  274. #define RETAIN(header) \
  275. (((header)->attributes & RDATASET_ATTR_RETAIN) != 0)
  276. #define NXDOMAIN(header) \
  277. (((header)->attributes & RDATASET_ATTR_NXDOMAIN) != 0)
  278. #define RESIGN(header) \
  279. (((header)->attributes & RDATASET_ATTR_RESIGN) != 0)
  280. #define OPTOUT(header) \
  281. (((header)->attributes & RDATASET_ATTR_OPTOUT) != 0)
  282. #define NEGATIVE(header) \
  283. (((header)->attributes & RDATASET_ATTR_NEGATIVE) != 0)
  284. #define DEFAULT_NODE_LOCK_COUNT 7 /*%< Should be prime. */
  285. /*%
  286. * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps).
  287. * There is a tradeoff issue about configuring this value: if this is too
  288. * small, it may cause heavier contention between threads; if this is too large,
  289. * LRU purge algorithm won't work well (entries tend to be purged prematurely).
  290. * The default value should work well for most environments, but this can
  291. * also be configurable at compilation time via the
  292. * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable. This value must be larger than
  293. * 1 due to the assumption of overmem_purge().
  294. */
  295. #ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT
  296. #if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1
  297. #error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1"
  298. #else
  299. #define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT
  300. #endif
  301. #else
  302. #define DEFAULT_CACHE_NODE_LOCK_COUNT 16
  303. #endif /* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
  304. typedef struct {
  305. nodelock_t lock;
  306. /* Protected in the refcount routines. */
  307. isc_refcount_t references;
  308. /* Locked by lock. */
  309. isc_boolean_t exiting;
  310. } rbtdb_nodelock_t;
  311. typedef struct rbtdb_changed {
  312. dns_rbtnode_t * node;
  313. isc_boolean_t dirty;
  314. ISC_LINK(struct rbtdb_changed) link;
  315. } rbtdb_changed_t;
  316. typedef ISC_LIST(rbtdb_changed_t) rbtdb_changedlist_t;
  317. typedef enum {
  318. dns_db_insecure,
  319. dns_db_partial,
  320. dns_db_secure
  321. } dns_db_secure_t;
  322. typedef struct dns_rbtdb dns_rbtdb_t;
  323. typedef struct rbtdb_version {
  324. /* Not locked */
  325. rbtdb_serial_t serial;
  326. dns_rbtdb_t * rbtdb;
  327. /*
  328. * Protected in the refcount routines.
  329. * XXXJT: should we change the lock policy based on the refcount
  330. * performance?
  331. */
  332. isc_refcount_t references;
  333. /* Locked by database lock. */
  334. isc_boolean_t writer;
  335. isc_boolean_t commit_ok;
  336. rbtdb_changedlist_t changed_list;
  337. rdatasetheaderlist_t resigned_list;
  338. ISC_LINK(struct rbtdb_version) link;
  339. dns_db_secure_t secure;
  340. isc_boolean_t havensec3;
  341. /* NSEC3 parameters */
  342. dns_hash_t hash;
  343. isc_uint8_t flags;
  344. isc_uint16_t iterations;
  345. isc_uint8_t salt_length;
  346. unsigned char salt[DNS_NSEC3_SALTSIZE];
  347. } rbtdb_version_t;
  348. typedef ISC_LIST(rbtdb_version_t) rbtdb_versionlist_t;
  349. struct dns_rbtdb {
  350. /* Unlocked. */
  351. dns_db_t common;
  352. /* Locks the data in this struct */
  353. #if DNS_RBTDB_USERWLOCK
  354. isc_rwlock_t lock;
  355. #else
  356. isc_mutex_t lock;
  357. #endif
  358. /* Locks the tree structure (prevents nodes appearing/disappearing) */
  359. isc_rwlock_t tree_lock;
  360. /* Locks for individual tree nodes */
  361. unsigned int node_lock_count;
  362. rbtdb_nodelock_t * node_locks;
  363. dns_rbtnode_t * origin_node;
  364. dns_stats_t * rrsetstats; /* cache DB only */
  365. /* Locked by lock. */
  366. unsigned int active;
  367. isc_refcount_t references;
  368. unsigned int attributes;
  369. rbtdb_serial_t current_serial;
  370. rbtdb_serial_t least_serial;
  371. rbtdb_serial_t next_serial;
  372. rbtdb_version_t * current_version;
  373. rbtdb_version_t * future_version;
  374. rbtdb_versionlist_t open_versions;
  375. isc_task_t * task;
  376. dns_dbnode_t *soanode;
  377. dns_dbnode_t *nsnode;
  378. /*
  379. * This is a linked list used to implement the LRU cache. There will
  380. * be node_lock_count linked lists here. Nodes in bucket 1 will be
  381. * placed on the linked list rdatasets[1].
  382. */
  383. rdatasetheaderlist_t *rdatasets;
  384. /*%
  385. * Temporary storage for stale cache nodes and dynamically deleted
  386. * nodes that await being cleaned up.
  387. */
  388. rbtnodelist_t *deadnodes;
  389. /*
  390. * Heaps. These are used for TTL based expiry in a cache,
  391. * or for zone resigning in a zone DB. hmctx is the memory
  392. * context to use for the heap (which differs from the main
  393. * database memory context in the case of a cache).
  394. */
  395. isc_mem_t * hmctx;
  396. isc_heap_t **heaps;
  397. /* Locked by tree_lock. */
  398. dns_rbt_t * tree;
  399. dns_rbt_t * nsec;
  400. dns_rbt_t * nsec3;
  401. dns_rpz_cidr_t * rpz_cidr;
  402. /* Unlocked */
  403. unsigned int quantum;
  404. };
  405. #define RBTDB_ATTR_LOADED 0x01
  406. #define RBTDB_ATTR_LOADING 0x02
  407. /*%
  408. * Search Context
  409. */
  410. typedef struct {
  411. dns_rbtdb_t * rbtdb;
  412. rbtdb_version_t * rbtversion;
  413. rbtdb_serial_t serial;
  414. unsigned int options;
  415. dns_rbtnodechain_t chain;
  416. isc_boolean_t copy_name;
  417. isc_boolean_t need_cleanup;
  418. isc_boolean_t wild;
  419. dns_rbtnode_t * zonecut;
  420. rdatasetheader_t * zonecut_rdataset;
  421. rdatasetheader_t * zonecut_sigrdataset;
  422. dns_fixedname_t zonecut_name;
  423. isc_stdtime_t now;
  424. } rbtdb_search_t;
  425. /*%
  426. * Load Context
  427. */
  428. typedef struct {
  429. dns_rbtdb_t * rbtdb;
  430. isc_stdtime_t now;
  431. } rbtdb_load_t;
  432. static void rdataset_disassociate(dns_rdataset_t *rdataset);
  433. static isc_result_t rdataset_first(dns_rdataset_t *rdataset);
  434. static isc_result_t rdataset_next(dns_rdataset_t *rdataset);
  435. static void rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata);
  436. static void rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target);
  437. static unsigned int rdataset_count(dns_rdataset_t *rdataset);
  438. static isc_result_t rdataset_getnoqname(dns_rdataset_t *rdataset,
  439. dns_name_t *name,
  440. dns_rdataset_t *neg,
  441. dns_rdataset_t *negsig);
  442. static isc_result_t rdataset_getclosest(dns_rdataset_t *rdataset,
  443. dns_name_t *name,
  444. dns_rdataset_t *neg,
  445. dns_rdataset_t *negsig);
  446. static isc_result_t rdataset_getadditional(dns_rdataset_t *rdataset,
  447. dns_rdatasetadditional_t type,
  448. dns_rdatatype_t qtype,
  449. dns_acache_t *acache,
  450. dns_zone_t **zonep,
  451. dns_db_t **dbp,
  452. dns_dbversion_t **versionp,
  453. dns_dbnode_t **nodep,
  454. dns_name_t *fname,
  455. dns_message_t *msg,
  456. isc_stdtime_t now);
  457. static isc_result_t rdataset_setadditional(dns_rdataset_t *rdataset,
  458. dns_rdatasetadditional_t type,
  459. dns_rdatatype_t qtype,
  460. dns_acache_t *acache,
  461. dns_zone_t *zone,
  462. dns_db_t *db,
  463. dns_dbversion_t *version,
  464. dns_dbnode_t *node,
  465. dns_name_t *fname);
  466. static isc_result_t rdataset_putadditional(dns_acache_t *acache,
  467. dns_rdataset_t *rdataset,
  468. dns_rdatasetadditional_t type,
  469. dns_rdatatype_t qtype);
  470. static inline isc_boolean_t need_headerupdate(rdatasetheader_t *header,
  471. isc_stdtime_t now);
  472. static void update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
  473. isc_stdtime_t now);
  474. static void expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
  475. isc_boolean_t tree_locked);
  476. static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
  477. isc_stdtime_t now, isc_boolean_t tree_locked);
  478. static isc_result_t resign_insert(dns_rbtdb_t *rbtdb, int idx,
  479. rdatasetheader_t *newheader);
  480. static void prune_tree(isc_task_t *task, isc_event_t *event);
  481. static void rdataset_settrust(dns_rdataset_t *rdataset, dns_trust_t trust);
  482. static void rdataset_expire(dns_rdataset_t *rdataset);
  483. static dns_rdatasetmethods_t rdataset_methods = {
  484. rdataset_disassociate,
  485. rdataset_first,
  486. rdataset_next,
  487. rdataset_current,
  488. rdataset_clone,
  489. rdataset_count,
  490. NULL,
  491. rdataset_getnoqname,
  492. NULL,
  493. rdataset_getclosest,
  494. rdataset_getadditional,
  495. rdataset_setadditional,
  496. rdataset_putadditional,
  497. rdataset_settrust,
  498. rdataset_expire
  499. };
  500. static void rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp);
  501. static isc_result_t rdatasetiter_first(dns_rdatasetiter_t *iterator);
  502. static isc_result_t rdatasetiter_next(dns_rdatasetiter_t *iterator);
  503. static void rdatasetiter_current(dns_rdatasetiter_t *iterator,
  504. dns_rdataset_t *rdataset);
  505. static dns_rdatasetitermethods_t rdatasetiter_methods = {
  506. rdatasetiter_destroy,
  507. rdatasetiter_first,
  508. rdatasetiter_next,
  509. rdatasetiter_current
  510. };
  511. typedef struct rbtdb_rdatasetiter {
  512. dns_rdatasetiter_t common;
  513. rdatasetheader_t * current;
  514. } rbtdb_rdatasetiter_t;
  515. static void dbiterator_destroy(dns_dbiterator_t **iteratorp);
  516. static isc_result_t dbiterator_first(dns_dbiterator_t *iterator);
  517. static isc_result_t dbiterator_last(dns_dbiterator_t *iterator);
  518. static isc_result_t dbiterator_seek(dns_dbiterator_t *iterator,
  519. dns_name_t *name);
  520. static isc_result_t dbiterator_prev(dns_dbiterator_t *iterator);
  521. static isc_result_t dbiterator_next(dns_dbiterator_t *iterator);
  522. static isc_result_t dbiterator_current(dns_dbiterator_t *iterator,
  523. dns_dbnode_t **nodep,
  524. dns_name_t *name);
  525. static isc_result_t dbiterator_pause(dns_dbiterator_t *iterator);
  526. static isc_result_t dbiterator_origin(dns_dbiterator_t *iterator,
  527. dns_name_t *name);
  528. static dns_dbiteratormethods_t dbiterator_methods = {
  529. dbiterator_destroy,
  530. dbiterator_first,
  531. dbiterator_last,
  532. dbiterator_seek,
  533. dbiterator_prev,
  534. dbiterator_next,
  535. dbiterator_current,
  536. dbiterator_pause,
  537. dbiterator_origin
  538. };
  539. #define DELETION_BATCH_MAX 64
  540. /*
  541. * If 'paused' is ISC_TRUE, then the tree lock is not being held.
  542. */
  543. typedef struct rbtdb_dbiterator {
  544. dns_dbiterator_t common;
  545. isc_boolean_t paused;
  546. isc_boolean_t new_origin;
  547. isc_rwlocktype_t tree_locked;
  548. isc_result_t result;
  549. dns_fixedname_t name;
  550. dns_fixedname_t origin;
  551. dns_rbtnodechain_t chain;
  552. dns_rbtnodechain_t nsec3chain;
  553. dns_rbtnodechain_t *current;
  554. dns_rbtnode_t *node;
  555. dns_rbtnode_t *deletions[DELETION_BATCH_MAX];
  556. int delete;
  557. isc_boolean_t nsec3only;
  558. isc_boolean_t nonsec3;
  559. } rbtdb_dbiterator_t;
  560. #define IS_STUB(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_STUB) != 0)
  561. #define IS_CACHE(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_CACHE) != 0)
  562. static void free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log,
  563. isc_event_t *event);
  564. static void overmem(dns_db_t *db, isc_boolean_t overmem);
  565. #ifdef BIND9
  566. static void setnsec3parameters(dns_db_t *db, rbtdb_version_t *version);
  567. #endif
  568. /*%
  569. * 'init_count' is used to initialize 'newheader->count' which inturn
  570. * is used to determine where in the cycle rrset-order cyclic starts.
  571. * We don't lock this as we don't care about simultaneous updates.
  572. *
  573. * Note:
  574. * Both init_count and header->count can be ISC_UINT32_MAX.
  575. * The count on the returned rdataset however can't be as
  576. * that indicates that the database does not implement cyclic
  577. * processing.
  578. */
  579. static unsigned int init_count;
  580. /*
  581. * Locking
  582. *
  583. * If a routine is going to lock more than one lock in this module, then
  584. * the locking must be done in the following order:
  585. *
  586. * Tree Lock
  587. *
  588. * Node Lock (Only one from the set may be locked at one time by
  589. * any caller)
  590. *
  591. * Database Lock
  592. *
  593. * Failure to follow this hierarchy can result in deadlock.
  594. */
  595. /*
  596. * Deleting Nodes
  597. *
  598. * For zone databases the node for the origin of the zone MUST NOT be deleted.
  599. */
  600. /*
  601. * DB Routines
  602. */
  603. static void
  604. attach(dns_db_t *source, dns_db_t **targetp) {
  605. dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)source;
  606. REQUIRE(VALID_RBTDB(rbtdb));
  607. isc_refcount_increment(&rbtdb->references, NULL);
  608. *targetp = source;
  609. }
  610. static void
  611. free_rbtdb_callback(isc_task_t *task, isc_event_t *event) {
  612. dns_rbtdb_t *rbtdb = event->ev_arg;
  613. UNUSED(task);
  614. free_rbtdb(rbtdb, ISC_TRUE, event);
  615. }
  616. static void
  617. update_rrsetstats(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
  618. isc_boolean_t increment)
  619. {
  620. dns_rdatastatstype_t statattributes = 0;
  621. dns_rdatastatstype_t base = 0;
  622. dns_rdatastatstype_t type;
  623. /* At the moment we count statistics only for cache DB */
  624. INSIST(IS_CACHE(rbtdb));
  625. if (NEGATIVE(header)) {
  626. if (NXDOMAIN(header))
  627. statattributes = DNS_RDATASTATSTYPE_ATTR_NXDOMAIN;
  628. else {
  629. statattributes = DNS_RDATASTATSTYPE_ATTR_NXRRSET;
  630. base = RBTDB_RDATATYPE_EXT(header->type);
  631. }
  632. } else
  633. base = RBTDB_RDATATYPE_BASE(header->type);
  634. type = DNS_RDATASTATSTYPE_VALUE(base, statattributes);
  635. if (increment)
  636. dns_rdatasetstats_increment(rbtdb->rrsetstats, type);
  637. else
  638. dns_rdatasetstats_decrement(rbtdb->rrsetstats, type);
  639. }
  640. static void
  641. set_ttl(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, dns_ttl_t newttl) {
  642. int idx;
  643. isc_heap_t *heap;
  644. dns_ttl_t oldttl;
  645. oldttl = header->rdh_ttl;
  646. header->rdh_ttl = newttl;
  647. if (!IS_CACHE(rbtdb))
  648. return;
  649. /*
  650. * It's possible the rbtdb is not a cache. If this is the case,
  651. * we will not have a heap, and we move on. If we do, though,
  652. * we might need to adjust things.
  653. */
  654. if (header->heap_index == 0 || newttl == oldttl)
  655. return;
  656. idx = header->node->locknum;
  657. if (rbtdb->heaps == NULL || rbtdb->heaps[idx] == NULL)
  658. return;
  659. heap = rbtdb->heaps[idx];
  660. if (newttl < oldttl)
  661. isc_heap_increased(heap, header->heap_index);
  662. else
  663. isc_heap_decreased(heap, header->heap_index);
  664. }
  665. /*%
  666. * These functions allow the heap code to rank the priority of each
  667. * element. It returns ISC_TRUE if v1 happens "sooner" than v2.
  668. */
  669. static isc_boolean_t
  670. ttl_sooner(void *v1, void *v2) {
  671. rdatasetheader_t *h1 = v1;
  672. rdatasetheader_t *h2 = v2;
  673. if (h1->rdh_ttl < h2->rdh_ttl)
  674. return (ISC_TRUE);
  675. return (ISC_FALSE);
  676. }
  677. static isc_boolean_t
  678. resign_sooner(void *v1, void *v2) {
  679. rdatasetheader_t *h1 = v1;
  680. rdatasetheader_t *h2 = v2;
  681. if (h1->resign < h2->resign)
  682. return (ISC_TRUE);
  683. return (ISC_FALSE);
  684. }
  685. /*%
  686. * This function sets the heap index into the header.
  687. */
  688. static void
  689. set_index(void *what, unsigned int index) {
  690. rdatasetheader_t *h = what;
  691. h->heap_index = index;
  692. }
  693. /*%
  694. * Work out how many nodes can be deleted in the time between two
  695. * requests to the nameserver. Smooth the resulting number and use it
  696. * as a estimate for the number of nodes to be deleted in the next
  697. * iteration.
  698. */
  699. static unsigned int
  700. adjust_quantum(unsigned int old, isc_time_t *start) {
  701. unsigned int pps = dns_pps; /* packets per second */
  702. unsigned int interval;
  703. isc_uint64_t usecs;
  704. isc_time_t end;
  705. unsigned int new;
  706. if (pps < 100)
  707. pps = 100;
  708. isc_time_now(&end);
  709. interval = 1000000 / pps; /* interval in usec */
  710. if (interval == 0)
  711. interval = 1;
  712. usecs = isc_time_microdiff(&end, start);
  713. if (usecs == 0) {
  714. /*
  715. * We were unable to measure the amount of time taken.
  716. * Double the nodes deleted next time.
  717. */
  718. old *= 2;
  719. if (old > 1000)
  720. old = 1000;
  721. return (old);
  722. }
  723. new = old * interval;
  724. new /= (unsigned int)usecs;
  725. if (new == 0)
  726. new = 1;
  727. else if (new > 1000)
  728. new = 1000;
  729. /* Smooth */
  730. new = (new + old * 3) / 4;
  731. isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE,
  732. ISC_LOG_DEBUG(1), "adjust_quantum -> %d", new);
  733. return (new);
  734. }
  735. static void
  736. free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) {
  737. unsigned int i;
  738. isc_ondestroy_t ondest;
  739. isc_result_t result;
  740. char buf[DNS_NAME_FORMATSIZE];
  741. dns_rbt_t **treep;
  742. isc_time_t start;
  743. if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
  744. overmem((dns_db_t *)rbtdb, (isc_boolean_t)-1);
  745. REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions));
  746. REQUIRE(rbtdb->future_version == NULL);
  747. if (rbtdb->current_version != NULL) {
  748. unsigned int refs;
  749. isc_refcount_decrement(&rbtdb->current_version->references,
  750. &refs);
  751. INSIST(refs == 0);
  752. UNLINK(rbtdb->open_versions, rbtdb->current_version, link);
  753. isc_refcount_destroy(&rbtdb->current_version->references);
  754. isc_mem_put(rbtdb->common.mctx, rbtdb->current_version,
  755. sizeof(rbtdb_version_t));
  756. }
  757. /*
  758. * We assume the number of remaining dead nodes is reasonably small;
  759. * the overhead of unlinking all nodes here should be negligible.
  760. */
  761. for (i = 0; i < rbtdb->node_lock_count; i++) {
  762. dns_rbtnode_t *node;
  763. node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
  764. while (node != NULL) {
  765. ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink);
  766. node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
  767. }
  768. }
  769. if (event == NULL)
  770. rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0;
  771. for (;;) {
  772. /*
  773. * pick the next tree to (start to) destroy
  774. */
  775. treep = &rbtdb->tree;
  776. if (*treep == NULL) {
  777. treep = &rbtdb->nsec;
  778. if (*treep == NULL) {
  779. treep = &rbtdb->nsec3;
  780. /*
  781. * we're finished after clear cutting
  782. */
  783. if (*treep == NULL)
  784. break;
  785. }
  786. }
  787. isc_time_now(&start);
  788. result = dns_rbt_destroy2(treep, rbtdb->quantum);
  789. if (result == ISC_R_QUOTA) {
  790. INSIST(rbtdb->task != NULL);
  791. if (rbtdb->quantum != 0)
  792. rbtdb->quantum = adjust_quantum(rbtdb->quantum,
  793. &start);
  794. if (event == NULL)
  795. event = isc_event_allocate(rbtdb->common.mctx,
  796. NULL,
  797. DNS_EVENT_FREESTORAGE,
  798. free_rbtdb_callback,
  799. rbtdb,
  800. sizeof(isc_event_t));
  801. if (event == NULL)
  802. continue;
  803. isc_task_send(rbtdb->task, &event);
  804. return;
  805. }
  806. INSIST(result == ISC_R_SUCCESS && *treep == NULL);
  807. }
  808. if (event != NULL)
  809. isc_event_free(&event);
  810. if (log) {
  811. if (dns_name_dynamic(&rbtdb->common.origin))
  812. dns_name_format(&rbtdb->common.origin, buf,
  813. sizeof(buf));
  814. else
  815. strcpy(buf, "<UNKNOWN>");
  816. isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
  817. DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
  818. "done free_rbtdb(%s)", buf);
  819. }
  820. if (dns_name_dynamic(&rbtdb->common.origin))
  821. dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx);
  822. for (i = 0; i < rbtdb->node_lock_count; i++) {
  823. isc_refcount_destroy(&rbtdb->node_locks[i].references);
  824. NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
  825. }
  826. /*
  827. * Clean up LRU / re-signing order lists.
  828. */
  829. if (rbtdb->rdatasets != NULL) {
  830. for (i = 0; i < rbtdb->node_lock_count; i++)
  831. INSIST(ISC_LIST_EMPTY(rbtdb->rdatasets[i]));
  832. isc_mem_put(rbtdb->common.mctx, rbtdb->rdatasets,
  833. rbtdb->node_lock_count *
  834. sizeof(rdatasetheaderlist_t));
  835. }
  836. /*
  837. * Clean up dead node buckets.
  838. */
  839. if (rbtdb->deadnodes != NULL) {
  840. for (i = 0; i < rbtdb->node_lock_count; i++)
  841. INSIST(ISC_LIST_EMPTY(rbtdb->deadnodes[i]));
  842. isc_mem_put(rbtdb->common.mctx, rbtdb->deadnodes,
  843. rbtdb->node_lock_count * sizeof(rbtnodelist_t));
  844. }
  845. /*
  846. * Clean up heap objects.
  847. */
  848. if (rbtdb->heaps != NULL) {
  849. for (i = 0; i < rbtdb->node_lock_count; i++)
  850. isc_heap_destroy(&rbtdb->heaps[i]);
  851. isc_mem_put(rbtdb->hmctx, rbtdb->heaps,
  852. rbtdb->node_lock_count * sizeof(isc_heap_t *));
  853. }
  854. if (rbtdb->rrsetstats != NULL)
  855. dns_stats_detach(&rbtdb->rrsetstats);
  856. #ifdef BIND9
  857. if (rbtdb->rpz_cidr != NULL)
  858. dns_rpz_cidr_free(&rbtdb->rpz_cidr);
  859. #endif
  860. isc_mem_put(rbtdb->common.mctx, rbtdb->node_locks,
  861. rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
  862. isc_rwlock_destroy(&rbtdb->tree_lock);
  863. isc_refcount_destroy(&rbtdb->references);
  864. if (rbtdb->task != NULL)
  865. isc_task_detach(&rbtdb->task);
  866. RBTDB_DESTROYLOCK(&rbtdb->lock);
  867. rbtdb->common.magic = 0;
  868. rbtdb->common.impmagic = 0;
  869. ondest = rbtdb->common.ondest;
  870. isc_mem_detach(&rbtdb->hmctx);
  871. isc_mem_putanddetach(&rbtdb->common.mctx, rbtdb, sizeof(*rbtdb));
  872. isc_ondestroy_notify(&ondest, rbtdb);
  873. }
  874. static inline void
  875. maybe_free_rbtdb(dns_rbtdb_t *rbtdb) {
  876. isc_boolean_t want_free = ISC_FALSE;
  877. unsigned int i;
  878. unsigned int inactive = 0;
  879. /* XXX check for open versions here */
  880. if (rbtdb->soanode != NULL)
  881. dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->soanode);
  882. if (rbtdb->nsnode != NULL)
  883. dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->nsnode);
  884. /*
  885. * Even though there are no external direct references, there still
  886. * may be nodes in use.
  887. */
  888. for (i = 0; i < rbtdb->node_lock_count; i++) {
  889. NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
  890. rbtdb->node_locks[i].exiting = ISC_TRUE;
  891. NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
  892. if (isc_refcount_current(&rbtdb->node_locks[i].references)
  893. == 0) {
  894. inactive++;
  895. }
  896. }
  897. if (inactive != 0) {
  898. RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
  899. rbtdb->active -= inactive;
  900. if (rbtdb->active == 0)
  901. want_free = ISC_TRUE;
  902. RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
  903. if (want_free) {
  904. char buf[DNS_NAME_FORMATSIZE];
  905. if (dns_name_dynamic(&rbtdb->common.origin))
  906. dns_name_format(&rbtdb->common.origin, buf,
  907. sizeof(buf));
  908. else
  909. strcpy(buf, "<UNKNOWN>");
  910. isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
  911. DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
  912. "calling free_rbtdb(%s)", buf);
  913. free_rbtdb(rbtdb, ISC_TRUE, NULL);
  914. }
  915. }
  916. }
  917. static void
  918. detach(dns_db_t **dbp) {
  919. dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(*dbp);
  920. unsigned int refs;
  921. REQUIRE(VALID_RBTDB(rbtdb));
  922. isc_refcount_decrement(&rbtdb->references, &refs);
  923. if (refs == 0)
  924. maybe_free_rbtdb(rbtdb);
  925. *dbp = NULL;
  926. }
  927. static void
  928. currentversion(dns_db_t *db, dns_dbversion_t **versionp) {
  929. dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
  930. rbtdb_version_t *version;
  931. unsigned int refs;
  932. REQUIRE(VALID_RBTDB(rbtdb));
  933. RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
  934. version = rbtdb->current_version;
  935. isc_refcount_increment(&version->references, &refs);
  936. RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
  937. *versionp = (dns_dbversion_t *)version;
  938. }
  939. static inline rbtdb_version_t *
  940. allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial,
  941. unsigned int references, isc_boolean_t writer)
  942. {
  943. isc_result_t result;
  944. rbtdb_version_t *version;
  945. version = isc_mem_get(mctx, sizeof(*version));
  946. if (version == NULL)
  947. return (NULL);
  948. version->serial = serial;
  949. result = isc_refcount_init(&version->references, references);
  950. if (result != ISC_R_SUCCESS) {
  951. isc_mem_put(mctx, version, sizeof(*version));
  952. return (NULL);
  953. }
  954. version->writer = writer;
  955. version->commit_ok = ISC_FALSE;
  956. ISC_LIST_INIT(version->changed_list);
  957. ISC_LIST_INIT(version->resigned_list);
  958. ISC_LINK_INIT(version, link);
  959. return (version);
  960. }
  961. static isc_result_t
  962. newversion(dns_db_t *db, dns_dbversion_t **versionp) {
  963. dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
  964. rbtdb_version_t *version;
  965. REQUIRE(VALID_RBTDB(rbtdb));
  966. REQUIRE(versionp != NULL && *versionp == NULL);
  967. REQUIRE(rbtdb->future_version == NULL);
  968. RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
  969. RUNTIME_CHECK(rbtdb->next_serial != 0); /* XXX Error? */
  970. version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1,
  971. ISC_TRUE);
  972. if (version != NULL) {
  973. version->rbtdb = rbtdb;
  974. version->commit_ok = ISC_TRUE;
  975. version->secure = rbtdb->current_version->secure;
  976. version->havensec3 = rbtdb->current_version->havensec3;
  977. if (version->havensec3) {
  978. version->flags = rbtdb->current_version->flags;
  979. version->iterations =
  980. rbtdb->current_version->iterations;
  981. version->hash = rbtdb->current_version->hash;
  982. version->salt_length =
  983. rbtdb->current_version->salt_length;
  984. memcpy(version->salt, rbtdb->current_version->salt,
  985. version->salt_length);
  986. } else {
  987. version->flags = 0;
  988. version->iterations = 0;
  989. version->hash = 0;
  990. version->salt_length = 0;
  991. memset(version->salt, 0, sizeof(version->salt));
  992. }
  993. rbtdb->next_serial++;
  994. rbtdb->future_version = version;
  995. }
  996. RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
  997. if (version == NULL)
  998. return (ISC_R_NOMEMORY);
  999. *versionp = version;
  1000. return (ISC_R_SUCCESS);
  1001. }
  1002. static void
  1003. attachversion(dns_db_t *db, dns_dbversion_t *source,
  1004. dns_dbversion_t **targetp)
  1005. {
  1006. dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
  1007. rbtdb_version_t *rbtversion = source;
  1008. unsigned int refs;
  1009. REQUIRE(VALID_RBTDB(rbtdb));
  1010. INSIST(rbtversion != NULL && rbtversion->rbtdb == rbtdb);
  1011. isc_refcount_increment(&rbtversion->references, &refs);
  1012. INSIST(refs > 1);
  1013. *targetp = rbtversion;
  1014. }
  1015. static rbtdb_changed_t *
  1016. add_changed(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
  1017. dns_rbtnode_t *node)
  1018. {
  1019. rbtdb_changed_t *changed;
  1020. unsigned int refs;
  1021. /*
  1022. * Caller must be holding the node lock if its reference must be
  1023. * protected by the lock.
  1024. */
  1025. changed = isc_mem_get(rbtdb->common.mctx, sizeof(*changed));
  1026. RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
  1027. REQUIRE(version->writer);
  1028. if (changed != NULL) {
  1029. dns_rbtnode_refincrement(node, &refs);
  1030. INSIST(refs != 0);
  1031. changed->node = node;
  1032. changed->dirty = ISC_FALSE;
  1033. ISC_LIST_INITANDAPPEND(version->changed_list, changed, link);
  1034. } else
  1035. version->commit_ok = ISC_FALSE;
  1036. RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
  1037. return (changed);
  1038. }
  1039. static void
  1040. free_acachearray(isc_mem_t *mctx, rdatasetheader_t *header,
  1041. acachectl_t *array)
  1042. {
  1043. unsigned int count;
  1044. unsigned int i;
  1045. unsigned char *raw; /* RDATASLAB */
  1046. /*
  1047. * The caller must be holding the corresponding node lock.
  1048. */
  1049. if (array == NULL)
  1050. return;
  1051. raw = (unsigned char *)header + sizeof(*header);
  1052. count = raw[0] * 256 + raw[1];
  1053. /*
  1054. * Sanity check: since an additional cache entry has a reference to
  1055. * the original DB node (in the callback arg), there should be no
  1056. * acache entries when the node can be freed.
  1057. */
  1058. for (i = 0; i < count; i++)
  1059. INSIST(array[i].entry == NULL && array[i].cbarg == NULL);
  1060. isc_mem_put(mctx, array, count * sizeof(acachectl_t));
  1061. }
  1062. static inline void
  1063. free_noqname(isc_mem_t *mctx, struct noqname **noqname) {
  1064. if (dns_name_dynamic(&(*noqname)->name))
  1065. dns_name_free(&(*noqname)->name, mctx);
  1066. if ((*noqname)->neg != NULL)
  1067. isc_mem_put(mctx, (*noqname)->neg,
  1068. dns_rdataslab_size((*noqname)->neg, 0));
  1069. if ((*noqname)->negsig != NULL)
  1070. isc_mem_put(mctx, (*noqname)->negsig,
  1071. dns_rdataslab_size((*noqname)->negsig, 0));
  1072. isc_mem_put(mctx, *noqname, sizeof(**noqname));
  1073. *noqname = NULL;
  1074. }
  1075. static inline void
  1076. init_rdataset(dns_rbtdb_t *rbtdb, rdatasetheader_t *h)
  1077. {
  1078. ISC_LINK_INIT(h, link);
  1079. h->heap_index = 0;
  1080. #if TRACE_HEADER
  1081. if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
  1082. fprintf(stderr, "initialized header: %p\n", h);
  1083. #else
  1084. UNUSED(rbtdb);
  1085. #endif
  1086. }
  1087. static inline rdatasetheader_t *
  1088. new_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx)
  1089. {
  1090. rdatasetheader_t *h;
  1091. h = isc_mem_get(mctx, sizeof(*h));
  1092. if (h == NULL)
  1093. return (NULL);
  1094. #if TRACE_HEADER
  1095. if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
  1096. fprintf(stderr, "allocated header: %p\n", h);
  1097. #endif
  1098. init_rdataset(rbtdb, h);
  1099. return (h);
  1100. }
  1101. static inline void
  1102. free_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *rdataset)
  1103. {
  1104. unsigned int size;
  1105. int idx;
  1106. if (EXISTS(rdataset) &&
  1107. (rdataset->attributes & RDATASET_ATTR_STATCOUNT) != 0) {
  1108. update_rrsetstats(rbtdb, rdataset, ISC_FALSE);
  1109. }
  1110. idx = rdataset->node->locknum;
  1111. if (ISC_LINK_LINKED(rdataset, link)) {
  1112. INSIST(IS_CACHE(rbtdb));
  1113. ISC_LIST_UNLINK(rbtdb->rdatasets[idx], rdataset, link);
  1114. }
  1115. if (rdataset->heap_index != 0)
  1116. isc_heap_delete(rbtdb->heaps[idx], rdataset->heap_index);
  1117. rdataset->heap_index = 0;
  1118. if (rdataset->noqname != NULL)
  1119. free_noqname(mctx, &rdataset->noqname);
  1120. if (rdataset->closest != NULL)
  1121. free_noqname(mctx, &rdataset->closest);
  1122. free_acachearray(mctx, rdataset, rdataset->additional_auth);
  1123. free_acachearray(mctx, rdataset, rdataset->additional_glue);
  1124. if ((rdataset->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
  1125. size = sizeof(*rdataset);
  1126. else
  1127. size = dns_rdataslab_size((unsigned char *)rdataset,
  1128. sizeof(*rdataset));
  1129. isc_mem_put(mctx, rdataset, size);
  1130. }
  1131. static inline void
  1132. rollback_node(dns_rbtnode_t *node, rbtdb_serial_t serial) {
  1133. rdatasetheader_t *header, *dcurrent;
  1134. isc_boolean_t make_dirty = ISC_FALSE;
  1135. /*
  1136. * Caller must hold the node lock.
  1137. */
  1138. /*
  1139. * We set the IGNORE attribute on rdatasets with serial number
  1140. * 'serial'. When the reference count goes to zero, these rdatasets
  1141. * will be cleaned up; until that time, they will be ignored.
  1142. */
  1143. for (header = node->data; header != NULL; header = header->next) {
  1144. if (header->serial == serial) {
  1145. header->attributes |= RDATASET_ATTR_IGNORE;
  1146. make_dirty = ISC_TRUE;
  1147. }
  1148. for (dcurrent = header->down;
  1149. dcurrent != NULL;
  1150. dcurrent = dcurrent->down) {
  1151. if (dcurrent->serial == serial) {
  1152. dcurrent->attributes |= RDATASET_ATTR_IGNORE;
  1153. make_dirty = ISC_TRUE;
  1154. }
  1155. }
  1156. }
  1157. if (make_dirty)
  1158. node->dirty = 1;
  1159. }
  1160. static inline void
  1161. clean_stale_headers(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *top)
  1162. {
  1163. rdatasetheader_t *d, *down_next;
  1164. for (d = top->down; d != NULL; d = down_next) {
  1165. down_next = d->down;
  1166. free_rdataset(rbtdb, mctx, d);
  1167. }
  1168. top->down = NULL;
  1169. }
  1170. static inline void
  1171. clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
  1172. rdatasetheader_t *current, *top_prev, *top_next;
  1173. isc_mem_t *mctx = rbtdb->common.mctx;
  1174. /*
  1175. * Caller must be holding the node lock.
  1176. */
  1177. top_prev = NULL;
  1178. for (current = node->data; current != NULL; current = top_next) {
  1179. top_next = current->next;
  1180. clean_stale_headers(rbtdb, mctx, current);
  1181. /*
  1182. * If current is nonexistent or stale, we can clean it up.
  1183. */
  1184. if ((current->attributes &
  1185. (RDATASET_ATTR_NONEXISTENT|RDATASET_ATTR_STALE)) != 0) {
  1186. if (top_prev != NULL)
  1187. top_prev->next = current->next;
  1188. else
  1189. node->data = current->next;
  1190. free_rdataset(rbtdb, mctx, current);
  1191. } else
  1192. top_prev = current;
  1193. }
  1194. node->dirty = 0;
  1195. }
  1196. static inline void
  1197. clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
  1198. rbtdb_serial_t least_serial)
  1199. {
  1200. rdatasetheader_t *current, *dcurrent, *down_next, *dparent;
  1201. rdatasetheader_t *top_prev, *top_next;
  1202. isc_mem_t *mctx = rbtdb->common.mctx;
  1203. isc_boolean_t still_dirty = ISC_FALSE;
  1204. /*
  1205. * Caller must be holding the node lock.
  1206. */
  1207. REQUIRE(least_serial != 0);
  1208. top_prev = NULL;
  1209. for (current = node->data; current != NULL; current = top_next) {
  1210. top_next = current->next;
  1211. /*
  1212. * First, we clean up any instances of multiple rdatasets
  1213. * with the same serial number, or that have the IGNORE
  1214. * attribute.
  1215. */
  1216. dparent = current;
  1217. for (dcurrent = current->down;
  1218. dcurrent != NULL;
  1219. dcurrent = down_next) {
  1220. down_next = dcurrent->down;
  1221. INSIST(dcurrent->serial <= dparent->serial);
  1222. if (dcurrent->serial == dparent->serial ||
  1223. IGNORE(dcurrent)) {
  1224. if (down_next != NULL)
  1225. down_next->next = dparent;
  1226. dparent->down = down_next;
  1227. free_rdataset(rbtdb, mctx, dcurrent);
  1228. } else
  1229. dparent = dcurrent;
  1230. }
  1231. /*
  1232. * We've now eliminated all IGNORE datasets with the possible
  1233. * exception of current, which we now check.
  1234. */
  1235. if (IGNORE(current)) {
  1236. down_next = current->down;
  1237. if (down_next == NULL) {
  1238. if (top_prev != NULL)
  1239. top_prev->next = current->next;
  1240. else
  1241. node->data = current->next;
  1242. free_rdataset(rbtdb, mctx, current);
  1243. /*
  1244. * current no longer exists, so we can
  1245. * just continue with the loop.
  1246. */
  1247. continue;
  1248. } else {
  1249. /*
  1250. * Pull up current->down, making it the new
  1251. * current.
  1252. */
  1253. if (top_prev != NULL)
  1254. top_prev->next = down_next;
  1255. else
  1256. node->data = down_next;
  1257. down_next->next = top_next;
  1258. free_rdataset(rbtdb, mctx, current);
  1259. current = down_next;
  1260. }
  1261. }
  1262. /*
  1263. * We now try to find the first down node less than the
  1264. * least serial.
  1265. */
  1266. dparent = current;
  1267. for (dcurrent = current->down;
  1268. dcurrent != NULL;
  1269. dcurrent = down_next) {
  1270. down_next = dcurrent->down;
  1271. if (dcurrent->serial < least_serial)
  1272. break;
  1273. dparent = dcurrent;
  1274. }
  1275. /*
  1276. * If there is a such an rdataset, delete it and any older
  1277. * versions.
  1278. */
  1279. if (dcurrent != NULL) {
  1280. do {
  1281. down_next = dcurrent->down;
  1282. INSIST(dcurrent->serial <= least_serial);
  1283. free_rdataset(rbtdb, mctx, dcurrent);
  1284. dcurrent = down_next;
  1285. } while (dcurrent != NULL);
  1286. dparent->down = NULL;
  1287. }
  1288. /*
  1289. * Note. The serial number of 'current' might be less than
  1290. * least_serial too, but we cannot delete it because it is
  1291. * the most recent version, unless it is a NONEXISTENT
  1292. * rdataset.
  1293. */
  1294. if (current->down != NULL) {
  1295. still_dirty = ISC_TRUE;
  1296. top_prev = current;
  1297. } else {
  1298. /*
  1299. * If this is a NONEXISTENT rdataset, we can delete it.
  1300. */
  1301. if (NONEXISTENT(current)) {
  1302. if (top_prev != NULL)
  1303. top_prev->next = current->next;
  1304. else
  1305. node->data = current->next;
  1306. free_rdataset(rbtdb, mctx, current);
  1307. } else
  1308. top_prev = current;
  1309. }
  1310. }
  1311. if (!still_dirty)
  1312. node->dirty = 0;
  1313. }
  1314. static void
  1315. delete_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node)
  1316. {
  1317. dns_rbtnode_t *nsecnode;
  1318. dns_fixedname_t fname;
  1319. dns_name_t *name;
  1320. isc_result_t result = ISC_R_UNEXPECTED;
  1321. INSIST(!ISC_LINK_LINKED(node, deadlink));
  1322. switch (node->nsec) {
  1323. case DNS_RBT_NSEC_NORMAL:
  1324. #ifdef BIND9
  1325. if (rbtdb->rpz_cidr != NULL) {
  1326. dns_fixedname_init(&fname);
  1327. name = dns_fixedname_name(&fname);
  1328. dns_rbt_fullnamefromnode(node, name);
  1329. dns_rpz_cidr_deleteip(rbtdb->rpz_cidr, name);
  1330. }
  1331. #endif
  1332. result = dns_rbt_deletenode(rbtdb->tree, node, ISC_FALSE);
  1333. break;
  1334. case DNS_RBT_NSEC_HAS_NSEC:
  1335. dns_fixedname_init(&fname);
  1336. name = dns_fixedname_name(&fname);
  1337. dns_rbt_fullnamefromnode(node, name);
  1338. /*
  1339. * Delete the corresponding node from the auxiliary NSEC
  1340. * tree before deleting from the main tree.
  1341. */
  1342. nsecnode = NULL;
  1343. result = dns_rbt_findnode(rbtdb->nsec, name, NULL, &nsecnode,
  1344. NULL, DNS_RBTFIND_EMPTYDATA,
  1345. NULL, NULL);
  1346. if (result != ISC_R_SUCCESS) {
  1347. isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
  1348. DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
  1349. "delete_node: "
  1350. "dns_rbt_findnode(nsec): %s",
  1351. isc_result_totext(result));
  1352. } else {
  1353. result = dns_rbt_deletenode(rbtdb->nsec, nsecnode,
  1354. ISC_FALSE);
  1355. if (result != ISC_R_SUCCESS) {
  1356. isc_log_write(dns_lctx,
  1357. DNS_LOGCATEGORY_DATABASE,
  1358. DNS_LOGMODULE_CACHE,
  1359. ISC_LOG_WARNING,
  1360. "delete_nsecnode(): "
  1361. "dns_rbt_deletenode(nsecnode): %s",
  1362. isc_result_totext(result));
  1363. }
  1364. }
  1365. result = dns_rbt_deletenode(rbtdb->tree, node, ISC_FALSE);
  1366. #ifdef BIND9
  1367. dns_rpz_cidr_deleteip(rbtdb->rpz_cidr, name);
  1368. #endif
  1369. break;
  1370. case DNS_RBT_NSEC_NSEC:
  1371. result = dns_rbt_deletenode(rbtdb->nsec, node, ISC_FALSE);
  1372. break;
  1373. case DNS_RBT_NSEC_NSEC3:
  1374. result = dns_rbt_deletenode(rbtdb->nsec3, node, ISC_FALSE);
  1375. break;
  1376. }
  1377. if (result != ISC_R_SUCCESS) {
  1378. isc_log_write(dns_lctx,
  1379. DNS_LOGCATEGORY_DATABASE,
  1380. DNS_LOGMODULE_CACHE,
  1381. ISC_LOG_WARNING,
  1382. "delete_nsecnode(): "
  1383. "dns_rbt_deletenode: %s",
  1384. isc_result_totext(result));
  1385. }
  1386. }
  1387. /*%
  1388. * Clean up dead nodes. These are nodes which have no references, and
  1389. * have no data. They are dead but we could not or chose not to delete
  1390. * them when we deleted all the data at that node because we did not want
  1391. * to wait for the tree write lock.
  1392. *
  1393. * The caller must hold a tree write lock and bucketnum'th node (write) lock.
  1394. */
  1395. static void
  1396. cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum) {
  1397. dns_rbtnode_t *node;
  1398. int count = 10; /* XXXJT: should be adjustable */
  1399. node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
  1400. while (node != NULL && count > 0) {
  1401. ISC_LIST_UNLINK(rbtdb->deadnodes[bucketnum], node, deadlink);
  1402. /*
  1403. * Since we're holding a tree write lock, it should be
  1404. * impossible for this node to be referenced by others.
  1405. */
  1406. INSIST(dns_rbtnode_refcurrent(node) == 0 &&
  1407. node->data == NULL);
  1408. delete_node(rbtdb, node);
  1409. node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
  1410. count--;
  1411. }
  1412. }
  1413. /*
  1414. * Caller must be holding the node lock.
  1415. */
  1416. static inline void
  1417. new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
  1418. unsigned int lockrefs, noderefs;
  1419. isc_refcount_t *lockref;
  1420. INSIST(!ISC_LINK_LINKED(node, deadlink));
  1421. dns_rbtnode_refincrement0(node, &noderefs);
  1422. if (noderefs == 1) { /* this is the first reference to the node */
  1423. lockref = &rbtdb->node_locks[node->locknum].references;
  1424. isc_refcount_increment0(lockref, &lockrefs);
  1425. INSIST(lockrefs != 0);
  1426. }
  1427. INSIST(noderefs != 0);
  1428. }
  1429. /*
  1430. * This function is assumed to be called when a node is newly referenced
  1431. * and can be in the deadnode list. In that case the node must be retrieved
  1432. * from the list because it is going to be used. In addition, if the caller
  1433. * happens to hold a write lock on the tree, it's a good chance to purge dead
  1434. * nodes.
  1435. * Note: while a new reference is gained in multiple places, there are only very
  1436. * few cases where the node can be in the deadnode list (only empty nodes can
  1437. * have been added to the list).
  1438. */
  1439. static inline void
  1440. reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
  1441. isc_rwlocktype_t treelocktype)
  1442. {
  1443. isc_rwlocktype_t locktype = isc_rwlocktype_read;
  1444. nodelock_t *nodelock = &rbtdb->node_locks[node->locknum].lock;
  1445. isc_boolean_t maybe_cleanup = ISC_FALSE;
  1446. POST(locktype);
  1447. NODE_STRONGLOCK(nodelock);
  1448. NODE_WEAKLOCK(nodelock, locktype);
  1449. /*
  1450. * Check if we can possibly cleanup the dead node. If so, upgrade
  1451. * the node lock below to perform the cleanup.
  1452. */
  1453. if (!ISC_LIST_EMPTY(rbtdb->deadnodes[node->locknum]) &&
  1454. treelocktype == isc_rwlocktype_write) {
  1455. maybe_cleanup = ISC_TRUE;
  1456. }
  1457. if (ISC_LINK_LINKED(node, deadlink) || maybe_cleanup) {
  1458. /*
  1459. * Upgrade the lock and test if we still need to unlink.
  1460. */
  1461. NODE_WEAKUNLOCK(nodelock, locktype);
  1462. locktype = isc_rwlocktype_write;
  1463. POST(locktype);
  1464. NODE_WEAKLOCK(nodelock, locktype);
  1465. if (ISC_LINK_LINKED(node, deadlink))
  1466. ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum],
  1467. node, deadlink);
  1468. if (maybe_cleanup)
  1469. cleanup_dead_nodes(rbtdb, node->locknum);
  1470. }
  1471. new_reference(rbtdb, node);
  1472. NODE_WEAKUNLOCK(nodelock, locktype);
  1473. NODE_STRONGUNLOCK(nodelock);
  1474. }
  1475. /*
  1476. * Caller must be holding the node lock; either the "strong", read or write
  1477. * lock. Note that the lock must be held even when node references are
  1478. * atomically modified; in that case the decrement operation itself does not
  1479. * have to be protected, but we must avoid a race condition where multiple
  1480. * threads are decreasing the reference to zero simultaneously and at least
  1481. * one of them is going to free the node.
  1482. * This function returns ISC_TRUE if and only if the node reference decreases
  1483. * to zero.
  1484. */
  1485. static isc_boolean_t
  1486. decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
  1487. rbtdb_serial_t least_serial,
  1488. isc_rwlocktype_t nlock, isc_rwlocktype_t tlock,
  1489. isc_boolean_t pruning)
  1490. {
  1491. isc_result_t result;
  1492. isc_boolean_t write_locked;
  1493. rbtdb_nodelock_t *nodelock;
  1494. unsigned int refs, nrefs;
  1495. int bucket = node->locknum;
  1496. isc_boolean_t no_reference = ISC_TRUE;
  1497. nodelock = &rbtdb->node_locks[bucket];
  1498. /* Handle easy and typical case first. */
  1499. if (!node->dirty && (node->data != NULL || node->down != NULL)) {
  1500. dns_rbtnode_refdecrement(node, &nrefs);
  1501. INSIST((int)nrefs >= 0);
  1502. if (nrefs == 0) {
  1503. isc_refcount_decrement(&nodelock->references, &refs);
  1504. INSIST((int)refs >= 0);
  1505. }
  1506. return ((nrefs == 0) ? ISC_TRUE : ISC_FALSE);
  1507. }
  1508. /* Upgrade the lock? */
  1509. if (nlock == isc_rwlocktype_read) {
  1510. NODE_WEAKUNLOCK(&nodelock->lock, isc_rwlocktype_read);
  1511. NODE_WEAKL