PageRenderTime 131ms CodeModel.GetById 38ms app.highlight 67ms RepoModel.GetById 2ms app.codeStats 2ms

/contrib/bind9/lib/dns/rbtdb.c

https://bitbucket.org/freebsd/freebsd-head/
C | 9332 lines | 6603 code | 1036 blank | 1693 comment | 2043 complexity | 246c4f82217f43e68410e305432312b4 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/*
   2 * Copyright (C) 2004-2012  Internet Systems Consortium, Inc. ("ISC")
   3 * Copyright (C) 1999-2003  Internet Software Consortium.
   4 *
   5 * Permission to use, copy, modify, and/or distribute this software for any
   6 * purpose with or without fee is hereby granted, provided that the above
   7 * copyright notice and this permission notice appear in all copies.
   8 *
   9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
  10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
  11 * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
  12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
  13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
  14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
  15 * PERFORMANCE OF THIS SOFTWARE.
  16 */
  17
  18/* $Id$ */
  19
  20/*! \file */
  21
  22/*
  23 * Principal Author: Bob Halley
  24 */
  25
  26#include <config.h>
  27
  28/* #define inline */
  29
  30#include <isc/event.h>
  31#include <isc/heap.h>
  32#include <isc/mem.h>
  33#include <isc/mutex.h>
  34#include <isc/platform.h>
  35#include <isc/print.h>
  36#include <isc/random.h>
  37#include <isc/refcount.h>
  38#include <isc/rwlock.h>
  39#include <isc/serial.h>
  40#include <isc/string.h>
  41#include <isc/task.h>
  42#include <isc/time.h>
  43#include <isc/util.h>
  44
  45#include <dns/acache.h>
  46#include <dns/db.h>
  47#include <dns/dbiterator.h>
  48#include <dns/events.h>
  49#include <dns/fixedname.h>
  50#include <dns/lib.h>
  51#include <dns/log.h>
  52#include <dns/masterdump.h>
  53#include <dns/nsec.h>
  54#include <dns/nsec3.h>
  55#include <dns/rbt.h>
  56#include <dns/rpz.h>
  57#include <dns/rdata.h>
  58#include <dns/rdataset.h>
  59#include <dns/rdatasetiter.h>
  60#include <dns/rdataslab.h>
  61#include <dns/rdatastruct.h>
  62#include <dns/result.h>
  63#include <dns/stats.h>
  64#include <dns/view.h>
  65#include <dns/zone.h>
  66#include <dns/zonekey.h>
  67
  68#ifdef DNS_RBTDB_VERSION64
  69#include "rbtdb64.h"
  70#else
  71#include "rbtdb.h"
  72#endif
  73
  74#ifdef DNS_RBTDB_VERSION64
  75#define RBTDB_MAGIC                     ISC_MAGIC('R', 'B', 'D', '8')
  76#else
  77#define RBTDB_MAGIC                     ISC_MAGIC('R', 'B', 'D', '4')
  78#endif
  79
  80/*%
  81 * Note that "impmagic" is not the first four bytes of the struct, so
  82 * ISC_MAGIC_VALID cannot be used.
  83 */
  84#define VALID_RBTDB(rbtdb)      ((rbtdb) != NULL && \
  85				 (rbtdb)->common.impmagic == RBTDB_MAGIC)
  86
  87#ifdef DNS_RBTDB_VERSION64
  88typedef isc_uint64_t                    rbtdb_serial_t;
  89/*%
  90 * Make casting easier in symbolic debuggers by using different names
  91 * for the 64 bit version.
  92 */
  93#define dns_rbtdb_t dns_rbtdb64_t
  94#define rdatasetheader_t rdatasetheader64_t
  95#define rbtdb_version_t rbtdb_version64_t
  96#else
  97typedef isc_uint32_t                    rbtdb_serial_t;
  98#endif
  99
 100typedef isc_uint32_t                    rbtdb_rdatatype_t;
 101
 102#define RBTDB_RDATATYPE_BASE(type)      ((dns_rdatatype_t)((type) & 0xFFFF))
 103#define RBTDB_RDATATYPE_EXT(type)       ((dns_rdatatype_t)((type) >> 16))
 104#define RBTDB_RDATATYPE_VALUE(b, e)     ((rbtdb_rdatatype_t)((e) << 16) | (b))
 105
 106#define RBTDB_RDATATYPE_SIGNSEC \
 107		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec)
 108#define RBTDB_RDATATYPE_SIGNSEC3 \
 109		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec3)
 110#define RBTDB_RDATATYPE_SIGNS \
 111		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns)
 112#define RBTDB_RDATATYPE_SIGCNAME \
 113		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname)
 114#define RBTDB_RDATATYPE_SIGDNAME \
 115		RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname)
 116#define RBTDB_RDATATYPE_NCACHEANY \
 117		RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any)
 118
 119/*
 120 * We use rwlock for DB lock only when ISC_RWLOCK_USEATOMIC is non 0.
 121 * Using rwlock is effective with regard to lookup performance only when
 122 * it is implemented in an efficient way.
 123 * Otherwise, it is generally wise to stick to the simple locking since rwlock
 124 * would require more memory or can even make lookups slower due to its own
 125 * overhead (when it internally calls mutex locks).
 126 */
 127#ifdef ISC_RWLOCK_USEATOMIC
 128#define DNS_RBTDB_USERWLOCK 1
 129#else
 130#define DNS_RBTDB_USERWLOCK 0
 131#endif
 132
 133#if DNS_RBTDB_USERWLOCK
 134#define RBTDB_INITLOCK(l)       isc_rwlock_init((l), 0, 0)
 135#define RBTDB_DESTROYLOCK(l)    isc_rwlock_destroy(l)
 136#define RBTDB_LOCK(l, t)        RWLOCK((l), (t))
 137#define RBTDB_UNLOCK(l, t)      RWUNLOCK((l), (t))
 138#else
 139#define RBTDB_INITLOCK(l)       isc_mutex_init(l)
 140#define RBTDB_DESTROYLOCK(l)    DESTROYLOCK(l)
 141#define RBTDB_LOCK(l, t)        LOCK(l)
 142#define RBTDB_UNLOCK(l, t)      UNLOCK(l)
 143#endif
 144
 145/*
 146 * Since node locking is sensitive to both performance and memory footprint,
 147 * we need some trick here.  If we have both high-performance rwlock and
 148 * high performance and small-memory reference counters, we use rwlock for
 149 * node lock and isc_refcount for node references.  In this case, we don't have
 150 * to protect the access to the counters by locks.
 151 * Otherwise, we simply use ordinary mutex lock for node locking, and use
 152 * simple integers as reference counters which is protected by the lock.
 153 * In most cases, we can simply use wrapper macros such as NODE_LOCK and
 154 * NODE_UNLOCK.  In some other cases, however, we need to protect reference
 155 * counters first and then protect other parts of a node as read-only data.
 156 * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also
 157 * provided for these special cases.  When we can use the efficient backend
 158 * routines, we should only protect the "other members" by NODE_WEAKLOCK(read).
 159 * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical
 160 * section including the access to the reference counter.
 161 * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected
 162 * section is also protected by NODE_STRONGLOCK().
 163 */
 164#if defined(ISC_RWLOCK_USEATOMIC) && defined(DNS_RBT_USEISCREFCOUNT)
 165typedef isc_rwlock_t nodelock_t;
 166
 167#define NODE_INITLOCK(l)        isc_rwlock_init((l), 0, 0)
 168#define NODE_DESTROYLOCK(l)     isc_rwlock_destroy(l)
 169#define NODE_LOCK(l, t)         RWLOCK((l), (t))
 170#define NODE_UNLOCK(l, t)       RWUNLOCK((l), (t))
 171#define NODE_TRYUPGRADE(l)      isc_rwlock_tryupgrade(l)
 172
 173#define NODE_STRONGLOCK(l)      ((void)0)
 174#define NODE_STRONGUNLOCK(l)    ((void)0)
 175#define NODE_WEAKLOCK(l, t)     NODE_LOCK(l, t)
 176#define NODE_WEAKUNLOCK(l, t)   NODE_UNLOCK(l, t)
 177#define NODE_WEAKDOWNGRADE(l)   isc_rwlock_downgrade(l)
 178#else
 179typedef isc_mutex_t nodelock_t;
 180
 181#define NODE_INITLOCK(l)        isc_mutex_init(l)
 182#define NODE_DESTROYLOCK(l)     DESTROYLOCK(l)
 183#define NODE_LOCK(l, t)         LOCK(l)
 184#define NODE_UNLOCK(l, t)       UNLOCK(l)
 185#define NODE_TRYUPGRADE(l)      ISC_R_SUCCESS
 186
 187#define NODE_STRONGLOCK(l)      LOCK(l)
 188#define NODE_STRONGUNLOCK(l)    UNLOCK(l)
 189#define NODE_WEAKLOCK(l, t)     ((void)0)
 190#define NODE_WEAKUNLOCK(l, t)   ((void)0)
 191#define NODE_WEAKDOWNGRADE(l)   ((void)0)
 192#endif
 193
 194/*%
 195 * Whether to rate-limit updating the LRU to avoid possible thread contention.
 196 * Our performance measurement has shown the cost is marginal, so it's defined
 197 * to be 0 by default either with or without threads.
 198 */
 199#ifndef DNS_RBTDB_LIMITLRUUPDATE
 200#define DNS_RBTDB_LIMITLRUUPDATE 0
 201#endif
 202
 203/*
 204 * Allow clients with a virtual time of up to 5 minutes in the past to see
 205 * records that would have otherwise have expired.
 206 */
 207#define RBTDB_VIRTUAL 300
 208
 209struct noqname {
 210	dns_name_t 	name;
 211	void *     	neg;
 212	void *     	negsig;
 213	dns_rdatatype_t	type;
 214};
 215
 216typedef struct acachectl acachectl_t;
 217
 218typedef struct rdatasetheader {
 219	/*%
 220	 * Locked by the owning node's lock.
 221	 */
 222	rbtdb_serial_t                  serial;
 223	dns_ttl_t                       rdh_ttl;
 224	rbtdb_rdatatype_t               type;
 225	isc_uint16_t                    attributes;
 226	dns_trust_t                     trust;
 227	struct noqname                  *noqname;
 228	struct noqname                  *closest;
 229	/*%<
 230	 * We don't use the LIST macros, because the LIST structure has
 231	 * both head and tail pointers, and is doubly linked.
 232	 */
 233
 234	struct rdatasetheader           *next;
 235	/*%<
 236	 * If this is the top header for an rdataset, 'next' points
 237	 * to the top header for the next rdataset (i.e., the next type).
 238	 * Otherwise, it points up to the header whose down pointer points
 239	 * at this header.
 240	 */
 241
 242	struct rdatasetheader           *down;
 243	/*%<
 244	 * Points to the header for the next older version of
 245	 * this rdataset.
 246	 */
 247
 248	isc_uint32_t                    count;
 249	/*%<
 250	 * Monotonously increased every time this rdataset is bound so that
 251	 * it is used as the base of the starting point in DNS responses
 252	 * when the "cyclic" rrset-order is required.  Since the ordering
 253	 * should not be so crucial, no lock is set for the counter for
 254	 * performance reasons.
 255	 */
 256
 257	acachectl_t                     *additional_auth;
 258	acachectl_t                     *additional_glue;
 259
 260	dns_rbtnode_t                   *node;
 261	isc_stdtime_t                   last_used;
 262	ISC_LINK(struct rdatasetheader) link;
 263
 264	unsigned int                    heap_index;
 265	/*%<
 266	 * Used for TTL-based cache cleaning.
 267	 */
 268	isc_stdtime_t                   resign;
 269} rdatasetheader_t;
 270
 271typedef ISC_LIST(rdatasetheader_t)      rdatasetheaderlist_t;
 272typedef ISC_LIST(dns_rbtnode_t)         rbtnodelist_t;
 273
 274#define RDATASET_ATTR_NONEXISTENT       0x0001
 275#define RDATASET_ATTR_STALE             0x0002
 276#define RDATASET_ATTR_IGNORE            0x0004
 277#define RDATASET_ATTR_RETAIN            0x0008
 278#define RDATASET_ATTR_NXDOMAIN          0x0010
 279#define RDATASET_ATTR_RESIGN            0x0020
 280#define RDATASET_ATTR_STATCOUNT         0x0040
 281#define RDATASET_ATTR_OPTOUT		0x0080
 282#define RDATASET_ATTR_NEGATIVE          0x0100
 283
 284typedef struct acache_cbarg {
 285	dns_rdatasetadditional_t        type;
 286	unsigned int                    count;
 287	dns_db_t                        *db;
 288	dns_dbnode_t                    *node;
 289	rdatasetheader_t                *header;
 290} acache_cbarg_t;
 291
 292struct acachectl {
 293	dns_acacheentry_t               *entry;
 294	acache_cbarg_t                  *cbarg;
 295};
 296
 297/*
 298 * XXX
 299 * When the cache will pre-expire data (due to memory low or other
 300 * situations) before the rdataset's TTL has expired, it MUST
 301 * respect the RETAIN bit and not expire the data until its TTL is
 302 * expired.
 303 */
 304
 305#undef IGNORE                   /* WIN32 winbase.h defines this. */
 306
 307#define EXISTS(header) \
 308	(((header)->attributes & RDATASET_ATTR_NONEXISTENT) == 0)
 309#define NONEXISTENT(header) \
 310	(((header)->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
 311#define IGNORE(header) \
 312	(((header)->attributes & RDATASET_ATTR_IGNORE) != 0)
 313#define RETAIN(header) \
 314	(((header)->attributes & RDATASET_ATTR_RETAIN) != 0)
 315#define NXDOMAIN(header) \
 316	(((header)->attributes & RDATASET_ATTR_NXDOMAIN) != 0)
 317#define RESIGN(header) \
 318	(((header)->attributes & RDATASET_ATTR_RESIGN) != 0)
 319#define OPTOUT(header) \
 320	(((header)->attributes & RDATASET_ATTR_OPTOUT) != 0)
 321#define NEGATIVE(header) \
 322	(((header)->attributes & RDATASET_ATTR_NEGATIVE) != 0)
 323
 324#define DEFAULT_NODE_LOCK_COUNT         7       /*%< Should be prime. */
 325
 326/*%
 327 * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps).
 328 * There is a tradeoff issue about configuring this value: if this is too
 329 * small, it may cause heavier contention between threads; if this is too large,
 330 * LRU purge algorithm won't work well (entries tend to be purged prematurely).
 331 * The default value should work well for most environments, but this can
 332 * also be configurable at compilation time via the
 333 * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable.  This value must be larger than
 334 * 1 due to the assumption of overmem_purge().
 335 */
 336#ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT
 337#if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1
 338#error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1"
 339#else
 340#define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT
 341#endif
 342#else
 343#define DEFAULT_CACHE_NODE_LOCK_COUNT   16
 344#endif	/* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */
 345
 346typedef struct {
 347	nodelock_t                      lock;
 348	/* Protected in the refcount routines. */
 349	isc_refcount_t                  references;
 350	/* Locked by lock. */
 351	isc_boolean_t                   exiting;
 352} rbtdb_nodelock_t;
 353
 354typedef struct rbtdb_changed {
 355	dns_rbtnode_t *                 node;
 356	isc_boolean_t                   dirty;
 357	ISC_LINK(struct rbtdb_changed)  link;
 358} rbtdb_changed_t;
 359
 360typedef ISC_LIST(rbtdb_changed_t)       rbtdb_changedlist_t;
 361
 362typedef enum {
 363	dns_db_insecure,
 364	dns_db_partial,
 365	dns_db_secure
 366} dns_db_secure_t;
 367
 368typedef struct dns_rbtdb dns_rbtdb_t;
 369
 370typedef struct rbtdb_version {
 371	/* Not locked */
 372	rbtdb_serial_t                  serial;
 373	dns_rbtdb_t *			rbtdb;
 374	/*
 375	 * Protected in the refcount routines.
 376	 * XXXJT: should we change the lock policy based on the refcount
 377	 * performance?
 378	 */
 379	isc_refcount_t                  references;
 380	/* Locked by database lock. */
 381	isc_boolean_t                   writer;
 382	isc_boolean_t                   commit_ok;
 383	rbtdb_changedlist_t             changed_list;
 384	rdatasetheaderlist_t		resigned_list;
 385	ISC_LINK(struct rbtdb_version)  link;
 386	dns_db_secure_t			secure;
 387	isc_boolean_t			havensec3;
 388	/* NSEC3 parameters */
 389	dns_hash_t			hash;
 390	isc_uint8_t			flags;
 391	isc_uint16_t			iterations;
 392	isc_uint8_t			salt_length;
 393	unsigned char			salt[DNS_NSEC3_SALTSIZE];
 394} rbtdb_version_t;
 395
 396typedef ISC_LIST(rbtdb_version_t)       rbtdb_versionlist_t;
 397
 398struct dns_rbtdb {
 399	/* Unlocked. */
 400	dns_db_t                        common;
 401	/* Locks the data in this struct */
 402#if DNS_RBTDB_USERWLOCK
 403	isc_rwlock_t                    lock;
 404#else
 405	isc_mutex_t                     lock;
 406#endif
 407	/* Locks the tree structure (prevents nodes appearing/disappearing) */
 408	isc_rwlock_t                    tree_lock;
 409	/* Locks for individual tree nodes */
 410	unsigned int                    node_lock_count;
 411	rbtdb_nodelock_t *              node_locks;
 412	dns_rbtnode_t *                 origin_node;
 413	dns_stats_t *			rrsetstats; /* cache DB only */
 414	/* Locked by lock. */
 415	unsigned int                    active;
 416	isc_refcount_t                  references;
 417	unsigned int                    attributes;
 418	rbtdb_serial_t                  current_serial;
 419	rbtdb_serial_t                  least_serial;
 420	rbtdb_serial_t                  next_serial;
 421	rbtdb_version_t *               current_version;
 422	rbtdb_version_t *               future_version;
 423	rbtdb_versionlist_t             open_versions;
 424	isc_task_t *                    task;
 425	dns_dbnode_t                    *soanode;
 426	dns_dbnode_t                    *nsnode;
 427
 428	/*
 429	 * This is a linked list used to implement the LRU cache.  There will
 430	 * be node_lock_count linked lists here.  Nodes in bucket 1 will be
 431	 * placed on the linked list rdatasets[1].
 432	 */
 433	rdatasetheaderlist_t            *rdatasets;
 434
 435	/*%
 436	 * Temporary storage for stale cache nodes and dynamically deleted
 437	 * nodes that await being cleaned up.
 438	 */
 439	rbtnodelist_t                   *deadnodes;
 440
 441	/*
 442	 * Heaps.  These are used for TTL based expiry in a cache,
 443	 * or for zone resigning in a zone DB.  hmctx is the memory
 444	 * context to use for the heap (which differs from the main
 445	 * database memory context in the case of a cache).
 446	 */
 447	isc_mem_t *			hmctx;
 448	isc_heap_t                      **heaps;
 449
 450	/* Locked by tree_lock. */
 451	dns_rbt_t *                     tree;
 452	dns_rbt_t *			nsec;
 453	dns_rbt_t *			nsec3;
 454	dns_rpz_cidr_t *		rpz_cidr;
 455
 456	/* Unlocked */
 457	unsigned int                    quantum;
 458};
 459
 460#define RBTDB_ATTR_LOADED               0x01
 461#define RBTDB_ATTR_LOADING              0x02
 462
 463/*%
 464 * Search Context
 465 */
 466typedef struct {
 467	dns_rbtdb_t *           rbtdb;
 468	rbtdb_version_t *       rbtversion;
 469	rbtdb_serial_t          serial;
 470	unsigned int            options;
 471	dns_rbtnodechain_t      chain;
 472	isc_boolean_t           copy_name;
 473	isc_boolean_t           need_cleanup;
 474	isc_boolean_t           wild;
 475	dns_rbtnode_t *         zonecut;
 476	rdatasetheader_t *      zonecut_rdataset;
 477	rdatasetheader_t *      zonecut_sigrdataset;
 478	dns_fixedname_t         zonecut_name;
 479	isc_stdtime_t           now;
 480} rbtdb_search_t;
 481
 482/*%
 483 * Load Context
 484 */
 485typedef struct {
 486	dns_rbtdb_t *           rbtdb;
 487	isc_stdtime_t           now;
 488} rbtdb_load_t;
 489
 490static void rdataset_disassociate(dns_rdataset_t *rdataset);
 491static isc_result_t rdataset_first(dns_rdataset_t *rdataset);
 492static isc_result_t rdataset_next(dns_rdataset_t *rdataset);
 493static void rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata);
 494static void rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target);
 495static unsigned int rdataset_count(dns_rdataset_t *rdataset);
 496static isc_result_t rdataset_getnoqname(dns_rdataset_t *rdataset,
 497					dns_name_t *name,
 498					dns_rdataset_t *neg,
 499					dns_rdataset_t *negsig);
 500static isc_result_t rdataset_getclosest(dns_rdataset_t *rdataset,
 501					dns_name_t *name,
 502					dns_rdataset_t *neg,
 503					dns_rdataset_t *negsig);
 504static isc_result_t rdataset_getadditional(dns_rdataset_t *rdataset,
 505					   dns_rdatasetadditional_t type,
 506					   dns_rdatatype_t qtype,
 507					   dns_acache_t *acache,
 508					   dns_zone_t **zonep,
 509					   dns_db_t **dbp,
 510					   dns_dbversion_t **versionp,
 511					   dns_dbnode_t **nodep,
 512					   dns_name_t *fname,
 513					   dns_message_t *msg,
 514					   isc_stdtime_t now);
 515static isc_result_t rdataset_setadditional(dns_rdataset_t *rdataset,
 516					   dns_rdatasetadditional_t type,
 517					   dns_rdatatype_t qtype,
 518					   dns_acache_t *acache,
 519					   dns_zone_t *zone,
 520					   dns_db_t *db,
 521					   dns_dbversion_t *version,
 522					   dns_dbnode_t *node,
 523					   dns_name_t *fname);
 524static isc_result_t rdataset_putadditional(dns_acache_t *acache,
 525					   dns_rdataset_t *rdataset,
 526					   dns_rdatasetadditional_t type,
 527					   dns_rdatatype_t qtype);
 528static inline isc_boolean_t need_headerupdate(rdatasetheader_t *header,
 529					      isc_stdtime_t now);
 530static void update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
 531			  isc_stdtime_t now);
 532static void expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
 533			  isc_boolean_t tree_locked);
 534static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
 535			  isc_stdtime_t now, isc_boolean_t tree_locked);
 536static isc_result_t resign_insert(dns_rbtdb_t *rbtdb, int idx,
 537				  rdatasetheader_t *newheader);
 538static void prune_tree(isc_task_t *task, isc_event_t *event);
 539static void rdataset_settrust(dns_rdataset_t *rdataset, dns_trust_t trust);
 540static void rdataset_expire(dns_rdataset_t *rdataset);
 541
 542static dns_rdatasetmethods_t rdataset_methods = {
 543	rdataset_disassociate,
 544	rdataset_first,
 545	rdataset_next,
 546	rdataset_current,
 547	rdataset_clone,
 548	rdataset_count,
 549	NULL,
 550	rdataset_getnoqname,
 551	NULL,
 552	rdataset_getclosest,
 553	rdataset_getadditional,
 554	rdataset_setadditional,
 555	rdataset_putadditional,
 556	rdataset_settrust,
 557	rdataset_expire
 558};
 559
 560static void rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp);
 561static isc_result_t rdatasetiter_first(dns_rdatasetiter_t *iterator);
 562static isc_result_t rdatasetiter_next(dns_rdatasetiter_t *iterator);
 563static void rdatasetiter_current(dns_rdatasetiter_t *iterator,
 564				 dns_rdataset_t *rdataset);
 565
 566static dns_rdatasetitermethods_t rdatasetiter_methods = {
 567	rdatasetiter_destroy,
 568	rdatasetiter_first,
 569	rdatasetiter_next,
 570	rdatasetiter_current
 571};
 572
 573typedef struct rbtdb_rdatasetiter {
 574	dns_rdatasetiter_t              common;
 575	rdatasetheader_t *              current;
 576} rbtdb_rdatasetiter_t;
 577
 578static void             dbiterator_destroy(dns_dbiterator_t **iteratorp);
 579static isc_result_t     dbiterator_first(dns_dbiterator_t *iterator);
 580static isc_result_t     dbiterator_last(dns_dbiterator_t *iterator);
 581static isc_result_t     dbiterator_seek(dns_dbiterator_t *iterator,
 582					dns_name_t *name);
 583static isc_result_t     dbiterator_prev(dns_dbiterator_t *iterator);
 584static isc_result_t     dbiterator_next(dns_dbiterator_t *iterator);
 585static isc_result_t     dbiterator_current(dns_dbiterator_t *iterator,
 586					   dns_dbnode_t **nodep,
 587					   dns_name_t *name);
 588static isc_result_t     dbiterator_pause(dns_dbiterator_t *iterator);
 589static isc_result_t     dbiterator_origin(dns_dbiterator_t *iterator,
 590					  dns_name_t *name);
 591
 592static dns_dbiteratormethods_t dbiterator_methods = {
 593	dbiterator_destroy,
 594	dbiterator_first,
 595	dbiterator_last,
 596	dbiterator_seek,
 597	dbiterator_prev,
 598	dbiterator_next,
 599	dbiterator_current,
 600	dbiterator_pause,
 601	dbiterator_origin
 602};
 603
 604#define DELETION_BATCH_MAX 64
 605
 606/*
 607 * If 'paused' is ISC_TRUE, then the tree lock is not being held.
 608 */
 609typedef struct rbtdb_dbiterator {
 610	dns_dbiterator_t                common;
 611	isc_boolean_t                   paused;
 612	isc_boolean_t                   new_origin;
 613	isc_rwlocktype_t                tree_locked;
 614	isc_result_t                    result;
 615	dns_fixedname_t                 name;
 616	dns_fixedname_t                 origin;
 617	dns_rbtnodechain_t              chain;
 618	dns_rbtnodechain_t		nsec3chain;
 619	dns_rbtnodechain_t		*current;
 620	dns_rbtnode_t                   *node;
 621	dns_rbtnode_t                   *deletions[DELETION_BATCH_MAX];
 622	int                             delete;
 623	isc_boolean_t			nsec3only;
 624	isc_boolean_t			nonsec3;
 625} rbtdb_dbiterator_t;
 626
 627
 628#define IS_STUB(rbtdb)  (((rbtdb)->common.attributes & DNS_DBATTR_STUB)  != 0)
 629#define IS_CACHE(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_CACHE) != 0)
 630
 631static void free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log,
 632		       isc_event_t *event);
 633static void overmem(dns_db_t *db, isc_boolean_t overmem);
 634#ifdef BIND9
 635static void setnsec3parameters(dns_db_t *db, rbtdb_version_t *version);
 636#endif
 637
 638/*%
 639 * 'init_count' is used to initialize 'newheader->count' which inturn
 640 * is used to determine where in the cycle rrset-order cyclic starts.
 641 * We don't lock this as we don't care about simultaneous updates.
 642 *
 643 * Note:
 644 *      Both init_count and header->count can be ISC_UINT32_MAX.
 645 *      The count on the returned rdataset however can't be as
 646 *      that indicates that the database does not implement cyclic
 647 *      processing.
 648 */
 649static unsigned int init_count;
 650
 651/*
 652 * Locking
 653 *
 654 * If a routine is going to lock more than one lock in this module, then
 655 * the locking must be done in the following order:
 656 *
 657 *      Tree Lock
 658 *
 659 *      Node Lock       (Only one from the set may be locked at one time by
 660 *                       any caller)
 661 *
 662 *      Database Lock
 663 *
 664 * Failure to follow this hierarchy can result in deadlock.
 665 */
 666
 667/*
 668 * Deleting Nodes
 669 *
 670 * For zone databases the node for the origin of the zone MUST NOT be deleted.
 671 */
 672
 673
 674/*
 675 * DB Routines
 676 */
 677
 678static void
 679attach(dns_db_t *source, dns_db_t **targetp) {
 680	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)source;
 681
 682	REQUIRE(VALID_RBTDB(rbtdb));
 683
 684	isc_refcount_increment(&rbtdb->references, NULL);
 685
 686	*targetp = source;
 687}
 688
 689static void
 690free_rbtdb_callback(isc_task_t *task, isc_event_t *event) {
 691	dns_rbtdb_t *rbtdb = event->ev_arg;
 692
 693	UNUSED(task);
 694
 695	free_rbtdb(rbtdb, ISC_TRUE, event);
 696}
 697
 698static void
 699update_rrsetstats(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
 700		  isc_boolean_t increment)
 701{
 702	dns_rdatastatstype_t statattributes = 0;
 703	dns_rdatastatstype_t base = 0;
 704	dns_rdatastatstype_t type;
 705
 706	/* At the moment we count statistics only for cache DB */
 707	INSIST(IS_CACHE(rbtdb));
 708
 709	if (NEGATIVE(header)) {
 710		if (NXDOMAIN(header))
 711			statattributes = DNS_RDATASTATSTYPE_ATTR_NXDOMAIN;
 712		else {
 713			statattributes = DNS_RDATASTATSTYPE_ATTR_NXRRSET;
 714			base = RBTDB_RDATATYPE_EXT(header->type);
 715		}
 716	} else
 717		base = RBTDB_RDATATYPE_BASE(header->type);
 718
 719	type = DNS_RDATASTATSTYPE_VALUE(base, statattributes);
 720	if (increment)
 721		dns_rdatasetstats_increment(rbtdb->rrsetstats, type);
 722	else
 723		dns_rdatasetstats_decrement(rbtdb->rrsetstats, type);
 724}
 725
 726static void
 727set_ttl(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, dns_ttl_t newttl) {
 728	int idx;
 729	isc_heap_t *heap;
 730	dns_ttl_t oldttl;
 731
 732	oldttl = header->rdh_ttl;
 733	header->rdh_ttl = newttl;
 734
 735	if (!IS_CACHE(rbtdb))
 736		return;
 737
 738	/*
 739	 * It's possible the rbtdb is not a cache.  If this is the case,
 740	 * we will not have a heap, and we move on.  If we do, though,
 741	 * we might need to adjust things.
 742	 */
 743	if (header->heap_index == 0 || newttl == oldttl)
 744		return;
 745	idx = header->node->locknum;
 746	if (rbtdb->heaps == NULL || rbtdb->heaps[idx] == NULL)
 747	    return;
 748	heap = rbtdb->heaps[idx];
 749
 750	if (newttl < oldttl)
 751		isc_heap_increased(heap, header->heap_index);
 752	else
 753		isc_heap_decreased(heap, header->heap_index);
 754}
 755
 756/*%
 757 * These functions allow the heap code to rank the priority of each
 758 * element.  It returns ISC_TRUE if v1 happens "sooner" than v2.
 759 */
 760static isc_boolean_t
 761ttl_sooner(void *v1, void *v2) {
 762	rdatasetheader_t *h1 = v1;
 763	rdatasetheader_t *h2 = v2;
 764
 765	if (h1->rdh_ttl < h2->rdh_ttl)
 766		return (ISC_TRUE);
 767	return (ISC_FALSE);
 768}
 769
 770static isc_boolean_t
 771resign_sooner(void *v1, void *v2) {
 772	rdatasetheader_t *h1 = v1;
 773	rdatasetheader_t *h2 = v2;
 774
 775	if (h1->resign < h2->resign)
 776		return (ISC_TRUE);
 777	return (ISC_FALSE);
 778}
 779
 780/*%
 781 * This function sets the heap index into the header.
 782 */
 783static void
 784set_index(void *what, unsigned int index) {
 785	rdatasetheader_t *h = what;
 786
 787	h->heap_index = index;
 788}
 789
 790/*%
 791 * Work out how many nodes can be deleted in the time between two
 792 * requests to the nameserver.  Smooth the resulting number and use it
 793 * as a estimate for the number of nodes to be deleted in the next
 794 * iteration.
 795 */
 796static unsigned int
 797adjust_quantum(unsigned int old, isc_time_t *start) {
 798	unsigned int pps = dns_pps;     /* packets per second */
 799	unsigned int interval;
 800	isc_uint64_t usecs;
 801	isc_time_t end;
 802	unsigned int new;
 803
 804	if (pps < 100)
 805		pps = 100;
 806	isc_time_now(&end);
 807
 808	interval = 1000000 / pps;       /* interval in usec */
 809	if (interval == 0)
 810		interval = 1;
 811	usecs = isc_time_microdiff(&end, start);
 812	if (usecs == 0) {
 813		/*
 814		 * We were unable to measure the amount of time taken.
 815		 * Double the nodes deleted next time.
 816		 */
 817		old *= 2;
 818		if (old > 1000)
 819			old = 1000;
 820		return (old);
 821	}
 822	new = old * interval;
 823	new /= (unsigned int)usecs;
 824	if (new == 0)
 825		new = 1;
 826	else if (new > 1000)
 827		new = 1000;
 828
 829	/* Smooth */
 830	new = (new + old * 3) / 4;
 831
 832	isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE,
 833		      ISC_LOG_DEBUG(1), "adjust_quantum -> %d", new);
 834
 835	return (new);
 836}
 837
 838static void
 839free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) {
 840	unsigned int i;
 841	isc_ondestroy_t ondest;
 842	isc_result_t result;
 843	char buf[DNS_NAME_FORMATSIZE];
 844	dns_rbt_t **treep;
 845	isc_time_t start;
 846
 847	if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
 848		overmem((dns_db_t *)rbtdb, (isc_boolean_t)-1);
 849
 850	REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions));
 851	REQUIRE(rbtdb->future_version == NULL);
 852
 853	if (rbtdb->current_version != NULL) {
 854		unsigned int refs;
 855
 856		isc_refcount_decrement(&rbtdb->current_version->references,
 857				       &refs);
 858		INSIST(refs == 0);
 859		UNLINK(rbtdb->open_versions, rbtdb->current_version, link);
 860		isc_refcount_destroy(&rbtdb->current_version->references);
 861		isc_mem_put(rbtdb->common.mctx, rbtdb->current_version,
 862			    sizeof(rbtdb_version_t));
 863	}
 864
 865	/*
 866	 * We assume the number of remaining dead nodes is reasonably small;
 867	 * the overhead of unlinking all nodes here should be negligible.
 868	 */
 869	for (i = 0; i < rbtdb->node_lock_count; i++) {
 870		dns_rbtnode_t *node;
 871
 872		node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
 873		while (node != NULL) {
 874			ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink);
 875			node = ISC_LIST_HEAD(rbtdb->deadnodes[i]);
 876		}
 877	}
 878
 879	if (event == NULL)
 880		rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0;
 881
 882	for (;;) {
 883		/*
 884		 * pick the next tree to (start to) destroy
 885		 */
 886		treep = &rbtdb->tree;
 887		if (*treep == NULL) {
 888			treep = &rbtdb->nsec;
 889			if (*treep == NULL) {
 890				treep = &rbtdb->nsec3;
 891				/*
 892				 * we're finished after clear cutting
 893				 */
 894				if (*treep == NULL)
 895					break;
 896			}
 897		}
 898
 899		isc_time_now(&start);
 900		result = dns_rbt_destroy2(treep, rbtdb->quantum);
 901		if (result == ISC_R_QUOTA) {
 902			INSIST(rbtdb->task != NULL);
 903			if (rbtdb->quantum != 0)
 904				rbtdb->quantum = adjust_quantum(rbtdb->quantum,
 905								&start);
 906			if (event == NULL)
 907				event = isc_event_allocate(rbtdb->common.mctx,
 908							   NULL,
 909							 DNS_EVENT_FREESTORAGE,
 910							   free_rbtdb_callback,
 911							   rbtdb,
 912							   sizeof(isc_event_t));
 913			if (event == NULL)
 914				continue;
 915			isc_task_send(rbtdb->task, &event);
 916			return;
 917		}
 918		INSIST(result == ISC_R_SUCCESS && *treep == NULL);
 919	}
 920
 921	if (event != NULL)
 922		isc_event_free(&event);
 923	if (log) {
 924		if (dns_name_dynamic(&rbtdb->common.origin))
 925			dns_name_format(&rbtdb->common.origin, buf,
 926					sizeof(buf));
 927		else
 928			strcpy(buf, "<UNKNOWN>");
 929		isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
 930			      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
 931			      "done free_rbtdb(%s)", buf);
 932	}
 933	if (dns_name_dynamic(&rbtdb->common.origin))
 934		dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx);
 935	for (i = 0; i < rbtdb->node_lock_count; i++) {
 936		isc_refcount_destroy(&rbtdb->node_locks[i].references);
 937		NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock);
 938	}
 939
 940	/*
 941	 * Clean up LRU / re-signing order lists.
 942	 */
 943	if (rbtdb->rdatasets != NULL) {
 944		for (i = 0; i < rbtdb->node_lock_count; i++)
 945			INSIST(ISC_LIST_EMPTY(rbtdb->rdatasets[i]));
 946		isc_mem_put(rbtdb->common.mctx, rbtdb->rdatasets,
 947			    rbtdb->node_lock_count *
 948			    sizeof(rdatasetheaderlist_t));
 949	}
 950	/*
 951	 * Clean up dead node buckets.
 952	 */
 953	if (rbtdb->deadnodes != NULL) {
 954		for (i = 0; i < rbtdb->node_lock_count; i++)
 955			INSIST(ISC_LIST_EMPTY(rbtdb->deadnodes[i]));
 956		isc_mem_put(rbtdb->common.mctx, rbtdb->deadnodes,
 957		    rbtdb->node_lock_count * sizeof(rbtnodelist_t));
 958	}
 959	/*
 960	 * Clean up heap objects.
 961	 */
 962	if (rbtdb->heaps != NULL) {
 963		for (i = 0; i < rbtdb->node_lock_count; i++)
 964			isc_heap_destroy(&rbtdb->heaps[i]);
 965		isc_mem_put(rbtdb->hmctx, rbtdb->heaps,
 966			    rbtdb->node_lock_count * sizeof(isc_heap_t *));
 967	}
 968
 969	if (rbtdb->rrsetstats != NULL)
 970		dns_stats_detach(&rbtdb->rrsetstats);
 971
 972#ifdef BIND9
 973	if (rbtdb->rpz_cidr != NULL)
 974		dns_rpz_cidr_free(&rbtdb->rpz_cidr);
 975#endif
 976
 977	isc_mem_put(rbtdb->common.mctx, rbtdb->node_locks,
 978		    rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t));
 979	isc_rwlock_destroy(&rbtdb->tree_lock);
 980	isc_refcount_destroy(&rbtdb->references);
 981	if (rbtdb->task != NULL)
 982		isc_task_detach(&rbtdb->task);
 983
 984	RBTDB_DESTROYLOCK(&rbtdb->lock);
 985	rbtdb->common.magic = 0;
 986	rbtdb->common.impmagic = 0;
 987	ondest = rbtdb->common.ondest;
 988	isc_mem_detach(&rbtdb->hmctx);
 989	isc_mem_putanddetach(&rbtdb->common.mctx, rbtdb, sizeof(*rbtdb));
 990	isc_ondestroy_notify(&ondest, rbtdb);
 991}
 992
 993static inline void
 994maybe_free_rbtdb(dns_rbtdb_t *rbtdb) {
 995	isc_boolean_t want_free = ISC_FALSE;
 996	unsigned int i;
 997	unsigned int inactive = 0;
 998
 999	/* XXX check for open versions here */
1000
1001	if (rbtdb->soanode != NULL)
1002		dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->soanode);
1003	if (rbtdb->nsnode != NULL)
1004		dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->nsnode);
1005
1006	/*
1007	 * Even though there are no external direct references, there still
1008	 * may be nodes in use.
1009	 */
1010	for (i = 0; i < rbtdb->node_lock_count; i++) {
1011		NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
1012		rbtdb->node_locks[i].exiting = ISC_TRUE;
1013		NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write);
1014		if (isc_refcount_current(&rbtdb->node_locks[i].references)
1015		    == 0) {
1016			inactive++;
1017		}
1018	}
1019
1020	if (inactive != 0) {
1021		RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1022		rbtdb->active -= inactive;
1023		if (rbtdb->active == 0)
1024			want_free = ISC_TRUE;
1025		RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1026		if (want_free) {
1027			char buf[DNS_NAME_FORMATSIZE];
1028			if (dns_name_dynamic(&rbtdb->common.origin))
1029				dns_name_format(&rbtdb->common.origin, buf,
1030						sizeof(buf));
1031			else
1032				strcpy(buf, "<UNKNOWN>");
1033			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1034				      DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
1035				      "calling free_rbtdb(%s)", buf);
1036			free_rbtdb(rbtdb, ISC_TRUE, NULL);
1037		}
1038	}
1039}
1040
1041static void
1042detach(dns_db_t **dbp) {
1043	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(*dbp);
1044	unsigned int refs;
1045
1046	REQUIRE(VALID_RBTDB(rbtdb));
1047
1048	isc_refcount_decrement(&rbtdb->references, &refs);
1049
1050	if (refs == 0)
1051		maybe_free_rbtdb(rbtdb);
1052
1053	*dbp = NULL;
1054}
1055
1056static void
1057currentversion(dns_db_t *db, dns_dbversion_t **versionp) {
1058	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1059	rbtdb_version_t *version;
1060	unsigned int refs;
1061
1062	REQUIRE(VALID_RBTDB(rbtdb));
1063
1064	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read);
1065	version = rbtdb->current_version;
1066	isc_refcount_increment(&version->references, &refs);
1067	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read);
1068
1069	*versionp = (dns_dbversion_t *)version;
1070}
1071
1072static inline rbtdb_version_t *
1073allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial,
1074		 unsigned int references, isc_boolean_t writer)
1075{
1076	isc_result_t result;
1077	rbtdb_version_t *version;
1078
1079	version = isc_mem_get(mctx, sizeof(*version));
1080	if (version == NULL)
1081		return (NULL);
1082	version->serial = serial;
1083	result = isc_refcount_init(&version->references, references);
1084	if (result != ISC_R_SUCCESS) {
1085		isc_mem_put(mctx, version, sizeof(*version));
1086		return (NULL);
1087	}
1088	version->writer = writer;
1089	version->commit_ok = ISC_FALSE;
1090	ISC_LIST_INIT(version->changed_list);
1091	ISC_LIST_INIT(version->resigned_list);
1092	ISC_LINK_INIT(version, link);
1093
1094	return (version);
1095}
1096
1097static isc_result_t
1098newversion(dns_db_t *db, dns_dbversion_t **versionp) {
1099	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1100	rbtdb_version_t *version;
1101
1102	REQUIRE(VALID_RBTDB(rbtdb));
1103	REQUIRE(versionp != NULL && *versionp == NULL);
1104	REQUIRE(rbtdb->future_version == NULL);
1105
1106	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1107	RUNTIME_CHECK(rbtdb->next_serial != 0);         /* XXX Error? */
1108	version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1,
1109				   ISC_TRUE);
1110	if (version != NULL) {
1111		version->rbtdb = rbtdb;
1112		version->commit_ok = ISC_TRUE;
1113		version->secure = rbtdb->current_version->secure;
1114		version->havensec3 = rbtdb->current_version->havensec3;
1115		if (version->havensec3) {
1116			version->flags = rbtdb->current_version->flags;
1117			version->iterations =
1118				rbtdb->current_version->iterations;
1119			version->hash = rbtdb->current_version->hash;
1120			version->salt_length =
1121				rbtdb->current_version->salt_length;
1122			memcpy(version->salt, rbtdb->current_version->salt,
1123			       version->salt_length);
1124		} else {
1125			version->flags = 0;
1126			version->iterations = 0;
1127			version->hash = 0;
1128			version->salt_length = 0;
1129			memset(version->salt, 0, sizeof(version->salt));
1130		}
1131		rbtdb->next_serial++;
1132		rbtdb->future_version = version;
1133	}
1134	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1135
1136	if (version == NULL)
1137		return (ISC_R_NOMEMORY);
1138
1139	*versionp = version;
1140
1141	return (ISC_R_SUCCESS);
1142}
1143
1144static void
1145attachversion(dns_db_t *db, dns_dbversion_t *source,
1146	      dns_dbversion_t **targetp)
1147{
1148	dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
1149	rbtdb_version_t *rbtversion = source;
1150	unsigned int refs;
1151
1152	REQUIRE(VALID_RBTDB(rbtdb));
1153	INSIST(rbtversion != NULL && rbtversion->rbtdb == rbtdb);
1154
1155	isc_refcount_increment(&rbtversion->references, &refs);
1156	INSIST(refs > 1);
1157
1158	*targetp = rbtversion;
1159}
1160
1161static rbtdb_changed_t *
1162add_changed(dns_rbtdb_t *rbtdb, rbtdb_version_t *version,
1163	    dns_rbtnode_t *node)
1164{
1165	rbtdb_changed_t *changed;
1166	unsigned int refs;
1167
1168	/*
1169	 * Caller must be holding the node lock if its reference must be
1170	 * protected by the lock.
1171	 */
1172
1173	changed = isc_mem_get(rbtdb->common.mctx, sizeof(*changed));
1174
1175	RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write);
1176
1177	REQUIRE(version->writer);
1178
1179	if (changed != NULL) {
1180		dns_rbtnode_refincrement(node, &refs);
1181		INSIST(refs != 0);
1182		changed->node = node;
1183		changed->dirty = ISC_FALSE;
1184		ISC_LIST_INITANDAPPEND(version->changed_list, changed, link);
1185	} else
1186		version->commit_ok = ISC_FALSE;
1187
1188	RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
1189
1190	return (changed);
1191}
1192
1193static void
1194free_acachearray(isc_mem_t *mctx, rdatasetheader_t *header,
1195		 acachectl_t *array)
1196{
1197	unsigned int count;
1198	unsigned int i;
1199	unsigned char *raw;     /* RDATASLAB */
1200
1201	/*
1202	 * The caller must be holding the corresponding node lock.
1203	 */
1204
1205	if (array == NULL)
1206		return;
1207
1208	raw = (unsigned char *)header + sizeof(*header);
1209	count = raw[0] * 256 + raw[1];
1210
1211	/*
1212	 * Sanity check: since an additional cache entry has a reference to
1213	 * the original DB node (in the callback arg), there should be no
1214	 * acache entries when the node can be freed.
1215	 */
1216	for (i = 0; i < count; i++)
1217		INSIST(array[i].entry == NULL && array[i].cbarg == NULL);
1218
1219	isc_mem_put(mctx, array, count * sizeof(acachectl_t));
1220}
1221
1222static inline void
1223free_noqname(isc_mem_t *mctx, struct noqname **noqname) {
1224
1225	if (dns_name_dynamic(&(*noqname)->name))
1226		dns_name_free(&(*noqname)->name, mctx);
1227	if ((*noqname)->neg != NULL)
1228		isc_mem_put(mctx, (*noqname)->neg,
1229			    dns_rdataslab_size((*noqname)->neg, 0));
1230	if ((*noqname)->negsig != NULL)
1231		isc_mem_put(mctx, (*noqname)->negsig,
1232			    dns_rdataslab_size((*noqname)->negsig, 0));
1233	isc_mem_put(mctx, *noqname, sizeof(**noqname));
1234	*noqname = NULL;
1235}
1236
1237static inline void
1238init_rdataset(dns_rbtdb_t *rbtdb, rdatasetheader_t *h)
1239{
1240	ISC_LINK_INIT(h, link);
1241	h->heap_index = 0;
1242
1243#if TRACE_HEADER
1244	if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
1245		fprintf(stderr, "initialized header: %p\n", h);
1246#else
1247	UNUSED(rbtdb);
1248#endif
1249}
1250
1251static inline rdatasetheader_t *
1252new_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx)
1253{
1254	rdatasetheader_t *h;
1255
1256	h = isc_mem_get(mctx, sizeof(*h));
1257	if (h == NULL)
1258		return (NULL);
1259
1260#if TRACE_HEADER
1261	if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in)
1262		fprintf(stderr, "allocated header: %p\n", h);
1263#endif
1264	init_rdataset(rbtdb, h);
1265	return (h);
1266}
1267
1268static inline void
1269free_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *rdataset)
1270{
1271	unsigned int size;
1272	int idx;
1273
1274	if (EXISTS(rdataset) &&
1275	    (rdataset->attributes & RDATASET_ATTR_STATCOUNT) != 0) {
1276		update_rrsetstats(rbtdb, rdataset, ISC_FALSE);
1277	}
1278
1279	idx = rdataset->node->locknum;
1280	if (ISC_LINK_LINKED(rdataset, link)) {
1281		INSIST(IS_CACHE(rbtdb));
1282		ISC_LIST_UNLINK(rbtdb->rdatasets[idx], rdataset, link);
1283	}
1284	if (rdataset->heap_index != 0)
1285		isc_heap_delete(rbtdb->heaps[idx], rdataset->heap_index);
1286	rdataset->heap_index = 0;
1287
1288	if (rdataset->noqname != NULL)
1289		free_noqname(mctx, &rdataset->noqname);
1290	if (rdataset->closest != NULL)
1291		free_noqname(mctx, &rdataset->closest);
1292
1293	free_acachearray(mctx, rdataset, rdataset->additional_auth);
1294	free_acachearray(mctx, rdataset, rdataset->additional_glue);
1295
1296	if ((rdataset->attributes & RDATASET_ATTR_NONEXISTENT) != 0)
1297		size = sizeof(*rdataset);
1298	else
1299		size = dns_rdataslab_size((unsigned char *)rdataset,
1300					  sizeof(*rdataset));
1301	isc_mem_put(mctx, rdataset, size);
1302}
1303
1304static inline void
1305rollback_node(dns_rbtnode_t *node, rbtdb_serial_t serial) {
1306	rdatasetheader_t *header, *dcurrent;
1307	isc_boolean_t make_dirty = ISC_FALSE;
1308
1309	/*
1310	 * Caller must hold the node lock.
1311	 */
1312
1313	/*
1314	 * We set the IGNORE attribute on rdatasets with serial number
1315	 * 'serial'.  When the reference count goes to zero, these rdatasets
1316	 * will be cleaned up; until that time, they will be ignored.
1317	 */
1318	for (header = node->data; header != NULL; header = header->next) {
1319		if (header->serial == serial) {
1320			header->attributes |= RDATASET_ATTR_IGNORE;
1321			make_dirty = ISC_TRUE;
1322		}
1323		for (dcurrent = header->down;
1324		     dcurrent != NULL;
1325		     dcurrent = dcurrent->down) {
1326			if (dcurrent->serial == serial) {
1327				dcurrent->attributes |= RDATASET_ATTR_IGNORE;
1328				make_dirty = ISC_TRUE;
1329			}
1330		}
1331	}
1332	if (make_dirty)
1333		node->dirty = 1;
1334}
1335
1336static inline void
1337clean_stale_headers(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *top)
1338{
1339	rdatasetheader_t *d, *down_next;
1340
1341	for (d = top->down; d != NULL; d = down_next) {
1342		down_next = d->down;
1343		free_rdataset(rbtdb, mctx, d);
1344	}
1345	top->down = NULL;
1346}
1347
1348static inline void
1349clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1350	rdatasetheader_t *current, *top_prev, *top_next;
1351	isc_mem_t *mctx = rbtdb->common.mctx;
1352
1353	/*
1354	 * Caller must be holding the node lock.
1355	 */
1356
1357	top_prev = NULL;
1358	for (current = node->data; current != NULL; current = top_next) {
1359		top_next = current->next;
1360		clean_stale_headers(rbtdb, mctx, current);
1361		/*
1362		 * If current is nonexistent or stale, we can clean it up.
1363		 */
1364		if ((current->attributes &
1365		     (RDATASET_ATTR_NONEXISTENT|RDATASET_ATTR_STALE)) != 0) {
1366			if (top_prev != NULL)
1367				top_prev->next = current->next;
1368			else
1369				node->data = current->next;
1370			free_rdataset(rbtdb, mctx, current);
1371		} else
1372			top_prev = current;
1373	}
1374	node->dirty = 0;
1375}
1376
1377static inline void
1378clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1379		rbtdb_serial_t least_serial)
1380{
1381	rdatasetheader_t *current, *dcurrent, *down_next, *dparent;
1382	rdatasetheader_t *top_prev, *top_next;
1383	isc_mem_t *mctx = rbtdb->common.mctx;
1384	isc_boolean_t still_dirty = ISC_FALSE;
1385
1386	/*
1387	 * Caller must be holding the node lock.
1388	 */
1389	REQUIRE(least_serial != 0);
1390
1391	top_prev = NULL;
1392	for (current = node->data; current != NULL; current = top_next) {
1393		top_next = current->next;
1394
1395		/*
1396		 * First, we clean up any instances of multiple rdatasets
1397		 * with the same serial number, or that have the IGNORE
1398		 * attribute.
1399		 */
1400		dparent = current;
1401		for (dcurrent = current->down;
1402		     dcurrent != NULL;
1403		     dcurrent = down_next) {
1404			down_next = dcurrent->down;
1405			INSIST(dcurrent->serial <= dparent->serial);
1406			if (dcurrent->serial == dparent->serial ||
1407			    IGNORE(dcurrent)) {
1408				if (down_next != NULL)
1409					down_next->next = dparent;
1410				dparent->down = down_next;
1411				free_rdataset(rbtdb, mctx, dcurrent);
1412			} else
1413				dparent = dcurrent;
1414		}
1415
1416		/*
1417		 * We've now eliminated all IGNORE datasets with the possible
1418		 * exception of current, which we now check.
1419		 */
1420		if (IGNORE(current)) {
1421			down_next = current->down;
1422			if (down_next == NULL) {
1423				if (top_prev != NULL)
1424					top_prev->next = current->next;
1425				else
1426					node->data = current->next;
1427				free_rdataset(rbtdb, mctx, current);
1428				/*
1429				 * current no longer exists, so we can
1430				 * just continue with the loop.
1431				 */
1432				continue;
1433			} else {
1434				/*
1435				 * Pull up current->down, making it the new
1436				 * current.
1437				 */
1438				if (top_prev != NULL)
1439					top_prev->next = down_next;
1440				else
1441					node->data = down_next;
1442				down_next->next = top_next;
1443				free_rdataset(rbtdb, mctx, current);
1444				current = down_next;
1445			}
1446		}
1447
1448		/*
1449		 * We now try to find the first down node less than the
1450		 * least serial.
1451		 */
1452		dparent = current;
1453		for (dcurrent = current->down;
1454		     dcurrent != NULL;
1455		     dcurrent = down_next) {
1456			down_next = dcurrent->down;
1457			if (dcurrent->serial < least_serial)
1458				break;
1459			dparent = dcurrent;
1460		}
1461
1462		/*
1463		 * If there is a such an rdataset, delete it and any older
1464		 * versions.
1465		 */
1466		if (dcurrent != NULL) {
1467			do {
1468				down_next = dcurrent->down;
1469				INSIST(dcurrent->serial <= least_serial);
1470				free_rdataset(rbtdb, mctx, dcurrent);
1471				dcurrent = down_next;
1472			} while (dcurrent != NULL);
1473			dparent->down = NULL;
1474		}
1475
1476		/*
1477		 * Note.  The serial number of 'current' might be less than
1478		 * least_serial too, but we cannot delete it because it is
1479		 * the most recent version, unless it is a NONEXISTENT
1480		 * rdataset.
1481		 */
1482		if (current->down != NULL) {
1483			still_dirty = ISC_TRUE;
1484			top_prev = current;
1485		} else {
1486			/*
1487			 * If this is a NONEXISTENT rdataset, we can delete it.
1488			 */
1489			if (NONEXISTENT(current)) {
1490				if (top_prev != NULL)
1491					top_prev->next = current->next;
1492				else
1493					node->data = current->next;
1494				free_rdataset(rbtdb, mctx, current);
1495			} else
1496				top_prev = current;
1497		}
1498	}
1499	if (!still_dirty)
1500		node->dirty = 0;
1501}
1502
1503static void
1504delete_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node)
1505{
1506	dns_rbtnode_t *nsecnode;
1507	dns_fixedname_t fname;
1508	dns_name_t *name;
1509	isc_result_t result = ISC_R_UNEXPECTED;
1510
1511	INSIST(!ISC_LINK_LINKED(node, deadlink));
1512
1513	switch (node->nsec) {
1514	case DNS_RBT_NSEC_NORMAL:
1515#ifdef BIND9
1516		if (rbtdb->rpz_cidr != NULL) {
1517			dns_fixedname_init(&fname);
1518			name = dns_fixedname_name(&fname);
1519			dns_rbt_fullnamefromnode(node, name);
1520			dns_rpz_cidr_deleteip(rbtdb->rpz_cidr, name);
1521		}
1522#endif
1523		result = dns_rbt_deletenode(rbtdb->tree, node, ISC_FALSE);
1524		break;
1525	case DNS_RBT_NSEC_HAS_NSEC:
1526		dns_fixedname_init(&fname);
1527		name = dns_fixedname_name(&fname);
1528		dns_rbt_fullnamefromnode(node, name);
1529		/*
1530		 * Delete the corresponding node from the auxiliary NSEC
1531		 * tree before deleting from the main tree.
1532		 */
1533		nsecnode = NULL;
1534		result = dns_rbt_findnode(rbtdb->nsec, name, NULL, &nsecnode,
1535					  NULL, DNS_RBTFIND_EMPTYDATA,
1536					  NULL, NULL);
1537		if (result != ISC_R_SUCCESS) {
1538			isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
1539				      DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
1540				      "delete_node: "
1541				      "dns_rbt_findnode(nsec): %s",
1542				      isc_result_totext(result));
1543		} else {
1544			result = dns_rbt_deletenode(rbtdb->nsec, nsecnode,
1545						    ISC_FALSE);
1546			if (result != ISC_R_SUCCESS) {
1547				isc_log_write(dns_lctx,
1548					      DNS_LOGCATEGORY_DATABASE,
1549					      DNS_LOGMODULE_CACHE,
1550					      ISC_LOG_WARNING,
1551					      "delete_nsecnode(): "
1552					      "dns_rbt_deletenode(nsecnode): %s",
1553					      isc_result_totext(result));
1554			}
1555		}
1556		result = dns_rbt_deletenode(rbtdb->tree, node, ISC_FALSE);
1557#ifdef BIND9
1558		dns_rpz_cidr_deleteip(rbtdb->rpz_cidr, name);
1559#endif
1560		break;
1561	case DNS_RBT_NSEC_NSEC:
1562		result = dns_rbt_deletenode(rbtdb->nsec, node, ISC_FALSE);
1563		break;
1564	case DNS_RBT_NSEC_NSEC3:
1565		result = dns_rbt_deletenode(rbtdb->nsec3, node, ISC_FALSE);
1566		break;
1567	}
1568	if (result != ISC_R_SUCCESS) {
1569		isc_log_write(dns_lctx,
1570			      DNS_LOGCATEGORY_DATABASE,
1571			      DNS_LOGMODULE_CACHE,
1572			      ISC_LOG_WARNING,
1573			      "delete_nsecnode(): "
1574			      "dns_rbt_deletenode: %s",
1575			      isc_result_totext(result));
1576	}
1577}
1578
1579/*%
1580 * Clean up dead nodes.  These are nodes which have no references, and
1581 * have no data.  They are dead but we could not or chose not to delete
1582 * them when we deleted all the data at that node because we did not want
1583 * to wait for the tree write lock.
1584 *
1585 * The caller must hold a tree write lock and bucketnum'th node (write) lock.
1586 */
1587static void
1588cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum) {
1589	dns_rbtnode_t *node;
1590	int count = 10;         /* XXXJT: should be adjustable */
1591
1592	node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1593	while (node != NULL && count > 0) {
1594		ISC_LIST_UNLINK(rbtdb->deadnodes[bucketnum], node, deadlink);
1595
1596		/*
1597		 * Since we're holding a tree write lock, it should be
1598		 * impossible for this node to be referenced by others.
1599		 */
1600		INSIST(dns_rbtnode_refcurrent(node) == 0 &&
1601		       node->data == NULL);
1602
1603		delete_node(rbtdb, node);
1604
1605		node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]);
1606		count--;
1607	}
1608}
1609
1610/*
1611 * Caller must be holding the node lock.
1612 */
1613static inline void
1614new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
1615	unsigned int lockrefs, noderefs;
1616	isc_refcount_t *lockref;
1617
1618	INSIST(!ISC_LINK_LINKED(node, deadlink));
1619	dns_rbtnode_refincrement0(node, &noderefs);
1620	if (noderefs == 1) {    /* this is the first reference to the node */
1621		lockref = &rbtdb->node_locks[node->locknum].references;
1622		isc_refcount_increment0(lockref, &lockrefs);
1623		INSIST(lockrefs != 0);
1624	}
1625	INSIST(noderefs != 0);
1626}
1627
1628/*
1629 * This function is assumed to be called when a node is newly referenced
1630 * and can be in the deadnode list.  In that case the node must be retrieved
1631 * from the list because it is going to be used.  In addition, if the caller
1632 * happens to hold a write lock on the tree, it's a good chance to purge dead
1633 * nodes.
1634 * Note: while a new reference is gained in multiple places, there are only very
1635 * few cases where the node can be in the deadnode list (only empty nodes can
1636 * have been added to the list).
1637 */
1638static inline void
1639reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1640		isc_rwlocktype_t treelocktype)
1641{
1642	isc_rwlocktype_t locktype = isc_rwlocktype_read;
1643	nodelock_t *nodelock = &rbtdb->node_locks[node->locknum].lock;
1644	isc_boolean_t maybe_cleanup = ISC_FALSE;
1645
1646	POST(locktype);
1647
1648	NODE_STRONGLOCK(nodelock);
1649	NODE_WEAKLOCK(nodelock, locktype);
1650
1651	/*
1652	 * Check if we can possibly cleanup the dead node.  If so, upgrade
1653	 * the node lock below to perform the cleanup.
1654	 */
1655	if (!ISC_LIST_EMPTY(rbtdb->deadnodes[node->locknum]) &&
1656	    treelocktype == isc_rwlocktype_write) {
1657		maybe_cleanup = ISC_TRUE;
1658	}
1659
1660	if (ISC_LINK_LINKED(node, deadlink) || maybe_cleanup) {
1661		/*
1662		 * Upgrade the lock and test if we still need to unlink.
1663		 */
1664		NODE_WEAKUNLOCK(nodelock, locktype);
1665		locktype = isc_rwlocktype_write;
1666		POST(locktype);
1667		NODE_WEAKLOCK(nodelock, locktype);
1668		if (ISC_LINK_LINKED(node, deadlink))
1669			ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum],
1670					node, deadlink);
1671		if (maybe_cleanup)
1672			cleanup_dead_nodes(rbtdb, node->locknum);
1673	}
1674
1675	new_reference(rbtdb, node);
1676
1677	NODE_WEAKUNLOCK(nodelock, locktype);
1678	NODE_STRONGUNLOCK(nodelock);
1679}
1680
1681/*
1682 * Caller must be holding the node lock; either the "strong", read or write
1683 * lock.  Note that the lock must be held even when node references are
1684 * atomically modified; in that case the decrement operation itself does not
1685 * have to be protected, but we must avoid a race condition where multiple
1686 * threads are decreasing the reference to zero simultaneously and at least
1687 * one of them is going to free the node.
1688 * This function returns ISC_TRUE if and only if the node reference decreases
1689 * to zero.
1690 */
1691static isc_boolean_t
1692decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
1693		    rbtdb_serial_t least_serial,
1694		    isc_rwlocktype_t nlock, isc_rwlocktype_t tlock,
1695		    isc_boolean_t pruning)
1696{
1697	isc_result_t result;
1698	isc_boolean_t write_locked;
1699	rbtdb_nodelock_t *nodelock;
1700	unsigned int refs, nrefs;
1701	int bucket = node->locknum;
1702	isc_boolean_t no_reference = ISC_TRUE;
1703
1704	nodelock = &rbtdb->node_locks[bucket];
1705
1706	/* Handle easy and typical case first. */
1707	if (!node->dirty && (node->data != NULL || node->down != NULL)) {
1708		dns_rbtnode_refdecrement(node, &nrefs);
1709		INSIST((int)nrefs >= 0);
1710		if (nrefs == 0) {
1711			isc_refcount_decrement(&nodelock->references, &refs);
1712			INSIST((int)refs >= 0);
1713		}
1714		return ((nrefs == 0) ? ISC_TRUE : ISC_FALSE);
1715	}
1716
1717	/* Upgrade the lock? */
1718	if (nlock == isc_rwlocktype_read) {
1719		NODE_WEAKUNLOCK(&nodelock->lock, isc_rwlocktype_read);
1720		NODE_WEAKL

Large files files are truncated, but you can click here to view the full file