/contrib/bind9/lib/dns/rbtdb.c
C | 9332 lines | 6603 code | 1036 blank | 1693 comment | 2043 complexity | 246c4f82217f43e68410e305432312b4 MD5 | raw file
Large files files are truncated, but you can click here to view the full file
1/* 2 * Copyright (C) 2004-2012 Internet Systems Consortium, Inc. ("ISC") 3 * Copyright (C) 1999-2003 Internet Software Consortium. 4 * 5 * Permission to use, copy, modify, and/or distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH 10 * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 11 * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, 12 * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 13 * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 14 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 15 * PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18/* $Id$ */ 19 20/*! \file */ 21 22/* 23 * Principal Author: Bob Halley 24 */ 25 26#include <config.h> 27 28/* #define inline */ 29 30#include <isc/event.h> 31#include <isc/heap.h> 32#include <isc/mem.h> 33#include <isc/mutex.h> 34#include <isc/platform.h> 35#include <isc/print.h> 36#include <isc/random.h> 37#include <isc/refcount.h> 38#include <isc/rwlock.h> 39#include <isc/serial.h> 40#include <isc/string.h> 41#include <isc/task.h> 42#include <isc/time.h> 43#include <isc/util.h> 44 45#include <dns/acache.h> 46#include <dns/db.h> 47#include <dns/dbiterator.h> 48#include <dns/events.h> 49#include <dns/fixedname.h> 50#include <dns/lib.h> 51#include <dns/log.h> 52#include <dns/masterdump.h> 53#include <dns/nsec.h> 54#include <dns/nsec3.h> 55#include <dns/rbt.h> 56#include <dns/rpz.h> 57#include <dns/rdata.h> 58#include <dns/rdataset.h> 59#include <dns/rdatasetiter.h> 60#include <dns/rdataslab.h> 61#include <dns/rdatastruct.h> 62#include <dns/result.h> 63#include <dns/stats.h> 64#include <dns/view.h> 65#include <dns/zone.h> 66#include <dns/zonekey.h> 67 68#ifdef DNS_RBTDB_VERSION64 69#include "rbtdb64.h" 70#else 71#include "rbtdb.h" 72#endif 73 74#ifdef DNS_RBTDB_VERSION64 75#define RBTDB_MAGIC ISC_MAGIC('R', 'B', 'D', '8') 76#else 77#define RBTDB_MAGIC ISC_MAGIC('R', 'B', 'D', '4') 78#endif 79 80/*% 81 * Note that "impmagic" is not the first four bytes of the struct, so 82 * ISC_MAGIC_VALID cannot be used. 83 */ 84#define VALID_RBTDB(rbtdb) ((rbtdb) != NULL && \ 85 (rbtdb)->common.impmagic == RBTDB_MAGIC) 86 87#ifdef DNS_RBTDB_VERSION64 88typedef isc_uint64_t rbtdb_serial_t; 89/*% 90 * Make casting easier in symbolic debuggers by using different names 91 * for the 64 bit version. 92 */ 93#define dns_rbtdb_t dns_rbtdb64_t 94#define rdatasetheader_t rdatasetheader64_t 95#define rbtdb_version_t rbtdb_version64_t 96#else 97typedef isc_uint32_t rbtdb_serial_t; 98#endif 99 100typedef isc_uint32_t rbtdb_rdatatype_t; 101 102#define RBTDB_RDATATYPE_BASE(type) ((dns_rdatatype_t)((type) & 0xFFFF)) 103#define RBTDB_RDATATYPE_EXT(type) ((dns_rdatatype_t)((type) >> 16)) 104#define RBTDB_RDATATYPE_VALUE(b, e) ((rbtdb_rdatatype_t)((e) << 16) | (b)) 105 106#define RBTDB_RDATATYPE_SIGNSEC \ 107 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec) 108#define RBTDB_RDATATYPE_SIGNSEC3 \ 109 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_nsec3) 110#define RBTDB_RDATATYPE_SIGNS \ 111 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_ns) 112#define RBTDB_RDATATYPE_SIGCNAME \ 113 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_cname) 114#define RBTDB_RDATATYPE_SIGDNAME \ 115 RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_dname) 116#define RBTDB_RDATATYPE_NCACHEANY \ 117 RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any) 118 119/* 120 * We use rwlock for DB lock only when ISC_RWLOCK_USEATOMIC is non 0. 121 * Using rwlock is effective with regard to lookup performance only when 122 * it is implemented in an efficient way. 123 * Otherwise, it is generally wise to stick to the simple locking since rwlock 124 * would require more memory or can even make lookups slower due to its own 125 * overhead (when it internally calls mutex locks). 126 */ 127#ifdef ISC_RWLOCK_USEATOMIC 128#define DNS_RBTDB_USERWLOCK 1 129#else 130#define DNS_RBTDB_USERWLOCK 0 131#endif 132 133#if DNS_RBTDB_USERWLOCK 134#define RBTDB_INITLOCK(l) isc_rwlock_init((l), 0, 0) 135#define RBTDB_DESTROYLOCK(l) isc_rwlock_destroy(l) 136#define RBTDB_LOCK(l, t) RWLOCK((l), (t)) 137#define RBTDB_UNLOCK(l, t) RWUNLOCK((l), (t)) 138#else 139#define RBTDB_INITLOCK(l) isc_mutex_init(l) 140#define RBTDB_DESTROYLOCK(l) DESTROYLOCK(l) 141#define RBTDB_LOCK(l, t) LOCK(l) 142#define RBTDB_UNLOCK(l, t) UNLOCK(l) 143#endif 144 145/* 146 * Since node locking is sensitive to both performance and memory footprint, 147 * we need some trick here. If we have both high-performance rwlock and 148 * high performance and small-memory reference counters, we use rwlock for 149 * node lock and isc_refcount for node references. In this case, we don't have 150 * to protect the access to the counters by locks. 151 * Otherwise, we simply use ordinary mutex lock for node locking, and use 152 * simple integers as reference counters which is protected by the lock. 153 * In most cases, we can simply use wrapper macros such as NODE_LOCK and 154 * NODE_UNLOCK. In some other cases, however, we need to protect reference 155 * counters first and then protect other parts of a node as read-only data. 156 * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also 157 * provided for these special cases. When we can use the efficient backend 158 * routines, we should only protect the "other members" by NODE_WEAKLOCK(read). 159 * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical 160 * section including the access to the reference counter. 161 * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected 162 * section is also protected by NODE_STRONGLOCK(). 163 */ 164#if defined(ISC_RWLOCK_USEATOMIC) && defined(DNS_RBT_USEISCREFCOUNT) 165typedef isc_rwlock_t nodelock_t; 166 167#define NODE_INITLOCK(l) isc_rwlock_init((l), 0, 0) 168#define NODE_DESTROYLOCK(l) isc_rwlock_destroy(l) 169#define NODE_LOCK(l, t) RWLOCK((l), (t)) 170#define NODE_UNLOCK(l, t) RWUNLOCK((l), (t)) 171#define NODE_TRYUPGRADE(l) isc_rwlock_tryupgrade(l) 172 173#define NODE_STRONGLOCK(l) ((void)0) 174#define NODE_STRONGUNLOCK(l) ((void)0) 175#define NODE_WEAKLOCK(l, t) NODE_LOCK(l, t) 176#define NODE_WEAKUNLOCK(l, t) NODE_UNLOCK(l, t) 177#define NODE_WEAKDOWNGRADE(l) isc_rwlock_downgrade(l) 178#else 179typedef isc_mutex_t nodelock_t; 180 181#define NODE_INITLOCK(l) isc_mutex_init(l) 182#define NODE_DESTROYLOCK(l) DESTROYLOCK(l) 183#define NODE_LOCK(l, t) LOCK(l) 184#define NODE_UNLOCK(l, t) UNLOCK(l) 185#define NODE_TRYUPGRADE(l) ISC_R_SUCCESS 186 187#define NODE_STRONGLOCK(l) LOCK(l) 188#define NODE_STRONGUNLOCK(l) UNLOCK(l) 189#define NODE_WEAKLOCK(l, t) ((void)0) 190#define NODE_WEAKUNLOCK(l, t) ((void)0) 191#define NODE_WEAKDOWNGRADE(l) ((void)0) 192#endif 193 194/*% 195 * Whether to rate-limit updating the LRU to avoid possible thread contention. 196 * Our performance measurement has shown the cost is marginal, so it's defined 197 * to be 0 by default either with or without threads. 198 */ 199#ifndef DNS_RBTDB_LIMITLRUUPDATE 200#define DNS_RBTDB_LIMITLRUUPDATE 0 201#endif 202 203/* 204 * Allow clients with a virtual time of up to 5 minutes in the past to see 205 * records that would have otherwise have expired. 206 */ 207#define RBTDB_VIRTUAL 300 208 209struct noqname { 210 dns_name_t name; 211 void * neg; 212 void * negsig; 213 dns_rdatatype_t type; 214}; 215 216typedef struct acachectl acachectl_t; 217 218typedef struct rdatasetheader { 219 /*% 220 * Locked by the owning node's lock. 221 */ 222 rbtdb_serial_t serial; 223 dns_ttl_t rdh_ttl; 224 rbtdb_rdatatype_t type; 225 isc_uint16_t attributes; 226 dns_trust_t trust; 227 struct noqname *noqname; 228 struct noqname *closest; 229 /*%< 230 * We don't use the LIST macros, because the LIST structure has 231 * both head and tail pointers, and is doubly linked. 232 */ 233 234 struct rdatasetheader *next; 235 /*%< 236 * If this is the top header for an rdataset, 'next' points 237 * to the top header for the next rdataset (i.e., the next type). 238 * Otherwise, it points up to the header whose down pointer points 239 * at this header. 240 */ 241 242 struct rdatasetheader *down; 243 /*%< 244 * Points to the header for the next older version of 245 * this rdataset. 246 */ 247 248 isc_uint32_t count; 249 /*%< 250 * Monotonously increased every time this rdataset is bound so that 251 * it is used as the base of the starting point in DNS responses 252 * when the "cyclic" rrset-order is required. Since the ordering 253 * should not be so crucial, no lock is set for the counter for 254 * performance reasons. 255 */ 256 257 acachectl_t *additional_auth; 258 acachectl_t *additional_glue; 259 260 dns_rbtnode_t *node; 261 isc_stdtime_t last_used; 262 ISC_LINK(struct rdatasetheader) link; 263 264 unsigned int heap_index; 265 /*%< 266 * Used for TTL-based cache cleaning. 267 */ 268 isc_stdtime_t resign; 269} rdatasetheader_t; 270 271typedef ISC_LIST(rdatasetheader_t) rdatasetheaderlist_t; 272typedef ISC_LIST(dns_rbtnode_t) rbtnodelist_t; 273 274#define RDATASET_ATTR_NONEXISTENT 0x0001 275#define RDATASET_ATTR_STALE 0x0002 276#define RDATASET_ATTR_IGNORE 0x0004 277#define RDATASET_ATTR_RETAIN 0x0008 278#define RDATASET_ATTR_NXDOMAIN 0x0010 279#define RDATASET_ATTR_RESIGN 0x0020 280#define RDATASET_ATTR_STATCOUNT 0x0040 281#define RDATASET_ATTR_OPTOUT 0x0080 282#define RDATASET_ATTR_NEGATIVE 0x0100 283 284typedef struct acache_cbarg { 285 dns_rdatasetadditional_t type; 286 unsigned int count; 287 dns_db_t *db; 288 dns_dbnode_t *node; 289 rdatasetheader_t *header; 290} acache_cbarg_t; 291 292struct acachectl { 293 dns_acacheentry_t *entry; 294 acache_cbarg_t *cbarg; 295}; 296 297/* 298 * XXX 299 * When the cache will pre-expire data (due to memory low or other 300 * situations) before the rdataset's TTL has expired, it MUST 301 * respect the RETAIN bit and not expire the data until its TTL is 302 * expired. 303 */ 304 305#undef IGNORE /* WIN32 winbase.h defines this. */ 306 307#define EXISTS(header) \ 308 (((header)->attributes & RDATASET_ATTR_NONEXISTENT) == 0) 309#define NONEXISTENT(header) \ 310 (((header)->attributes & RDATASET_ATTR_NONEXISTENT) != 0) 311#define IGNORE(header) \ 312 (((header)->attributes & RDATASET_ATTR_IGNORE) != 0) 313#define RETAIN(header) \ 314 (((header)->attributes & RDATASET_ATTR_RETAIN) != 0) 315#define NXDOMAIN(header) \ 316 (((header)->attributes & RDATASET_ATTR_NXDOMAIN) != 0) 317#define RESIGN(header) \ 318 (((header)->attributes & RDATASET_ATTR_RESIGN) != 0) 319#define OPTOUT(header) \ 320 (((header)->attributes & RDATASET_ATTR_OPTOUT) != 0) 321#define NEGATIVE(header) \ 322 (((header)->attributes & RDATASET_ATTR_NEGATIVE) != 0) 323 324#define DEFAULT_NODE_LOCK_COUNT 7 /*%< Should be prime. */ 325 326/*% 327 * Number of buckets for cache DB entries (locks, LRU lists, TTL heaps). 328 * There is a tradeoff issue about configuring this value: if this is too 329 * small, it may cause heavier contention between threads; if this is too large, 330 * LRU purge algorithm won't work well (entries tend to be purged prematurely). 331 * The default value should work well for most environments, but this can 332 * also be configurable at compilation time via the 333 * DNS_RBTDB_CACHE_NODE_LOCK_COUNT variable. This value must be larger than 334 * 1 due to the assumption of overmem_purge(). 335 */ 336#ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT 337#if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1 338#error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1" 339#else 340#define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT 341#endif 342#else 343#define DEFAULT_CACHE_NODE_LOCK_COUNT 16 344#endif /* DNS_RBTDB_CACHE_NODE_LOCK_COUNT */ 345 346typedef struct { 347 nodelock_t lock; 348 /* Protected in the refcount routines. */ 349 isc_refcount_t references; 350 /* Locked by lock. */ 351 isc_boolean_t exiting; 352} rbtdb_nodelock_t; 353 354typedef struct rbtdb_changed { 355 dns_rbtnode_t * node; 356 isc_boolean_t dirty; 357 ISC_LINK(struct rbtdb_changed) link; 358} rbtdb_changed_t; 359 360typedef ISC_LIST(rbtdb_changed_t) rbtdb_changedlist_t; 361 362typedef enum { 363 dns_db_insecure, 364 dns_db_partial, 365 dns_db_secure 366} dns_db_secure_t; 367 368typedef struct dns_rbtdb dns_rbtdb_t; 369 370typedef struct rbtdb_version { 371 /* Not locked */ 372 rbtdb_serial_t serial; 373 dns_rbtdb_t * rbtdb; 374 /* 375 * Protected in the refcount routines. 376 * XXXJT: should we change the lock policy based on the refcount 377 * performance? 378 */ 379 isc_refcount_t references; 380 /* Locked by database lock. */ 381 isc_boolean_t writer; 382 isc_boolean_t commit_ok; 383 rbtdb_changedlist_t changed_list; 384 rdatasetheaderlist_t resigned_list; 385 ISC_LINK(struct rbtdb_version) link; 386 dns_db_secure_t secure; 387 isc_boolean_t havensec3; 388 /* NSEC3 parameters */ 389 dns_hash_t hash; 390 isc_uint8_t flags; 391 isc_uint16_t iterations; 392 isc_uint8_t salt_length; 393 unsigned char salt[DNS_NSEC3_SALTSIZE]; 394} rbtdb_version_t; 395 396typedef ISC_LIST(rbtdb_version_t) rbtdb_versionlist_t; 397 398struct dns_rbtdb { 399 /* Unlocked. */ 400 dns_db_t common; 401 /* Locks the data in this struct */ 402#if DNS_RBTDB_USERWLOCK 403 isc_rwlock_t lock; 404#else 405 isc_mutex_t lock; 406#endif 407 /* Locks the tree structure (prevents nodes appearing/disappearing) */ 408 isc_rwlock_t tree_lock; 409 /* Locks for individual tree nodes */ 410 unsigned int node_lock_count; 411 rbtdb_nodelock_t * node_locks; 412 dns_rbtnode_t * origin_node; 413 dns_stats_t * rrsetstats; /* cache DB only */ 414 /* Locked by lock. */ 415 unsigned int active; 416 isc_refcount_t references; 417 unsigned int attributes; 418 rbtdb_serial_t current_serial; 419 rbtdb_serial_t least_serial; 420 rbtdb_serial_t next_serial; 421 rbtdb_version_t * current_version; 422 rbtdb_version_t * future_version; 423 rbtdb_versionlist_t open_versions; 424 isc_task_t * task; 425 dns_dbnode_t *soanode; 426 dns_dbnode_t *nsnode; 427 428 /* 429 * This is a linked list used to implement the LRU cache. There will 430 * be node_lock_count linked lists here. Nodes in bucket 1 will be 431 * placed on the linked list rdatasets[1]. 432 */ 433 rdatasetheaderlist_t *rdatasets; 434 435 /*% 436 * Temporary storage for stale cache nodes and dynamically deleted 437 * nodes that await being cleaned up. 438 */ 439 rbtnodelist_t *deadnodes; 440 441 /* 442 * Heaps. These are used for TTL based expiry in a cache, 443 * or for zone resigning in a zone DB. hmctx is the memory 444 * context to use for the heap (which differs from the main 445 * database memory context in the case of a cache). 446 */ 447 isc_mem_t * hmctx; 448 isc_heap_t **heaps; 449 450 /* Locked by tree_lock. */ 451 dns_rbt_t * tree; 452 dns_rbt_t * nsec; 453 dns_rbt_t * nsec3; 454 dns_rpz_cidr_t * rpz_cidr; 455 456 /* Unlocked */ 457 unsigned int quantum; 458}; 459 460#define RBTDB_ATTR_LOADED 0x01 461#define RBTDB_ATTR_LOADING 0x02 462 463/*% 464 * Search Context 465 */ 466typedef struct { 467 dns_rbtdb_t * rbtdb; 468 rbtdb_version_t * rbtversion; 469 rbtdb_serial_t serial; 470 unsigned int options; 471 dns_rbtnodechain_t chain; 472 isc_boolean_t copy_name; 473 isc_boolean_t need_cleanup; 474 isc_boolean_t wild; 475 dns_rbtnode_t * zonecut; 476 rdatasetheader_t * zonecut_rdataset; 477 rdatasetheader_t * zonecut_sigrdataset; 478 dns_fixedname_t zonecut_name; 479 isc_stdtime_t now; 480} rbtdb_search_t; 481 482/*% 483 * Load Context 484 */ 485typedef struct { 486 dns_rbtdb_t * rbtdb; 487 isc_stdtime_t now; 488} rbtdb_load_t; 489 490static void rdataset_disassociate(dns_rdataset_t *rdataset); 491static isc_result_t rdataset_first(dns_rdataset_t *rdataset); 492static isc_result_t rdataset_next(dns_rdataset_t *rdataset); 493static void rdataset_current(dns_rdataset_t *rdataset, dns_rdata_t *rdata); 494static void rdataset_clone(dns_rdataset_t *source, dns_rdataset_t *target); 495static unsigned int rdataset_count(dns_rdataset_t *rdataset); 496static isc_result_t rdataset_getnoqname(dns_rdataset_t *rdataset, 497 dns_name_t *name, 498 dns_rdataset_t *neg, 499 dns_rdataset_t *negsig); 500static isc_result_t rdataset_getclosest(dns_rdataset_t *rdataset, 501 dns_name_t *name, 502 dns_rdataset_t *neg, 503 dns_rdataset_t *negsig); 504static isc_result_t rdataset_getadditional(dns_rdataset_t *rdataset, 505 dns_rdatasetadditional_t type, 506 dns_rdatatype_t qtype, 507 dns_acache_t *acache, 508 dns_zone_t **zonep, 509 dns_db_t **dbp, 510 dns_dbversion_t **versionp, 511 dns_dbnode_t **nodep, 512 dns_name_t *fname, 513 dns_message_t *msg, 514 isc_stdtime_t now); 515static isc_result_t rdataset_setadditional(dns_rdataset_t *rdataset, 516 dns_rdatasetadditional_t type, 517 dns_rdatatype_t qtype, 518 dns_acache_t *acache, 519 dns_zone_t *zone, 520 dns_db_t *db, 521 dns_dbversion_t *version, 522 dns_dbnode_t *node, 523 dns_name_t *fname); 524static isc_result_t rdataset_putadditional(dns_acache_t *acache, 525 dns_rdataset_t *rdataset, 526 dns_rdatasetadditional_t type, 527 dns_rdatatype_t qtype); 528static inline isc_boolean_t need_headerupdate(rdatasetheader_t *header, 529 isc_stdtime_t now); 530static void update_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, 531 isc_stdtime_t now); 532static void expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, 533 isc_boolean_t tree_locked); 534static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start, 535 isc_stdtime_t now, isc_boolean_t tree_locked); 536static isc_result_t resign_insert(dns_rbtdb_t *rbtdb, int idx, 537 rdatasetheader_t *newheader); 538static void prune_tree(isc_task_t *task, isc_event_t *event); 539static void rdataset_settrust(dns_rdataset_t *rdataset, dns_trust_t trust); 540static void rdataset_expire(dns_rdataset_t *rdataset); 541 542static dns_rdatasetmethods_t rdataset_methods = { 543 rdataset_disassociate, 544 rdataset_first, 545 rdataset_next, 546 rdataset_current, 547 rdataset_clone, 548 rdataset_count, 549 NULL, 550 rdataset_getnoqname, 551 NULL, 552 rdataset_getclosest, 553 rdataset_getadditional, 554 rdataset_setadditional, 555 rdataset_putadditional, 556 rdataset_settrust, 557 rdataset_expire 558}; 559 560static void rdatasetiter_destroy(dns_rdatasetiter_t **iteratorp); 561static isc_result_t rdatasetiter_first(dns_rdatasetiter_t *iterator); 562static isc_result_t rdatasetiter_next(dns_rdatasetiter_t *iterator); 563static void rdatasetiter_current(dns_rdatasetiter_t *iterator, 564 dns_rdataset_t *rdataset); 565 566static dns_rdatasetitermethods_t rdatasetiter_methods = { 567 rdatasetiter_destroy, 568 rdatasetiter_first, 569 rdatasetiter_next, 570 rdatasetiter_current 571}; 572 573typedef struct rbtdb_rdatasetiter { 574 dns_rdatasetiter_t common; 575 rdatasetheader_t * current; 576} rbtdb_rdatasetiter_t; 577 578static void dbiterator_destroy(dns_dbiterator_t **iteratorp); 579static isc_result_t dbiterator_first(dns_dbiterator_t *iterator); 580static isc_result_t dbiterator_last(dns_dbiterator_t *iterator); 581static isc_result_t dbiterator_seek(dns_dbiterator_t *iterator, 582 dns_name_t *name); 583static isc_result_t dbiterator_prev(dns_dbiterator_t *iterator); 584static isc_result_t dbiterator_next(dns_dbiterator_t *iterator); 585static isc_result_t dbiterator_current(dns_dbiterator_t *iterator, 586 dns_dbnode_t **nodep, 587 dns_name_t *name); 588static isc_result_t dbiterator_pause(dns_dbiterator_t *iterator); 589static isc_result_t dbiterator_origin(dns_dbiterator_t *iterator, 590 dns_name_t *name); 591 592static dns_dbiteratormethods_t dbiterator_methods = { 593 dbiterator_destroy, 594 dbiterator_first, 595 dbiterator_last, 596 dbiterator_seek, 597 dbiterator_prev, 598 dbiterator_next, 599 dbiterator_current, 600 dbiterator_pause, 601 dbiterator_origin 602}; 603 604#define DELETION_BATCH_MAX 64 605 606/* 607 * If 'paused' is ISC_TRUE, then the tree lock is not being held. 608 */ 609typedef struct rbtdb_dbiterator { 610 dns_dbiterator_t common; 611 isc_boolean_t paused; 612 isc_boolean_t new_origin; 613 isc_rwlocktype_t tree_locked; 614 isc_result_t result; 615 dns_fixedname_t name; 616 dns_fixedname_t origin; 617 dns_rbtnodechain_t chain; 618 dns_rbtnodechain_t nsec3chain; 619 dns_rbtnodechain_t *current; 620 dns_rbtnode_t *node; 621 dns_rbtnode_t *deletions[DELETION_BATCH_MAX]; 622 int delete; 623 isc_boolean_t nsec3only; 624 isc_boolean_t nonsec3; 625} rbtdb_dbiterator_t; 626 627 628#define IS_STUB(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_STUB) != 0) 629#define IS_CACHE(rbtdb) (((rbtdb)->common.attributes & DNS_DBATTR_CACHE) != 0) 630 631static void free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, 632 isc_event_t *event); 633static void overmem(dns_db_t *db, isc_boolean_t overmem); 634#ifdef BIND9 635static void setnsec3parameters(dns_db_t *db, rbtdb_version_t *version); 636#endif 637 638/*% 639 * 'init_count' is used to initialize 'newheader->count' which inturn 640 * is used to determine where in the cycle rrset-order cyclic starts. 641 * We don't lock this as we don't care about simultaneous updates. 642 * 643 * Note: 644 * Both init_count and header->count can be ISC_UINT32_MAX. 645 * The count on the returned rdataset however can't be as 646 * that indicates that the database does not implement cyclic 647 * processing. 648 */ 649static unsigned int init_count; 650 651/* 652 * Locking 653 * 654 * If a routine is going to lock more than one lock in this module, then 655 * the locking must be done in the following order: 656 * 657 * Tree Lock 658 * 659 * Node Lock (Only one from the set may be locked at one time by 660 * any caller) 661 * 662 * Database Lock 663 * 664 * Failure to follow this hierarchy can result in deadlock. 665 */ 666 667/* 668 * Deleting Nodes 669 * 670 * For zone databases the node for the origin of the zone MUST NOT be deleted. 671 */ 672 673 674/* 675 * DB Routines 676 */ 677 678static void 679attach(dns_db_t *source, dns_db_t **targetp) { 680 dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)source; 681 682 REQUIRE(VALID_RBTDB(rbtdb)); 683 684 isc_refcount_increment(&rbtdb->references, NULL); 685 686 *targetp = source; 687} 688 689static void 690free_rbtdb_callback(isc_task_t *task, isc_event_t *event) { 691 dns_rbtdb_t *rbtdb = event->ev_arg; 692 693 UNUSED(task); 694 695 free_rbtdb(rbtdb, ISC_TRUE, event); 696} 697 698static void 699update_rrsetstats(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, 700 isc_boolean_t increment) 701{ 702 dns_rdatastatstype_t statattributes = 0; 703 dns_rdatastatstype_t base = 0; 704 dns_rdatastatstype_t type; 705 706 /* At the moment we count statistics only for cache DB */ 707 INSIST(IS_CACHE(rbtdb)); 708 709 if (NEGATIVE(header)) { 710 if (NXDOMAIN(header)) 711 statattributes = DNS_RDATASTATSTYPE_ATTR_NXDOMAIN; 712 else { 713 statattributes = DNS_RDATASTATSTYPE_ATTR_NXRRSET; 714 base = RBTDB_RDATATYPE_EXT(header->type); 715 } 716 } else 717 base = RBTDB_RDATATYPE_BASE(header->type); 718 719 type = DNS_RDATASTATSTYPE_VALUE(base, statattributes); 720 if (increment) 721 dns_rdatasetstats_increment(rbtdb->rrsetstats, type); 722 else 723 dns_rdatasetstats_decrement(rbtdb->rrsetstats, type); 724} 725 726static void 727set_ttl(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, dns_ttl_t newttl) { 728 int idx; 729 isc_heap_t *heap; 730 dns_ttl_t oldttl; 731 732 oldttl = header->rdh_ttl; 733 header->rdh_ttl = newttl; 734 735 if (!IS_CACHE(rbtdb)) 736 return; 737 738 /* 739 * It's possible the rbtdb is not a cache. If this is the case, 740 * we will not have a heap, and we move on. If we do, though, 741 * we might need to adjust things. 742 */ 743 if (header->heap_index == 0 || newttl == oldttl) 744 return; 745 idx = header->node->locknum; 746 if (rbtdb->heaps == NULL || rbtdb->heaps[idx] == NULL) 747 return; 748 heap = rbtdb->heaps[idx]; 749 750 if (newttl < oldttl) 751 isc_heap_increased(heap, header->heap_index); 752 else 753 isc_heap_decreased(heap, header->heap_index); 754} 755 756/*% 757 * These functions allow the heap code to rank the priority of each 758 * element. It returns ISC_TRUE if v1 happens "sooner" than v2. 759 */ 760static isc_boolean_t 761ttl_sooner(void *v1, void *v2) { 762 rdatasetheader_t *h1 = v1; 763 rdatasetheader_t *h2 = v2; 764 765 if (h1->rdh_ttl < h2->rdh_ttl) 766 return (ISC_TRUE); 767 return (ISC_FALSE); 768} 769 770static isc_boolean_t 771resign_sooner(void *v1, void *v2) { 772 rdatasetheader_t *h1 = v1; 773 rdatasetheader_t *h2 = v2; 774 775 if (h1->resign < h2->resign) 776 return (ISC_TRUE); 777 return (ISC_FALSE); 778} 779 780/*% 781 * This function sets the heap index into the header. 782 */ 783static void 784set_index(void *what, unsigned int index) { 785 rdatasetheader_t *h = what; 786 787 h->heap_index = index; 788} 789 790/*% 791 * Work out how many nodes can be deleted in the time between two 792 * requests to the nameserver. Smooth the resulting number and use it 793 * as a estimate for the number of nodes to be deleted in the next 794 * iteration. 795 */ 796static unsigned int 797adjust_quantum(unsigned int old, isc_time_t *start) { 798 unsigned int pps = dns_pps; /* packets per second */ 799 unsigned int interval; 800 isc_uint64_t usecs; 801 isc_time_t end; 802 unsigned int new; 803 804 if (pps < 100) 805 pps = 100; 806 isc_time_now(&end); 807 808 interval = 1000000 / pps; /* interval in usec */ 809 if (interval == 0) 810 interval = 1; 811 usecs = isc_time_microdiff(&end, start); 812 if (usecs == 0) { 813 /* 814 * We were unable to measure the amount of time taken. 815 * Double the nodes deleted next time. 816 */ 817 old *= 2; 818 if (old > 1000) 819 old = 1000; 820 return (old); 821 } 822 new = old * interval; 823 new /= (unsigned int)usecs; 824 if (new == 0) 825 new = 1; 826 else if (new > 1000) 827 new = 1000; 828 829 /* Smooth */ 830 new = (new + old * 3) / 4; 831 832 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, DNS_LOGMODULE_CACHE, 833 ISC_LOG_DEBUG(1), "adjust_quantum -> %d", new); 834 835 return (new); 836} 837 838static void 839free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) { 840 unsigned int i; 841 isc_ondestroy_t ondest; 842 isc_result_t result; 843 char buf[DNS_NAME_FORMATSIZE]; 844 dns_rbt_t **treep; 845 isc_time_t start; 846 847 if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) 848 overmem((dns_db_t *)rbtdb, (isc_boolean_t)-1); 849 850 REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions)); 851 REQUIRE(rbtdb->future_version == NULL); 852 853 if (rbtdb->current_version != NULL) { 854 unsigned int refs; 855 856 isc_refcount_decrement(&rbtdb->current_version->references, 857 &refs); 858 INSIST(refs == 0); 859 UNLINK(rbtdb->open_versions, rbtdb->current_version, link); 860 isc_refcount_destroy(&rbtdb->current_version->references); 861 isc_mem_put(rbtdb->common.mctx, rbtdb->current_version, 862 sizeof(rbtdb_version_t)); 863 } 864 865 /* 866 * We assume the number of remaining dead nodes is reasonably small; 867 * the overhead of unlinking all nodes here should be negligible. 868 */ 869 for (i = 0; i < rbtdb->node_lock_count; i++) { 870 dns_rbtnode_t *node; 871 872 node = ISC_LIST_HEAD(rbtdb->deadnodes[i]); 873 while (node != NULL) { 874 ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink); 875 node = ISC_LIST_HEAD(rbtdb->deadnodes[i]); 876 } 877 } 878 879 if (event == NULL) 880 rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0; 881 882 for (;;) { 883 /* 884 * pick the next tree to (start to) destroy 885 */ 886 treep = &rbtdb->tree; 887 if (*treep == NULL) { 888 treep = &rbtdb->nsec; 889 if (*treep == NULL) { 890 treep = &rbtdb->nsec3; 891 /* 892 * we're finished after clear cutting 893 */ 894 if (*treep == NULL) 895 break; 896 } 897 } 898 899 isc_time_now(&start); 900 result = dns_rbt_destroy2(treep, rbtdb->quantum); 901 if (result == ISC_R_QUOTA) { 902 INSIST(rbtdb->task != NULL); 903 if (rbtdb->quantum != 0) 904 rbtdb->quantum = adjust_quantum(rbtdb->quantum, 905 &start); 906 if (event == NULL) 907 event = isc_event_allocate(rbtdb->common.mctx, 908 NULL, 909 DNS_EVENT_FREESTORAGE, 910 free_rbtdb_callback, 911 rbtdb, 912 sizeof(isc_event_t)); 913 if (event == NULL) 914 continue; 915 isc_task_send(rbtdb->task, &event); 916 return; 917 } 918 INSIST(result == ISC_R_SUCCESS && *treep == NULL); 919 } 920 921 if (event != NULL) 922 isc_event_free(&event); 923 if (log) { 924 if (dns_name_dynamic(&rbtdb->common.origin)) 925 dns_name_format(&rbtdb->common.origin, buf, 926 sizeof(buf)); 927 else 928 strcpy(buf, "<UNKNOWN>"); 929 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, 930 DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1), 931 "done free_rbtdb(%s)", buf); 932 } 933 if (dns_name_dynamic(&rbtdb->common.origin)) 934 dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx); 935 for (i = 0; i < rbtdb->node_lock_count; i++) { 936 isc_refcount_destroy(&rbtdb->node_locks[i].references); 937 NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock); 938 } 939 940 /* 941 * Clean up LRU / re-signing order lists. 942 */ 943 if (rbtdb->rdatasets != NULL) { 944 for (i = 0; i < rbtdb->node_lock_count; i++) 945 INSIST(ISC_LIST_EMPTY(rbtdb->rdatasets[i])); 946 isc_mem_put(rbtdb->common.mctx, rbtdb->rdatasets, 947 rbtdb->node_lock_count * 948 sizeof(rdatasetheaderlist_t)); 949 } 950 /* 951 * Clean up dead node buckets. 952 */ 953 if (rbtdb->deadnodes != NULL) { 954 for (i = 0; i < rbtdb->node_lock_count; i++) 955 INSIST(ISC_LIST_EMPTY(rbtdb->deadnodes[i])); 956 isc_mem_put(rbtdb->common.mctx, rbtdb->deadnodes, 957 rbtdb->node_lock_count * sizeof(rbtnodelist_t)); 958 } 959 /* 960 * Clean up heap objects. 961 */ 962 if (rbtdb->heaps != NULL) { 963 for (i = 0; i < rbtdb->node_lock_count; i++) 964 isc_heap_destroy(&rbtdb->heaps[i]); 965 isc_mem_put(rbtdb->hmctx, rbtdb->heaps, 966 rbtdb->node_lock_count * sizeof(isc_heap_t *)); 967 } 968 969 if (rbtdb->rrsetstats != NULL) 970 dns_stats_detach(&rbtdb->rrsetstats); 971 972#ifdef BIND9 973 if (rbtdb->rpz_cidr != NULL) 974 dns_rpz_cidr_free(&rbtdb->rpz_cidr); 975#endif 976 977 isc_mem_put(rbtdb->common.mctx, rbtdb->node_locks, 978 rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t)); 979 isc_rwlock_destroy(&rbtdb->tree_lock); 980 isc_refcount_destroy(&rbtdb->references); 981 if (rbtdb->task != NULL) 982 isc_task_detach(&rbtdb->task); 983 984 RBTDB_DESTROYLOCK(&rbtdb->lock); 985 rbtdb->common.magic = 0; 986 rbtdb->common.impmagic = 0; 987 ondest = rbtdb->common.ondest; 988 isc_mem_detach(&rbtdb->hmctx); 989 isc_mem_putanddetach(&rbtdb->common.mctx, rbtdb, sizeof(*rbtdb)); 990 isc_ondestroy_notify(&ondest, rbtdb); 991} 992 993static inline void 994maybe_free_rbtdb(dns_rbtdb_t *rbtdb) { 995 isc_boolean_t want_free = ISC_FALSE; 996 unsigned int i; 997 unsigned int inactive = 0; 998 999 /* XXX check for open versions here */ 1000 1001 if (rbtdb->soanode != NULL) 1002 dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->soanode); 1003 if (rbtdb->nsnode != NULL) 1004 dns_db_detachnode((dns_db_t *)rbtdb, &rbtdb->nsnode); 1005 1006 /* 1007 * Even though there are no external direct references, there still 1008 * may be nodes in use. 1009 */ 1010 for (i = 0; i < rbtdb->node_lock_count; i++) { 1011 NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write); 1012 rbtdb->node_locks[i].exiting = ISC_TRUE; 1013 NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write); 1014 if (isc_refcount_current(&rbtdb->node_locks[i].references) 1015 == 0) { 1016 inactive++; 1017 } 1018 } 1019 1020 if (inactive != 0) { 1021 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); 1022 rbtdb->active -= inactive; 1023 if (rbtdb->active == 0) 1024 want_free = ISC_TRUE; 1025 RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); 1026 if (want_free) { 1027 char buf[DNS_NAME_FORMATSIZE]; 1028 if (dns_name_dynamic(&rbtdb->common.origin)) 1029 dns_name_format(&rbtdb->common.origin, buf, 1030 sizeof(buf)); 1031 else 1032 strcpy(buf, "<UNKNOWN>"); 1033 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, 1034 DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1), 1035 "calling free_rbtdb(%s)", buf); 1036 free_rbtdb(rbtdb, ISC_TRUE, NULL); 1037 } 1038 } 1039} 1040 1041static void 1042detach(dns_db_t **dbp) { 1043 dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)(*dbp); 1044 unsigned int refs; 1045 1046 REQUIRE(VALID_RBTDB(rbtdb)); 1047 1048 isc_refcount_decrement(&rbtdb->references, &refs); 1049 1050 if (refs == 0) 1051 maybe_free_rbtdb(rbtdb); 1052 1053 *dbp = NULL; 1054} 1055 1056static void 1057currentversion(dns_db_t *db, dns_dbversion_t **versionp) { 1058 dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; 1059 rbtdb_version_t *version; 1060 unsigned int refs; 1061 1062 REQUIRE(VALID_RBTDB(rbtdb)); 1063 1064 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); 1065 version = rbtdb->current_version; 1066 isc_refcount_increment(&version->references, &refs); 1067 RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read); 1068 1069 *versionp = (dns_dbversion_t *)version; 1070} 1071 1072static inline rbtdb_version_t * 1073allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial, 1074 unsigned int references, isc_boolean_t writer) 1075{ 1076 isc_result_t result; 1077 rbtdb_version_t *version; 1078 1079 version = isc_mem_get(mctx, sizeof(*version)); 1080 if (version == NULL) 1081 return (NULL); 1082 version->serial = serial; 1083 result = isc_refcount_init(&version->references, references); 1084 if (result != ISC_R_SUCCESS) { 1085 isc_mem_put(mctx, version, sizeof(*version)); 1086 return (NULL); 1087 } 1088 version->writer = writer; 1089 version->commit_ok = ISC_FALSE; 1090 ISC_LIST_INIT(version->changed_list); 1091 ISC_LIST_INIT(version->resigned_list); 1092 ISC_LINK_INIT(version, link); 1093 1094 return (version); 1095} 1096 1097static isc_result_t 1098newversion(dns_db_t *db, dns_dbversion_t **versionp) { 1099 dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; 1100 rbtdb_version_t *version; 1101 1102 REQUIRE(VALID_RBTDB(rbtdb)); 1103 REQUIRE(versionp != NULL && *versionp == NULL); 1104 REQUIRE(rbtdb->future_version == NULL); 1105 1106 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); 1107 RUNTIME_CHECK(rbtdb->next_serial != 0); /* XXX Error? */ 1108 version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1, 1109 ISC_TRUE); 1110 if (version != NULL) { 1111 version->rbtdb = rbtdb; 1112 version->commit_ok = ISC_TRUE; 1113 version->secure = rbtdb->current_version->secure; 1114 version->havensec3 = rbtdb->current_version->havensec3; 1115 if (version->havensec3) { 1116 version->flags = rbtdb->current_version->flags; 1117 version->iterations = 1118 rbtdb->current_version->iterations; 1119 version->hash = rbtdb->current_version->hash; 1120 version->salt_length = 1121 rbtdb->current_version->salt_length; 1122 memcpy(version->salt, rbtdb->current_version->salt, 1123 version->salt_length); 1124 } else { 1125 version->flags = 0; 1126 version->iterations = 0; 1127 version->hash = 0; 1128 version->salt_length = 0; 1129 memset(version->salt, 0, sizeof(version->salt)); 1130 } 1131 rbtdb->next_serial++; 1132 rbtdb->future_version = version; 1133 } 1134 RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); 1135 1136 if (version == NULL) 1137 return (ISC_R_NOMEMORY); 1138 1139 *versionp = version; 1140 1141 return (ISC_R_SUCCESS); 1142} 1143 1144static void 1145attachversion(dns_db_t *db, dns_dbversion_t *source, 1146 dns_dbversion_t **targetp) 1147{ 1148 dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; 1149 rbtdb_version_t *rbtversion = source; 1150 unsigned int refs; 1151 1152 REQUIRE(VALID_RBTDB(rbtdb)); 1153 INSIST(rbtversion != NULL && rbtversion->rbtdb == rbtdb); 1154 1155 isc_refcount_increment(&rbtversion->references, &refs); 1156 INSIST(refs > 1); 1157 1158 *targetp = rbtversion; 1159} 1160 1161static rbtdb_changed_t * 1162add_changed(dns_rbtdb_t *rbtdb, rbtdb_version_t *version, 1163 dns_rbtnode_t *node) 1164{ 1165 rbtdb_changed_t *changed; 1166 unsigned int refs; 1167 1168 /* 1169 * Caller must be holding the node lock if its reference must be 1170 * protected by the lock. 1171 */ 1172 1173 changed = isc_mem_get(rbtdb->common.mctx, sizeof(*changed)); 1174 1175 RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); 1176 1177 REQUIRE(version->writer); 1178 1179 if (changed != NULL) { 1180 dns_rbtnode_refincrement(node, &refs); 1181 INSIST(refs != 0); 1182 changed->node = node; 1183 changed->dirty = ISC_FALSE; 1184 ISC_LIST_INITANDAPPEND(version->changed_list, changed, link); 1185 } else 1186 version->commit_ok = ISC_FALSE; 1187 1188 RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); 1189 1190 return (changed); 1191} 1192 1193static void 1194free_acachearray(isc_mem_t *mctx, rdatasetheader_t *header, 1195 acachectl_t *array) 1196{ 1197 unsigned int count; 1198 unsigned int i; 1199 unsigned char *raw; /* RDATASLAB */ 1200 1201 /* 1202 * The caller must be holding the corresponding node lock. 1203 */ 1204 1205 if (array == NULL) 1206 return; 1207 1208 raw = (unsigned char *)header + sizeof(*header); 1209 count = raw[0] * 256 + raw[1]; 1210 1211 /* 1212 * Sanity check: since an additional cache entry has a reference to 1213 * the original DB node (in the callback arg), there should be no 1214 * acache entries when the node can be freed. 1215 */ 1216 for (i = 0; i < count; i++) 1217 INSIST(array[i].entry == NULL && array[i].cbarg == NULL); 1218 1219 isc_mem_put(mctx, array, count * sizeof(acachectl_t)); 1220} 1221 1222static inline void 1223free_noqname(isc_mem_t *mctx, struct noqname **noqname) { 1224 1225 if (dns_name_dynamic(&(*noqname)->name)) 1226 dns_name_free(&(*noqname)->name, mctx); 1227 if ((*noqname)->neg != NULL) 1228 isc_mem_put(mctx, (*noqname)->neg, 1229 dns_rdataslab_size((*noqname)->neg, 0)); 1230 if ((*noqname)->negsig != NULL) 1231 isc_mem_put(mctx, (*noqname)->negsig, 1232 dns_rdataslab_size((*noqname)->negsig, 0)); 1233 isc_mem_put(mctx, *noqname, sizeof(**noqname)); 1234 *noqname = NULL; 1235} 1236 1237static inline void 1238init_rdataset(dns_rbtdb_t *rbtdb, rdatasetheader_t *h) 1239{ 1240 ISC_LINK_INIT(h, link); 1241 h->heap_index = 0; 1242 1243#if TRACE_HEADER 1244 if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) 1245 fprintf(stderr, "initialized header: %p\n", h); 1246#else 1247 UNUSED(rbtdb); 1248#endif 1249} 1250 1251static inline rdatasetheader_t * 1252new_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx) 1253{ 1254 rdatasetheader_t *h; 1255 1256 h = isc_mem_get(mctx, sizeof(*h)); 1257 if (h == NULL) 1258 return (NULL); 1259 1260#if TRACE_HEADER 1261 if (IS_CACHE(rbtdb) && rbtdb->common.rdclass == dns_rdataclass_in) 1262 fprintf(stderr, "allocated header: %p\n", h); 1263#endif 1264 init_rdataset(rbtdb, h); 1265 return (h); 1266} 1267 1268static inline void 1269free_rdataset(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *rdataset) 1270{ 1271 unsigned int size; 1272 int idx; 1273 1274 if (EXISTS(rdataset) && 1275 (rdataset->attributes & RDATASET_ATTR_STATCOUNT) != 0) { 1276 update_rrsetstats(rbtdb, rdataset, ISC_FALSE); 1277 } 1278 1279 idx = rdataset->node->locknum; 1280 if (ISC_LINK_LINKED(rdataset, link)) { 1281 INSIST(IS_CACHE(rbtdb)); 1282 ISC_LIST_UNLINK(rbtdb->rdatasets[idx], rdataset, link); 1283 } 1284 if (rdataset->heap_index != 0) 1285 isc_heap_delete(rbtdb->heaps[idx], rdataset->heap_index); 1286 rdataset->heap_index = 0; 1287 1288 if (rdataset->noqname != NULL) 1289 free_noqname(mctx, &rdataset->noqname); 1290 if (rdataset->closest != NULL) 1291 free_noqname(mctx, &rdataset->closest); 1292 1293 free_acachearray(mctx, rdataset, rdataset->additional_auth); 1294 free_acachearray(mctx, rdataset, rdataset->additional_glue); 1295 1296 if ((rdataset->attributes & RDATASET_ATTR_NONEXISTENT) != 0) 1297 size = sizeof(*rdataset); 1298 else 1299 size = dns_rdataslab_size((unsigned char *)rdataset, 1300 sizeof(*rdataset)); 1301 isc_mem_put(mctx, rdataset, size); 1302} 1303 1304static inline void 1305rollback_node(dns_rbtnode_t *node, rbtdb_serial_t serial) { 1306 rdatasetheader_t *header, *dcurrent; 1307 isc_boolean_t make_dirty = ISC_FALSE; 1308 1309 /* 1310 * Caller must hold the node lock. 1311 */ 1312 1313 /* 1314 * We set the IGNORE attribute on rdatasets with serial number 1315 * 'serial'. When the reference count goes to zero, these rdatasets 1316 * will be cleaned up; until that time, they will be ignored. 1317 */ 1318 for (header = node->data; header != NULL; header = header->next) { 1319 if (header->serial == serial) { 1320 header->attributes |= RDATASET_ATTR_IGNORE; 1321 make_dirty = ISC_TRUE; 1322 } 1323 for (dcurrent = header->down; 1324 dcurrent != NULL; 1325 dcurrent = dcurrent->down) { 1326 if (dcurrent->serial == serial) { 1327 dcurrent->attributes |= RDATASET_ATTR_IGNORE; 1328 make_dirty = ISC_TRUE; 1329 } 1330 } 1331 } 1332 if (make_dirty) 1333 node->dirty = 1; 1334} 1335 1336static inline void 1337clean_stale_headers(dns_rbtdb_t *rbtdb, isc_mem_t *mctx, rdatasetheader_t *top) 1338{ 1339 rdatasetheader_t *d, *down_next; 1340 1341 for (d = top->down; d != NULL; d = down_next) { 1342 down_next = d->down; 1343 free_rdataset(rbtdb, mctx, d); 1344 } 1345 top->down = NULL; 1346} 1347 1348static inline void 1349clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) { 1350 rdatasetheader_t *current, *top_prev, *top_next; 1351 isc_mem_t *mctx = rbtdb->common.mctx; 1352 1353 /* 1354 * Caller must be holding the node lock. 1355 */ 1356 1357 top_prev = NULL; 1358 for (current = node->data; current != NULL; current = top_next) { 1359 top_next = current->next; 1360 clean_stale_headers(rbtdb, mctx, current); 1361 /* 1362 * If current is nonexistent or stale, we can clean it up. 1363 */ 1364 if ((current->attributes & 1365 (RDATASET_ATTR_NONEXISTENT|RDATASET_ATTR_STALE)) != 0) { 1366 if (top_prev != NULL) 1367 top_prev->next = current->next; 1368 else 1369 node->data = current->next; 1370 free_rdataset(rbtdb, mctx, current); 1371 } else 1372 top_prev = current; 1373 } 1374 node->dirty = 0; 1375} 1376 1377static inline void 1378clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, 1379 rbtdb_serial_t least_serial) 1380{ 1381 rdatasetheader_t *current, *dcurrent, *down_next, *dparent; 1382 rdatasetheader_t *top_prev, *top_next; 1383 isc_mem_t *mctx = rbtdb->common.mctx; 1384 isc_boolean_t still_dirty = ISC_FALSE; 1385 1386 /* 1387 * Caller must be holding the node lock. 1388 */ 1389 REQUIRE(least_serial != 0); 1390 1391 top_prev = NULL; 1392 for (current = node->data; current != NULL; current = top_next) { 1393 top_next = current->next; 1394 1395 /* 1396 * First, we clean up any instances of multiple rdatasets 1397 * with the same serial number, or that have the IGNORE 1398 * attribute. 1399 */ 1400 dparent = current; 1401 for (dcurrent = current->down; 1402 dcurrent != NULL; 1403 dcurrent = down_next) { 1404 down_next = dcurrent->down; 1405 INSIST(dcurrent->serial <= dparent->serial); 1406 if (dcurrent->serial == dparent->serial || 1407 IGNORE(dcurrent)) { 1408 if (down_next != NULL) 1409 down_next->next = dparent; 1410 dparent->down = down_next; 1411 free_rdataset(rbtdb, mctx, dcurrent); 1412 } else 1413 dparent = dcurrent; 1414 } 1415 1416 /* 1417 * We've now eliminated all IGNORE datasets with the possible 1418 * exception of current, which we now check. 1419 */ 1420 if (IGNORE(current)) { 1421 down_next = current->down; 1422 if (down_next == NULL) { 1423 if (top_prev != NULL) 1424 top_prev->next = current->next; 1425 else 1426 node->data = current->next; 1427 free_rdataset(rbtdb, mctx, current); 1428 /* 1429 * current no longer exists, so we can 1430 * just continue with the loop. 1431 */ 1432 continue; 1433 } else { 1434 /* 1435 * Pull up current->down, making it the new 1436 * current. 1437 */ 1438 if (top_prev != NULL) 1439 top_prev->next = down_next; 1440 else 1441 node->data = down_next; 1442 down_next->next = top_next; 1443 free_rdataset(rbtdb, mctx, current); 1444 current = down_next; 1445 } 1446 } 1447 1448 /* 1449 * We now try to find the first down node less than the 1450 * least serial. 1451 */ 1452 dparent = current; 1453 for (dcurrent = current->down; 1454 dcurrent != NULL; 1455 dcurrent = down_next) { 1456 down_next = dcurrent->down; 1457 if (dcurrent->serial < least_serial) 1458 break; 1459 dparent = dcurrent; 1460 } 1461 1462 /* 1463 * If there is a such an rdataset, delete it and any older 1464 * versions. 1465 */ 1466 if (dcurrent != NULL) { 1467 do { 1468 down_next = dcurrent->down; 1469 INSIST(dcurrent->serial <= least_serial); 1470 free_rdataset(rbtdb, mctx, dcurrent); 1471 dcurrent = down_next; 1472 } while (dcurrent != NULL); 1473 dparent->down = NULL; 1474 } 1475 1476 /* 1477 * Note. The serial number of 'current' might be less than 1478 * least_serial too, but we cannot delete it because it is 1479 * the most recent version, unless it is a NONEXISTENT 1480 * rdataset. 1481 */ 1482 if (current->down != NULL) { 1483 still_dirty = ISC_TRUE; 1484 top_prev = current; 1485 } else { 1486 /* 1487 * If this is a NONEXISTENT rdataset, we can delete it. 1488 */ 1489 if (NONEXISTENT(current)) { 1490 if (top_prev != NULL) 1491 top_prev->next = current->next; 1492 else 1493 node->data = current->next; 1494 free_rdataset(rbtdb, mctx, current); 1495 } else 1496 top_prev = current; 1497 } 1498 } 1499 if (!still_dirty) 1500 node->dirty = 0; 1501} 1502 1503static void 1504delete_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) 1505{ 1506 dns_rbtnode_t *nsecnode; 1507 dns_fixedname_t fname; 1508 dns_name_t *name; 1509 isc_result_t result = ISC_R_UNEXPECTED; 1510 1511 INSIST(!ISC_LINK_LINKED(node, deadlink)); 1512 1513 switch (node->nsec) { 1514 case DNS_RBT_NSEC_NORMAL: 1515#ifdef BIND9 1516 if (rbtdb->rpz_cidr != NULL) { 1517 dns_fixedname_init(&fname); 1518 name = dns_fixedname_name(&fname); 1519 dns_rbt_fullnamefromnode(node, name); 1520 dns_rpz_cidr_deleteip(rbtdb->rpz_cidr, name); 1521 } 1522#endif 1523 result = dns_rbt_deletenode(rbtdb->tree, node, ISC_FALSE); 1524 break; 1525 case DNS_RBT_NSEC_HAS_NSEC: 1526 dns_fixedname_init(&fname); 1527 name = dns_fixedname_name(&fname); 1528 dns_rbt_fullnamefromnode(node, name); 1529 /* 1530 * Delete the corresponding node from the auxiliary NSEC 1531 * tree before deleting from the main tree. 1532 */ 1533 nsecnode = NULL; 1534 result = dns_rbt_findnode(rbtdb->nsec, name, NULL, &nsecnode, 1535 NULL, DNS_RBTFIND_EMPTYDATA, 1536 NULL, NULL); 1537 if (result != ISC_R_SUCCESS) { 1538 isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, 1539 DNS_LOGMODULE_CACHE, ISC_LOG_WARNING, 1540 "delete_node: " 1541 "dns_rbt_findnode(nsec): %s", 1542 isc_result_totext(result)); 1543 } else { 1544 result = dns_rbt_deletenode(rbtdb->nsec, nsecnode, 1545 ISC_FALSE); 1546 if (result != ISC_R_SUCCESS) { 1547 isc_log_write(dns_lctx, 1548 DNS_LOGCATEGORY_DATABASE, 1549 DNS_LOGMODULE_CACHE, 1550 ISC_LOG_WARNING, 1551 "delete_nsecnode(): " 1552 "dns_rbt_deletenode(nsecnode): %s", 1553 isc_result_totext(result)); 1554 } 1555 } 1556 result = dns_rbt_deletenode(rbtdb->tree, node, ISC_FALSE); 1557#ifdef BIND9 1558 dns_rpz_cidr_deleteip(rbtdb->rpz_cidr, name); 1559#endif 1560 break; 1561 case DNS_RBT_NSEC_NSEC: 1562 result = dns_rbt_deletenode(rbtdb->nsec, node, ISC_FALSE); 1563 break; 1564 case DNS_RBT_NSEC_NSEC3: 1565 result = dns_rbt_deletenode(rbtdb->nsec3, node, ISC_FALSE); 1566 break; 1567 } 1568 if (result != ISC_R_SUCCESS) { 1569 isc_log_write(dns_lctx, 1570 DNS_LOGCATEGORY_DATABASE, 1571 DNS_LOGMODULE_CACHE, 1572 ISC_LOG_WARNING, 1573 "delete_nsecnode(): " 1574 "dns_rbt_deletenode: %s", 1575 isc_result_totext(result)); 1576 } 1577} 1578 1579/*% 1580 * Clean up dead nodes. These are nodes which have no references, and 1581 * have no data. They are dead but we could not or chose not to delete 1582 * them when we deleted all the data at that node because we did not want 1583 * to wait for the tree write lock. 1584 * 1585 * The caller must hold a tree write lock and bucketnum'th node (write) lock. 1586 */ 1587static void 1588cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum) { 1589 dns_rbtnode_t *node; 1590 int count = 10; /* XXXJT: should be adjustable */ 1591 1592 node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]); 1593 while (node != NULL && count > 0) { 1594 ISC_LIST_UNLINK(rbtdb->deadnodes[bucketnum], node, deadlink); 1595 1596 /* 1597 * Since we're holding a tree write lock, it should be 1598 * impossible for this node to be referenced by others. 1599 */ 1600 INSIST(dns_rbtnode_refcurrent(node) == 0 && 1601 node->data == NULL); 1602 1603 delete_node(rbtdb, node); 1604 1605 node = ISC_LIST_HEAD(rbtdb->deadnodes[bucketnum]); 1606 count--; 1607 } 1608} 1609 1610/* 1611 * Caller must be holding the node lock. 1612 */ 1613static inline void 1614new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) { 1615 unsigned int lockrefs, noderefs; 1616 isc_refcount_t *lockref; 1617 1618 INSIST(!ISC_LINK_LINKED(node, deadlink)); 1619 dns_rbtnode_refincrement0(node, &noderefs); 1620 if (noderefs == 1) { /* this is the first reference to the node */ 1621 lockref = &rbtdb->node_locks[node->locknum].references; 1622 isc_refcount_increment0(lockref, &lockrefs); 1623 INSIST(lockrefs != 0); 1624 } 1625 INSIST(noderefs != 0); 1626} 1627 1628/* 1629 * This function is assumed to be called when a node is newly referenced 1630 * and can be in the deadnode list. In that case the node must be retrieved 1631 * from the list because it is going to be used. In addition, if the caller 1632 * happens to hold a write lock on the tree, it's a good chance to purge dead 1633 * nodes. 1634 * Note: while a new reference is gained in multiple places, there are only very 1635 * few cases where the node can be in the deadnode list (only empty nodes can 1636 * have been added to the list). 1637 */ 1638static inline void 1639reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, 1640 isc_rwlocktype_t treelocktype) 1641{ 1642 isc_rwlocktype_t locktype = isc_rwlocktype_read; 1643 nodelock_t *nodelock = &rbtdb->node_locks[node->locknum].lock; 1644 isc_boolean_t maybe_cleanup = ISC_FALSE; 1645 1646 POST(locktype); 1647 1648 NODE_STRONGLOCK(nodelock); 1649 NODE_WEAKLOCK(nodelock, locktype); 1650 1651 /* 1652 * Check if we can possibly cleanup the dead node. If so, upgrade 1653 * the node lock below to perform the cleanup. 1654 */ 1655 if (!ISC_LIST_EMPTY(rbtdb->deadnodes[node->locknum]) && 1656 treelocktype == isc_rwlocktype_write) { 1657 maybe_cleanup = ISC_TRUE; 1658 } 1659 1660 if (ISC_LINK_LINKED(node, deadlink) || maybe_cleanup) { 1661 /* 1662 * Upgrade the lock and test if we still need to unlink. 1663 */ 1664 NODE_WEAKUNLOCK(nodelock, locktype); 1665 locktype = isc_rwlocktype_write; 1666 POST(locktype); 1667 NODE_WEAKLOCK(nodelock, locktype); 1668 if (ISC_LINK_LINKED(node, deadlink)) 1669 ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum], 1670 node, deadlink); 1671 if (maybe_cleanup) 1672 cleanup_dead_nodes(rbtdb, node->locknum); 1673 } 1674 1675 new_reference(rbtdb, node); 1676 1677 NODE_WEAKUNLOCK(nodelock, locktype); 1678 NODE_STRONGUNLOCK(nodelock); 1679} 1680 1681/* 1682 * Caller must be holding the node lock; either the "strong", read or write 1683 * lock. Note that the lock must be held even when node references are 1684 * atomically modified; in that case the decrement operation itself does not 1685 * have to be protected, but we must avoid a race condition where multiple 1686 * threads are decreasing the reference to zero simultaneously and at least 1687 * one of them is going to free the node. 1688 * This function returns ISC_TRUE if and only if the node reference decreases 1689 * to zero. 1690 */ 1691static isc_boolean_t 1692decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, 1693 rbtdb_serial_t least_serial, 1694 isc_rwlocktype_t nlock, isc_rwlocktype_t tlock, 1695 isc_boolean_t pruning) 1696{ 1697 isc_result_t result; 1698 isc_boolean_t write_locked; 1699 rbtdb_nodelock_t *nodelock; 1700 unsigned int refs, nrefs; 1701 int bucket = node->locknum; 1702 isc_boolean_t no_reference = ISC_TRUE; 1703 1704 nodelock = &rbtdb->node_locks[bucket]; 1705 1706 /* Handle easy and typical case first. */ 1707 if (!node->dirty && (node->data != NULL || node->down != NULL)) { 1708 dns_rbtnode_refdecrement(node, &nrefs); 1709 INSIST((int)nrefs >= 0); 1710 if (nrefs == 0) { 1711 isc_refcount_decrement(&nodelock->references, &refs); 1712 INSIST((int)refs >= 0); 1713 } 1714 return ((nrefs == 0) ? ISC_TRUE : ISC_FALSE); 1715 } 1716 1717 /* Upgrade the lock? */ 1718 if (nlock == isc_rwlocktype_read) { 1719 NODE_WEAKUNLOCK(&nodelock->lock, isc_rwlocktype_read); 1720 NODE_WEAKL…
Large files files are truncated, but you can click here to view the full file