/drivers/md/raid5.c
C | 6022 lines | 4565 code | 661 blank | 796 comment | 1110 complexity | 55ed2e6a439dbba1eab15e8ff1006358 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.0, AGPL-1.0
Large files files are truncated, but you can click here to view the full file
1/* 2 * raid5.c : Multiple Devices driver for Linux 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 4 * Copyright (C) 1999, 2000 Ingo Molnar 5 * Copyright (C) 2002, 2003 H. Peter Anvin 6 * 7 * RAID-4/5/6 management functions. 8 * Thanks to Penguin Computing for making the RAID-6 development possible 9 * by donating a test server! 10 * 11 * This program is free software; you can redistribute it and/or modify 12 * it under the terms of the GNU General Public License as published by 13 * the Free Software Foundation; either version 2, or (at your option) 14 * any later version. 15 * 16 * You should have received a copy of the GNU General Public License 17 * (for example /usr/src/linux/COPYING); if not, write to the Free 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 19 */ 20 21/* 22 * BITMAP UNPLUGGING: 23 * 24 * The sequencing for updating the bitmap reliably is a little 25 * subtle (and I got it wrong the first time) so it deserves some 26 * explanation. 27 * 28 * We group bitmap updates into batches. Each batch has a number. 29 * We may write out several batches at once, but that isn't very important. 30 * conf->seq_write is the number of the last batch successfully written. 31 * conf->seq_flush is the number of the last batch that was closed to 32 * new additions. 33 * When we discover that we will need to write to any block in a stripe 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 35 * the number of the batch it will be in. This is seq_flush+1. 36 * When we are ready to do a write, if that batch hasn't been written yet, 37 * we plug the array and queue the stripe for later. 38 * When an unplug happens, we increment bm_flush, thus closing the current 39 * batch. 40 * When we notice that bm_flush > bm_write, we write out all pending updates 41 * to the bitmap, and advance bm_write to where bm_flush was. 42 * This may occasionally write a bit out twice, but is sure never to 43 * miss any bits. 44 */ 45 46#include <linux/blkdev.h> 47#include <linux/kthread.h> 48#include <linux/raid/pq.h> 49#include <linux/async_tx.h> 50#include <linux/async.h> 51#include <linux/seq_file.h> 52#include <linux/cpu.h> 53#include <linux/slab.h> 54#include "md.h" 55#include "raid5.h" 56#include "raid0.h" 57#include "bitmap.h" 58 59/* 60 * Stripe cache 61 */ 62 63#define NR_STRIPES 256 64#define STRIPE_SIZE PAGE_SIZE 65#define STRIPE_SHIFT (PAGE_SHIFT - 9) 66#define STRIPE_SECTORS (STRIPE_SIZE>>9) 67#define IO_THRESHOLD 1 68#define BYPASS_THRESHOLD 1 69#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 70#define HASH_MASK (NR_HASH - 1) 71 72#define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])) 73 74/* bio's attached to a stripe+device for I/O are linked together in bi_sector 75 * order without overlap. There may be several bio's per stripe+device, and 76 * a bio could span several devices. 77 * When walking this list for a particular stripe+device, we must never proceed 78 * beyond a bio that extends past this device, as the next bio might no longer 79 * be valid. 80 * This macro is used to determine the 'next' bio in the list, given the sector 81 * of the current stripe+device 82 */ 83#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL) 84/* 85 * The following can be used to debug the driver 86 */ 87#define RAID5_PARANOIA 1 88#if RAID5_PARANOIA && defined(CONFIG_SMP) 89# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) 90#else 91# define CHECK_DEVLOCK() 92#endif 93 94#ifdef DEBUG 95#define inline 96#define __inline__ 97#endif 98 99#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) 100 101/* 102 * We maintain a biased count of active stripes in the bottom 16 bits of 103 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 104 */ 105static inline int raid5_bi_phys_segments(struct bio *bio) 106{ 107 return bio->bi_phys_segments & 0xffff; 108} 109 110static inline int raid5_bi_hw_segments(struct bio *bio) 111{ 112 return (bio->bi_phys_segments >> 16) & 0xffff; 113} 114 115static inline int raid5_dec_bi_phys_segments(struct bio *bio) 116{ 117 --bio->bi_phys_segments; 118 return raid5_bi_phys_segments(bio); 119} 120 121static inline int raid5_dec_bi_hw_segments(struct bio *bio) 122{ 123 unsigned short val = raid5_bi_hw_segments(bio); 124 125 --val; 126 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); 127 return val; 128} 129 130static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 131{ 132 bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16); 133} 134 135/* Find first data disk in a raid6 stripe */ 136static inline int raid6_d0(struct stripe_head *sh) 137{ 138 if (sh->ddf_layout) 139 /* ddf always start from first device */ 140 return 0; 141 /* md starts just after Q block */ 142 if (sh->qd_idx == sh->disks - 1) 143 return 0; 144 else 145 return sh->qd_idx + 1; 146} 147static inline int raid6_next_disk(int disk, int raid_disks) 148{ 149 disk++; 150 return (disk < raid_disks) ? disk : 0; 151} 152 153/* When walking through the disks in a raid5, starting at raid6_d0, 154 * We need to map each disk to a 'slot', where the data disks are slot 155 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 156 * is raid_disks-1. This help does that mapping. 157 */ 158static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 159 int *count, int syndrome_disks) 160{ 161 int slot = *count; 162 163 if (sh->ddf_layout) 164 (*count)++; 165 if (idx == sh->pd_idx) 166 return syndrome_disks; 167 if (idx == sh->qd_idx) 168 return syndrome_disks + 1; 169 if (!sh->ddf_layout) 170 (*count)++; 171 return slot; 172} 173 174static void return_io(struct bio *return_bi) 175{ 176 struct bio *bi = return_bi; 177 while (bi) { 178 179 return_bi = bi->bi_next; 180 bi->bi_next = NULL; 181 bi->bi_size = 0; 182 bio_endio(bi, 0); 183 bi = return_bi; 184 } 185} 186 187static void print_raid5_conf (raid5_conf_t *conf); 188 189static int stripe_operations_active(struct stripe_head *sh) 190{ 191 return sh->check_state || sh->reconstruct_state || 192 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 193 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 194} 195 196static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 197{ 198 if (atomic_dec_and_test(&sh->count)) { 199 BUG_ON(!list_empty(&sh->lru)); 200 BUG_ON(atomic_read(&conf->active_stripes)==0); 201 if (test_bit(STRIPE_HANDLE, &sh->state)) { 202 if (test_bit(STRIPE_DELAYED, &sh->state) && 203 !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 204 list_add_tail(&sh->lru, &conf->delayed_list); 205 else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 206 sh->bm_seq - conf->seq_write > 0) 207 list_add_tail(&sh->lru, &conf->bitmap_list); 208 else { 209 clear_bit(STRIPE_DELAYED, &sh->state); 210 clear_bit(STRIPE_BIT_DELAY, &sh->state); 211 list_add_tail(&sh->lru, &conf->handle_list); 212 } 213 md_wakeup_thread(conf->mddev->thread); 214 } else { 215 BUG_ON(stripe_operations_active(sh)); 216 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 217 atomic_dec(&conf->preread_active_stripes); 218 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 219 md_wakeup_thread(conf->mddev->thread); 220 } 221 atomic_dec(&conf->active_stripes); 222 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 223 list_add_tail(&sh->lru, &conf->inactive_list); 224 wake_up(&conf->wait_for_stripe); 225 if (conf->retry_read_aligned) 226 md_wakeup_thread(conf->mddev->thread); 227 } 228 } 229 } 230} 231 232static void release_stripe(struct stripe_head *sh) 233{ 234 raid5_conf_t *conf = sh->raid_conf; 235 unsigned long flags; 236 237 spin_lock_irqsave(&conf->device_lock, flags); 238 __release_stripe(conf, sh); 239 spin_unlock_irqrestore(&conf->device_lock, flags); 240} 241 242static inline void remove_hash(struct stripe_head *sh) 243{ 244 pr_debug("remove_hash(), stripe %llu\n", 245 (unsigned long long)sh->sector); 246 247 hlist_del_init(&sh->hash); 248} 249 250static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) 251{ 252 struct hlist_head *hp = stripe_hash(conf, sh->sector); 253 254 pr_debug("insert_hash(), stripe %llu\n", 255 (unsigned long long)sh->sector); 256 257 CHECK_DEVLOCK(); 258 hlist_add_head(&sh->hash, hp); 259} 260 261 262/* find an idle stripe, make sure it is unhashed, and return it. */ 263static struct stripe_head *get_free_stripe(raid5_conf_t *conf) 264{ 265 struct stripe_head *sh = NULL; 266 struct list_head *first; 267 268 CHECK_DEVLOCK(); 269 if (list_empty(&conf->inactive_list)) 270 goto out; 271 first = conf->inactive_list.next; 272 sh = list_entry(first, struct stripe_head, lru); 273 list_del_init(first); 274 remove_hash(sh); 275 atomic_inc(&conf->active_stripes); 276out: 277 return sh; 278} 279 280static void shrink_buffers(struct stripe_head *sh) 281{ 282 struct page *p; 283 int i; 284 int num = sh->raid_conf->pool_size; 285 286 for (i = 0; i < num ; i++) { 287 p = sh->dev[i].page; 288 if (!p) 289 continue; 290 sh->dev[i].page = NULL; 291 put_page(p); 292 } 293} 294 295static int grow_buffers(struct stripe_head *sh) 296{ 297 int i; 298 int num = sh->raid_conf->pool_size; 299 300 for (i = 0; i < num; i++) { 301 struct page *page; 302 303 if (!(page = alloc_page(GFP_KERNEL))) { 304 return 1; 305 } 306 sh->dev[i].page = page; 307 } 308 return 0; 309} 310 311static void raid5_build_block(struct stripe_head *sh, int i, int previous); 312static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, 313 struct stripe_head *sh); 314 315static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 316{ 317 raid5_conf_t *conf = sh->raid_conf; 318 int i; 319 320 BUG_ON(atomic_read(&sh->count) != 0); 321 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 322 BUG_ON(stripe_operations_active(sh)); 323 324 CHECK_DEVLOCK(); 325 pr_debug("init_stripe called, stripe %llu\n", 326 (unsigned long long)sh->sector); 327 328 remove_hash(sh); 329 330 sh->generation = conf->generation - previous; 331 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 332 sh->sector = sector; 333 stripe_set_idx(sector, conf, previous, sh); 334 sh->state = 0; 335 336 337 for (i = sh->disks; i--; ) { 338 struct r5dev *dev = &sh->dev[i]; 339 340 if (dev->toread || dev->read || dev->towrite || dev->written || 341 test_bit(R5_LOCKED, &dev->flags)) { 342 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 343 (unsigned long long)sh->sector, i, dev->toread, 344 dev->read, dev->towrite, dev->written, 345 test_bit(R5_LOCKED, &dev->flags)); 346 BUG(); 347 } 348 dev->flags = 0; 349 raid5_build_block(sh, i, previous); 350 } 351 insert_hash(conf, sh); 352} 353 354static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, 355 short generation) 356{ 357 struct stripe_head *sh; 358 struct hlist_node *hn; 359 360 CHECK_DEVLOCK(); 361 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 362 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 363 if (sh->sector == sector && sh->generation == generation) 364 return sh; 365 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 366 return NULL; 367} 368 369/* 370 * Need to check if array has failed when deciding whether to: 371 * - start an array 372 * - remove non-faulty devices 373 * - add a spare 374 * - allow a reshape 375 * This determination is simple when no reshape is happening. 376 * However if there is a reshape, we need to carefully check 377 * both the before and after sections. 378 * This is because some failed devices may only affect one 379 * of the two sections, and some non-in_sync devices may 380 * be insync in the section most affected by failed devices. 381 */ 382static int has_failed(raid5_conf_t *conf) 383{ 384 int degraded; 385 int i; 386 if (conf->mddev->reshape_position == MaxSector) 387 return conf->mddev->degraded > conf->max_degraded; 388 389 rcu_read_lock(); 390 degraded = 0; 391 for (i = 0; i < conf->previous_raid_disks; i++) { 392 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); 393 if (!rdev || test_bit(Faulty, &rdev->flags)) 394 degraded++; 395 else if (test_bit(In_sync, &rdev->flags)) 396 ; 397 else 398 /* not in-sync or faulty. 399 * If the reshape increases the number of devices, 400 * this is being recovered by the reshape, so 401 * this 'previous' section is not in_sync. 402 * If the number of devices is being reduced however, 403 * the device can only be part of the array if 404 * we are reverting a reshape, so this section will 405 * be in-sync. 406 */ 407 if (conf->raid_disks >= conf->previous_raid_disks) 408 degraded++; 409 } 410 rcu_read_unlock(); 411 if (degraded > conf->max_degraded) 412 return 1; 413 rcu_read_lock(); 414 degraded = 0; 415 for (i = 0; i < conf->raid_disks; i++) { 416 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); 417 if (!rdev || test_bit(Faulty, &rdev->flags)) 418 degraded++; 419 else if (test_bit(In_sync, &rdev->flags)) 420 ; 421 else 422 /* not in-sync or faulty. 423 * If reshape increases the number of devices, this 424 * section has already been recovered, else it 425 * almost certainly hasn't. 426 */ 427 if (conf->raid_disks <= conf->previous_raid_disks) 428 degraded++; 429 } 430 rcu_read_unlock(); 431 if (degraded > conf->max_degraded) 432 return 1; 433 return 0; 434} 435 436static struct stripe_head * 437get_active_stripe(raid5_conf_t *conf, sector_t sector, 438 int previous, int noblock, int noquiesce) 439{ 440 struct stripe_head *sh; 441 442 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 443 444 spin_lock_irq(&conf->device_lock); 445 446 do { 447 wait_event_lock_irq(conf->wait_for_stripe, 448 conf->quiesce == 0 || noquiesce, 449 conf->device_lock, /* nothing */); 450 sh = __find_stripe(conf, sector, conf->generation - previous); 451 if (!sh) { 452 if (!conf->inactive_blocked) 453 sh = get_free_stripe(conf); 454 if (noblock && sh == NULL) 455 break; 456 if (!sh) { 457 conf->inactive_blocked = 1; 458 wait_event_lock_irq(conf->wait_for_stripe, 459 !list_empty(&conf->inactive_list) && 460 (atomic_read(&conf->active_stripes) 461 < (conf->max_nr_stripes *3/4) 462 || !conf->inactive_blocked), 463 conf->device_lock, 464 ); 465 conf->inactive_blocked = 0; 466 } else 467 init_stripe(sh, sector, previous); 468 } else { 469 if (atomic_read(&sh->count)) { 470 BUG_ON(!list_empty(&sh->lru) 471 && !test_bit(STRIPE_EXPANDING, &sh->state)); 472 } else { 473 if (!test_bit(STRIPE_HANDLE, &sh->state)) 474 atomic_inc(&conf->active_stripes); 475 if (list_empty(&sh->lru) && 476 !test_bit(STRIPE_EXPANDING, &sh->state)) 477 BUG(); 478 list_del_init(&sh->lru); 479 } 480 } 481 } while (sh == NULL); 482 483 if (sh) 484 atomic_inc(&sh->count); 485 486 spin_unlock_irq(&conf->device_lock); 487 return sh; 488} 489 490static void 491raid5_end_read_request(struct bio *bi, int error); 492static void 493raid5_end_write_request(struct bio *bi, int error); 494 495static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 496{ 497 raid5_conf_t *conf = sh->raid_conf; 498 int i, disks = sh->disks; 499 500 might_sleep(); 501 502 for (i = disks; i--; ) { 503 int rw; 504 struct bio *bi; 505 mdk_rdev_t *rdev; 506 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) { 507 if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags)) 508 rw = WRITE_FUA; 509 else 510 rw = WRITE; 511 } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 512 rw = READ; 513 else 514 continue; 515 516 bi = &sh->dev[i].req; 517 518 bi->bi_rw = rw; 519 if (rw & WRITE) 520 bi->bi_end_io = raid5_end_write_request; 521 else 522 bi->bi_end_io = raid5_end_read_request; 523 524 rcu_read_lock(); 525 rdev = rcu_dereference(conf->disks[i].rdev); 526 if (rdev && test_bit(Faulty, &rdev->flags)) 527 rdev = NULL; 528 if (rdev) 529 atomic_inc(&rdev->nr_pending); 530 rcu_read_unlock(); 531 532 if (rdev) { 533 if (s->syncing || s->expanding || s->expanded) 534 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 535 536 set_bit(STRIPE_IO_STARTED, &sh->state); 537 538 bi->bi_bdev = rdev->bdev; 539 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 540 __func__, (unsigned long long)sh->sector, 541 bi->bi_rw, i); 542 atomic_inc(&sh->count); 543 bi->bi_sector = sh->sector + rdev->data_offset; 544 bi->bi_flags = 1 << BIO_UPTODATE; 545 bi->bi_vcnt = 1; 546 bi->bi_max_vecs = 1; 547 bi->bi_idx = 0; 548 bi->bi_io_vec = &sh->dev[i].vec; 549 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 550 bi->bi_io_vec[0].bv_offset = 0; 551 bi->bi_size = STRIPE_SIZE; 552 bi->bi_next = NULL; 553 if ((rw & WRITE) && 554 test_bit(R5_ReWrite, &sh->dev[i].flags)) 555 atomic_add(STRIPE_SECTORS, 556 &rdev->corrected_errors); 557 generic_make_request(bi); 558 } else { 559 if (rw & WRITE) 560 set_bit(STRIPE_DEGRADED, &sh->state); 561 pr_debug("skip op %ld on disc %d for sector %llu\n", 562 bi->bi_rw, i, (unsigned long long)sh->sector); 563 clear_bit(R5_LOCKED, &sh->dev[i].flags); 564 set_bit(STRIPE_HANDLE, &sh->state); 565 } 566 } 567} 568 569static struct dma_async_tx_descriptor * 570async_copy_data(int frombio, struct bio *bio, struct page *page, 571 sector_t sector, struct dma_async_tx_descriptor *tx) 572{ 573 struct bio_vec *bvl; 574 struct page *bio_page; 575 int i; 576 int page_offset; 577 struct async_submit_ctl submit; 578 enum async_tx_flags flags = 0; 579 580 if (bio->bi_sector >= sector) 581 page_offset = (signed)(bio->bi_sector - sector) * 512; 582 else 583 page_offset = (signed)(sector - bio->bi_sector) * -512; 584 585 if (frombio) 586 flags |= ASYNC_TX_FENCE; 587 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 588 589 bio_for_each_segment(bvl, bio, i) { 590 int len = bvl->bv_len; 591 int clen; 592 int b_offset = 0; 593 594 if (page_offset < 0) { 595 b_offset = -page_offset; 596 page_offset += b_offset; 597 len -= b_offset; 598 } 599 600 if (len > 0 && page_offset + len > STRIPE_SIZE) 601 clen = STRIPE_SIZE - page_offset; 602 else 603 clen = len; 604 605 if (clen > 0) { 606 b_offset += bvl->bv_offset; 607 bio_page = bvl->bv_page; 608 if (frombio) 609 tx = async_memcpy(page, bio_page, page_offset, 610 b_offset, clen, &submit); 611 else 612 tx = async_memcpy(bio_page, page, b_offset, 613 page_offset, clen, &submit); 614 } 615 /* chain the operations */ 616 submit.depend_tx = tx; 617 618 if (clen < len) /* hit end of page */ 619 break; 620 page_offset += len; 621 } 622 623 return tx; 624} 625 626static void ops_complete_biofill(void *stripe_head_ref) 627{ 628 struct stripe_head *sh = stripe_head_ref; 629 struct bio *return_bi = NULL; 630 raid5_conf_t *conf = sh->raid_conf; 631 int i; 632 633 pr_debug("%s: stripe %llu\n", __func__, 634 (unsigned long long)sh->sector); 635 636 /* clear completed biofills */ 637 spin_lock_irq(&conf->device_lock); 638 for (i = sh->disks; i--; ) { 639 struct r5dev *dev = &sh->dev[i]; 640 641 /* acknowledge completion of a biofill operation */ 642 /* and check if we need to reply to a read request, 643 * new R5_Wantfill requests are held off until 644 * !STRIPE_BIOFILL_RUN 645 */ 646 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 647 struct bio *rbi, *rbi2; 648 649 BUG_ON(!dev->read); 650 rbi = dev->read; 651 dev->read = NULL; 652 while (rbi && rbi->bi_sector < 653 dev->sector + STRIPE_SECTORS) { 654 rbi2 = r5_next_bio(rbi, dev->sector); 655 if (!raid5_dec_bi_phys_segments(rbi)) { 656 rbi->bi_next = return_bi; 657 return_bi = rbi; 658 } 659 rbi = rbi2; 660 } 661 } 662 } 663 spin_unlock_irq(&conf->device_lock); 664 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 665 666 return_io(return_bi); 667 668 set_bit(STRIPE_HANDLE, &sh->state); 669 release_stripe(sh); 670} 671 672static void ops_run_biofill(struct stripe_head *sh) 673{ 674 struct dma_async_tx_descriptor *tx = NULL; 675 raid5_conf_t *conf = sh->raid_conf; 676 struct async_submit_ctl submit; 677 int i; 678 679 pr_debug("%s: stripe %llu\n", __func__, 680 (unsigned long long)sh->sector); 681 682 for (i = sh->disks; i--; ) { 683 struct r5dev *dev = &sh->dev[i]; 684 if (test_bit(R5_Wantfill, &dev->flags)) { 685 struct bio *rbi; 686 spin_lock_irq(&conf->device_lock); 687 dev->read = rbi = dev->toread; 688 dev->toread = NULL; 689 spin_unlock_irq(&conf->device_lock); 690 while (rbi && rbi->bi_sector < 691 dev->sector + STRIPE_SECTORS) { 692 tx = async_copy_data(0, rbi, dev->page, 693 dev->sector, tx); 694 rbi = r5_next_bio(rbi, dev->sector); 695 } 696 } 697 } 698 699 atomic_inc(&sh->count); 700 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 701 async_trigger_callback(&submit); 702} 703 704static void mark_target_uptodate(struct stripe_head *sh, int target) 705{ 706 struct r5dev *tgt; 707 708 if (target < 0) 709 return; 710 711 tgt = &sh->dev[target]; 712 set_bit(R5_UPTODATE, &tgt->flags); 713 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 714 clear_bit(R5_Wantcompute, &tgt->flags); 715} 716 717static void ops_complete_compute(void *stripe_head_ref) 718{ 719 struct stripe_head *sh = stripe_head_ref; 720 721 pr_debug("%s: stripe %llu\n", __func__, 722 (unsigned long long)sh->sector); 723 724 /* mark the computed target(s) as uptodate */ 725 mark_target_uptodate(sh, sh->ops.target); 726 mark_target_uptodate(sh, sh->ops.target2); 727 728 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 729 if (sh->check_state == check_state_compute_run) 730 sh->check_state = check_state_compute_result; 731 set_bit(STRIPE_HANDLE, &sh->state); 732 release_stripe(sh); 733} 734 735/* return a pointer to the address conversion region of the scribble buffer */ 736static addr_conv_t *to_addr_conv(struct stripe_head *sh, 737 struct raid5_percpu *percpu) 738{ 739 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 740} 741 742static struct dma_async_tx_descriptor * 743ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 744{ 745 int disks = sh->disks; 746 struct page **xor_srcs = percpu->scribble; 747 int target = sh->ops.target; 748 struct r5dev *tgt = &sh->dev[target]; 749 struct page *xor_dest = tgt->page; 750 int count = 0; 751 struct dma_async_tx_descriptor *tx; 752 struct async_submit_ctl submit; 753 int i; 754 755 pr_debug("%s: stripe %llu block: %d\n", 756 __func__, (unsigned long long)sh->sector, target); 757 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 758 759 for (i = disks; i--; ) 760 if (i != target) 761 xor_srcs[count++] = sh->dev[i].page; 762 763 atomic_inc(&sh->count); 764 765 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 766 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 767 if (unlikely(count == 1)) 768 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 769 else 770 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 771 772 return tx; 773} 774 775/* set_syndrome_sources - populate source buffers for gen_syndrome 776 * @srcs - (struct page *) array of size sh->disks 777 * @sh - stripe_head to parse 778 * 779 * Populates srcs in proper layout order for the stripe and returns the 780 * 'count' of sources to be used in a call to async_gen_syndrome. The P 781 * destination buffer is recorded in srcs[count] and the Q destination 782 * is recorded in srcs[count+1]]. 783 */ 784static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 785{ 786 int disks = sh->disks; 787 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 788 int d0_idx = raid6_d0(sh); 789 int count; 790 int i; 791 792 for (i = 0; i < disks; i++) 793 srcs[i] = NULL; 794 795 count = 0; 796 i = d0_idx; 797 do { 798 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 799 800 srcs[slot] = sh->dev[i].page; 801 i = raid6_next_disk(i, disks); 802 } while (i != d0_idx); 803 804 return syndrome_disks; 805} 806 807static struct dma_async_tx_descriptor * 808ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 809{ 810 int disks = sh->disks; 811 struct page **blocks = percpu->scribble; 812 int target; 813 int qd_idx = sh->qd_idx; 814 struct dma_async_tx_descriptor *tx; 815 struct async_submit_ctl submit; 816 struct r5dev *tgt; 817 struct page *dest; 818 int i; 819 int count; 820 821 if (sh->ops.target < 0) 822 target = sh->ops.target2; 823 else if (sh->ops.target2 < 0) 824 target = sh->ops.target; 825 else 826 /* we should only have one valid target */ 827 BUG(); 828 BUG_ON(target < 0); 829 pr_debug("%s: stripe %llu block: %d\n", 830 __func__, (unsigned long long)sh->sector, target); 831 832 tgt = &sh->dev[target]; 833 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 834 dest = tgt->page; 835 836 atomic_inc(&sh->count); 837 838 if (target == qd_idx) { 839 count = set_syndrome_sources(blocks, sh); 840 blocks[count] = NULL; /* regenerating p is not necessary */ 841 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 842 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 843 ops_complete_compute, sh, 844 to_addr_conv(sh, percpu)); 845 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 846 } else { 847 /* Compute any data- or p-drive using XOR */ 848 count = 0; 849 for (i = disks; i-- ; ) { 850 if (i == target || i == qd_idx) 851 continue; 852 blocks[count++] = sh->dev[i].page; 853 } 854 855 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 856 NULL, ops_complete_compute, sh, 857 to_addr_conv(sh, percpu)); 858 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 859 } 860 861 return tx; 862} 863 864static struct dma_async_tx_descriptor * 865ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 866{ 867 int i, count, disks = sh->disks; 868 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 869 int d0_idx = raid6_d0(sh); 870 int faila = -1, failb = -1; 871 int target = sh->ops.target; 872 int target2 = sh->ops.target2; 873 struct r5dev *tgt = &sh->dev[target]; 874 struct r5dev *tgt2 = &sh->dev[target2]; 875 struct dma_async_tx_descriptor *tx; 876 struct page **blocks = percpu->scribble; 877 struct async_submit_ctl submit; 878 879 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 880 __func__, (unsigned long long)sh->sector, target, target2); 881 BUG_ON(target < 0 || target2 < 0); 882 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 883 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 884 885 /* we need to open-code set_syndrome_sources to handle the 886 * slot number conversion for 'faila' and 'failb' 887 */ 888 for (i = 0; i < disks ; i++) 889 blocks[i] = NULL; 890 count = 0; 891 i = d0_idx; 892 do { 893 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 894 895 blocks[slot] = sh->dev[i].page; 896 897 if (i == target) 898 faila = slot; 899 if (i == target2) 900 failb = slot; 901 i = raid6_next_disk(i, disks); 902 } while (i != d0_idx); 903 904 BUG_ON(faila == failb); 905 if (failb < faila) 906 swap(faila, failb); 907 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 908 __func__, (unsigned long long)sh->sector, faila, failb); 909 910 atomic_inc(&sh->count); 911 912 if (failb == syndrome_disks+1) { 913 /* Q disk is one of the missing disks */ 914 if (faila == syndrome_disks) { 915 /* Missing P+Q, just recompute */ 916 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 917 ops_complete_compute, sh, 918 to_addr_conv(sh, percpu)); 919 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 920 STRIPE_SIZE, &submit); 921 } else { 922 struct page *dest; 923 int data_target; 924 int qd_idx = sh->qd_idx; 925 926 /* Missing D+Q: recompute D from P, then recompute Q */ 927 if (target == qd_idx) 928 data_target = target2; 929 else 930 data_target = target; 931 932 count = 0; 933 for (i = disks; i-- ; ) { 934 if (i == data_target || i == qd_idx) 935 continue; 936 blocks[count++] = sh->dev[i].page; 937 } 938 dest = sh->dev[data_target].page; 939 init_async_submit(&submit, 940 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 941 NULL, NULL, NULL, 942 to_addr_conv(sh, percpu)); 943 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 944 &submit); 945 946 count = set_syndrome_sources(blocks, sh); 947 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 948 ops_complete_compute, sh, 949 to_addr_conv(sh, percpu)); 950 return async_gen_syndrome(blocks, 0, count+2, 951 STRIPE_SIZE, &submit); 952 } 953 } else { 954 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 955 ops_complete_compute, sh, 956 to_addr_conv(sh, percpu)); 957 if (failb == syndrome_disks) { 958 /* We're missing D+P. */ 959 return async_raid6_datap_recov(syndrome_disks+2, 960 STRIPE_SIZE, faila, 961 blocks, &submit); 962 } else { 963 /* We're missing D+D. */ 964 return async_raid6_2data_recov(syndrome_disks+2, 965 STRIPE_SIZE, faila, failb, 966 blocks, &submit); 967 } 968 } 969} 970 971 972static void ops_complete_prexor(void *stripe_head_ref) 973{ 974 struct stripe_head *sh = stripe_head_ref; 975 976 pr_debug("%s: stripe %llu\n", __func__, 977 (unsigned long long)sh->sector); 978} 979 980static struct dma_async_tx_descriptor * 981ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 982 struct dma_async_tx_descriptor *tx) 983{ 984 int disks = sh->disks; 985 struct page **xor_srcs = percpu->scribble; 986 int count = 0, pd_idx = sh->pd_idx, i; 987 struct async_submit_ctl submit; 988 989 /* existing parity data subtracted */ 990 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 991 992 pr_debug("%s: stripe %llu\n", __func__, 993 (unsigned long long)sh->sector); 994 995 for (i = disks; i--; ) { 996 struct r5dev *dev = &sh->dev[i]; 997 /* Only process blocks that are known to be uptodate */ 998 if (test_bit(R5_Wantdrain, &dev->flags)) 999 xor_srcs[count++] = dev->page; 1000 } 1001 1002 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 1003 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 1004 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1005 1006 return tx; 1007} 1008 1009static struct dma_async_tx_descriptor * 1010ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 1011{ 1012 int disks = sh->disks; 1013 int i; 1014 1015 pr_debug("%s: stripe %llu\n", __func__, 1016 (unsigned long long)sh->sector); 1017 1018 for (i = disks; i--; ) { 1019 struct r5dev *dev = &sh->dev[i]; 1020 struct bio *chosen; 1021 1022 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 1023 struct bio *wbi; 1024 1025 spin_lock(&sh->lock); 1026 chosen = dev->towrite; 1027 dev->towrite = NULL; 1028 BUG_ON(dev->written); 1029 wbi = dev->written = chosen; 1030 spin_unlock(&sh->lock); 1031 1032 while (wbi && wbi->bi_sector < 1033 dev->sector + STRIPE_SECTORS) { 1034 if (wbi->bi_rw & REQ_FUA) 1035 set_bit(R5_WantFUA, &dev->flags); 1036 tx = async_copy_data(1, wbi, dev->page, 1037 dev->sector, tx); 1038 wbi = r5_next_bio(wbi, dev->sector); 1039 } 1040 } 1041 } 1042 1043 return tx; 1044} 1045 1046static void ops_complete_reconstruct(void *stripe_head_ref) 1047{ 1048 struct stripe_head *sh = stripe_head_ref; 1049 int disks = sh->disks; 1050 int pd_idx = sh->pd_idx; 1051 int qd_idx = sh->qd_idx; 1052 int i; 1053 bool fua = false; 1054 1055 pr_debug("%s: stripe %llu\n", __func__, 1056 (unsigned long long)sh->sector); 1057 1058 for (i = disks; i--; ) 1059 fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); 1060 1061 for (i = disks; i--; ) { 1062 struct r5dev *dev = &sh->dev[i]; 1063 1064 if (dev->written || i == pd_idx || i == qd_idx) { 1065 set_bit(R5_UPTODATE, &dev->flags); 1066 if (fua) 1067 set_bit(R5_WantFUA, &dev->flags); 1068 } 1069 } 1070 1071 if (sh->reconstruct_state == reconstruct_state_drain_run) 1072 sh->reconstruct_state = reconstruct_state_drain_result; 1073 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 1074 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 1075 else { 1076 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 1077 sh->reconstruct_state = reconstruct_state_result; 1078 } 1079 1080 set_bit(STRIPE_HANDLE, &sh->state); 1081 release_stripe(sh); 1082} 1083 1084static void 1085ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1086 struct dma_async_tx_descriptor *tx) 1087{ 1088 int disks = sh->disks; 1089 struct page **xor_srcs = percpu->scribble; 1090 struct async_submit_ctl submit; 1091 int count = 0, pd_idx = sh->pd_idx, i; 1092 struct page *xor_dest; 1093 int prexor = 0; 1094 unsigned long flags; 1095 1096 pr_debug("%s: stripe %llu\n", __func__, 1097 (unsigned long long)sh->sector); 1098 1099 /* check if prexor is active which means only process blocks 1100 * that are part of a read-modify-write (written) 1101 */ 1102 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1103 prexor = 1; 1104 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1105 for (i = disks; i--; ) { 1106 struct r5dev *dev = &sh->dev[i]; 1107 if (dev->written) 1108 xor_srcs[count++] = dev->page; 1109 } 1110 } else { 1111 xor_dest = sh->dev[pd_idx].page; 1112 for (i = disks; i--; ) { 1113 struct r5dev *dev = &sh->dev[i]; 1114 if (i != pd_idx) 1115 xor_srcs[count++] = dev->page; 1116 } 1117 } 1118 1119 /* 1/ if we prexor'd then the dest is reused as a source 1120 * 2/ if we did not prexor then we are redoing the parity 1121 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1122 * for the synchronous xor case 1123 */ 1124 flags = ASYNC_TX_ACK | 1125 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1126 1127 atomic_inc(&sh->count); 1128 1129 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1130 to_addr_conv(sh, percpu)); 1131 if (unlikely(count == 1)) 1132 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1133 else 1134 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1135} 1136 1137static void 1138ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1139 struct dma_async_tx_descriptor *tx) 1140{ 1141 struct async_submit_ctl submit; 1142 struct page **blocks = percpu->scribble; 1143 int count; 1144 1145 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1146 1147 count = set_syndrome_sources(blocks, sh); 1148 1149 atomic_inc(&sh->count); 1150 1151 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1152 sh, to_addr_conv(sh, percpu)); 1153 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1154} 1155 1156static void ops_complete_check(void *stripe_head_ref) 1157{ 1158 struct stripe_head *sh = stripe_head_ref; 1159 1160 pr_debug("%s: stripe %llu\n", __func__, 1161 (unsigned long long)sh->sector); 1162 1163 sh->check_state = check_state_check_result; 1164 set_bit(STRIPE_HANDLE, &sh->state); 1165 release_stripe(sh); 1166} 1167 1168static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1169{ 1170 int disks = sh->disks; 1171 int pd_idx = sh->pd_idx; 1172 int qd_idx = sh->qd_idx; 1173 struct page *xor_dest; 1174 struct page **xor_srcs = percpu->scribble; 1175 struct dma_async_tx_descriptor *tx; 1176 struct async_submit_ctl submit; 1177 int count; 1178 int i; 1179 1180 pr_debug("%s: stripe %llu\n", __func__, 1181 (unsigned long long)sh->sector); 1182 1183 count = 0; 1184 xor_dest = sh->dev[pd_idx].page; 1185 xor_srcs[count++] = xor_dest; 1186 for (i = disks; i--; ) { 1187 if (i == pd_idx || i == qd_idx) 1188 continue; 1189 xor_srcs[count++] = sh->dev[i].page; 1190 } 1191 1192 init_async_submit(&submit, 0, NULL, NULL, NULL, 1193 to_addr_conv(sh, percpu)); 1194 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1195 &sh->ops.zero_sum_result, &submit); 1196 1197 atomic_inc(&sh->count); 1198 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1199 tx = async_trigger_callback(&submit); 1200} 1201 1202static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1203{ 1204 struct page **srcs = percpu->scribble; 1205 struct async_submit_ctl submit; 1206 int count; 1207 1208 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1209 (unsigned long long)sh->sector, checkp); 1210 1211 count = set_syndrome_sources(srcs, sh); 1212 if (!checkp) 1213 srcs[count] = NULL; 1214 1215 atomic_inc(&sh->count); 1216 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1217 sh, to_addr_conv(sh, percpu)); 1218 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1219 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1220} 1221 1222static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1223{ 1224 int overlap_clear = 0, i, disks = sh->disks; 1225 struct dma_async_tx_descriptor *tx = NULL; 1226 raid5_conf_t *conf = sh->raid_conf; 1227 int level = conf->level; 1228 struct raid5_percpu *percpu; 1229 unsigned long cpu; 1230 1231 cpu = get_cpu(); 1232 percpu = per_cpu_ptr(conf->percpu, cpu); 1233 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1234 ops_run_biofill(sh); 1235 overlap_clear++; 1236 } 1237 1238 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1239 if (level < 6) 1240 tx = ops_run_compute5(sh, percpu); 1241 else { 1242 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1243 tx = ops_run_compute6_1(sh, percpu); 1244 else 1245 tx = ops_run_compute6_2(sh, percpu); 1246 } 1247 /* terminate the chain if reconstruct is not set to be run */ 1248 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1249 async_tx_ack(tx); 1250 } 1251 1252 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1253 tx = ops_run_prexor(sh, percpu, tx); 1254 1255 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1256 tx = ops_run_biodrain(sh, tx); 1257 overlap_clear++; 1258 } 1259 1260 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1261 if (level < 6) 1262 ops_run_reconstruct5(sh, percpu, tx); 1263 else 1264 ops_run_reconstruct6(sh, percpu, tx); 1265 } 1266 1267 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1268 if (sh->check_state == check_state_run) 1269 ops_run_check_p(sh, percpu); 1270 else if (sh->check_state == check_state_run_q) 1271 ops_run_check_pq(sh, percpu, 0); 1272 else if (sh->check_state == check_state_run_pq) 1273 ops_run_check_pq(sh, percpu, 1); 1274 else 1275 BUG(); 1276 } 1277 1278 if (overlap_clear) 1279 for (i = disks; i--; ) { 1280 struct r5dev *dev = &sh->dev[i]; 1281 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1282 wake_up(&sh->raid_conf->wait_for_overlap); 1283 } 1284 put_cpu(); 1285} 1286 1287#ifdef CONFIG_MULTICORE_RAID456 1288static void async_run_ops(void *param, async_cookie_t cookie) 1289{ 1290 struct stripe_head *sh = param; 1291 unsigned long ops_request = sh->ops.request; 1292 1293 clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); 1294 wake_up(&sh->ops.wait_for_ops); 1295 1296 __raid_run_ops(sh, ops_request); 1297 release_stripe(sh); 1298} 1299 1300static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1301{ 1302 /* since handle_stripe can be called outside of raid5d context 1303 * we need to ensure sh->ops.request is de-staged before another 1304 * request arrives 1305 */ 1306 wait_event(sh->ops.wait_for_ops, 1307 !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); 1308 sh->ops.request = ops_request; 1309 1310 atomic_inc(&sh->count); 1311 async_schedule(async_run_ops, sh); 1312} 1313#else 1314#define raid_run_ops __raid_run_ops 1315#endif 1316 1317static int grow_one_stripe(raid5_conf_t *conf) 1318{ 1319 struct stripe_head *sh; 1320 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); 1321 if (!sh) 1322 return 0; 1323 memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev)); 1324 sh->raid_conf = conf; 1325 spin_lock_init(&sh->lock); 1326 #ifdef CONFIG_MULTICORE_RAID456 1327 init_waitqueue_head(&sh->ops.wait_for_ops); 1328 #endif 1329 1330 if (grow_buffers(sh)) { 1331 shrink_buffers(sh); 1332 kmem_cache_free(conf->slab_cache, sh); 1333 return 0; 1334 } 1335 /* we just created an active stripe so... */ 1336 atomic_set(&sh->count, 1); 1337 atomic_inc(&conf->active_stripes); 1338 INIT_LIST_HEAD(&sh->lru); 1339 release_stripe(sh); 1340 return 1; 1341} 1342 1343static int grow_stripes(raid5_conf_t *conf, int num) 1344{ 1345 struct kmem_cache *sc; 1346 int devs = max(conf->raid_disks, conf->previous_raid_disks); 1347 1348 if (conf->mddev->gendisk) 1349 sprintf(conf->cache_name[0], 1350 "raid%d-%s", conf->level, mdname(conf->mddev)); 1351 else 1352 sprintf(conf->cache_name[0], 1353 "raid%d-%p", conf->level, conf->mddev); 1354 sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); 1355 1356 conf->active_name = 0; 1357 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1358 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1359 0, 0, NULL); 1360 if (!sc) 1361 return 1; 1362 conf->slab_cache = sc; 1363 conf->pool_size = devs; 1364 while (num--) 1365 if (!grow_one_stripe(conf)) 1366 return 1; 1367 return 0; 1368} 1369 1370/** 1371 * scribble_len - return the required size of the scribble region 1372 * @num - total number of disks in the array 1373 * 1374 * The size must be enough to contain: 1375 * 1/ a struct page pointer for each device in the array +2 1376 * 2/ room to convert each entry in (1) to its corresponding dma 1377 * (dma_map_page()) or page (page_address()) address. 1378 * 1379 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1380 * calculate over all devices (not just the data blocks), using zeros in place 1381 * of the P and Q blocks. 1382 */ 1383static size_t scribble_len(int num) 1384{ 1385 size_t len; 1386 1387 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 1388 1389 return len; 1390} 1391 1392static int resize_stripes(raid5_conf_t *conf, int newsize) 1393{ 1394 /* Make all the stripes able to hold 'newsize' devices. 1395 * New slots in each stripe get 'page' set to a new page. 1396 * 1397 * This happens in stages: 1398 * 1/ create a new kmem_cache and allocate the required number of 1399 * stripe_heads. 1400 * 2/ gather all the old stripe_heads and tranfer the pages across 1401 * to the new stripe_heads. This will have the side effect of 1402 * freezing the array as once all stripe_heads have been collected, 1403 * no IO will be possible. Old stripe heads are freed once their 1404 * pages have been transferred over, and the old kmem_cache is 1405 * freed when all stripes are done. 1406 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 1407 * we simple return a failre status - no need to clean anything up. 1408 * 4/ allocate new pages for the new slots in the new stripe_heads. 1409 * If this fails, we don't bother trying the shrink the 1410 * stripe_heads down again, we just leave them as they are. 1411 * As each stripe_head is processed the new one is released into 1412 * active service. 1413 * 1414 * Once step2 is started, we cannot afford to wait for a write, 1415 * so we use GFP_NOIO allocations. 1416 */ 1417 struct stripe_head *osh, *nsh; 1418 LIST_HEAD(newstripes); 1419 struct disk_info *ndisks; 1420 unsigned long cpu; 1421 int err; 1422 struct kmem_cache *sc; 1423 int i; 1424 1425 if (newsize <= conf->pool_size) 1426 return 0; /* never bother to shrink */ 1427 1428 err = md_allow_write(conf->mddev); 1429 if (err) 1430 return err; 1431 1432 /* Step 1 */ 1433 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 1434 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 1435 0, 0, NULL); 1436 if (!sc) 1437 return -ENOMEM; 1438 1439 for (i = conf->max_nr_stripes; i; i--) { 1440 nsh = kmem_cache_alloc(sc, GFP_KERNEL); 1441 if (!nsh) 1442 break; 1443 1444 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev)); 1445 1446 nsh->raid_conf = conf; 1447 spin_lock_init(&nsh->lock); 1448 #ifdef CONFIG_MULTICORE_RAID456 1449 init_waitqueue_head(&nsh->ops.wait_for_ops); 1450 #endif 1451 1452 list_add(&nsh->lru, &newstripes); 1453 } 1454 if (i) { 1455 /* didn't get enough, give up */ 1456 while (!list_empty(&newstripes)) { 1457 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1458 list_del(&nsh->lru); 1459 kmem_cache_free(sc, nsh); 1460 } 1461 kmem_cache_destroy(sc); 1462 return -ENOMEM; 1463 } 1464 /* Step 2 - Must use GFP_NOIO now. 1465 * OK, we have enough stripes, start collecting inactive 1466 * stripes and copying them over 1467 */ 1468 list_for_each_entry(nsh, &newstripes, lru) { 1469 spin_lock_irq(&conf->device_lock); 1470 wait_event_lock_irq(conf->wait_for_stripe, 1471 !list_empty(&conf->inactive_list), 1472 conf->device_lock, 1473 ); 1474 osh = get_free_stripe(conf); 1475 spin_unlock_irq(&conf->device_lock); 1476 atomic_set(&nsh->count, 1); 1477 for(i=0; i<conf->pool_size; i++) 1478 nsh->dev[i].page = osh->dev[i].page; 1479 for( ; i<newsize; i++) 1480 nsh->dev[i].page = NULL; 1481 kmem_cache_free(conf->slab_cache, osh); 1482 } 1483 kmem_cache_destroy(conf->slab_cache); 1484 1485 /* Step 3. 1486 * At this point, we are holding all the stripes so the array 1487 * is completely stalled, so now is a good time to resize 1488 * conf->disks and the scribble region 1489 */ 1490 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1491 if (ndisks) { 1492 for (i=0; i<conf->raid_disks; i++) 1493 ndisks[i] = conf->disks[i]; 1494 kfree(conf->disks); 1495 conf->disks = ndisks; 1496 } else 1497 err = -ENOMEM; 1498 1499 get_online_cpus(); 1500 conf->scribble_len = scribble_len(newsize); 1501 for_each_present_cpu(cpu) { 1502 struct raid5_percpu *percpu; 1503 void *scribble; 1504 1505 percpu = per_cpu_ptr(conf->percpu, cpu); 1506 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1507 1508 if (scribble) { 1509 kfree(percpu->scribble); 1510 percpu->scribble = scribble; 1511 } else { 1512 err = -ENOMEM; 1513 break; 1514 } 1515 } 1516 put_online_cpus(); 1517 1518 /* Step 4, return new stripes to service */ 1519 while(!list_empty(&newstripes)) { 1520 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1521 list_del_init(&nsh->lru); 1522 1523 for (i=conf->raid_disks; i < newsize; i++) 1524 if (nsh->dev[i].page == NULL) { 1525 struct page *p = alloc_page(GFP_NOIO); 1526 nsh->dev[i].page = p; 1527 if (!p) 1528 err = -ENOMEM; 1529 } 1530 release_stripe(nsh); 1531 } 1532 /* critical section pass, GFP_NOIO no longer needed */ 1533 1534 conf->slab_cache = sc; 1535 conf->active_name = 1-conf->active_name; 1536 conf->pool_size = newsize; 1537 return err; 1538} 1539 1540static int drop_one_stripe(raid5_conf_t *conf) 1541{ 1542 struct stripe_head *sh; 1543 1544 spin_lock_irq(&conf->device_lock); 1545 sh = get_free_stripe(conf); 1546 spin_unlock_irq(&conf->device_lock); 1547 if (!sh) 1548 return 0; 1549 BUG_ON(atomic_read(&sh->count)); 1550 shrink_buffers(sh); 1551 kmem_cache_free(conf->slab_cache, sh); 1552 atomic_dec(&conf->active_stripes); 1553 return 1; 1554} 1555 1556static void shrink_stripes(raid5_conf_t *conf) 1557{ 1558 while (drop_one_stripe(conf)) 1559 ; 1560 1561 if (conf->slab_cache) 1562 kmem_cache_destroy(conf->slab_cache); 1563 conf->slab_cache = NULL; 1564} 1565 1566static void raid5_end_read_request(struct bio * bi, int error) 1567{ 1568 struct stripe_head *sh = bi->bi_private; 1569 raid5_conf_t *conf = sh->raid_conf; 1570 int disks = sh->disks, i; 1571 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1572 char b[BDEVNAME_SIZE]; 1573 mdk_rdev_t *rdev; 1574 1575 1576 for (i=0 ; i<disks; i++) 1577 if (bi == &sh->dev[i].req) 1578 break; 1579 1580 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1581 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1582 uptodate); 1583 if (i == disks) { 1584 BUG(); 1585 return; 1586 } 1587 1588 if (uptodate) { 1589 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1590 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1591 rdev = conf->disks[i].rdev; 1592 printk_rl(KERN_INFO "md/raid:%s: read error corrected" 1593 " (%lu sectors at %llu on %s)\n", 1594 mdname(conf->mddev), STRIPE_SECTORS, 1595 (unsigned long long)(sh->sector 1596 + rdev->data_offset), 1597 bdevname(rdev->bdev, b)); 1598 clear_bit(R5_ReadError, &sh->dev[i].flags); 1599 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1600 } 1601 if (atomic_read(&conf->disks[i].rdev->read_errors)) 1602 atomic_set(&conf->disks[i].rdev->read_errors, 0); 1603 } else { 1604 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); 1605 int retry = 0; 1606 rdev = conf->disks[i].rdev; 1607 1608 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1609 atomic_inc(&rdev->read_errors); 1610 if (conf->mddev->degraded >= conf->max_degraded) 1611 printk_rl(KERN_WARNING 1612 "md/raid:%s: read error not correctable " 1613 "(sector %llu on %s).\n", 1614 mdname(conf->mddev), 1615 (unsigned long long)(sh->sector 1616 + rdev->data_offset), 1617 bdn); 1618 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1619 /* Oh, no!!! */ 1620 printk_rl(KERN_WARNING 1621 "md/raid:%s: read error NOT corrected!! " 1622 "(sector %llu on %s).\n", 1623 mdname(conf->mddev), 1624 (unsigned long long)(sh->sector 1625 + rdev->data_offset), 1626 bdn); 1627 else if (atomic_read(&rdev->read_errors) 1628 > conf->max_nr_stripes) 1629 printk(KERN_WARNING 1630 "md/raid:%s: Too many read errors, failing device %s.\n", 1631 mdname(conf->mddev), bdn); 1632 else 1633 retry = 1; 1634 if (retry) 1635 set_bit(R5_ReadError, &sh->dev[i].flags); 1636 else { 1637 clear_bit(R5_ReadError, &sh->dev[i].flags); 1638 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1639 md_error(conf->mddev, rdev); 1640 } 1641 } 1642 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1643 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1644 set_bit(STRIPE_HANDLE, &sh->state); 1645 release_stripe(sh); 1646} 1647 1648static void raid5_end_write_request(struct bio *bi, int error) 1649{ 1650 struct stripe_head *sh = bi->bi_private; 1651 raid5_conf_t *conf = sh->raid_conf; 1652 int disks = sh->disks, i; 1653 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1654 1655 for (i=0 ; i<disks; i++) 1656 if (bi == &sh->dev[i].req) 1657 break; 1658 1659 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1660 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1661 uptodate); 1662 if (i == disks) { 1663 BUG(); 1664 return; 1665 } 1666 1667 if (!uptodate) 1668 md_error(conf->mddev, conf->disks[i].rdev); 1669 1670 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1671 1672 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1673 set_bit(STRIPE_HANDLE, &sh->state); 1674 release_stripe(sh); 1675} 1676 1677 1678static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1679 1680static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1681{ 1682 struct r5dev *dev = &sh->dev[i]; 1683 1684 bio_init(&dev->req); 1685 dev->req.bi_io_vec = &dev->vec; 1686 dev->req.bi_vcnt++; 1687 dev->req.bi_max_vecs++; 1688 dev->vec.bv_page = dev->page; 1689 dev->vec.bv_len = STRIPE_SIZE; 1690 dev->vec.bv_offset = 0; 1691 1692 dev->req.bi_sector = sh->sector; 1693 dev->req.bi_private = sh; 1694 1695 dev->flags = 0; 1696 dev->sector = compute_blocknr(sh, i, previous); 1697} 1698 1699static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1700{ 1701 char b[BDEVNAME_SIZE]; 1702 raid5_conf_t *conf = mddev->private; 1703 pr_debug("raid456: error called\n"); 1704 1705 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1706 unsigned long flags; 1707 spin_lock_irqsave(&conf->device_lock, flags); 1708 mddev->degraded++; 1709 spin_unlock_irqrestore(&conf->device_lock, flags); 1710 /* 1711 * if recovery was running, make sure it aborts. 1712 */ 1713 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1714 } 1715 set_bit(Faulty, &rdev->flags); 1716 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1717 printk(KERN_ALERT 1718 "md/raid:%s: Disk failure on %s, disabling device.\n" 1719 "md/raid:%s: Operation continuing on %d devices.\n", 1720 mdname(mddev), 1721 bdevname(rdev->bdev, b), 1722 mdname(mddev), 1723 conf->raid_disks - mddev->degraded); 1724} 1725 1726/* 1727 * Input: a 'big' sector number, 1728 * Output: index of the data and parity disk, and the sector # in them. 1729 */ 1730static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, 1731 int previous, int *dd_idx, 1732 struct stripe_head *sh) 1733{ 1734 sector_t stripe, stripe2; 1735 sector_t chunk_number; 1736 unsigned int chunk_offset; 1737 int pd_idx, qd_idx; 1738 int ddf_layout = 0; 1739 sector_t new_sector; 1740 int algorithm = previous ? conf->prev_algo 1741 : conf->algorithm; 1742 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1743 : conf->chunk_sectors; 1744 int raid_disks = previous ? conf->previous_raid_disks 1745 : conf->raid_disks; 1746 int data_disks = raid_disks - conf->max_degraded; 1747 1748 /* First compute the information on this sector */ 1749 1750 /* 1751 * Compute the chunk number and the sector offset inside the chunk 1752 */ 1753 chunk_offset = sector_div(r_sector, sectors_per_chunk); 1754 chunk_number = r_sector; 1755 1756 /* 1757 * Compute the stripe number 1758 */ 1759 stripe = chunk_number; 1760 *dd_idx = sector_div(stripe, data_disks); 1761 stripe2 = stripe; 1762 /* 1763 * Select the parity disk based on the user selected algorithm. 1764 */ 1765 pd_idx = qd_idx = ~0; 1766 switch(conf->level) { 1767 case 4: 1768 pd_idx = data_disks; 1769 break; 1770 case 5: 1771 switch (algorithm) { 1772 case ALGORITHM_LEFT_ASYMMETRIC: 1773 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1774 if (*dd_idx >= pd_idx) 1775 (*dd_idx)++; 1776 break; 1777 case ALGORITHM_RIGHT_ASYMMETRIC: 1778 pd_idx = sector_div(stripe2, raid_disks); 1779 if (*dd_idx >= pd_idx) 1780 (*dd_idx)++; 1781 break; 1782 case ALGORITHM_LEFT_SYMMETRIC: 1783 pd_idx = data_disks - sector_div(stripe2, raid_disks); 1784 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1785 break; 1786 case ALGORITHM_RIGHT_SYMMETRIC: 1787 pd_idx = sector_div(stripe2, raid_disks); 1788 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1789 break; 1790 case ALGORITHM_PARITY_0: 1791 pd_idx = 0; 1792 (*dd_idx)++; 1793 break; 1794 case ALGORITHM_PARITY_N: 1795 pd_idx = data_disks; 1796 break; 1797 default: 1798 BUG(); 1799 } 1800 break; 1801 case 6: 1802 1803 switch (algorithm) { 1804 case ALGORITHM_LEFT_ASYMMETRIC: 1805 pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks); 1806 qd_idx = pd_idx + 1; 1807 if (pd_idx == raid_disks-1) { 1808 (*dd_idx)++; /* Q D D D P */ 1809 qd_idx = 0; 1810 } else if (*dd_idx >= pd_idx) 1811 (*dd_idx) += 2; /* D D P Q D */ 1812 break; 1813 case ALGORITHM_RIGHT_ASYMMETRIC: 1814 pd_idx = sector_div(stripe2, raid_disks); 1815 qd_idx = pd_idx + 1; 1816 if (pd_idx == raid_disks-1) { 1817 (*dd_idx)++; /* Q D D D P */ 1818 qd_idx = 0; 1819 } else if (*dd_idx >= pd_idx) 1820 (*dd_idx) += 2; /* D D P Q D */ 1821 break; 1822 case ALGORITHM_LEFT_SYMMETRIC: 1823 pd_…
Large files files are truncated, but you can click here to view the full file