PageRenderTime 107ms CodeModel.GetById 25ms app.highlight 69ms RepoModel.GetById 1ms app.codeStats 1ms

/drivers/md/raid5.c

https://bitbucket.org/ndreys/linux-sunxi
C | 6022 lines | 4565 code | 661 blank | 796 comment | 1110 complexity | 55ed2e6a439dbba1eab15e8ff1006358 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.0, AGPL-1.0

Large files files are truncated, but you can click here to view the full file

   1/*
   2 * raid5.c : Multiple Devices driver for Linux
   3 *	   Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
   4 *	   Copyright (C) 1999, 2000 Ingo Molnar
   5 *	   Copyright (C) 2002, 2003 H. Peter Anvin
   6 *
   7 * RAID-4/5/6 management functions.
   8 * Thanks to Penguin Computing for making the RAID-6 development possible
   9 * by donating a test server!
  10 *
  11 * This program is free software; you can redistribute it and/or modify
  12 * it under the terms of the GNU General Public License as published by
  13 * the Free Software Foundation; either version 2, or (at your option)
  14 * any later version.
  15 *
  16 * You should have received a copy of the GNU General Public License
  17 * (for example /usr/src/linux/COPYING); if not, write to the Free
  18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19 */
  20
  21/*
  22 * BITMAP UNPLUGGING:
  23 *
  24 * The sequencing for updating the bitmap reliably is a little
  25 * subtle (and I got it wrong the first time) so it deserves some
  26 * explanation.
  27 *
  28 * We group bitmap updates into batches.  Each batch has a number.
  29 * We may write out several batches at once, but that isn't very important.
  30 * conf->seq_write is the number of the last batch successfully written.
  31 * conf->seq_flush is the number of the last batch that was closed to
  32 *    new additions.
  33 * When we discover that we will need to write to any block in a stripe
  34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
  35 * the number of the batch it will be in. This is seq_flush+1.
  36 * When we are ready to do a write, if that batch hasn't been written yet,
  37 *   we plug the array and queue the stripe for later.
  38 * When an unplug happens, we increment bm_flush, thus closing the current
  39 *   batch.
  40 * When we notice that bm_flush > bm_write, we write out all pending updates
  41 * to the bitmap, and advance bm_write to where bm_flush was.
  42 * This may occasionally write a bit out twice, but is sure never to
  43 * miss any bits.
  44 */
  45
  46#include <linux/blkdev.h>
  47#include <linux/kthread.h>
  48#include <linux/raid/pq.h>
  49#include <linux/async_tx.h>
  50#include <linux/async.h>
  51#include <linux/seq_file.h>
  52#include <linux/cpu.h>
  53#include <linux/slab.h>
  54#include "md.h"
  55#include "raid5.h"
  56#include "raid0.h"
  57#include "bitmap.h"
  58
  59/*
  60 * Stripe cache
  61 */
  62
  63#define NR_STRIPES		256
  64#define STRIPE_SIZE		PAGE_SIZE
  65#define STRIPE_SHIFT		(PAGE_SHIFT - 9)
  66#define STRIPE_SECTORS		(STRIPE_SIZE>>9)
  67#define	IO_THRESHOLD		1
  68#define BYPASS_THRESHOLD	1
  69#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
  70#define HASH_MASK		(NR_HASH - 1)
  71
  72#define stripe_hash(conf, sect)	(&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
  73
  74/* bio's attached to a stripe+device for I/O are linked together in bi_sector
  75 * order without overlap.  There may be several bio's per stripe+device, and
  76 * a bio could span several devices.
  77 * When walking this list for a particular stripe+device, we must never proceed
  78 * beyond a bio that extends past this device, as the next bio might no longer
  79 * be valid.
  80 * This macro is used to determine the 'next' bio in the list, given the sector
  81 * of the current stripe+device
  82 */
  83#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
  84/*
  85 * The following can be used to debug the driver
  86 */
  87#define RAID5_PARANOIA	1
  88#if RAID5_PARANOIA && defined(CONFIG_SMP)
  89# define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
  90#else
  91# define CHECK_DEVLOCK()
  92#endif
  93
  94#ifdef DEBUG
  95#define inline
  96#define __inline__
  97#endif
  98
  99#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
 100
 101/*
 102 * We maintain a biased count of active stripes in the bottom 16 bits of
 103 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
 104 */
 105static inline int raid5_bi_phys_segments(struct bio *bio)
 106{
 107	return bio->bi_phys_segments & 0xffff;
 108}
 109
 110static inline int raid5_bi_hw_segments(struct bio *bio)
 111{
 112	return (bio->bi_phys_segments >> 16) & 0xffff;
 113}
 114
 115static inline int raid5_dec_bi_phys_segments(struct bio *bio)
 116{
 117	--bio->bi_phys_segments;
 118	return raid5_bi_phys_segments(bio);
 119}
 120
 121static inline int raid5_dec_bi_hw_segments(struct bio *bio)
 122{
 123	unsigned short val = raid5_bi_hw_segments(bio);
 124
 125	--val;
 126	bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
 127	return val;
 128}
 129
 130static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
 131{
 132	bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16);
 133}
 134
 135/* Find first data disk in a raid6 stripe */
 136static inline int raid6_d0(struct stripe_head *sh)
 137{
 138	if (sh->ddf_layout)
 139		/* ddf always start from first device */
 140		return 0;
 141	/* md starts just after Q block */
 142	if (sh->qd_idx == sh->disks - 1)
 143		return 0;
 144	else
 145		return sh->qd_idx + 1;
 146}
 147static inline int raid6_next_disk(int disk, int raid_disks)
 148{
 149	disk++;
 150	return (disk < raid_disks) ? disk : 0;
 151}
 152
 153/* When walking through the disks in a raid5, starting at raid6_d0,
 154 * We need to map each disk to a 'slot', where the data disks are slot
 155 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
 156 * is raid_disks-1.  This help does that mapping.
 157 */
 158static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
 159			     int *count, int syndrome_disks)
 160{
 161	int slot = *count;
 162
 163	if (sh->ddf_layout)
 164		(*count)++;
 165	if (idx == sh->pd_idx)
 166		return syndrome_disks;
 167	if (idx == sh->qd_idx)
 168		return syndrome_disks + 1;
 169	if (!sh->ddf_layout)
 170		(*count)++;
 171	return slot;
 172}
 173
 174static void return_io(struct bio *return_bi)
 175{
 176	struct bio *bi = return_bi;
 177	while (bi) {
 178
 179		return_bi = bi->bi_next;
 180		bi->bi_next = NULL;
 181		bi->bi_size = 0;
 182		bio_endio(bi, 0);
 183		bi = return_bi;
 184	}
 185}
 186
 187static void print_raid5_conf (raid5_conf_t *conf);
 188
 189static int stripe_operations_active(struct stripe_head *sh)
 190{
 191	return sh->check_state || sh->reconstruct_state ||
 192	       test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
 193	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 194}
 195
 196static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
 197{
 198	if (atomic_dec_and_test(&sh->count)) {
 199		BUG_ON(!list_empty(&sh->lru));
 200		BUG_ON(atomic_read(&conf->active_stripes)==0);
 201		if (test_bit(STRIPE_HANDLE, &sh->state)) {
 202			if (test_bit(STRIPE_DELAYED, &sh->state) &&
 203			    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 204				list_add_tail(&sh->lru, &conf->delayed_list);
 205			else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
 206				   sh->bm_seq - conf->seq_write > 0)
 207				list_add_tail(&sh->lru, &conf->bitmap_list);
 208			else {
 209				clear_bit(STRIPE_DELAYED, &sh->state);
 210				clear_bit(STRIPE_BIT_DELAY, &sh->state);
 211				list_add_tail(&sh->lru, &conf->handle_list);
 212			}
 213			md_wakeup_thread(conf->mddev->thread);
 214		} else {
 215			BUG_ON(stripe_operations_active(sh));
 216			if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
 217				atomic_dec(&conf->preread_active_stripes);
 218				if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
 219					md_wakeup_thread(conf->mddev->thread);
 220			}
 221			atomic_dec(&conf->active_stripes);
 222			if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
 223				list_add_tail(&sh->lru, &conf->inactive_list);
 224				wake_up(&conf->wait_for_stripe);
 225				if (conf->retry_read_aligned)
 226					md_wakeup_thread(conf->mddev->thread);
 227			}
 228		}
 229	}
 230}
 231
 232static void release_stripe(struct stripe_head *sh)
 233{
 234	raid5_conf_t *conf = sh->raid_conf;
 235	unsigned long flags;
 236
 237	spin_lock_irqsave(&conf->device_lock, flags);
 238	__release_stripe(conf, sh);
 239	spin_unlock_irqrestore(&conf->device_lock, flags);
 240}
 241
 242static inline void remove_hash(struct stripe_head *sh)
 243{
 244	pr_debug("remove_hash(), stripe %llu\n",
 245		(unsigned long long)sh->sector);
 246
 247	hlist_del_init(&sh->hash);
 248}
 249
 250static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
 251{
 252	struct hlist_head *hp = stripe_hash(conf, sh->sector);
 253
 254	pr_debug("insert_hash(), stripe %llu\n",
 255		(unsigned long long)sh->sector);
 256
 257	CHECK_DEVLOCK();
 258	hlist_add_head(&sh->hash, hp);
 259}
 260
 261
 262/* find an idle stripe, make sure it is unhashed, and return it. */
 263static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
 264{
 265	struct stripe_head *sh = NULL;
 266	struct list_head *first;
 267
 268	CHECK_DEVLOCK();
 269	if (list_empty(&conf->inactive_list))
 270		goto out;
 271	first = conf->inactive_list.next;
 272	sh = list_entry(first, struct stripe_head, lru);
 273	list_del_init(first);
 274	remove_hash(sh);
 275	atomic_inc(&conf->active_stripes);
 276out:
 277	return sh;
 278}
 279
 280static void shrink_buffers(struct stripe_head *sh)
 281{
 282	struct page *p;
 283	int i;
 284	int num = sh->raid_conf->pool_size;
 285
 286	for (i = 0; i < num ; i++) {
 287		p = sh->dev[i].page;
 288		if (!p)
 289			continue;
 290		sh->dev[i].page = NULL;
 291		put_page(p);
 292	}
 293}
 294
 295static int grow_buffers(struct stripe_head *sh)
 296{
 297	int i;
 298	int num = sh->raid_conf->pool_size;
 299
 300	for (i = 0; i < num; i++) {
 301		struct page *page;
 302
 303		if (!(page = alloc_page(GFP_KERNEL))) {
 304			return 1;
 305		}
 306		sh->dev[i].page = page;
 307	}
 308	return 0;
 309}
 310
 311static void raid5_build_block(struct stripe_head *sh, int i, int previous);
 312static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
 313			    struct stripe_head *sh);
 314
 315static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 316{
 317	raid5_conf_t *conf = sh->raid_conf;
 318	int i;
 319
 320	BUG_ON(atomic_read(&sh->count) != 0);
 321	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
 322	BUG_ON(stripe_operations_active(sh));
 323
 324	CHECK_DEVLOCK();
 325	pr_debug("init_stripe called, stripe %llu\n",
 326		(unsigned long long)sh->sector);
 327
 328	remove_hash(sh);
 329
 330	sh->generation = conf->generation - previous;
 331	sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
 332	sh->sector = sector;
 333	stripe_set_idx(sector, conf, previous, sh);
 334	sh->state = 0;
 335
 336
 337	for (i = sh->disks; i--; ) {
 338		struct r5dev *dev = &sh->dev[i];
 339
 340		if (dev->toread || dev->read || dev->towrite || dev->written ||
 341		    test_bit(R5_LOCKED, &dev->flags)) {
 342			printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
 343			       (unsigned long long)sh->sector, i, dev->toread,
 344			       dev->read, dev->towrite, dev->written,
 345			       test_bit(R5_LOCKED, &dev->flags));
 346			BUG();
 347		}
 348		dev->flags = 0;
 349		raid5_build_block(sh, i, previous);
 350	}
 351	insert_hash(conf, sh);
 352}
 353
 354static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
 355					 short generation)
 356{
 357	struct stripe_head *sh;
 358	struct hlist_node *hn;
 359
 360	CHECK_DEVLOCK();
 361	pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
 362	hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
 363		if (sh->sector == sector && sh->generation == generation)
 364			return sh;
 365	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
 366	return NULL;
 367}
 368
 369/*
 370 * Need to check if array has failed when deciding whether to:
 371 *  - start an array
 372 *  - remove non-faulty devices
 373 *  - add a spare
 374 *  - allow a reshape
 375 * This determination is simple when no reshape is happening.
 376 * However if there is a reshape, we need to carefully check
 377 * both the before and after sections.
 378 * This is because some failed devices may only affect one
 379 * of the two sections, and some non-in_sync devices may
 380 * be insync in the section most affected by failed devices.
 381 */
 382static int has_failed(raid5_conf_t *conf)
 383{
 384	int degraded;
 385	int i;
 386	if (conf->mddev->reshape_position == MaxSector)
 387		return conf->mddev->degraded > conf->max_degraded;
 388
 389	rcu_read_lock();
 390	degraded = 0;
 391	for (i = 0; i < conf->previous_raid_disks; i++) {
 392		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
 393		if (!rdev || test_bit(Faulty, &rdev->flags))
 394			degraded++;
 395		else if (test_bit(In_sync, &rdev->flags))
 396			;
 397		else
 398			/* not in-sync or faulty.
 399			 * If the reshape increases the number of devices,
 400			 * this is being recovered by the reshape, so
 401			 * this 'previous' section is not in_sync.
 402			 * If the number of devices is being reduced however,
 403			 * the device can only be part of the array if
 404			 * we are reverting a reshape, so this section will
 405			 * be in-sync.
 406			 */
 407			if (conf->raid_disks >= conf->previous_raid_disks)
 408				degraded++;
 409	}
 410	rcu_read_unlock();
 411	if (degraded > conf->max_degraded)
 412		return 1;
 413	rcu_read_lock();
 414	degraded = 0;
 415	for (i = 0; i < conf->raid_disks; i++) {
 416		mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
 417		if (!rdev || test_bit(Faulty, &rdev->flags))
 418			degraded++;
 419		else if (test_bit(In_sync, &rdev->flags))
 420			;
 421		else
 422			/* not in-sync or faulty.
 423			 * If reshape increases the number of devices, this
 424			 * section has already been recovered, else it
 425			 * almost certainly hasn't.
 426			 */
 427			if (conf->raid_disks <= conf->previous_raid_disks)
 428				degraded++;
 429	}
 430	rcu_read_unlock();
 431	if (degraded > conf->max_degraded)
 432		return 1;
 433	return 0;
 434}
 435
 436static struct stripe_head *
 437get_active_stripe(raid5_conf_t *conf, sector_t sector,
 438		  int previous, int noblock, int noquiesce)
 439{
 440	struct stripe_head *sh;
 441
 442	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
 443
 444	spin_lock_irq(&conf->device_lock);
 445
 446	do {
 447		wait_event_lock_irq(conf->wait_for_stripe,
 448				    conf->quiesce == 0 || noquiesce,
 449				    conf->device_lock, /* nothing */);
 450		sh = __find_stripe(conf, sector, conf->generation - previous);
 451		if (!sh) {
 452			if (!conf->inactive_blocked)
 453				sh = get_free_stripe(conf);
 454			if (noblock && sh == NULL)
 455				break;
 456			if (!sh) {
 457				conf->inactive_blocked = 1;
 458				wait_event_lock_irq(conf->wait_for_stripe,
 459						    !list_empty(&conf->inactive_list) &&
 460						    (atomic_read(&conf->active_stripes)
 461						     < (conf->max_nr_stripes *3/4)
 462						     || !conf->inactive_blocked),
 463						    conf->device_lock,
 464						    );
 465				conf->inactive_blocked = 0;
 466			} else
 467				init_stripe(sh, sector, previous);
 468		} else {
 469			if (atomic_read(&sh->count)) {
 470				BUG_ON(!list_empty(&sh->lru)
 471				    && !test_bit(STRIPE_EXPANDING, &sh->state));
 472			} else {
 473				if (!test_bit(STRIPE_HANDLE, &sh->state))
 474					atomic_inc(&conf->active_stripes);
 475				if (list_empty(&sh->lru) &&
 476				    !test_bit(STRIPE_EXPANDING, &sh->state))
 477					BUG();
 478				list_del_init(&sh->lru);
 479			}
 480		}
 481	} while (sh == NULL);
 482
 483	if (sh)
 484		atomic_inc(&sh->count);
 485
 486	spin_unlock_irq(&conf->device_lock);
 487	return sh;
 488}
 489
 490static void
 491raid5_end_read_request(struct bio *bi, int error);
 492static void
 493raid5_end_write_request(struct bio *bi, int error);
 494
 495static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 496{
 497	raid5_conf_t *conf = sh->raid_conf;
 498	int i, disks = sh->disks;
 499
 500	might_sleep();
 501
 502	for (i = disks; i--; ) {
 503		int rw;
 504		struct bio *bi;
 505		mdk_rdev_t *rdev;
 506		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
 507			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
 508				rw = WRITE_FUA;
 509			else
 510				rw = WRITE;
 511		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
 512			rw = READ;
 513		else
 514			continue;
 515
 516		bi = &sh->dev[i].req;
 517
 518		bi->bi_rw = rw;
 519		if (rw & WRITE)
 520			bi->bi_end_io = raid5_end_write_request;
 521		else
 522			bi->bi_end_io = raid5_end_read_request;
 523
 524		rcu_read_lock();
 525		rdev = rcu_dereference(conf->disks[i].rdev);
 526		if (rdev && test_bit(Faulty, &rdev->flags))
 527			rdev = NULL;
 528		if (rdev)
 529			atomic_inc(&rdev->nr_pending);
 530		rcu_read_unlock();
 531
 532		if (rdev) {
 533			if (s->syncing || s->expanding || s->expanded)
 534				md_sync_acct(rdev->bdev, STRIPE_SECTORS);
 535
 536			set_bit(STRIPE_IO_STARTED, &sh->state);
 537
 538			bi->bi_bdev = rdev->bdev;
 539			pr_debug("%s: for %llu schedule op %ld on disc %d\n",
 540				__func__, (unsigned long long)sh->sector,
 541				bi->bi_rw, i);
 542			atomic_inc(&sh->count);
 543			bi->bi_sector = sh->sector + rdev->data_offset;
 544			bi->bi_flags = 1 << BIO_UPTODATE;
 545			bi->bi_vcnt = 1;
 546			bi->bi_max_vecs = 1;
 547			bi->bi_idx = 0;
 548			bi->bi_io_vec = &sh->dev[i].vec;
 549			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
 550			bi->bi_io_vec[0].bv_offset = 0;
 551			bi->bi_size = STRIPE_SIZE;
 552			bi->bi_next = NULL;
 553			if ((rw & WRITE) &&
 554			    test_bit(R5_ReWrite, &sh->dev[i].flags))
 555				atomic_add(STRIPE_SECTORS,
 556					&rdev->corrected_errors);
 557			generic_make_request(bi);
 558		} else {
 559			if (rw & WRITE)
 560				set_bit(STRIPE_DEGRADED, &sh->state);
 561			pr_debug("skip op %ld on disc %d for sector %llu\n",
 562				bi->bi_rw, i, (unsigned long long)sh->sector);
 563			clear_bit(R5_LOCKED, &sh->dev[i].flags);
 564			set_bit(STRIPE_HANDLE, &sh->state);
 565		}
 566	}
 567}
 568
 569static struct dma_async_tx_descriptor *
 570async_copy_data(int frombio, struct bio *bio, struct page *page,
 571	sector_t sector, struct dma_async_tx_descriptor *tx)
 572{
 573	struct bio_vec *bvl;
 574	struct page *bio_page;
 575	int i;
 576	int page_offset;
 577	struct async_submit_ctl submit;
 578	enum async_tx_flags flags = 0;
 579
 580	if (bio->bi_sector >= sector)
 581		page_offset = (signed)(bio->bi_sector - sector) * 512;
 582	else
 583		page_offset = (signed)(sector - bio->bi_sector) * -512;
 584
 585	if (frombio)
 586		flags |= ASYNC_TX_FENCE;
 587	init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
 588
 589	bio_for_each_segment(bvl, bio, i) {
 590		int len = bvl->bv_len;
 591		int clen;
 592		int b_offset = 0;
 593
 594		if (page_offset < 0) {
 595			b_offset = -page_offset;
 596			page_offset += b_offset;
 597			len -= b_offset;
 598		}
 599
 600		if (len > 0 && page_offset + len > STRIPE_SIZE)
 601			clen = STRIPE_SIZE - page_offset;
 602		else
 603			clen = len;
 604
 605		if (clen > 0) {
 606			b_offset += bvl->bv_offset;
 607			bio_page = bvl->bv_page;
 608			if (frombio)
 609				tx = async_memcpy(page, bio_page, page_offset,
 610						  b_offset, clen, &submit);
 611			else
 612				tx = async_memcpy(bio_page, page, b_offset,
 613						  page_offset, clen, &submit);
 614		}
 615		/* chain the operations */
 616		submit.depend_tx = tx;
 617
 618		if (clen < len) /* hit end of page */
 619			break;
 620		page_offset +=  len;
 621	}
 622
 623	return tx;
 624}
 625
 626static void ops_complete_biofill(void *stripe_head_ref)
 627{
 628	struct stripe_head *sh = stripe_head_ref;
 629	struct bio *return_bi = NULL;
 630	raid5_conf_t *conf = sh->raid_conf;
 631	int i;
 632
 633	pr_debug("%s: stripe %llu\n", __func__,
 634		(unsigned long long)sh->sector);
 635
 636	/* clear completed biofills */
 637	spin_lock_irq(&conf->device_lock);
 638	for (i = sh->disks; i--; ) {
 639		struct r5dev *dev = &sh->dev[i];
 640
 641		/* acknowledge completion of a biofill operation */
 642		/* and check if we need to reply to a read request,
 643		 * new R5_Wantfill requests are held off until
 644		 * !STRIPE_BIOFILL_RUN
 645		 */
 646		if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
 647			struct bio *rbi, *rbi2;
 648
 649			BUG_ON(!dev->read);
 650			rbi = dev->read;
 651			dev->read = NULL;
 652			while (rbi && rbi->bi_sector <
 653				dev->sector + STRIPE_SECTORS) {
 654				rbi2 = r5_next_bio(rbi, dev->sector);
 655				if (!raid5_dec_bi_phys_segments(rbi)) {
 656					rbi->bi_next = return_bi;
 657					return_bi = rbi;
 658				}
 659				rbi = rbi2;
 660			}
 661		}
 662	}
 663	spin_unlock_irq(&conf->device_lock);
 664	clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
 665
 666	return_io(return_bi);
 667
 668	set_bit(STRIPE_HANDLE, &sh->state);
 669	release_stripe(sh);
 670}
 671
 672static void ops_run_biofill(struct stripe_head *sh)
 673{
 674	struct dma_async_tx_descriptor *tx = NULL;
 675	raid5_conf_t *conf = sh->raid_conf;
 676	struct async_submit_ctl submit;
 677	int i;
 678
 679	pr_debug("%s: stripe %llu\n", __func__,
 680		(unsigned long long)sh->sector);
 681
 682	for (i = sh->disks; i--; ) {
 683		struct r5dev *dev = &sh->dev[i];
 684		if (test_bit(R5_Wantfill, &dev->flags)) {
 685			struct bio *rbi;
 686			spin_lock_irq(&conf->device_lock);
 687			dev->read = rbi = dev->toread;
 688			dev->toread = NULL;
 689			spin_unlock_irq(&conf->device_lock);
 690			while (rbi && rbi->bi_sector <
 691				dev->sector + STRIPE_SECTORS) {
 692				tx = async_copy_data(0, rbi, dev->page,
 693					dev->sector, tx);
 694				rbi = r5_next_bio(rbi, dev->sector);
 695			}
 696		}
 697	}
 698
 699	atomic_inc(&sh->count);
 700	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
 701	async_trigger_callback(&submit);
 702}
 703
 704static void mark_target_uptodate(struct stripe_head *sh, int target)
 705{
 706	struct r5dev *tgt;
 707
 708	if (target < 0)
 709		return;
 710
 711	tgt = &sh->dev[target];
 712	set_bit(R5_UPTODATE, &tgt->flags);
 713	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 714	clear_bit(R5_Wantcompute, &tgt->flags);
 715}
 716
 717static void ops_complete_compute(void *stripe_head_ref)
 718{
 719	struct stripe_head *sh = stripe_head_ref;
 720
 721	pr_debug("%s: stripe %llu\n", __func__,
 722		(unsigned long long)sh->sector);
 723
 724	/* mark the computed target(s) as uptodate */
 725	mark_target_uptodate(sh, sh->ops.target);
 726	mark_target_uptodate(sh, sh->ops.target2);
 727
 728	clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
 729	if (sh->check_state == check_state_compute_run)
 730		sh->check_state = check_state_compute_result;
 731	set_bit(STRIPE_HANDLE, &sh->state);
 732	release_stripe(sh);
 733}
 734
 735/* return a pointer to the address conversion region of the scribble buffer */
 736static addr_conv_t *to_addr_conv(struct stripe_head *sh,
 737				 struct raid5_percpu *percpu)
 738{
 739	return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
 740}
 741
 742static struct dma_async_tx_descriptor *
 743ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 744{
 745	int disks = sh->disks;
 746	struct page **xor_srcs = percpu->scribble;
 747	int target = sh->ops.target;
 748	struct r5dev *tgt = &sh->dev[target];
 749	struct page *xor_dest = tgt->page;
 750	int count = 0;
 751	struct dma_async_tx_descriptor *tx;
 752	struct async_submit_ctl submit;
 753	int i;
 754
 755	pr_debug("%s: stripe %llu block: %d\n",
 756		__func__, (unsigned long long)sh->sector, target);
 757	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 758
 759	for (i = disks; i--; )
 760		if (i != target)
 761			xor_srcs[count++] = sh->dev[i].page;
 762
 763	atomic_inc(&sh->count);
 764
 765	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
 766			  ops_complete_compute, sh, to_addr_conv(sh, percpu));
 767	if (unlikely(count == 1))
 768		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
 769	else
 770		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 771
 772	return tx;
 773}
 774
 775/* set_syndrome_sources - populate source buffers for gen_syndrome
 776 * @srcs - (struct page *) array of size sh->disks
 777 * @sh - stripe_head to parse
 778 *
 779 * Populates srcs in proper layout order for the stripe and returns the
 780 * 'count' of sources to be used in a call to async_gen_syndrome.  The P
 781 * destination buffer is recorded in srcs[count] and the Q destination
 782 * is recorded in srcs[count+1]].
 783 */
 784static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
 785{
 786	int disks = sh->disks;
 787	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
 788	int d0_idx = raid6_d0(sh);
 789	int count;
 790	int i;
 791
 792	for (i = 0; i < disks; i++)
 793		srcs[i] = NULL;
 794
 795	count = 0;
 796	i = d0_idx;
 797	do {
 798		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
 799
 800		srcs[slot] = sh->dev[i].page;
 801		i = raid6_next_disk(i, disks);
 802	} while (i != d0_idx);
 803
 804	return syndrome_disks;
 805}
 806
 807static struct dma_async_tx_descriptor *
 808ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 809{
 810	int disks = sh->disks;
 811	struct page **blocks = percpu->scribble;
 812	int target;
 813	int qd_idx = sh->qd_idx;
 814	struct dma_async_tx_descriptor *tx;
 815	struct async_submit_ctl submit;
 816	struct r5dev *tgt;
 817	struct page *dest;
 818	int i;
 819	int count;
 820
 821	if (sh->ops.target < 0)
 822		target = sh->ops.target2;
 823	else if (sh->ops.target2 < 0)
 824		target = sh->ops.target;
 825	else
 826		/* we should only have one valid target */
 827		BUG();
 828	BUG_ON(target < 0);
 829	pr_debug("%s: stripe %llu block: %d\n",
 830		__func__, (unsigned long long)sh->sector, target);
 831
 832	tgt = &sh->dev[target];
 833	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 834	dest = tgt->page;
 835
 836	atomic_inc(&sh->count);
 837
 838	if (target == qd_idx) {
 839		count = set_syndrome_sources(blocks, sh);
 840		blocks[count] = NULL; /* regenerating p is not necessary */
 841		BUG_ON(blocks[count+1] != dest); /* q should already be set */
 842		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
 843				  ops_complete_compute, sh,
 844				  to_addr_conv(sh, percpu));
 845		tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
 846	} else {
 847		/* Compute any data- or p-drive using XOR */
 848		count = 0;
 849		for (i = disks; i-- ; ) {
 850			if (i == target || i == qd_idx)
 851				continue;
 852			blocks[count++] = sh->dev[i].page;
 853		}
 854
 855		init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
 856				  NULL, ops_complete_compute, sh,
 857				  to_addr_conv(sh, percpu));
 858		tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
 859	}
 860
 861	return tx;
 862}
 863
 864static struct dma_async_tx_descriptor *
 865ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 866{
 867	int i, count, disks = sh->disks;
 868	int syndrome_disks = sh->ddf_layout ? disks : disks-2;
 869	int d0_idx = raid6_d0(sh);
 870	int faila = -1, failb = -1;
 871	int target = sh->ops.target;
 872	int target2 = sh->ops.target2;
 873	struct r5dev *tgt = &sh->dev[target];
 874	struct r5dev *tgt2 = &sh->dev[target2];
 875	struct dma_async_tx_descriptor *tx;
 876	struct page **blocks = percpu->scribble;
 877	struct async_submit_ctl submit;
 878
 879	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
 880		 __func__, (unsigned long long)sh->sector, target, target2);
 881	BUG_ON(target < 0 || target2 < 0);
 882	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 883	BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
 884
 885	/* we need to open-code set_syndrome_sources to handle the
 886	 * slot number conversion for 'faila' and 'failb'
 887	 */
 888	for (i = 0; i < disks ; i++)
 889		blocks[i] = NULL;
 890	count = 0;
 891	i = d0_idx;
 892	do {
 893		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
 894
 895		blocks[slot] = sh->dev[i].page;
 896
 897		if (i == target)
 898			faila = slot;
 899		if (i == target2)
 900			failb = slot;
 901		i = raid6_next_disk(i, disks);
 902	} while (i != d0_idx);
 903
 904	BUG_ON(faila == failb);
 905	if (failb < faila)
 906		swap(faila, failb);
 907	pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
 908		 __func__, (unsigned long long)sh->sector, faila, failb);
 909
 910	atomic_inc(&sh->count);
 911
 912	if (failb == syndrome_disks+1) {
 913		/* Q disk is one of the missing disks */
 914		if (faila == syndrome_disks) {
 915			/* Missing P+Q, just recompute */
 916			init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
 917					  ops_complete_compute, sh,
 918					  to_addr_conv(sh, percpu));
 919			return async_gen_syndrome(blocks, 0, syndrome_disks+2,
 920						  STRIPE_SIZE, &submit);
 921		} else {
 922			struct page *dest;
 923			int data_target;
 924			int qd_idx = sh->qd_idx;
 925
 926			/* Missing D+Q: recompute D from P, then recompute Q */
 927			if (target == qd_idx)
 928				data_target = target2;
 929			else
 930				data_target = target;
 931
 932			count = 0;
 933			for (i = disks; i-- ; ) {
 934				if (i == data_target || i == qd_idx)
 935					continue;
 936				blocks[count++] = sh->dev[i].page;
 937			}
 938			dest = sh->dev[data_target].page;
 939			init_async_submit(&submit,
 940					  ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
 941					  NULL, NULL, NULL,
 942					  to_addr_conv(sh, percpu));
 943			tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
 944				       &submit);
 945
 946			count = set_syndrome_sources(blocks, sh);
 947			init_async_submit(&submit, ASYNC_TX_FENCE, tx,
 948					  ops_complete_compute, sh,
 949					  to_addr_conv(sh, percpu));
 950			return async_gen_syndrome(blocks, 0, count+2,
 951						  STRIPE_SIZE, &submit);
 952		}
 953	} else {
 954		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
 955				  ops_complete_compute, sh,
 956				  to_addr_conv(sh, percpu));
 957		if (failb == syndrome_disks) {
 958			/* We're missing D+P. */
 959			return async_raid6_datap_recov(syndrome_disks+2,
 960						       STRIPE_SIZE, faila,
 961						       blocks, &submit);
 962		} else {
 963			/* We're missing D+D. */
 964			return async_raid6_2data_recov(syndrome_disks+2,
 965						       STRIPE_SIZE, faila, failb,
 966						       blocks, &submit);
 967		}
 968	}
 969}
 970
 971
 972static void ops_complete_prexor(void *stripe_head_ref)
 973{
 974	struct stripe_head *sh = stripe_head_ref;
 975
 976	pr_debug("%s: stripe %llu\n", __func__,
 977		(unsigned long long)sh->sector);
 978}
 979
 980static struct dma_async_tx_descriptor *
 981ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
 982	       struct dma_async_tx_descriptor *tx)
 983{
 984	int disks = sh->disks;
 985	struct page **xor_srcs = percpu->scribble;
 986	int count = 0, pd_idx = sh->pd_idx, i;
 987	struct async_submit_ctl submit;
 988
 989	/* existing parity data subtracted */
 990	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 991
 992	pr_debug("%s: stripe %llu\n", __func__,
 993		(unsigned long long)sh->sector);
 994
 995	for (i = disks; i--; ) {
 996		struct r5dev *dev = &sh->dev[i];
 997		/* Only process blocks that are known to be uptodate */
 998		if (test_bit(R5_Wantdrain, &dev->flags))
 999			xor_srcs[count++] = dev->page;
1000	}
1001
1002	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
1003			  ops_complete_prexor, sh, to_addr_conv(sh, percpu));
1004	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1005
1006	return tx;
1007}
1008
1009static struct dma_async_tx_descriptor *
1010ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
1011{
1012	int disks = sh->disks;
1013	int i;
1014
1015	pr_debug("%s: stripe %llu\n", __func__,
1016		(unsigned long long)sh->sector);
1017
1018	for (i = disks; i--; ) {
1019		struct r5dev *dev = &sh->dev[i];
1020		struct bio *chosen;
1021
1022		if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
1023			struct bio *wbi;
1024
1025			spin_lock(&sh->lock);
1026			chosen = dev->towrite;
1027			dev->towrite = NULL;
1028			BUG_ON(dev->written);
1029			wbi = dev->written = chosen;
1030			spin_unlock(&sh->lock);
1031
1032			while (wbi && wbi->bi_sector <
1033				dev->sector + STRIPE_SECTORS) {
1034				if (wbi->bi_rw & REQ_FUA)
1035					set_bit(R5_WantFUA, &dev->flags);
1036				tx = async_copy_data(1, wbi, dev->page,
1037					dev->sector, tx);
1038				wbi = r5_next_bio(wbi, dev->sector);
1039			}
1040		}
1041	}
1042
1043	return tx;
1044}
1045
1046static void ops_complete_reconstruct(void *stripe_head_ref)
1047{
1048	struct stripe_head *sh = stripe_head_ref;
1049	int disks = sh->disks;
1050	int pd_idx = sh->pd_idx;
1051	int qd_idx = sh->qd_idx;
1052	int i;
1053	bool fua = false;
1054
1055	pr_debug("%s: stripe %llu\n", __func__,
1056		(unsigned long long)sh->sector);
1057
1058	for (i = disks; i--; )
1059		fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
1060
1061	for (i = disks; i--; ) {
1062		struct r5dev *dev = &sh->dev[i];
1063
1064		if (dev->written || i == pd_idx || i == qd_idx) {
1065			set_bit(R5_UPTODATE, &dev->flags);
1066			if (fua)
1067				set_bit(R5_WantFUA, &dev->flags);
1068		}
1069	}
1070
1071	if (sh->reconstruct_state == reconstruct_state_drain_run)
1072		sh->reconstruct_state = reconstruct_state_drain_result;
1073	else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
1074		sh->reconstruct_state = reconstruct_state_prexor_drain_result;
1075	else {
1076		BUG_ON(sh->reconstruct_state != reconstruct_state_run);
1077		sh->reconstruct_state = reconstruct_state_result;
1078	}
1079
1080	set_bit(STRIPE_HANDLE, &sh->state);
1081	release_stripe(sh);
1082}
1083
1084static void
1085ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1086		     struct dma_async_tx_descriptor *tx)
1087{
1088	int disks = sh->disks;
1089	struct page **xor_srcs = percpu->scribble;
1090	struct async_submit_ctl submit;
1091	int count = 0, pd_idx = sh->pd_idx, i;
1092	struct page *xor_dest;
1093	int prexor = 0;
1094	unsigned long flags;
1095
1096	pr_debug("%s: stripe %llu\n", __func__,
1097		(unsigned long long)sh->sector);
1098
1099	/* check if prexor is active which means only process blocks
1100	 * that are part of a read-modify-write (written)
1101	 */
1102	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1103		prexor = 1;
1104		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1105		for (i = disks; i--; ) {
1106			struct r5dev *dev = &sh->dev[i];
1107			if (dev->written)
1108				xor_srcs[count++] = dev->page;
1109		}
1110	} else {
1111		xor_dest = sh->dev[pd_idx].page;
1112		for (i = disks; i--; ) {
1113			struct r5dev *dev = &sh->dev[i];
1114			if (i != pd_idx)
1115				xor_srcs[count++] = dev->page;
1116		}
1117	}
1118
1119	/* 1/ if we prexor'd then the dest is reused as a source
1120	 * 2/ if we did not prexor then we are redoing the parity
1121	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
1122	 * for the synchronous xor case
1123	 */
1124	flags = ASYNC_TX_ACK |
1125		(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1126
1127	atomic_inc(&sh->count);
1128
1129	init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
1130			  to_addr_conv(sh, percpu));
1131	if (unlikely(count == 1))
1132		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1133	else
1134		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1135}
1136
1137static void
1138ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1139		     struct dma_async_tx_descriptor *tx)
1140{
1141	struct async_submit_ctl submit;
1142	struct page **blocks = percpu->scribble;
1143	int count;
1144
1145	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1146
1147	count = set_syndrome_sources(blocks, sh);
1148
1149	atomic_inc(&sh->count);
1150
1151	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
1152			  sh, to_addr_conv(sh, percpu));
1153	async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
1154}
1155
1156static void ops_complete_check(void *stripe_head_ref)
1157{
1158	struct stripe_head *sh = stripe_head_ref;
1159
1160	pr_debug("%s: stripe %llu\n", __func__,
1161		(unsigned long long)sh->sector);
1162
1163	sh->check_state = check_state_check_result;
1164	set_bit(STRIPE_HANDLE, &sh->state);
1165	release_stripe(sh);
1166}
1167
1168static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1169{
1170	int disks = sh->disks;
1171	int pd_idx = sh->pd_idx;
1172	int qd_idx = sh->qd_idx;
1173	struct page *xor_dest;
1174	struct page **xor_srcs = percpu->scribble;
1175	struct dma_async_tx_descriptor *tx;
1176	struct async_submit_ctl submit;
1177	int count;
1178	int i;
1179
1180	pr_debug("%s: stripe %llu\n", __func__,
1181		(unsigned long long)sh->sector);
1182
1183	count = 0;
1184	xor_dest = sh->dev[pd_idx].page;
1185	xor_srcs[count++] = xor_dest;
1186	for (i = disks; i--; ) {
1187		if (i == pd_idx || i == qd_idx)
1188			continue;
1189		xor_srcs[count++] = sh->dev[i].page;
1190	}
1191
1192	init_async_submit(&submit, 0, NULL, NULL, NULL,
1193			  to_addr_conv(sh, percpu));
1194	tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1195			   &sh->ops.zero_sum_result, &submit);
1196
1197	atomic_inc(&sh->count);
1198	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1199	tx = async_trigger_callback(&submit);
1200}
1201
1202static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1203{
1204	struct page **srcs = percpu->scribble;
1205	struct async_submit_ctl submit;
1206	int count;
1207
1208	pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1209		(unsigned long long)sh->sector, checkp);
1210
1211	count = set_syndrome_sources(srcs, sh);
1212	if (!checkp)
1213		srcs[count] = NULL;
1214
1215	atomic_inc(&sh->count);
1216	init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
1217			  sh, to_addr_conv(sh, percpu));
1218	async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1219			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1220}
1221
1222static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1223{
1224	int overlap_clear = 0, i, disks = sh->disks;
1225	struct dma_async_tx_descriptor *tx = NULL;
1226	raid5_conf_t *conf = sh->raid_conf;
1227	int level = conf->level;
1228	struct raid5_percpu *percpu;
1229	unsigned long cpu;
1230
1231	cpu = get_cpu();
1232	percpu = per_cpu_ptr(conf->percpu, cpu);
1233	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
1234		ops_run_biofill(sh);
1235		overlap_clear++;
1236	}
1237
1238	if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
1239		if (level < 6)
1240			tx = ops_run_compute5(sh, percpu);
1241		else {
1242			if (sh->ops.target2 < 0 || sh->ops.target < 0)
1243				tx = ops_run_compute6_1(sh, percpu);
1244			else
1245				tx = ops_run_compute6_2(sh, percpu);
1246		}
1247		/* terminate the chain if reconstruct is not set to be run */
1248		if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
1249			async_tx_ack(tx);
1250	}
1251
1252	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
1253		tx = ops_run_prexor(sh, percpu, tx);
1254
1255	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
1256		tx = ops_run_biodrain(sh, tx);
1257		overlap_clear++;
1258	}
1259
1260	if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
1261		if (level < 6)
1262			ops_run_reconstruct5(sh, percpu, tx);
1263		else
1264			ops_run_reconstruct6(sh, percpu, tx);
1265	}
1266
1267	if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
1268		if (sh->check_state == check_state_run)
1269			ops_run_check_p(sh, percpu);
1270		else if (sh->check_state == check_state_run_q)
1271			ops_run_check_pq(sh, percpu, 0);
1272		else if (sh->check_state == check_state_run_pq)
1273			ops_run_check_pq(sh, percpu, 1);
1274		else
1275			BUG();
1276	}
1277
1278	if (overlap_clear)
1279		for (i = disks; i--; ) {
1280			struct r5dev *dev = &sh->dev[i];
1281			if (test_and_clear_bit(R5_Overlap, &dev->flags))
1282				wake_up(&sh->raid_conf->wait_for_overlap);
1283		}
1284	put_cpu();
1285}
1286
1287#ifdef CONFIG_MULTICORE_RAID456
1288static void async_run_ops(void *param, async_cookie_t cookie)
1289{
1290	struct stripe_head *sh = param;
1291	unsigned long ops_request = sh->ops.request;
1292
1293	clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
1294	wake_up(&sh->ops.wait_for_ops);
1295
1296	__raid_run_ops(sh, ops_request);
1297	release_stripe(sh);
1298}
1299
1300static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1301{
1302	/* since handle_stripe can be called outside of raid5d context
1303	 * we need to ensure sh->ops.request is de-staged before another
1304	 * request arrives
1305	 */
1306	wait_event(sh->ops.wait_for_ops,
1307		   !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
1308	sh->ops.request = ops_request;
1309
1310	atomic_inc(&sh->count);
1311	async_schedule(async_run_ops, sh);
1312}
1313#else
1314#define raid_run_ops __raid_run_ops
1315#endif
1316
1317static int grow_one_stripe(raid5_conf_t *conf)
1318{
1319	struct stripe_head *sh;
1320	sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
1321	if (!sh)
1322		return 0;
1323	memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev));
1324	sh->raid_conf = conf;
1325	spin_lock_init(&sh->lock);
1326	#ifdef CONFIG_MULTICORE_RAID456
1327	init_waitqueue_head(&sh->ops.wait_for_ops);
1328	#endif
1329
1330	if (grow_buffers(sh)) {
1331		shrink_buffers(sh);
1332		kmem_cache_free(conf->slab_cache, sh);
1333		return 0;
1334	}
1335	/* we just created an active stripe so... */
1336	atomic_set(&sh->count, 1);
1337	atomic_inc(&conf->active_stripes);
1338	INIT_LIST_HEAD(&sh->lru);
1339	release_stripe(sh);
1340	return 1;
1341}
1342
1343static int grow_stripes(raid5_conf_t *conf, int num)
1344{
1345	struct kmem_cache *sc;
1346	int devs = max(conf->raid_disks, conf->previous_raid_disks);
1347
1348	if (conf->mddev->gendisk)
1349		sprintf(conf->cache_name[0],
1350			"raid%d-%s", conf->level, mdname(conf->mddev));
1351	else
1352		sprintf(conf->cache_name[0],
1353			"raid%d-%p", conf->level, conf->mddev);
1354	sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
1355
1356	conf->active_name = 0;
1357	sc = kmem_cache_create(conf->cache_name[conf->active_name],
1358			       sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
1359			       0, 0, NULL);
1360	if (!sc)
1361		return 1;
1362	conf->slab_cache = sc;
1363	conf->pool_size = devs;
1364	while (num--)
1365		if (!grow_one_stripe(conf))
1366			return 1;
1367	return 0;
1368}
1369
1370/**
1371 * scribble_len - return the required size of the scribble region
1372 * @num - total number of disks in the array
1373 *
1374 * The size must be enough to contain:
1375 * 1/ a struct page pointer for each device in the array +2
1376 * 2/ room to convert each entry in (1) to its corresponding dma
1377 *    (dma_map_page()) or page (page_address()) address.
1378 *
1379 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
1380 * calculate over all devices (not just the data blocks), using zeros in place
1381 * of the P and Q blocks.
1382 */
1383static size_t scribble_len(int num)
1384{
1385	size_t len;
1386
1387	len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
1388
1389	return len;
1390}
1391
1392static int resize_stripes(raid5_conf_t *conf, int newsize)
1393{
1394	/* Make all the stripes able to hold 'newsize' devices.
1395	 * New slots in each stripe get 'page' set to a new page.
1396	 *
1397	 * This happens in stages:
1398	 * 1/ create a new kmem_cache and allocate the required number of
1399	 *    stripe_heads.
1400	 * 2/ gather all the old stripe_heads and tranfer the pages across
1401	 *    to the new stripe_heads.  This will have the side effect of
1402	 *    freezing the array as once all stripe_heads have been collected,
1403	 *    no IO will be possible.  Old stripe heads are freed once their
1404	 *    pages have been transferred over, and the old kmem_cache is
1405	 *    freed when all stripes are done.
1406	 * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
1407	 *    we simple return a failre status - no need to clean anything up.
1408	 * 4/ allocate new pages for the new slots in the new stripe_heads.
1409	 *    If this fails, we don't bother trying the shrink the
1410	 *    stripe_heads down again, we just leave them as they are.
1411	 *    As each stripe_head is processed the new one is released into
1412	 *    active service.
1413	 *
1414	 * Once step2 is started, we cannot afford to wait for a write,
1415	 * so we use GFP_NOIO allocations.
1416	 */
1417	struct stripe_head *osh, *nsh;
1418	LIST_HEAD(newstripes);
1419	struct disk_info *ndisks;
1420	unsigned long cpu;
1421	int err;
1422	struct kmem_cache *sc;
1423	int i;
1424
1425	if (newsize <= conf->pool_size)
1426		return 0; /* never bother to shrink */
1427
1428	err = md_allow_write(conf->mddev);
1429	if (err)
1430		return err;
1431
1432	/* Step 1 */
1433	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
1434			       sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
1435			       0, 0, NULL);
1436	if (!sc)
1437		return -ENOMEM;
1438
1439	for (i = conf->max_nr_stripes; i; i--) {
1440		nsh = kmem_cache_alloc(sc, GFP_KERNEL);
1441		if (!nsh)
1442			break;
1443
1444		memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
1445
1446		nsh->raid_conf = conf;
1447		spin_lock_init(&nsh->lock);
1448		#ifdef CONFIG_MULTICORE_RAID456
1449		init_waitqueue_head(&nsh->ops.wait_for_ops);
1450		#endif
1451
1452		list_add(&nsh->lru, &newstripes);
1453	}
1454	if (i) {
1455		/* didn't get enough, give up */
1456		while (!list_empty(&newstripes)) {
1457			nsh = list_entry(newstripes.next, struct stripe_head, lru);
1458			list_del(&nsh->lru);
1459			kmem_cache_free(sc, nsh);
1460		}
1461		kmem_cache_destroy(sc);
1462		return -ENOMEM;
1463	}
1464	/* Step 2 - Must use GFP_NOIO now.
1465	 * OK, we have enough stripes, start collecting inactive
1466	 * stripes and copying them over
1467	 */
1468	list_for_each_entry(nsh, &newstripes, lru) {
1469		spin_lock_irq(&conf->device_lock);
1470		wait_event_lock_irq(conf->wait_for_stripe,
1471				    !list_empty(&conf->inactive_list),
1472				    conf->device_lock,
1473				    );
1474		osh = get_free_stripe(conf);
1475		spin_unlock_irq(&conf->device_lock);
1476		atomic_set(&nsh->count, 1);
1477		for(i=0; i<conf->pool_size; i++)
1478			nsh->dev[i].page = osh->dev[i].page;
1479		for( ; i<newsize; i++)
1480			nsh->dev[i].page = NULL;
1481		kmem_cache_free(conf->slab_cache, osh);
1482	}
1483	kmem_cache_destroy(conf->slab_cache);
1484
1485	/* Step 3.
1486	 * At this point, we are holding all the stripes so the array
1487	 * is completely stalled, so now is a good time to resize
1488	 * conf->disks and the scribble region
1489	 */
1490	ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1491	if (ndisks) {
1492		for (i=0; i<conf->raid_disks; i++)
1493			ndisks[i] = conf->disks[i];
1494		kfree(conf->disks);
1495		conf->disks = ndisks;
1496	} else
1497		err = -ENOMEM;
1498
1499	get_online_cpus();
1500	conf->scribble_len = scribble_len(newsize);
1501	for_each_present_cpu(cpu) {
1502		struct raid5_percpu *percpu;
1503		void *scribble;
1504
1505		percpu = per_cpu_ptr(conf->percpu, cpu);
1506		scribble = kmalloc(conf->scribble_len, GFP_NOIO);
1507
1508		if (scribble) {
1509			kfree(percpu->scribble);
1510			percpu->scribble = scribble;
1511		} else {
1512			err = -ENOMEM;
1513			break;
1514		}
1515	}
1516	put_online_cpus();
1517
1518	/* Step 4, return new stripes to service */
1519	while(!list_empty(&newstripes)) {
1520		nsh = list_entry(newstripes.next, struct stripe_head, lru);
1521		list_del_init(&nsh->lru);
1522
1523		for (i=conf->raid_disks; i < newsize; i++)
1524			if (nsh->dev[i].page == NULL) {
1525				struct page *p = alloc_page(GFP_NOIO);
1526				nsh->dev[i].page = p;
1527				if (!p)
1528					err = -ENOMEM;
1529			}
1530		release_stripe(nsh);
1531	}
1532	/* critical section pass, GFP_NOIO no longer needed */
1533
1534	conf->slab_cache = sc;
1535	conf->active_name = 1-conf->active_name;
1536	conf->pool_size = newsize;
1537	return err;
1538}
1539
1540static int drop_one_stripe(raid5_conf_t *conf)
1541{
1542	struct stripe_head *sh;
1543
1544	spin_lock_irq(&conf->device_lock);
1545	sh = get_free_stripe(conf);
1546	spin_unlock_irq(&conf->device_lock);
1547	if (!sh)
1548		return 0;
1549	BUG_ON(atomic_read(&sh->count));
1550	shrink_buffers(sh);
1551	kmem_cache_free(conf->slab_cache, sh);
1552	atomic_dec(&conf->active_stripes);
1553	return 1;
1554}
1555
1556static void shrink_stripes(raid5_conf_t *conf)
1557{
1558	while (drop_one_stripe(conf))
1559		;
1560
1561	if (conf->slab_cache)
1562		kmem_cache_destroy(conf->slab_cache);
1563	conf->slab_cache = NULL;
1564}
1565
1566static void raid5_end_read_request(struct bio * bi, int error)
1567{
1568	struct stripe_head *sh = bi->bi_private;
1569	raid5_conf_t *conf = sh->raid_conf;
1570	int disks = sh->disks, i;
1571	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1572	char b[BDEVNAME_SIZE];
1573	mdk_rdev_t *rdev;
1574
1575
1576	for (i=0 ; i<disks; i++)
1577		if (bi == &sh->dev[i].req)
1578			break;
1579
1580	pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
1581		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
1582		uptodate);
1583	if (i == disks) {
1584		BUG();
1585		return;
1586	}
1587
1588	if (uptodate) {
1589		set_bit(R5_UPTODATE, &sh->dev[i].flags);
1590		if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1591			rdev = conf->disks[i].rdev;
1592			printk_rl(KERN_INFO "md/raid:%s: read error corrected"
1593				  " (%lu sectors at %llu on %s)\n",
1594				  mdname(conf->mddev), STRIPE_SECTORS,
1595				  (unsigned long long)(sh->sector
1596						       + rdev->data_offset),
1597				  bdevname(rdev->bdev, b));
1598			clear_bit(R5_ReadError, &sh->dev[i].flags);
1599			clear_bit(R5_ReWrite, &sh->dev[i].flags);
1600		}
1601		if (atomic_read(&conf->disks[i].rdev->read_errors))
1602			atomic_set(&conf->disks[i].rdev->read_errors, 0);
1603	} else {
1604		const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
1605		int retry = 0;
1606		rdev = conf->disks[i].rdev;
1607
1608		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1609		atomic_inc(&rdev->read_errors);
1610		if (conf->mddev->degraded >= conf->max_degraded)
1611			printk_rl(KERN_WARNING
1612				  "md/raid:%s: read error not correctable "
1613				  "(sector %llu on %s).\n",
1614				  mdname(conf->mddev),
1615				  (unsigned long long)(sh->sector
1616						       + rdev->data_offset),
1617				  bdn);
1618		else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1619			/* Oh, no!!! */
1620			printk_rl(KERN_WARNING
1621				  "md/raid:%s: read error NOT corrected!! "
1622				  "(sector %llu on %s).\n",
1623				  mdname(conf->mddev),
1624				  (unsigned long long)(sh->sector
1625						       + rdev->data_offset),
1626				  bdn);
1627		else if (atomic_read(&rdev->read_errors)
1628			 > conf->max_nr_stripes)
1629			printk(KERN_WARNING
1630			       "md/raid:%s: Too many read errors, failing device %s.\n",
1631			       mdname(conf->mddev), bdn);
1632		else
1633			retry = 1;
1634		if (retry)
1635			set_bit(R5_ReadError, &sh->dev[i].flags);
1636		else {
1637			clear_bit(R5_ReadError, &sh->dev[i].flags);
1638			clear_bit(R5_ReWrite, &sh->dev[i].flags);
1639			md_error(conf->mddev, rdev);
1640		}
1641	}
1642	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1643	clear_bit(R5_LOCKED, &sh->dev[i].flags);
1644	set_bit(STRIPE_HANDLE, &sh->state);
1645	release_stripe(sh);
1646}
1647
1648static void raid5_end_write_request(struct bio *bi, int error)
1649{
1650	struct stripe_head *sh = bi->bi_private;
1651	raid5_conf_t *conf = sh->raid_conf;
1652	int disks = sh->disks, i;
1653	int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1654
1655	for (i=0 ; i<disks; i++)
1656		if (bi == &sh->dev[i].req)
1657			break;
1658
1659	pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
1660		(unsigned long long)sh->sector, i, atomic_read(&sh->count),
1661		uptodate);
1662	if (i == disks) {
1663		BUG();
1664		return;
1665	}
1666
1667	if (!uptodate)
1668		md_error(conf->mddev, conf->disks[i].rdev);
1669
1670	rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1671	
1672	clear_bit(R5_LOCKED, &sh->dev[i].flags);
1673	set_bit(STRIPE_HANDLE, &sh->state);
1674	release_stripe(sh);
1675}
1676
1677
1678static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
1679	
1680static void raid5_build_block(struct stripe_head *sh, int i, int previous)
1681{
1682	struct r5dev *dev = &sh->dev[i];
1683
1684	bio_init(&dev->req);
1685	dev->req.bi_io_vec = &dev->vec;
1686	dev->req.bi_vcnt++;
1687	dev->req.bi_max_vecs++;
1688	dev->vec.bv_page = dev->page;
1689	dev->vec.bv_len = STRIPE_SIZE;
1690	dev->vec.bv_offset = 0;
1691
1692	dev->req.bi_sector = sh->sector;
1693	dev->req.bi_private = sh;
1694
1695	dev->flags = 0;
1696	dev->sector = compute_blocknr(sh, i, previous);
1697}
1698
1699static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1700{
1701	char b[BDEVNAME_SIZE];
1702	raid5_conf_t *conf = mddev->private;
1703	pr_debug("raid456: error called\n");
1704
1705	if (test_and_clear_bit(In_sync, &rdev->flags)) {
1706		unsigned long flags;
1707		spin_lock_irqsave(&conf->device_lock, flags);
1708		mddev->degraded++;
1709		spin_unlock_irqrestore(&conf->device_lock, flags);
1710		/*
1711		 * if recovery was running, make sure it aborts.
1712		 */
1713		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1714	}
1715	set_bit(Faulty, &rdev->flags);
1716	set_bit(MD_CHANGE_DEVS, &mddev->flags);
1717	printk(KERN_ALERT
1718	       "md/raid:%s: Disk failure on %s, disabling device.\n"
1719	       "md/raid:%s: Operation continuing on %d devices.\n",
1720	       mdname(mddev),
1721	       bdevname(rdev->bdev, b),
1722	       mdname(mddev),
1723	       conf->raid_disks - mddev->degraded);
1724}
1725
1726/*
1727 * Input: a 'big' sector number,
1728 * Output: index of the data and parity disk, and the sector # in them.
1729 */
1730static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1731				     int previous, int *dd_idx,
1732				     struct stripe_head *sh)
1733{
1734	sector_t stripe, stripe2;
1735	sector_t chunk_number;
1736	unsigned int chunk_offset;
1737	int pd_idx, qd_idx;
1738	int ddf_layout = 0;
1739	sector_t new_sector;
1740	int algorithm = previous ? conf->prev_algo
1741				 : conf->algorithm;
1742	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
1743					 : conf->chunk_sectors;
1744	int raid_disks = previous ? conf->previous_raid_disks
1745				  : conf->raid_disks;
1746	int data_disks = raid_disks - conf->max_degraded;
1747
1748	/* First compute the information on this sector */
1749
1750	/*
1751	 * Compute the chunk number and the sector offset inside the chunk
1752	 */
1753	chunk_offset = sector_div(r_sector, sectors_per_chunk);
1754	chunk_number = r_sector;
1755
1756	/*
1757	 * Compute the stripe number
1758	 */
1759	stripe = chunk_number;
1760	*dd_idx = sector_div(stripe, data_disks);
1761	stripe2 = stripe;
1762	/*
1763	 * Select the parity disk based on the user selected algorithm.
1764	 */
1765	pd_idx = qd_idx = ~0;
1766	switch(conf->level) {
1767	case 4:
1768		pd_idx = data_disks;
1769		break;
1770	case 5:
1771		switch (algorithm) {
1772		case ALGORITHM_LEFT_ASYMMETRIC:
1773			pd_idx = data_disks - sector_div(stripe2, raid_disks);
1774			if (*dd_idx >= pd_idx)
1775				(*dd_idx)++;
1776			break;
1777		case ALGORITHM_RIGHT_ASYMMETRIC:
1778			pd_idx = sector_div(stripe2, raid_disks);
1779			if (*dd_idx >= pd_idx)
1780				(*dd_idx)++;
1781			break;
1782		case ALGORITHM_LEFT_SYMMETRIC:
1783			pd_idx = data_disks - sector_div(stripe2, raid_disks);
1784			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1785			break;
1786		case ALGORITHM_RIGHT_SYMMETRIC:
1787			pd_idx = sector_div(stripe2, raid_disks);
1788			*dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1789			break;
1790		case ALGORITHM_PARITY_0:
1791			pd_idx = 0;
1792			(*dd_idx)++;
1793			break;
1794		case ALGORITHM_PARITY_N:
1795			pd_idx = data_disks;
1796			break;
1797		default:
1798			BUG();
1799		}
1800		break;
1801	case 6:
1802
1803		switch (algorithm) {
1804		case ALGORITHM_LEFT_ASYMMETRIC:
1805			pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
1806			qd_idx = pd_idx + 1;
1807			if (pd_idx == raid_disks-1) {
1808				(*dd_idx)++;	/* Q D D D P */
1809				qd_idx = 0;
1810			} else if (*dd_idx >= pd_idx)
1811				(*dd_idx) += 2; /* D D P Q D */
1812			break;
1813		case ALGORITHM_RIGHT_ASYMMETRIC:
1814			pd_idx = sector_div(stripe2, raid_disks);
1815			qd_idx = pd_idx + 1;
1816			if (pd_idx == raid_disks-1) {
1817				(*dd_idx)++;	/* Q D D D P */
1818				qd_idx = 0;
1819			} else if (*dd_idx >= pd_idx)
1820				(*dd_idx) += 2; /* D D P Q D */
1821			break;
1822		case ALGORITHM_LEFT_SYMMETRIC:
1823			pd_

Large files files are truncated, but you can click here to view the full file