/drivers/md/raid5.c
https://bitbucket.org/ndreys/linux-sunxi · C · 6022 lines · 4565 code · 661 blank · 796 comment · 1114 complexity · 55ed2e6a439dbba1eab15e8ff1006358 MD5 · raw file
Large files are truncated click here to view the full file
- /*
- * raid5.c : Multiple Devices driver for Linux
- * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
- * Copyright (C) 1999, 2000 Ingo Molnar
- * Copyright (C) 2002, 2003 H. Peter Anvin
- *
- * RAID-4/5/6 management functions.
- * Thanks to Penguin Computing for making the RAID-6 development possible
- * by donating a test server!
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * You should have received a copy of the GNU General Public License
- * (for example /usr/src/linux/COPYING); if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
- /*
- * BITMAP UNPLUGGING:
- *
- * The sequencing for updating the bitmap reliably is a little
- * subtle (and I got it wrong the first time) so it deserves some
- * explanation.
- *
- * We group bitmap updates into batches. Each batch has a number.
- * We may write out several batches at once, but that isn't very important.
- * conf->seq_write is the number of the last batch successfully written.
- * conf->seq_flush is the number of the last batch that was closed to
- * new additions.
- * When we discover that we will need to write to any block in a stripe
- * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
- * the number of the batch it will be in. This is seq_flush+1.
- * When we are ready to do a write, if that batch hasn't been written yet,
- * we plug the array and queue the stripe for later.
- * When an unplug happens, we increment bm_flush, thus closing the current
- * batch.
- * When we notice that bm_flush > bm_write, we write out all pending updates
- * to the bitmap, and advance bm_write to where bm_flush was.
- * This may occasionally write a bit out twice, but is sure never to
- * miss any bits.
- */
- #include <linux/blkdev.h>
- #include <linux/kthread.h>
- #include <linux/raid/pq.h>
- #include <linux/async_tx.h>
- #include <linux/async.h>
- #include <linux/seq_file.h>
- #include <linux/cpu.h>
- #include <linux/slab.h>
- #include "md.h"
- #include "raid5.h"
- #include "raid0.h"
- #include "bitmap.h"
- /*
- * Stripe cache
- */
- #define NR_STRIPES 256
- #define STRIPE_SIZE PAGE_SIZE
- #define STRIPE_SHIFT (PAGE_SHIFT - 9)
- #define STRIPE_SECTORS (STRIPE_SIZE>>9)
- #define IO_THRESHOLD 1
- #define BYPASS_THRESHOLD 1
- #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
- #define HASH_MASK (NR_HASH - 1)
- #define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
- /* bio's attached to a stripe+device for I/O are linked together in bi_sector
- * order without overlap. There may be several bio's per stripe+device, and
- * a bio could span several devices.
- * When walking this list for a particular stripe+device, we must never proceed
- * beyond a bio that extends past this device, as the next bio might no longer
- * be valid.
- * This macro is used to determine the 'next' bio in the list, given the sector
- * of the current stripe+device
- */
- #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
- /*
- * The following can be used to debug the driver
- */
- #define RAID5_PARANOIA 1
- #if RAID5_PARANOIA && defined(CONFIG_SMP)
- # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
- #else
- # define CHECK_DEVLOCK()
- #endif
- #ifdef DEBUG
- #define inline
- #define __inline__
- #endif
- #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
- /*
- * We maintain a biased count of active stripes in the bottom 16 bits of
- * bi_phys_segments, and a count of processed stripes in the upper 16 bits
- */
- static inline int raid5_bi_phys_segments(struct bio *bio)
- {
- return bio->bi_phys_segments & 0xffff;
- }
- static inline int raid5_bi_hw_segments(struct bio *bio)
- {
- return (bio->bi_phys_segments >> 16) & 0xffff;
- }
- static inline int raid5_dec_bi_phys_segments(struct bio *bio)
- {
- --bio->bi_phys_segments;
- return raid5_bi_phys_segments(bio);
- }
- static inline int raid5_dec_bi_hw_segments(struct bio *bio)
- {
- unsigned short val = raid5_bi_hw_segments(bio);
- --val;
- bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
- return val;
- }
- static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
- {
- bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16);
- }
- /* Find first data disk in a raid6 stripe */
- static inline int raid6_d0(struct stripe_head *sh)
- {
- if (sh->ddf_layout)
- /* ddf always start from first device */
- return 0;
- /* md starts just after Q block */
- if (sh->qd_idx == sh->disks - 1)
- return 0;
- else
- return sh->qd_idx + 1;
- }
- static inline int raid6_next_disk(int disk, int raid_disks)
- {
- disk++;
- return (disk < raid_disks) ? disk : 0;
- }
- /* When walking through the disks in a raid5, starting at raid6_d0,
- * We need to map each disk to a 'slot', where the data disks are slot
- * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
- * is raid_disks-1. This help does that mapping.
- */
- static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
- int *count, int syndrome_disks)
- {
- int slot = *count;
- if (sh->ddf_layout)
- (*count)++;
- if (idx == sh->pd_idx)
- return syndrome_disks;
- if (idx == sh->qd_idx)
- return syndrome_disks + 1;
- if (!sh->ddf_layout)
- (*count)++;
- return slot;
- }
- static void return_io(struct bio *return_bi)
- {
- struct bio *bi = return_bi;
- while (bi) {
- return_bi = bi->bi_next;
- bi->bi_next = NULL;
- bi->bi_size = 0;
- bio_endio(bi, 0);
- bi = return_bi;
- }
- }
- static void print_raid5_conf (raid5_conf_t *conf);
- static int stripe_operations_active(struct stripe_head *sh)
- {
- return sh->check_state || sh->reconstruct_state ||
- test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
- test_bit(STRIPE_COMPUTE_RUN, &sh->state);
- }
- static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
- {
- if (atomic_dec_and_test(&sh->count)) {
- BUG_ON(!list_empty(&sh->lru));
- BUG_ON(atomic_read(&conf->active_stripes)==0);
- if (test_bit(STRIPE_HANDLE, &sh->state)) {
- if (test_bit(STRIPE_DELAYED, &sh->state) &&
- !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
- list_add_tail(&sh->lru, &conf->delayed_list);
- else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
- sh->bm_seq - conf->seq_write > 0)
- list_add_tail(&sh->lru, &conf->bitmap_list);
- else {
- clear_bit(STRIPE_DELAYED, &sh->state);
- clear_bit(STRIPE_BIT_DELAY, &sh->state);
- list_add_tail(&sh->lru, &conf->handle_list);
- }
- md_wakeup_thread(conf->mddev->thread);
- } else {
- BUG_ON(stripe_operations_active(sh));
- if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
- atomic_dec(&conf->preread_active_stripes);
- if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
- md_wakeup_thread(conf->mddev->thread);
- }
- atomic_dec(&conf->active_stripes);
- if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
- list_add_tail(&sh->lru, &conf->inactive_list);
- wake_up(&conf->wait_for_stripe);
- if (conf->retry_read_aligned)
- md_wakeup_thread(conf->mddev->thread);
- }
- }
- }
- }
- static void release_stripe(struct stripe_head *sh)
- {
- raid5_conf_t *conf = sh->raid_conf;
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- __release_stripe(conf, sh);
- spin_unlock_irqrestore(&conf->device_lock, flags);
- }
- static inline void remove_hash(struct stripe_head *sh)
- {
- pr_debug("remove_hash(), stripe %llu\n",
- (unsigned long long)sh->sector);
- hlist_del_init(&sh->hash);
- }
- static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
- {
- struct hlist_head *hp = stripe_hash(conf, sh->sector);
- pr_debug("insert_hash(), stripe %llu\n",
- (unsigned long long)sh->sector);
- CHECK_DEVLOCK();
- hlist_add_head(&sh->hash, hp);
- }
- /* find an idle stripe, make sure it is unhashed, and return it. */
- static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
- {
- struct stripe_head *sh = NULL;
- struct list_head *first;
- CHECK_DEVLOCK();
- if (list_empty(&conf->inactive_list))
- goto out;
- first = conf->inactive_list.next;
- sh = list_entry(first, struct stripe_head, lru);
- list_del_init(first);
- remove_hash(sh);
- atomic_inc(&conf->active_stripes);
- out:
- return sh;
- }
- static void shrink_buffers(struct stripe_head *sh)
- {
- struct page *p;
- int i;
- int num = sh->raid_conf->pool_size;
- for (i = 0; i < num ; i++) {
- p = sh->dev[i].page;
- if (!p)
- continue;
- sh->dev[i].page = NULL;
- put_page(p);
- }
- }
- static int grow_buffers(struct stripe_head *sh)
- {
- int i;
- int num = sh->raid_conf->pool_size;
- for (i = 0; i < num; i++) {
- struct page *page;
- if (!(page = alloc_page(GFP_KERNEL))) {
- return 1;
- }
- sh->dev[i].page = page;
- }
- return 0;
- }
- static void raid5_build_block(struct stripe_head *sh, int i, int previous);
- static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
- struct stripe_head *sh);
- static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
- {
- raid5_conf_t *conf = sh->raid_conf;
- int i;
- BUG_ON(atomic_read(&sh->count) != 0);
- BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
- BUG_ON(stripe_operations_active(sh));
- CHECK_DEVLOCK();
- pr_debug("init_stripe called, stripe %llu\n",
- (unsigned long long)sh->sector);
- remove_hash(sh);
- sh->generation = conf->generation - previous;
- sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
- sh->sector = sector;
- stripe_set_idx(sector, conf, previous, sh);
- sh->state = 0;
- for (i = sh->disks; i--; ) {
- struct r5dev *dev = &sh->dev[i];
- if (dev->toread || dev->read || dev->towrite || dev->written ||
- test_bit(R5_LOCKED, &dev->flags)) {
- printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
- (unsigned long long)sh->sector, i, dev->toread,
- dev->read, dev->towrite, dev->written,
- test_bit(R5_LOCKED, &dev->flags));
- BUG();
- }
- dev->flags = 0;
- raid5_build_block(sh, i, previous);
- }
- insert_hash(conf, sh);
- }
- static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
- short generation)
- {
- struct stripe_head *sh;
- struct hlist_node *hn;
- CHECK_DEVLOCK();
- pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
- hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
- if (sh->sector == sector && sh->generation == generation)
- return sh;
- pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
- return NULL;
- }
- /*
- * Need to check if array has failed when deciding whether to:
- * - start an array
- * - remove non-faulty devices
- * - add a spare
- * - allow a reshape
- * This determination is simple when no reshape is happening.
- * However if there is a reshape, we need to carefully check
- * both the before and after sections.
- * This is because some failed devices may only affect one
- * of the two sections, and some non-in_sync devices may
- * be insync in the section most affected by failed devices.
- */
- static int has_failed(raid5_conf_t *conf)
- {
- int degraded;
- int i;
- if (conf->mddev->reshape_position == MaxSector)
- return conf->mddev->degraded > conf->max_degraded;
- rcu_read_lock();
- degraded = 0;
- for (i = 0; i < conf->previous_raid_disks; i++) {
- mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
- if (!rdev || test_bit(Faulty, &rdev->flags))
- degraded++;
- else if (test_bit(In_sync, &rdev->flags))
- ;
- else
- /* not in-sync or faulty.
- * If the reshape increases the number of devices,
- * this is being recovered by the reshape, so
- * this 'previous' section is not in_sync.
- * If the number of devices is being reduced however,
- * the device can only be part of the array if
- * we are reverting a reshape, so this section will
- * be in-sync.
- */
- if (conf->raid_disks >= conf->previous_raid_disks)
- degraded++;
- }
- rcu_read_unlock();
- if (degraded > conf->max_degraded)
- return 1;
- rcu_read_lock();
- degraded = 0;
- for (i = 0; i < conf->raid_disks; i++) {
- mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
- if (!rdev || test_bit(Faulty, &rdev->flags))
- degraded++;
- else if (test_bit(In_sync, &rdev->flags))
- ;
- else
- /* not in-sync or faulty.
- * If reshape increases the number of devices, this
- * section has already been recovered, else it
- * almost certainly hasn't.
- */
- if (conf->raid_disks <= conf->previous_raid_disks)
- degraded++;
- }
- rcu_read_unlock();
- if (degraded > conf->max_degraded)
- return 1;
- return 0;
- }
- static struct stripe_head *
- get_active_stripe(raid5_conf_t *conf, sector_t sector,
- int previous, int noblock, int noquiesce)
- {
- struct stripe_head *sh;
- pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
- spin_lock_irq(&conf->device_lock);
- do {
- wait_event_lock_irq(conf->wait_for_stripe,
- conf->quiesce == 0 || noquiesce,
- conf->device_lock, /* nothing */);
- sh = __find_stripe(conf, sector, conf->generation - previous);
- if (!sh) {
- if (!conf->inactive_blocked)
- sh = get_free_stripe(conf);
- if (noblock && sh == NULL)
- break;
- if (!sh) {
- conf->inactive_blocked = 1;
- wait_event_lock_irq(conf->wait_for_stripe,
- !list_empty(&conf->inactive_list) &&
- (atomic_read(&conf->active_stripes)
- < (conf->max_nr_stripes *3/4)
- || !conf->inactive_blocked),
- conf->device_lock,
- );
- conf->inactive_blocked = 0;
- } else
- init_stripe(sh, sector, previous);
- } else {
- if (atomic_read(&sh->count)) {
- BUG_ON(!list_empty(&sh->lru)
- && !test_bit(STRIPE_EXPANDING, &sh->state));
- } else {
- if (!test_bit(STRIPE_HANDLE, &sh->state))
- atomic_inc(&conf->active_stripes);
- if (list_empty(&sh->lru) &&
- !test_bit(STRIPE_EXPANDING, &sh->state))
- BUG();
- list_del_init(&sh->lru);
- }
- }
- } while (sh == NULL);
- if (sh)
- atomic_inc(&sh->count);
- spin_unlock_irq(&conf->device_lock);
- return sh;
- }
- static void
- raid5_end_read_request(struct bio *bi, int error);
- static void
- raid5_end_write_request(struct bio *bi, int error);
- static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
- {
- raid5_conf_t *conf = sh->raid_conf;
- int i, disks = sh->disks;
- might_sleep();
- for (i = disks; i--; ) {
- int rw;
- struct bio *bi;
- mdk_rdev_t *rdev;
- if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
- if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
- rw = WRITE_FUA;
- else
- rw = WRITE;
- } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
- rw = READ;
- else
- continue;
- bi = &sh->dev[i].req;
- bi->bi_rw = rw;
- if (rw & WRITE)
- bi->bi_end_io = raid5_end_write_request;
- else
- bi->bi_end_io = raid5_end_read_request;
- rcu_read_lock();
- rdev = rcu_dereference(conf->disks[i].rdev);
- if (rdev && test_bit(Faulty, &rdev->flags))
- rdev = NULL;
- if (rdev)
- atomic_inc(&rdev->nr_pending);
- rcu_read_unlock();
- if (rdev) {
- if (s->syncing || s->expanding || s->expanded)
- md_sync_acct(rdev->bdev, STRIPE_SECTORS);
- set_bit(STRIPE_IO_STARTED, &sh->state);
- bi->bi_bdev = rdev->bdev;
- pr_debug("%s: for %llu schedule op %ld on disc %d\n",
- __func__, (unsigned long long)sh->sector,
- bi->bi_rw, i);
- atomic_inc(&sh->count);
- bi->bi_sector = sh->sector + rdev->data_offset;
- bi->bi_flags = 1 << BIO_UPTODATE;
- bi->bi_vcnt = 1;
- bi->bi_max_vecs = 1;
- bi->bi_idx = 0;
- bi->bi_io_vec = &sh->dev[i].vec;
- bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
- bi->bi_io_vec[0].bv_offset = 0;
- bi->bi_size = STRIPE_SIZE;
- bi->bi_next = NULL;
- if ((rw & WRITE) &&
- test_bit(R5_ReWrite, &sh->dev[i].flags))
- atomic_add(STRIPE_SECTORS,
- &rdev->corrected_errors);
- generic_make_request(bi);
- } else {
- if (rw & WRITE)
- set_bit(STRIPE_DEGRADED, &sh->state);
- pr_debug("skip op %ld on disc %d for sector %llu\n",
- bi->bi_rw, i, (unsigned long long)sh->sector);
- clear_bit(R5_LOCKED, &sh->dev[i].flags);
- set_bit(STRIPE_HANDLE, &sh->state);
- }
- }
- }
- static struct dma_async_tx_descriptor *
- async_copy_data(int frombio, struct bio *bio, struct page *page,
- sector_t sector, struct dma_async_tx_descriptor *tx)
- {
- struct bio_vec *bvl;
- struct page *bio_page;
- int i;
- int page_offset;
- struct async_submit_ctl submit;
- enum async_tx_flags flags = 0;
- if (bio->bi_sector >= sector)
- page_offset = (signed)(bio->bi_sector - sector) * 512;
- else
- page_offset = (signed)(sector - bio->bi_sector) * -512;
- if (frombio)
- flags |= ASYNC_TX_FENCE;
- init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
- bio_for_each_segment(bvl, bio, i) {
- int len = bvl->bv_len;
- int clen;
- int b_offset = 0;
- if (page_offset < 0) {
- b_offset = -page_offset;
- page_offset += b_offset;
- len -= b_offset;
- }
- if (len > 0 && page_offset + len > STRIPE_SIZE)
- clen = STRIPE_SIZE - page_offset;
- else
- clen = len;
- if (clen > 0) {
- b_offset += bvl->bv_offset;
- bio_page = bvl->bv_page;
- if (frombio)
- tx = async_memcpy(page, bio_page, page_offset,
- b_offset, clen, &submit);
- else
- tx = async_memcpy(bio_page, page, b_offset,
- page_offset, clen, &submit);
- }
- /* chain the operations */
- submit.depend_tx = tx;
- if (clen < len) /* hit end of page */
- break;
- page_offset += len;
- }
- return tx;
- }
- static void ops_complete_biofill(void *stripe_head_ref)
- {
- struct stripe_head *sh = stripe_head_ref;
- struct bio *return_bi = NULL;
- raid5_conf_t *conf = sh->raid_conf;
- int i;
- pr_debug("%s: stripe %llu\n", __func__,
- (unsigned long long)sh->sector);
- /* clear completed biofills */
- spin_lock_irq(&conf->device_lock);
- for (i = sh->disks; i--; ) {
- struct r5dev *dev = &sh->dev[i];
- /* acknowledge completion of a biofill operation */
- /* and check if we need to reply to a read request,
- * new R5_Wantfill requests are held off until
- * !STRIPE_BIOFILL_RUN
- */
- if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
- struct bio *rbi, *rbi2;
- BUG_ON(!dev->read);
- rbi = dev->read;
- dev->read = NULL;
- while (rbi && rbi->bi_sector <
- dev->sector + STRIPE_SECTORS) {
- rbi2 = r5_next_bio(rbi, dev->sector);
- if (!raid5_dec_bi_phys_segments(rbi)) {
- rbi->bi_next = return_bi;
- return_bi = rbi;
- }
- rbi = rbi2;
- }
- }
- }
- spin_unlock_irq(&conf->device_lock);
- clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
- return_io(return_bi);
- set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
- }
- static void ops_run_biofill(struct stripe_head *sh)
- {
- struct dma_async_tx_descriptor *tx = NULL;
- raid5_conf_t *conf = sh->raid_conf;
- struct async_submit_ctl submit;
- int i;
- pr_debug("%s: stripe %llu\n", __func__,
- (unsigned long long)sh->sector);
- for (i = sh->disks; i--; ) {
- struct r5dev *dev = &sh->dev[i];
- if (test_bit(R5_Wantfill, &dev->flags)) {
- struct bio *rbi;
- spin_lock_irq(&conf->device_lock);
- dev->read = rbi = dev->toread;
- dev->toread = NULL;
- spin_unlock_irq(&conf->device_lock);
- while (rbi && rbi->bi_sector <
- dev->sector + STRIPE_SECTORS) {
- tx = async_copy_data(0, rbi, dev->page,
- dev->sector, tx);
- rbi = r5_next_bio(rbi, dev->sector);
- }
- }
- }
- atomic_inc(&sh->count);
- init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
- async_trigger_callback(&submit);
- }
- static void mark_target_uptodate(struct stripe_head *sh, int target)
- {
- struct r5dev *tgt;
- if (target < 0)
- return;
- tgt = &sh->dev[target];
- set_bit(R5_UPTODATE, &tgt->flags);
- BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
- clear_bit(R5_Wantcompute, &tgt->flags);
- }
- static void ops_complete_compute(void *stripe_head_ref)
- {
- struct stripe_head *sh = stripe_head_ref;
- pr_debug("%s: stripe %llu\n", __func__,
- (unsigned long long)sh->sector);
- /* mark the computed target(s) as uptodate */
- mark_target_uptodate(sh, sh->ops.target);
- mark_target_uptodate(sh, sh->ops.target2);
- clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
- if (sh->check_state == check_state_compute_run)
- sh->check_state = check_state_compute_result;
- set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
- }
- /* return a pointer to the address conversion region of the scribble buffer */
- static addr_conv_t *to_addr_conv(struct stripe_head *sh,
- struct raid5_percpu *percpu)
- {
- return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
- }
- static struct dma_async_tx_descriptor *
- ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
- {
- int disks = sh->disks;
- struct page **xor_srcs = percpu->scribble;
- int target = sh->ops.target;
- struct r5dev *tgt = &sh->dev[target];
- struct page *xor_dest = tgt->page;
- int count = 0;
- struct dma_async_tx_descriptor *tx;
- struct async_submit_ctl submit;
- int i;
- pr_debug("%s: stripe %llu block: %d\n",
- __func__, (unsigned long long)sh->sector, target);
- BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
- for (i = disks; i--; )
- if (i != target)
- xor_srcs[count++] = sh->dev[i].page;
- atomic_inc(&sh->count);
- init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
- ops_complete_compute, sh, to_addr_conv(sh, percpu));
- if (unlikely(count == 1))
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
- else
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
- return tx;
- }
- /* set_syndrome_sources - populate source buffers for gen_syndrome
- * @srcs - (struct page *) array of size sh->disks
- * @sh - stripe_head to parse
- *
- * Populates srcs in proper layout order for the stripe and returns the
- * 'count' of sources to be used in a call to async_gen_syndrome. The P
- * destination buffer is recorded in srcs[count] and the Q destination
- * is recorded in srcs[count+1]].
- */
- static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
- {
- int disks = sh->disks;
- int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
- int d0_idx = raid6_d0(sh);
- int count;
- int i;
- for (i = 0; i < disks; i++)
- srcs[i] = NULL;
- count = 0;
- i = d0_idx;
- do {
- int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
- srcs[slot] = sh->dev[i].page;
- i = raid6_next_disk(i, disks);
- } while (i != d0_idx);
- return syndrome_disks;
- }
- static struct dma_async_tx_descriptor *
- ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
- {
- int disks = sh->disks;
- struct page **blocks = percpu->scribble;
- int target;
- int qd_idx = sh->qd_idx;
- struct dma_async_tx_descriptor *tx;
- struct async_submit_ctl submit;
- struct r5dev *tgt;
- struct page *dest;
- int i;
- int count;
- if (sh->ops.target < 0)
- target = sh->ops.target2;
- else if (sh->ops.target2 < 0)
- target = sh->ops.target;
- else
- /* we should only have one valid target */
- BUG();
- BUG_ON(target < 0);
- pr_debug("%s: stripe %llu block: %d\n",
- __func__, (unsigned long long)sh->sector, target);
- tgt = &sh->dev[target];
- BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
- dest = tgt->page;
- atomic_inc(&sh->count);
- if (target == qd_idx) {
- count = set_syndrome_sources(blocks, sh);
- blocks[count] = NULL; /* regenerating p is not necessary */
- BUG_ON(blocks[count+1] != dest); /* q should already be set */
- init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
- ops_complete_compute, sh,
- to_addr_conv(sh, percpu));
- tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
- } else {
- /* Compute any data- or p-drive using XOR */
- count = 0;
- for (i = disks; i-- ; ) {
- if (i == target || i == qd_idx)
- continue;
- blocks[count++] = sh->dev[i].page;
- }
- init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
- NULL, ops_complete_compute, sh,
- to_addr_conv(sh, percpu));
- tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
- }
- return tx;
- }
- static struct dma_async_tx_descriptor *
- ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
- {
- int i, count, disks = sh->disks;
- int syndrome_disks = sh->ddf_layout ? disks : disks-2;
- int d0_idx = raid6_d0(sh);
- int faila = -1, failb = -1;
- int target = sh->ops.target;
- int target2 = sh->ops.target2;
- struct r5dev *tgt = &sh->dev[target];
- struct r5dev *tgt2 = &sh->dev[target2];
- struct dma_async_tx_descriptor *tx;
- struct page **blocks = percpu->scribble;
- struct async_submit_ctl submit;
- pr_debug("%s: stripe %llu block1: %d block2: %d\n",
- __func__, (unsigned long long)sh->sector, target, target2);
- BUG_ON(target < 0 || target2 < 0);
- BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
- BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
- /* we need to open-code set_syndrome_sources to handle the
- * slot number conversion for 'faila' and 'failb'
- */
- for (i = 0; i < disks ; i++)
- blocks[i] = NULL;
- count = 0;
- i = d0_idx;
- do {
- int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
- blocks[slot] = sh->dev[i].page;
- if (i == target)
- faila = slot;
- if (i == target2)
- failb = slot;
- i = raid6_next_disk(i, disks);
- } while (i != d0_idx);
- BUG_ON(faila == failb);
- if (failb < faila)
- swap(faila, failb);
- pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
- __func__, (unsigned long long)sh->sector, faila, failb);
- atomic_inc(&sh->count);
- if (failb == syndrome_disks+1) {
- /* Q disk is one of the missing disks */
- if (faila == syndrome_disks) {
- /* Missing P+Q, just recompute */
- init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
- ops_complete_compute, sh,
- to_addr_conv(sh, percpu));
- return async_gen_syndrome(blocks, 0, syndrome_disks+2,
- STRIPE_SIZE, &submit);
- } else {
- struct page *dest;
- int data_target;
- int qd_idx = sh->qd_idx;
- /* Missing D+Q: recompute D from P, then recompute Q */
- if (target == qd_idx)
- data_target = target2;
- else
- data_target = target;
- count = 0;
- for (i = disks; i-- ; ) {
- if (i == data_target || i == qd_idx)
- continue;
- blocks[count++] = sh->dev[i].page;
- }
- dest = sh->dev[data_target].page;
- init_async_submit(&submit,
- ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
- NULL, NULL, NULL,
- to_addr_conv(sh, percpu));
- tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
- &submit);
- count = set_syndrome_sources(blocks, sh);
- init_async_submit(&submit, ASYNC_TX_FENCE, tx,
- ops_complete_compute, sh,
- to_addr_conv(sh, percpu));
- return async_gen_syndrome(blocks, 0, count+2,
- STRIPE_SIZE, &submit);
- }
- } else {
- init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
- ops_complete_compute, sh,
- to_addr_conv(sh, percpu));
- if (failb == syndrome_disks) {
- /* We're missing D+P. */
- return async_raid6_datap_recov(syndrome_disks+2,
- STRIPE_SIZE, faila,
- blocks, &submit);
- } else {
- /* We're missing D+D. */
- return async_raid6_2data_recov(syndrome_disks+2,
- STRIPE_SIZE, faila, failb,
- blocks, &submit);
- }
- }
- }
- static void ops_complete_prexor(void *stripe_head_ref)
- {
- struct stripe_head *sh = stripe_head_ref;
- pr_debug("%s: stripe %llu\n", __func__,
- (unsigned long long)sh->sector);
- }
- static struct dma_async_tx_descriptor *
- ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
- struct dma_async_tx_descriptor *tx)
- {
- int disks = sh->disks;
- struct page **xor_srcs = percpu->scribble;
- int count = 0, pd_idx = sh->pd_idx, i;
- struct async_submit_ctl submit;
- /* existing parity data subtracted */
- struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
- pr_debug("%s: stripe %llu\n", __func__,
- (unsigned long long)sh->sector);
- for (i = disks; i--; ) {
- struct r5dev *dev = &sh->dev[i];
- /* Only process blocks that are known to be uptodate */
- if (test_bit(R5_Wantdrain, &dev->flags))
- xor_srcs[count++] = dev->page;
- }
- init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
- ops_complete_prexor, sh, to_addr_conv(sh, percpu));
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
- return tx;
- }
- static struct dma_async_tx_descriptor *
- ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
- {
- int disks = sh->disks;
- int i;
- pr_debug("%s: stripe %llu\n", __func__,
- (unsigned long long)sh->sector);
- for (i = disks; i--; ) {
- struct r5dev *dev = &sh->dev[i];
- struct bio *chosen;
- if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
- struct bio *wbi;
- spin_lock(&sh->lock);
- chosen = dev->towrite;
- dev->towrite = NULL;
- BUG_ON(dev->written);
- wbi = dev->written = chosen;
- spin_unlock(&sh->lock);
- while (wbi && wbi->bi_sector <
- dev->sector + STRIPE_SECTORS) {
- if (wbi->bi_rw & REQ_FUA)
- set_bit(R5_WantFUA, &dev->flags);
- tx = async_copy_data(1, wbi, dev->page,
- dev->sector, tx);
- wbi = r5_next_bio(wbi, dev->sector);
- }
- }
- }
- return tx;
- }
- static void ops_complete_reconstruct(void *stripe_head_ref)
- {
- struct stripe_head *sh = stripe_head_ref;
- int disks = sh->disks;
- int pd_idx = sh->pd_idx;
- int qd_idx = sh->qd_idx;
- int i;
- bool fua = false;
- pr_debug("%s: stripe %llu\n", __func__,
- (unsigned long long)sh->sector);
- for (i = disks; i--; )
- fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
- for (i = disks; i--; ) {
- struct r5dev *dev = &sh->dev[i];
- if (dev->written || i == pd_idx || i == qd_idx) {
- set_bit(R5_UPTODATE, &dev->flags);
- if (fua)
- set_bit(R5_WantFUA, &dev->flags);
- }
- }
- if (sh->reconstruct_state == reconstruct_state_drain_run)
- sh->reconstruct_state = reconstruct_state_drain_result;
- else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
- sh->reconstruct_state = reconstruct_state_prexor_drain_result;
- else {
- BUG_ON(sh->reconstruct_state != reconstruct_state_run);
- sh->reconstruct_state = reconstruct_state_result;
- }
- set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
- }
- static void
- ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
- struct dma_async_tx_descriptor *tx)
- {
- int disks = sh->disks;
- struct page **xor_srcs = percpu->scribble;
- struct async_submit_ctl submit;
- int count = 0, pd_idx = sh->pd_idx, i;
- struct page *xor_dest;
- int prexor = 0;
- unsigned long flags;
- pr_debug("%s: stripe %llu\n", __func__,
- (unsigned long long)sh->sector);
- /* check if prexor is active which means only process blocks
- * that are part of a read-modify-write (written)
- */
- if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
- prexor = 1;
- xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
- for (i = disks; i--; ) {
- struct r5dev *dev = &sh->dev[i];
- if (dev->written)
- xor_srcs[count++] = dev->page;
- }
- } else {
- xor_dest = sh->dev[pd_idx].page;
- for (i = disks; i--; ) {
- struct r5dev *dev = &sh->dev[i];
- if (i != pd_idx)
- xor_srcs[count++] = dev->page;
- }
- }
- /* 1/ if we prexor'd then the dest is reused as a source
- * 2/ if we did not prexor then we are redoing the parity
- * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
- * for the synchronous xor case
- */
- flags = ASYNC_TX_ACK |
- (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
- atomic_inc(&sh->count);
- init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
- to_addr_conv(sh, percpu));
- if (unlikely(count == 1))
- tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
- else
- tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
- }
- static void
- ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
- struct dma_async_tx_descriptor *tx)
- {
- struct async_submit_ctl submit;
- struct page **blocks = percpu->scribble;
- int count;
- pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
- count = set_syndrome_sources(blocks, sh);
- atomic_inc(&sh->count);
- init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
- sh, to_addr_conv(sh, percpu));
- async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
- }
- static void ops_complete_check(void *stripe_head_ref)
- {
- struct stripe_head *sh = stripe_head_ref;
- pr_debug("%s: stripe %llu\n", __func__,
- (unsigned long long)sh->sector);
- sh->check_state = check_state_check_result;
- set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
- }
- static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
- {
- int disks = sh->disks;
- int pd_idx = sh->pd_idx;
- int qd_idx = sh->qd_idx;
- struct page *xor_dest;
- struct page **xor_srcs = percpu->scribble;
- struct dma_async_tx_descriptor *tx;
- struct async_submit_ctl submit;
- int count;
- int i;
- pr_debug("%s: stripe %llu\n", __func__,
- (unsigned long long)sh->sector);
- count = 0;
- xor_dest = sh->dev[pd_idx].page;
- xor_srcs[count++] = xor_dest;
- for (i = disks; i--; ) {
- if (i == pd_idx || i == qd_idx)
- continue;
- xor_srcs[count++] = sh->dev[i].page;
- }
- init_async_submit(&submit, 0, NULL, NULL, NULL,
- to_addr_conv(sh, percpu));
- tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
- &sh->ops.zero_sum_result, &submit);
- atomic_inc(&sh->count);
- init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
- tx = async_trigger_callback(&submit);
- }
- static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
- {
- struct page **srcs = percpu->scribble;
- struct async_submit_ctl submit;
- int count;
- pr_debug("%s: stripe %llu checkp: %d\n", __func__,
- (unsigned long long)sh->sector, checkp);
- count = set_syndrome_sources(srcs, sh);
- if (!checkp)
- srcs[count] = NULL;
- atomic_inc(&sh->count);
- init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
- sh, to_addr_conv(sh, percpu));
- async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
- &sh->ops.zero_sum_result, percpu->spare_page, &submit);
- }
- static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
- {
- int overlap_clear = 0, i, disks = sh->disks;
- struct dma_async_tx_descriptor *tx = NULL;
- raid5_conf_t *conf = sh->raid_conf;
- int level = conf->level;
- struct raid5_percpu *percpu;
- unsigned long cpu;
- cpu = get_cpu();
- percpu = per_cpu_ptr(conf->percpu, cpu);
- if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
- ops_run_biofill(sh);
- overlap_clear++;
- }
- if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
- if (level < 6)
- tx = ops_run_compute5(sh, percpu);
- else {
- if (sh->ops.target2 < 0 || sh->ops.target < 0)
- tx = ops_run_compute6_1(sh, percpu);
- else
- tx = ops_run_compute6_2(sh, percpu);
- }
- /* terminate the chain if reconstruct is not set to be run */
- if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
- async_tx_ack(tx);
- }
- if (test_bit(STRIPE_OP_PREXOR, &ops_request))
- tx = ops_run_prexor(sh, percpu, tx);
- if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
- tx = ops_run_biodrain(sh, tx);
- overlap_clear++;
- }
- if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
- if (level < 6)
- ops_run_reconstruct5(sh, percpu, tx);
- else
- ops_run_reconstruct6(sh, percpu, tx);
- }
- if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
- if (sh->check_state == check_state_run)
- ops_run_check_p(sh, percpu);
- else if (sh->check_state == check_state_run_q)
- ops_run_check_pq(sh, percpu, 0);
- else if (sh->check_state == check_state_run_pq)
- ops_run_check_pq(sh, percpu, 1);
- else
- BUG();
- }
- if (overlap_clear)
- for (i = disks; i--; ) {
- struct r5dev *dev = &sh->dev[i];
- if (test_and_clear_bit(R5_Overlap, &dev->flags))
- wake_up(&sh->raid_conf->wait_for_overlap);
- }
- put_cpu();
- }
- #ifdef CONFIG_MULTICORE_RAID456
- static void async_run_ops(void *param, async_cookie_t cookie)
- {
- struct stripe_head *sh = param;
- unsigned long ops_request = sh->ops.request;
- clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
- wake_up(&sh->ops.wait_for_ops);
- __raid_run_ops(sh, ops_request);
- release_stripe(sh);
- }
- static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
- {
- /* since handle_stripe can be called outside of raid5d context
- * we need to ensure sh->ops.request is de-staged before another
- * request arrives
- */
- wait_event(sh->ops.wait_for_ops,
- !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
- sh->ops.request = ops_request;
- atomic_inc(&sh->count);
- async_schedule(async_run_ops, sh);
- }
- #else
- #define raid_run_ops __raid_run_ops
- #endif
- static int grow_one_stripe(raid5_conf_t *conf)
- {
- struct stripe_head *sh;
- sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
- if (!sh)
- return 0;
- memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev));
- sh->raid_conf = conf;
- spin_lock_init(&sh->lock);
- #ifdef CONFIG_MULTICORE_RAID456
- init_waitqueue_head(&sh->ops.wait_for_ops);
- #endif
- if (grow_buffers(sh)) {
- shrink_buffers(sh);
- kmem_cache_free(conf->slab_cache, sh);
- return 0;
- }
- /* we just created an active stripe so... */
- atomic_set(&sh->count, 1);
- atomic_inc(&conf->active_stripes);
- INIT_LIST_HEAD(&sh->lru);
- release_stripe(sh);
- return 1;
- }
- static int grow_stripes(raid5_conf_t *conf, int num)
- {
- struct kmem_cache *sc;
- int devs = max(conf->raid_disks, conf->previous_raid_disks);
- if (conf->mddev->gendisk)
- sprintf(conf->cache_name[0],
- "raid%d-%s", conf->level, mdname(conf->mddev));
- else
- sprintf(conf->cache_name[0],
- "raid%d-%p", conf->level, conf->mddev);
- sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
- conf->active_name = 0;
- sc = kmem_cache_create(conf->cache_name[conf->active_name],
- sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
- 0, 0, NULL);
- if (!sc)
- return 1;
- conf->slab_cache = sc;
- conf->pool_size = devs;
- while (num--)
- if (!grow_one_stripe(conf))
- return 1;
- return 0;
- }
- /**
- * scribble_len - return the required size of the scribble region
- * @num - total number of disks in the array
- *
- * The size must be enough to contain:
- * 1/ a struct page pointer for each device in the array +2
- * 2/ room to convert each entry in (1) to its corresponding dma
- * (dma_map_page()) or page (page_address()) address.
- *
- * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
- * calculate over all devices (not just the data blocks), using zeros in place
- * of the P and Q blocks.
- */
- static size_t scribble_len(int num)
- {
- size_t len;
- len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
- return len;
- }
- static int resize_stripes(raid5_conf_t *conf, int newsize)
- {
- /* Make all the stripes able to hold 'newsize' devices.
- * New slots in each stripe get 'page' set to a new page.
- *
- * This happens in stages:
- * 1/ create a new kmem_cache and allocate the required number of
- * stripe_heads.
- * 2/ gather all the old stripe_heads and tranfer the pages across
- * to the new stripe_heads. This will have the side effect of
- * freezing the array as once all stripe_heads have been collected,
- * no IO will be possible. Old stripe heads are freed once their
- * pages have been transferred over, and the old kmem_cache is
- * freed when all stripes are done.
- * 3/ reallocate conf->disks to be suitable bigger. If this fails,
- * we simple return a failre status - no need to clean anything up.
- * 4/ allocate new pages for the new slots in the new stripe_heads.
- * If this fails, we don't bother trying the shrink the
- * stripe_heads down again, we just leave them as they are.
- * As each stripe_head is processed the new one is released into
- * active service.
- *
- * Once step2 is started, we cannot afford to wait for a write,
- * so we use GFP_NOIO allocations.
- */
- struct stripe_head *osh, *nsh;
- LIST_HEAD(newstripes);
- struct disk_info *ndisks;
- unsigned long cpu;
- int err;
- struct kmem_cache *sc;
- int i;
- if (newsize <= conf->pool_size)
- return 0; /* never bother to shrink */
- err = md_allow_write(conf->mddev);
- if (err)
- return err;
- /* Step 1 */
- sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
- sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
- 0, 0, NULL);
- if (!sc)
- return -ENOMEM;
- for (i = conf->max_nr_stripes; i; i--) {
- nsh = kmem_cache_alloc(sc, GFP_KERNEL);
- if (!nsh)
- break;
- memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
- nsh->raid_conf = conf;
- spin_lock_init(&nsh->lock);
- #ifdef CONFIG_MULTICORE_RAID456
- init_waitqueue_head(&nsh->ops.wait_for_ops);
- #endif
- list_add(&nsh->lru, &newstripes);
- }
- if (i) {
- /* didn't get enough, give up */
- while (!list_empty(&newstripes)) {
- nsh = list_entry(newstripes.next, struct stripe_head, lru);
- list_del(&nsh->lru);
- kmem_cache_free(sc, nsh);
- }
- kmem_cache_destroy(sc);
- return -ENOMEM;
- }
- /* Step 2 - Must use GFP_NOIO now.
- * OK, we have enough stripes, start collecting inactive
- * stripes and copying them over
- */
- list_for_each_entry(nsh, &newstripes, lru) {
- spin_lock_irq(&conf->device_lock);
- wait_event_lock_irq(conf->wait_for_stripe,
- !list_empty(&conf->inactive_list),
- conf->device_lock,
- );
- osh = get_free_stripe(conf);
- spin_unlock_irq(&conf->device_lock);
- atomic_set(&nsh->count, 1);
- for(i=0; i<conf->pool_size; i++)
- nsh->dev[i].page = osh->dev[i].page;
- for( ; i<newsize; i++)
- nsh->dev[i].page = NULL;
- kmem_cache_free(conf->slab_cache, osh);
- }
- kmem_cache_destroy(conf->slab_cache);
- /* Step 3.
- * At this point, we are holding all the stripes so the array
- * is completely stalled, so now is a good time to resize
- * conf->disks and the scribble region
- */
- ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
- if (ndisks) {
- for (i=0; i<conf->raid_disks; i++)
- ndisks[i] = conf->disks[i];
- kfree(conf->disks);
- conf->disks = ndisks;
- } else
- err = -ENOMEM;
- get_online_cpus();
- conf->scribble_len = scribble_len(newsize);
- for_each_present_cpu(cpu) {
- struct raid5_percpu *percpu;
- void *scribble;
- percpu = per_cpu_ptr(conf->percpu, cpu);
- scribble = kmalloc(conf->scribble_len, GFP_NOIO);
- if (scribble) {
- kfree(percpu->scribble);
- percpu->scribble = scribble;
- } else {
- err = -ENOMEM;
- break;
- }
- }
- put_online_cpus();
- /* Step 4, return new stripes to service */
- while(!list_empty(&newstripes)) {
- nsh = list_entry(newstripes.next, struct stripe_head, lru);
- list_del_init(&nsh->lru);
- for (i=conf->raid_disks; i < newsize; i++)
- if (nsh->dev[i].page == NULL) {
- struct page *p = alloc_page(GFP_NOIO);
- nsh->dev[i].page = p;
- if (!p)
- err = -ENOMEM;
- }
- release_stripe(nsh);
- }
- /* critical section pass, GFP_NOIO no longer needed */
- conf->slab_cache = sc;
- conf->active_name = 1-conf->active_name;
- conf->pool_size = newsize;
- return err;
- }
- static int drop_one_stripe(raid5_conf_t *conf)
- {
- struct stripe_head *sh;
- spin_lock_irq(&conf->device_lock);
- sh = get_free_stripe(conf);
- spin_unlock_irq(&conf->device_lock);
- if (!sh)
- return 0;
- BUG_ON(atomic_read(&sh->count));
- shrink_buffers(sh);
- kmem_cache_free(conf->slab_cache, sh);
- atomic_dec(&conf->active_stripes);
- return 1;
- }
- static void shrink_stripes(raid5_conf_t *conf)
- {
- while (drop_one_stripe(conf))
- ;
- if (conf->slab_cache)
- kmem_cache_destroy(conf->slab_cache);
- conf->slab_cache = NULL;
- }
- static void raid5_end_read_request(struct bio * bi, int error)
- {
- struct stripe_head *sh = bi->bi_private;
- raid5_conf_t *conf = sh->raid_conf;
- int disks = sh->disks, i;
- int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
- char b[BDEVNAME_SIZE];
- mdk_rdev_t *rdev;
- for (i=0 ; i<disks; i++)
- if (bi == &sh->dev[i].req)
- break;
- pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
- (unsigned long long)sh->sector, i, atomic_read(&sh->count),
- uptodate);
- if (i == disks) {
- BUG();
- return;
- }
- if (uptodate) {
- set_bit(R5_UPTODATE, &sh->dev[i].flags);
- if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
- rdev = conf->disks[i].rdev;
- printk_rl(KERN_INFO "md/raid:%s: read error corrected"
- " (%lu sectors at %llu on %s)\n",
- mdname(conf->mddev), STRIPE_SECTORS,
- (unsigned long long)(sh->sector
- + rdev->data_offset),
- bdevname(rdev->bdev, b));
- clear_bit(R5_ReadError, &sh->dev[i].flags);
- clear_bit(R5_ReWrite, &sh->dev[i].flags);
- }
- if (atomic_read(&conf->disks[i].rdev->read_errors))
- atomic_set(&conf->disks[i].rdev->read_errors, 0);
- } else {
- const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
- int retry = 0;
- rdev = conf->disks[i].rdev;
- clear_bit(R5_UPTODATE, &sh->dev[i].flags);
- atomic_inc(&rdev->read_errors);
- if (conf->mddev->degraded >= conf->max_degraded)
- printk_rl(KERN_WARNING
- "md/raid:%s: read error not correctable "
- "(sector %llu on %s).\n",
- mdname(conf->mddev),
- (unsigned long long)(sh->sector
- + rdev->data_offset),
- bdn);
- else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
- /* Oh, no!!! */
- printk_rl(KERN_WARNING
- "md/raid:%s: read error NOT corrected!! "
- "(sector %llu on %s).\n",
- mdname(conf->mddev),
- (unsigned long long)(sh->sector
- + rdev->data_offset),
- bdn);
- else if (atomic_read(&rdev->read_errors)
- > conf->max_nr_stripes)
- printk(KERN_WARNING
- "md/raid:%s: Too many read errors, failing device %s.\n",
- mdname(conf->mddev), bdn);
- else
- retry = 1;
- if (retry)
- set_bit(R5_ReadError, &sh->dev[i].flags);
- else {
- clear_bit(R5_ReadError, &sh->dev[i].flags);
- clear_bit(R5_ReWrite, &sh->dev[i].flags);
- md_error(conf->mddev, rdev);
- }
- }
- rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
- clear_bit(R5_LOCKED, &sh->dev[i].flags);
- set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
- }
- static void raid5_end_write_request(struct bio *bi, int error)
- {
- struct stripe_head *sh = bi->bi_private;
- raid5_conf_t *conf = sh->raid_conf;
- int disks = sh->disks, i;
- int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
- for (i=0 ; i<disks; i++)
- if (bi == &sh->dev[i].req)
- break;
- pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
- (unsigned long long)sh->sector, i, atomic_read(&sh->count),
- uptodate);
- if (i == disks) {
- BUG();
- return;
- }
- if (!uptodate)
- md_error(conf->mddev, conf->disks[i].rdev);
- rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-
- clear_bit(R5_LOCKED, &sh->dev[i].flags);
- set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
- }
- static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
-
- static void raid5_build_block(struct stripe_head *sh, int i, int previous)
- {
- struct r5dev *dev = &sh->dev[i];
- bio_init(&dev->req);
- dev->req.bi_io_vec = &dev->vec;
- dev->req.bi_vcnt++;
- dev->req.bi_max_vecs++;
- dev->vec.bv_page = dev->page;
- dev->vec.bv_len = STRIPE_SIZE;
- dev->vec.bv_offset = 0;
- dev->req.bi_sector = sh->sector;
- dev->req.bi_private = sh;
- dev->flags = 0;
- dev->sector = compute_blocknr(sh, i, previous);
- }
- static void error(mddev_t *mddev, mdk_rdev_t *rdev)
- {
- char b[BDEVNAME_SIZE];
- raid5_conf_t *conf = mddev->private;
- pr_debug("raid456: error called\n");
- if (test_and_clear_bit(In_sync, &rdev->flags)) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- mddev->degraded++;
- spin_unlock_irqrestore(&conf->device_lock, flags);
- /*
- * if recovery was running, make sure it aborts.
- */
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- }
- set_bit(Faulty, &rdev->flags);
- set_bit(MD_CHANGE_DEVS, &mddev->flags);
- printk(KERN_ALERT
- "md/raid:%s: Disk failure on %s, disabling device.\n"
- "md/raid:%s: Operation continuing on %d devices.\n",
- mdname(mddev),
- bdevname(rdev->bdev, b),
- mdname(mddev),
- conf->raid_disks - mddev->degraded);
- }
- /*
- * Input: a 'big' sector number,
- * Output: index of the data and parity disk, and the sector # in them.
- */
- static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
- int previous, int *dd_idx,
- struct stripe_head *sh)
- {
- sector_t stripe, stripe2;
- sector_t chunk_number;
- unsigned int chunk_offset;
- int pd_idx, qd_idx;
- int ddf_layout = 0;
- sector_t new_sector;
- int algorithm = previous ? conf->prev_algo
- : conf->algorithm;
- int sectors_per_chunk = previous ? conf->prev_chunk_sectors
- : conf->chunk_sectors;
- int raid_disks = previous ? conf->previous_raid_disks
- : conf->raid_disks;
- int data_disks = raid_disks - conf->max_degraded;
- /* First compute the information on this sector */
- /*
- * Compute the chunk number and the sector offset inside the chunk
- */
- chunk_offset = sector_div(r_sector, sectors_per_chunk);
- chunk_number = r_sector;
- /*
- * Compute the stripe number
- */
- stripe = chunk_number;
- *dd_idx = sector_div(stripe, data_disks);
- stripe2 = stripe;
- /*
- * Select the parity disk based on the user selected algorithm.
- */
- pd_idx = qd_idx = ~0;
- switch(conf->level) {
- case 4:
- pd_idx = data_disks;
- break;
- case 5:
- switch (algorithm) {
- case ALGORITHM_LEFT_ASYMMETRIC:
- pd_idx = data_disks - sector_div(stripe2, raid_disks);
- if (*dd_idx >= pd_idx)
- (*dd_idx)++;
- break;
- case ALGORITHM_RIGHT_ASYMMETRIC:
- pd_idx = sector_div(stripe2, raid_disks);
- if (*dd_idx >= pd_idx)
- (*dd_idx)++;
- break;
- case ALGORITHM_LEFT_SYMMETRIC:
- pd_idx = data_disks - sector_div(stripe2, raid_disks);
- *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
- break;
- case ALGORITHM_RIGHT_SYMMETRIC:
- pd_idx = sector_div(stripe2, raid_disks);
- *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
- break;
- case ALGORITHM_PARITY_0:
- pd_idx = 0;
- (*dd_idx)++;
- break;
- case ALGORITHM_PARITY_N:
- pd_idx = data_disks;
- break;
- default:
- BUG();
- }
- break;
- case 6:
- switch (algorithm) {
- case ALGORITHM_LEFT_ASYMMETRIC:
- pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
- qd_idx = pd_idx + 1;
- if (pd_idx == raid_disks-1) {
- (*dd_idx)++; /* Q D D D P */
- qd_idx = 0;
- } else if (*dd_idx >= pd_idx)
- (*dd_idx) += 2; /* D D P Q D */
- break;
- case ALGORITHM_RIGHT_ASYMMETRIC:
- pd_idx = sector_div(stripe2, raid_disks);
- qd_idx = pd_idx + 1;
- if (pd_idx == raid_disks-1) {
- (*dd_idx)++; /* Q D D D P */
- qd_idx = 0;
- } else if (*dd_idx >= pd_idx)
- (*dd_idx) += 2; /* D D P Q D */
- break;
- case ALGORITHM_LEFT_SYMMETRIC:
- pd_…