/drivers/md/raid5.c

https://bitbucket.org/ndreys/linux-sunxi · C · 6022 lines · 4565 code · 661 blank · 796 comment · 1114 complexity · 55ed2e6a439dbba1eab15e8ff1006358 MD5 · raw file

Large files are truncated click here to view the full file

  1. /*
  2. * raid5.c : Multiple Devices driver for Linux
  3. * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
  4. * Copyright (C) 1999, 2000 Ingo Molnar
  5. * Copyright (C) 2002, 2003 H. Peter Anvin
  6. *
  7. * RAID-4/5/6 management functions.
  8. * Thanks to Penguin Computing for making the RAID-6 development possible
  9. * by donating a test server!
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of the GNU General Public License as published by
  13. * the Free Software Foundation; either version 2, or (at your option)
  14. * any later version.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * (for example /usr/src/linux/COPYING); if not, write to the Free
  18. * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19. */
  20. /*
  21. * BITMAP UNPLUGGING:
  22. *
  23. * The sequencing for updating the bitmap reliably is a little
  24. * subtle (and I got it wrong the first time) so it deserves some
  25. * explanation.
  26. *
  27. * We group bitmap updates into batches. Each batch has a number.
  28. * We may write out several batches at once, but that isn't very important.
  29. * conf->seq_write is the number of the last batch successfully written.
  30. * conf->seq_flush is the number of the last batch that was closed to
  31. * new additions.
  32. * When we discover that we will need to write to any block in a stripe
  33. * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
  34. * the number of the batch it will be in. This is seq_flush+1.
  35. * When we are ready to do a write, if that batch hasn't been written yet,
  36. * we plug the array and queue the stripe for later.
  37. * When an unplug happens, we increment bm_flush, thus closing the current
  38. * batch.
  39. * When we notice that bm_flush > bm_write, we write out all pending updates
  40. * to the bitmap, and advance bm_write to where bm_flush was.
  41. * This may occasionally write a bit out twice, but is sure never to
  42. * miss any bits.
  43. */
  44. #include <linux/blkdev.h>
  45. #include <linux/kthread.h>
  46. #include <linux/raid/pq.h>
  47. #include <linux/async_tx.h>
  48. #include <linux/async.h>
  49. #include <linux/seq_file.h>
  50. #include <linux/cpu.h>
  51. #include <linux/slab.h>
  52. #include "md.h"
  53. #include "raid5.h"
  54. #include "raid0.h"
  55. #include "bitmap.h"
  56. /*
  57. * Stripe cache
  58. */
  59. #define NR_STRIPES 256
  60. #define STRIPE_SIZE PAGE_SIZE
  61. #define STRIPE_SHIFT (PAGE_SHIFT - 9)
  62. #define STRIPE_SECTORS (STRIPE_SIZE>>9)
  63. #define IO_THRESHOLD 1
  64. #define BYPASS_THRESHOLD 1
  65. #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
  66. #define HASH_MASK (NR_HASH - 1)
  67. #define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
  68. /* bio's attached to a stripe+device for I/O are linked together in bi_sector
  69. * order without overlap. There may be several bio's per stripe+device, and
  70. * a bio could span several devices.
  71. * When walking this list for a particular stripe+device, we must never proceed
  72. * beyond a bio that extends past this device, as the next bio might no longer
  73. * be valid.
  74. * This macro is used to determine the 'next' bio in the list, given the sector
  75. * of the current stripe+device
  76. */
  77. #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
  78. /*
  79. * The following can be used to debug the driver
  80. */
  81. #define RAID5_PARANOIA 1
  82. #if RAID5_PARANOIA && defined(CONFIG_SMP)
  83. # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
  84. #else
  85. # define CHECK_DEVLOCK()
  86. #endif
  87. #ifdef DEBUG
  88. #define inline
  89. #define __inline__
  90. #endif
  91. #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
  92. /*
  93. * We maintain a biased count of active stripes in the bottom 16 bits of
  94. * bi_phys_segments, and a count of processed stripes in the upper 16 bits
  95. */
  96. static inline int raid5_bi_phys_segments(struct bio *bio)
  97. {
  98. return bio->bi_phys_segments & 0xffff;
  99. }
  100. static inline int raid5_bi_hw_segments(struct bio *bio)
  101. {
  102. return (bio->bi_phys_segments >> 16) & 0xffff;
  103. }
  104. static inline int raid5_dec_bi_phys_segments(struct bio *bio)
  105. {
  106. --bio->bi_phys_segments;
  107. return raid5_bi_phys_segments(bio);
  108. }
  109. static inline int raid5_dec_bi_hw_segments(struct bio *bio)
  110. {
  111. unsigned short val = raid5_bi_hw_segments(bio);
  112. --val;
  113. bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
  114. return val;
  115. }
  116. static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
  117. {
  118. bio->bi_phys_segments = raid5_bi_phys_segments(bio) | (cnt << 16);
  119. }
  120. /* Find first data disk in a raid6 stripe */
  121. static inline int raid6_d0(struct stripe_head *sh)
  122. {
  123. if (sh->ddf_layout)
  124. /* ddf always start from first device */
  125. return 0;
  126. /* md starts just after Q block */
  127. if (sh->qd_idx == sh->disks - 1)
  128. return 0;
  129. else
  130. return sh->qd_idx + 1;
  131. }
  132. static inline int raid6_next_disk(int disk, int raid_disks)
  133. {
  134. disk++;
  135. return (disk < raid_disks) ? disk : 0;
  136. }
  137. /* When walking through the disks in a raid5, starting at raid6_d0,
  138. * We need to map each disk to a 'slot', where the data disks are slot
  139. * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
  140. * is raid_disks-1. This help does that mapping.
  141. */
  142. static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
  143. int *count, int syndrome_disks)
  144. {
  145. int slot = *count;
  146. if (sh->ddf_layout)
  147. (*count)++;
  148. if (idx == sh->pd_idx)
  149. return syndrome_disks;
  150. if (idx == sh->qd_idx)
  151. return syndrome_disks + 1;
  152. if (!sh->ddf_layout)
  153. (*count)++;
  154. return slot;
  155. }
  156. static void return_io(struct bio *return_bi)
  157. {
  158. struct bio *bi = return_bi;
  159. while (bi) {
  160. return_bi = bi->bi_next;
  161. bi->bi_next = NULL;
  162. bi->bi_size = 0;
  163. bio_endio(bi, 0);
  164. bi = return_bi;
  165. }
  166. }
  167. static void print_raid5_conf (raid5_conf_t *conf);
  168. static int stripe_operations_active(struct stripe_head *sh)
  169. {
  170. return sh->check_state || sh->reconstruct_state ||
  171. test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
  172. test_bit(STRIPE_COMPUTE_RUN, &sh->state);
  173. }
  174. static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
  175. {
  176. if (atomic_dec_and_test(&sh->count)) {
  177. BUG_ON(!list_empty(&sh->lru));
  178. BUG_ON(atomic_read(&conf->active_stripes)==0);
  179. if (test_bit(STRIPE_HANDLE, &sh->state)) {
  180. if (test_bit(STRIPE_DELAYED, &sh->state) &&
  181. !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
  182. list_add_tail(&sh->lru, &conf->delayed_list);
  183. else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
  184. sh->bm_seq - conf->seq_write > 0)
  185. list_add_tail(&sh->lru, &conf->bitmap_list);
  186. else {
  187. clear_bit(STRIPE_DELAYED, &sh->state);
  188. clear_bit(STRIPE_BIT_DELAY, &sh->state);
  189. list_add_tail(&sh->lru, &conf->handle_list);
  190. }
  191. md_wakeup_thread(conf->mddev->thread);
  192. } else {
  193. BUG_ON(stripe_operations_active(sh));
  194. if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
  195. atomic_dec(&conf->preread_active_stripes);
  196. if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
  197. md_wakeup_thread(conf->mddev->thread);
  198. }
  199. atomic_dec(&conf->active_stripes);
  200. if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
  201. list_add_tail(&sh->lru, &conf->inactive_list);
  202. wake_up(&conf->wait_for_stripe);
  203. if (conf->retry_read_aligned)
  204. md_wakeup_thread(conf->mddev->thread);
  205. }
  206. }
  207. }
  208. }
  209. static void release_stripe(struct stripe_head *sh)
  210. {
  211. raid5_conf_t *conf = sh->raid_conf;
  212. unsigned long flags;
  213. spin_lock_irqsave(&conf->device_lock, flags);
  214. __release_stripe(conf, sh);
  215. spin_unlock_irqrestore(&conf->device_lock, flags);
  216. }
  217. static inline void remove_hash(struct stripe_head *sh)
  218. {
  219. pr_debug("remove_hash(), stripe %llu\n",
  220. (unsigned long long)sh->sector);
  221. hlist_del_init(&sh->hash);
  222. }
  223. static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
  224. {
  225. struct hlist_head *hp = stripe_hash(conf, sh->sector);
  226. pr_debug("insert_hash(), stripe %llu\n",
  227. (unsigned long long)sh->sector);
  228. CHECK_DEVLOCK();
  229. hlist_add_head(&sh->hash, hp);
  230. }
  231. /* find an idle stripe, make sure it is unhashed, and return it. */
  232. static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
  233. {
  234. struct stripe_head *sh = NULL;
  235. struct list_head *first;
  236. CHECK_DEVLOCK();
  237. if (list_empty(&conf->inactive_list))
  238. goto out;
  239. first = conf->inactive_list.next;
  240. sh = list_entry(first, struct stripe_head, lru);
  241. list_del_init(first);
  242. remove_hash(sh);
  243. atomic_inc(&conf->active_stripes);
  244. out:
  245. return sh;
  246. }
  247. static void shrink_buffers(struct stripe_head *sh)
  248. {
  249. struct page *p;
  250. int i;
  251. int num = sh->raid_conf->pool_size;
  252. for (i = 0; i < num ; i++) {
  253. p = sh->dev[i].page;
  254. if (!p)
  255. continue;
  256. sh->dev[i].page = NULL;
  257. put_page(p);
  258. }
  259. }
  260. static int grow_buffers(struct stripe_head *sh)
  261. {
  262. int i;
  263. int num = sh->raid_conf->pool_size;
  264. for (i = 0; i < num; i++) {
  265. struct page *page;
  266. if (!(page = alloc_page(GFP_KERNEL))) {
  267. return 1;
  268. }
  269. sh->dev[i].page = page;
  270. }
  271. return 0;
  272. }
  273. static void raid5_build_block(struct stripe_head *sh, int i, int previous);
  274. static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
  275. struct stripe_head *sh);
  276. static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
  277. {
  278. raid5_conf_t *conf = sh->raid_conf;
  279. int i;
  280. BUG_ON(atomic_read(&sh->count) != 0);
  281. BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
  282. BUG_ON(stripe_operations_active(sh));
  283. CHECK_DEVLOCK();
  284. pr_debug("init_stripe called, stripe %llu\n",
  285. (unsigned long long)sh->sector);
  286. remove_hash(sh);
  287. sh->generation = conf->generation - previous;
  288. sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
  289. sh->sector = sector;
  290. stripe_set_idx(sector, conf, previous, sh);
  291. sh->state = 0;
  292. for (i = sh->disks; i--; ) {
  293. struct r5dev *dev = &sh->dev[i];
  294. if (dev->toread || dev->read || dev->towrite || dev->written ||
  295. test_bit(R5_LOCKED, &dev->flags)) {
  296. printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
  297. (unsigned long long)sh->sector, i, dev->toread,
  298. dev->read, dev->towrite, dev->written,
  299. test_bit(R5_LOCKED, &dev->flags));
  300. BUG();
  301. }
  302. dev->flags = 0;
  303. raid5_build_block(sh, i, previous);
  304. }
  305. insert_hash(conf, sh);
  306. }
  307. static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
  308. short generation)
  309. {
  310. struct stripe_head *sh;
  311. struct hlist_node *hn;
  312. CHECK_DEVLOCK();
  313. pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
  314. hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
  315. if (sh->sector == sector && sh->generation == generation)
  316. return sh;
  317. pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
  318. return NULL;
  319. }
  320. /*
  321. * Need to check if array has failed when deciding whether to:
  322. * - start an array
  323. * - remove non-faulty devices
  324. * - add a spare
  325. * - allow a reshape
  326. * This determination is simple when no reshape is happening.
  327. * However if there is a reshape, we need to carefully check
  328. * both the before and after sections.
  329. * This is because some failed devices may only affect one
  330. * of the two sections, and some non-in_sync devices may
  331. * be insync in the section most affected by failed devices.
  332. */
  333. static int has_failed(raid5_conf_t *conf)
  334. {
  335. int degraded;
  336. int i;
  337. if (conf->mddev->reshape_position == MaxSector)
  338. return conf->mddev->degraded > conf->max_degraded;
  339. rcu_read_lock();
  340. degraded = 0;
  341. for (i = 0; i < conf->previous_raid_disks; i++) {
  342. mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
  343. if (!rdev || test_bit(Faulty, &rdev->flags))
  344. degraded++;
  345. else if (test_bit(In_sync, &rdev->flags))
  346. ;
  347. else
  348. /* not in-sync or faulty.
  349. * If the reshape increases the number of devices,
  350. * this is being recovered by the reshape, so
  351. * this 'previous' section is not in_sync.
  352. * If the number of devices is being reduced however,
  353. * the device can only be part of the array if
  354. * we are reverting a reshape, so this section will
  355. * be in-sync.
  356. */
  357. if (conf->raid_disks >= conf->previous_raid_disks)
  358. degraded++;
  359. }
  360. rcu_read_unlock();
  361. if (degraded > conf->max_degraded)
  362. return 1;
  363. rcu_read_lock();
  364. degraded = 0;
  365. for (i = 0; i < conf->raid_disks; i++) {
  366. mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
  367. if (!rdev || test_bit(Faulty, &rdev->flags))
  368. degraded++;
  369. else if (test_bit(In_sync, &rdev->flags))
  370. ;
  371. else
  372. /* not in-sync or faulty.
  373. * If reshape increases the number of devices, this
  374. * section has already been recovered, else it
  375. * almost certainly hasn't.
  376. */
  377. if (conf->raid_disks <= conf->previous_raid_disks)
  378. degraded++;
  379. }
  380. rcu_read_unlock();
  381. if (degraded > conf->max_degraded)
  382. return 1;
  383. return 0;
  384. }
  385. static struct stripe_head *
  386. get_active_stripe(raid5_conf_t *conf, sector_t sector,
  387. int previous, int noblock, int noquiesce)
  388. {
  389. struct stripe_head *sh;
  390. pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
  391. spin_lock_irq(&conf->device_lock);
  392. do {
  393. wait_event_lock_irq(conf->wait_for_stripe,
  394. conf->quiesce == 0 || noquiesce,
  395. conf->device_lock, /* nothing */);
  396. sh = __find_stripe(conf, sector, conf->generation - previous);
  397. if (!sh) {
  398. if (!conf->inactive_blocked)
  399. sh = get_free_stripe(conf);
  400. if (noblock && sh == NULL)
  401. break;
  402. if (!sh) {
  403. conf->inactive_blocked = 1;
  404. wait_event_lock_irq(conf->wait_for_stripe,
  405. !list_empty(&conf->inactive_list) &&
  406. (atomic_read(&conf->active_stripes)
  407. < (conf->max_nr_stripes *3/4)
  408. || !conf->inactive_blocked),
  409. conf->device_lock,
  410. );
  411. conf->inactive_blocked = 0;
  412. } else
  413. init_stripe(sh, sector, previous);
  414. } else {
  415. if (atomic_read(&sh->count)) {
  416. BUG_ON(!list_empty(&sh->lru)
  417. && !test_bit(STRIPE_EXPANDING, &sh->state));
  418. } else {
  419. if (!test_bit(STRIPE_HANDLE, &sh->state))
  420. atomic_inc(&conf->active_stripes);
  421. if (list_empty(&sh->lru) &&
  422. !test_bit(STRIPE_EXPANDING, &sh->state))
  423. BUG();
  424. list_del_init(&sh->lru);
  425. }
  426. }
  427. } while (sh == NULL);
  428. if (sh)
  429. atomic_inc(&sh->count);
  430. spin_unlock_irq(&conf->device_lock);
  431. return sh;
  432. }
  433. static void
  434. raid5_end_read_request(struct bio *bi, int error);
  435. static void
  436. raid5_end_write_request(struct bio *bi, int error);
  437. static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
  438. {
  439. raid5_conf_t *conf = sh->raid_conf;
  440. int i, disks = sh->disks;
  441. might_sleep();
  442. for (i = disks; i--; ) {
  443. int rw;
  444. struct bio *bi;
  445. mdk_rdev_t *rdev;
  446. if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
  447. if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
  448. rw = WRITE_FUA;
  449. else
  450. rw = WRITE;
  451. } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
  452. rw = READ;
  453. else
  454. continue;
  455. bi = &sh->dev[i].req;
  456. bi->bi_rw = rw;
  457. if (rw & WRITE)
  458. bi->bi_end_io = raid5_end_write_request;
  459. else
  460. bi->bi_end_io = raid5_end_read_request;
  461. rcu_read_lock();
  462. rdev = rcu_dereference(conf->disks[i].rdev);
  463. if (rdev && test_bit(Faulty, &rdev->flags))
  464. rdev = NULL;
  465. if (rdev)
  466. atomic_inc(&rdev->nr_pending);
  467. rcu_read_unlock();
  468. if (rdev) {
  469. if (s->syncing || s->expanding || s->expanded)
  470. md_sync_acct(rdev->bdev, STRIPE_SECTORS);
  471. set_bit(STRIPE_IO_STARTED, &sh->state);
  472. bi->bi_bdev = rdev->bdev;
  473. pr_debug("%s: for %llu schedule op %ld on disc %d\n",
  474. __func__, (unsigned long long)sh->sector,
  475. bi->bi_rw, i);
  476. atomic_inc(&sh->count);
  477. bi->bi_sector = sh->sector + rdev->data_offset;
  478. bi->bi_flags = 1 << BIO_UPTODATE;
  479. bi->bi_vcnt = 1;
  480. bi->bi_max_vecs = 1;
  481. bi->bi_idx = 0;
  482. bi->bi_io_vec = &sh->dev[i].vec;
  483. bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
  484. bi->bi_io_vec[0].bv_offset = 0;
  485. bi->bi_size = STRIPE_SIZE;
  486. bi->bi_next = NULL;
  487. if ((rw & WRITE) &&
  488. test_bit(R5_ReWrite, &sh->dev[i].flags))
  489. atomic_add(STRIPE_SECTORS,
  490. &rdev->corrected_errors);
  491. generic_make_request(bi);
  492. } else {
  493. if (rw & WRITE)
  494. set_bit(STRIPE_DEGRADED, &sh->state);
  495. pr_debug("skip op %ld on disc %d for sector %llu\n",
  496. bi->bi_rw, i, (unsigned long long)sh->sector);
  497. clear_bit(R5_LOCKED, &sh->dev[i].flags);
  498. set_bit(STRIPE_HANDLE, &sh->state);
  499. }
  500. }
  501. }
  502. static struct dma_async_tx_descriptor *
  503. async_copy_data(int frombio, struct bio *bio, struct page *page,
  504. sector_t sector, struct dma_async_tx_descriptor *tx)
  505. {
  506. struct bio_vec *bvl;
  507. struct page *bio_page;
  508. int i;
  509. int page_offset;
  510. struct async_submit_ctl submit;
  511. enum async_tx_flags flags = 0;
  512. if (bio->bi_sector >= sector)
  513. page_offset = (signed)(bio->bi_sector - sector) * 512;
  514. else
  515. page_offset = (signed)(sector - bio->bi_sector) * -512;
  516. if (frombio)
  517. flags |= ASYNC_TX_FENCE;
  518. init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
  519. bio_for_each_segment(bvl, bio, i) {
  520. int len = bvl->bv_len;
  521. int clen;
  522. int b_offset = 0;
  523. if (page_offset < 0) {
  524. b_offset = -page_offset;
  525. page_offset += b_offset;
  526. len -= b_offset;
  527. }
  528. if (len > 0 && page_offset + len > STRIPE_SIZE)
  529. clen = STRIPE_SIZE - page_offset;
  530. else
  531. clen = len;
  532. if (clen > 0) {
  533. b_offset += bvl->bv_offset;
  534. bio_page = bvl->bv_page;
  535. if (frombio)
  536. tx = async_memcpy(page, bio_page, page_offset,
  537. b_offset, clen, &submit);
  538. else
  539. tx = async_memcpy(bio_page, page, b_offset,
  540. page_offset, clen, &submit);
  541. }
  542. /* chain the operations */
  543. submit.depend_tx = tx;
  544. if (clen < len) /* hit end of page */
  545. break;
  546. page_offset += len;
  547. }
  548. return tx;
  549. }
  550. static void ops_complete_biofill(void *stripe_head_ref)
  551. {
  552. struct stripe_head *sh = stripe_head_ref;
  553. struct bio *return_bi = NULL;
  554. raid5_conf_t *conf = sh->raid_conf;
  555. int i;
  556. pr_debug("%s: stripe %llu\n", __func__,
  557. (unsigned long long)sh->sector);
  558. /* clear completed biofills */
  559. spin_lock_irq(&conf->device_lock);
  560. for (i = sh->disks; i--; ) {
  561. struct r5dev *dev = &sh->dev[i];
  562. /* acknowledge completion of a biofill operation */
  563. /* and check if we need to reply to a read request,
  564. * new R5_Wantfill requests are held off until
  565. * !STRIPE_BIOFILL_RUN
  566. */
  567. if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
  568. struct bio *rbi, *rbi2;
  569. BUG_ON(!dev->read);
  570. rbi = dev->read;
  571. dev->read = NULL;
  572. while (rbi && rbi->bi_sector <
  573. dev->sector + STRIPE_SECTORS) {
  574. rbi2 = r5_next_bio(rbi, dev->sector);
  575. if (!raid5_dec_bi_phys_segments(rbi)) {
  576. rbi->bi_next = return_bi;
  577. return_bi = rbi;
  578. }
  579. rbi = rbi2;
  580. }
  581. }
  582. }
  583. spin_unlock_irq(&conf->device_lock);
  584. clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
  585. return_io(return_bi);
  586. set_bit(STRIPE_HANDLE, &sh->state);
  587. release_stripe(sh);
  588. }
  589. static void ops_run_biofill(struct stripe_head *sh)
  590. {
  591. struct dma_async_tx_descriptor *tx = NULL;
  592. raid5_conf_t *conf = sh->raid_conf;
  593. struct async_submit_ctl submit;
  594. int i;
  595. pr_debug("%s: stripe %llu\n", __func__,
  596. (unsigned long long)sh->sector);
  597. for (i = sh->disks; i--; ) {
  598. struct r5dev *dev = &sh->dev[i];
  599. if (test_bit(R5_Wantfill, &dev->flags)) {
  600. struct bio *rbi;
  601. spin_lock_irq(&conf->device_lock);
  602. dev->read = rbi = dev->toread;
  603. dev->toread = NULL;
  604. spin_unlock_irq(&conf->device_lock);
  605. while (rbi && rbi->bi_sector <
  606. dev->sector + STRIPE_SECTORS) {
  607. tx = async_copy_data(0, rbi, dev->page,
  608. dev->sector, tx);
  609. rbi = r5_next_bio(rbi, dev->sector);
  610. }
  611. }
  612. }
  613. atomic_inc(&sh->count);
  614. init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
  615. async_trigger_callback(&submit);
  616. }
  617. static void mark_target_uptodate(struct stripe_head *sh, int target)
  618. {
  619. struct r5dev *tgt;
  620. if (target < 0)
  621. return;
  622. tgt = &sh->dev[target];
  623. set_bit(R5_UPTODATE, &tgt->flags);
  624. BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
  625. clear_bit(R5_Wantcompute, &tgt->flags);
  626. }
  627. static void ops_complete_compute(void *stripe_head_ref)
  628. {
  629. struct stripe_head *sh = stripe_head_ref;
  630. pr_debug("%s: stripe %llu\n", __func__,
  631. (unsigned long long)sh->sector);
  632. /* mark the computed target(s) as uptodate */
  633. mark_target_uptodate(sh, sh->ops.target);
  634. mark_target_uptodate(sh, sh->ops.target2);
  635. clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
  636. if (sh->check_state == check_state_compute_run)
  637. sh->check_state = check_state_compute_result;
  638. set_bit(STRIPE_HANDLE, &sh->state);
  639. release_stripe(sh);
  640. }
  641. /* return a pointer to the address conversion region of the scribble buffer */
  642. static addr_conv_t *to_addr_conv(struct stripe_head *sh,
  643. struct raid5_percpu *percpu)
  644. {
  645. return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
  646. }
  647. static struct dma_async_tx_descriptor *
  648. ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
  649. {
  650. int disks = sh->disks;
  651. struct page **xor_srcs = percpu->scribble;
  652. int target = sh->ops.target;
  653. struct r5dev *tgt = &sh->dev[target];
  654. struct page *xor_dest = tgt->page;
  655. int count = 0;
  656. struct dma_async_tx_descriptor *tx;
  657. struct async_submit_ctl submit;
  658. int i;
  659. pr_debug("%s: stripe %llu block: %d\n",
  660. __func__, (unsigned long long)sh->sector, target);
  661. BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
  662. for (i = disks; i--; )
  663. if (i != target)
  664. xor_srcs[count++] = sh->dev[i].page;
  665. atomic_inc(&sh->count);
  666. init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
  667. ops_complete_compute, sh, to_addr_conv(sh, percpu));
  668. if (unlikely(count == 1))
  669. tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
  670. else
  671. tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
  672. return tx;
  673. }
  674. /* set_syndrome_sources - populate source buffers for gen_syndrome
  675. * @srcs - (struct page *) array of size sh->disks
  676. * @sh - stripe_head to parse
  677. *
  678. * Populates srcs in proper layout order for the stripe and returns the
  679. * 'count' of sources to be used in a call to async_gen_syndrome. The P
  680. * destination buffer is recorded in srcs[count] and the Q destination
  681. * is recorded in srcs[count+1]].
  682. */
  683. static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
  684. {
  685. int disks = sh->disks;
  686. int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
  687. int d0_idx = raid6_d0(sh);
  688. int count;
  689. int i;
  690. for (i = 0; i < disks; i++)
  691. srcs[i] = NULL;
  692. count = 0;
  693. i = d0_idx;
  694. do {
  695. int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
  696. srcs[slot] = sh->dev[i].page;
  697. i = raid6_next_disk(i, disks);
  698. } while (i != d0_idx);
  699. return syndrome_disks;
  700. }
  701. static struct dma_async_tx_descriptor *
  702. ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
  703. {
  704. int disks = sh->disks;
  705. struct page **blocks = percpu->scribble;
  706. int target;
  707. int qd_idx = sh->qd_idx;
  708. struct dma_async_tx_descriptor *tx;
  709. struct async_submit_ctl submit;
  710. struct r5dev *tgt;
  711. struct page *dest;
  712. int i;
  713. int count;
  714. if (sh->ops.target < 0)
  715. target = sh->ops.target2;
  716. else if (sh->ops.target2 < 0)
  717. target = sh->ops.target;
  718. else
  719. /* we should only have one valid target */
  720. BUG();
  721. BUG_ON(target < 0);
  722. pr_debug("%s: stripe %llu block: %d\n",
  723. __func__, (unsigned long long)sh->sector, target);
  724. tgt = &sh->dev[target];
  725. BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
  726. dest = tgt->page;
  727. atomic_inc(&sh->count);
  728. if (target == qd_idx) {
  729. count = set_syndrome_sources(blocks, sh);
  730. blocks[count] = NULL; /* regenerating p is not necessary */
  731. BUG_ON(blocks[count+1] != dest); /* q should already be set */
  732. init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
  733. ops_complete_compute, sh,
  734. to_addr_conv(sh, percpu));
  735. tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
  736. } else {
  737. /* Compute any data- or p-drive using XOR */
  738. count = 0;
  739. for (i = disks; i-- ; ) {
  740. if (i == target || i == qd_idx)
  741. continue;
  742. blocks[count++] = sh->dev[i].page;
  743. }
  744. init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
  745. NULL, ops_complete_compute, sh,
  746. to_addr_conv(sh, percpu));
  747. tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
  748. }
  749. return tx;
  750. }
  751. static struct dma_async_tx_descriptor *
  752. ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
  753. {
  754. int i, count, disks = sh->disks;
  755. int syndrome_disks = sh->ddf_layout ? disks : disks-2;
  756. int d0_idx = raid6_d0(sh);
  757. int faila = -1, failb = -1;
  758. int target = sh->ops.target;
  759. int target2 = sh->ops.target2;
  760. struct r5dev *tgt = &sh->dev[target];
  761. struct r5dev *tgt2 = &sh->dev[target2];
  762. struct dma_async_tx_descriptor *tx;
  763. struct page **blocks = percpu->scribble;
  764. struct async_submit_ctl submit;
  765. pr_debug("%s: stripe %llu block1: %d block2: %d\n",
  766. __func__, (unsigned long long)sh->sector, target, target2);
  767. BUG_ON(target < 0 || target2 < 0);
  768. BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
  769. BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
  770. /* we need to open-code set_syndrome_sources to handle the
  771. * slot number conversion for 'faila' and 'failb'
  772. */
  773. for (i = 0; i < disks ; i++)
  774. blocks[i] = NULL;
  775. count = 0;
  776. i = d0_idx;
  777. do {
  778. int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
  779. blocks[slot] = sh->dev[i].page;
  780. if (i == target)
  781. faila = slot;
  782. if (i == target2)
  783. failb = slot;
  784. i = raid6_next_disk(i, disks);
  785. } while (i != d0_idx);
  786. BUG_ON(faila == failb);
  787. if (failb < faila)
  788. swap(faila, failb);
  789. pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
  790. __func__, (unsigned long long)sh->sector, faila, failb);
  791. atomic_inc(&sh->count);
  792. if (failb == syndrome_disks+1) {
  793. /* Q disk is one of the missing disks */
  794. if (faila == syndrome_disks) {
  795. /* Missing P+Q, just recompute */
  796. init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
  797. ops_complete_compute, sh,
  798. to_addr_conv(sh, percpu));
  799. return async_gen_syndrome(blocks, 0, syndrome_disks+2,
  800. STRIPE_SIZE, &submit);
  801. } else {
  802. struct page *dest;
  803. int data_target;
  804. int qd_idx = sh->qd_idx;
  805. /* Missing D+Q: recompute D from P, then recompute Q */
  806. if (target == qd_idx)
  807. data_target = target2;
  808. else
  809. data_target = target;
  810. count = 0;
  811. for (i = disks; i-- ; ) {
  812. if (i == data_target || i == qd_idx)
  813. continue;
  814. blocks[count++] = sh->dev[i].page;
  815. }
  816. dest = sh->dev[data_target].page;
  817. init_async_submit(&submit,
  818. ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
  819. NULL, NULL, NULL,
  820. to_addr_conv(sh, percpu));
  821. tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
  822. &submit);
  823. count = set_syndrome_sources(blocks, sh);
  824. init_async_submit(&submit, ASYNC_TX_FENCE, tx,
  825. ops_complete_compute, sh,
  826. to_addr_conv(sh, percpu));
  827. return async_gen_syndrome(blocks, 0, count+2,
  828. STRIPE_SIZE, &submit);
  829. }
  830. } else {
  831. init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
  832. ops_complete_compute, sh,
  833. to_addr_conv(sh, percpu));
  834. if (failb == syndrome_disks) {
  835. /* We're missing D+P. */
  836. return async_raid6_datap_recov(syndrome_disks+2,
  837. STRIPE_SIZE, faila,
  838. blocks, &submit);
  839. } else {
  840. /* We're missing D+D. */
  841. return async_raid6_2data_recov(syndrome_disks+2,
  842. STRIPE_SIZE, faila, failb,
  843. blocks, &submit);
  844. }
  845. }
  846. }
  847. static void ops_complete_prexor(void *stripe_head_ref)
  848. {
  849. struct stripe_head *sh = stripe_head_ref;
  850. pr_debug("%s: stripe %llu\n", __func__,
  851. (unsigned long long)sh->sector);
  852. }
  853. static struct dma_async_tx_descriptor *
  854. ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
  855. struct dma_async_tx_descriptor *tx)
  856. {
  857. int disks = sh->disks;
  858. struct page **xor_srcs = percpu->scribble;
  859. int count = 0, pd_idx = sh->pd_idx, i;
  860. struct async_submit_ctl submit;
  861. /* existing parity data subtracted */
  862. struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
  863. pr_debug("%s: stripe %llu\n", __func__,
  864. (unsigned long long)sh->sector);
  865. for (i = disks; i--; ) {
  866. struct r5dev *dev = &sh->dev[i];
  867. /* Only process blocks that are known to be uptodate */
  868. if (test_bit(R5_Wantdrain, &dev->flags))
  869. xor_srcs[count++] = dev->page;
  870. }
  871. init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
  872. ops_complete_prexor, sh, to_addr_conv(sh, percpu));
  873. tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
  874. return tx;
  875. }
  876. static struct dma_async_tx_descriptor *
  877. ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
  878. {
  879. int disks = sh->disks;
  880. int i;
  881. pr_debug("%s: stripe %llu\n", __func__,
  882. (unsigned long long)sh->sector);
  883. for (i = disks; i--; ) {
  884. struct r5dev *dev = &sh->dev[i];
  885. struct bio *chosen;
  886. if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
  887. struct bio *wbi;
  888. spin_lock(&sh->lock);
  889. chosen = dev->towrite;
  890. dev->towrite = NULL;
  891. BUG_ON(dev->written);
  892. wbi = dev->written = chosen;
  893. spin_unlock(&sh->lock);
  894. while (wbi && wbi->bi_sector <
  895. dev->sector + STRIPE_SECTORS) {
  896. if (wbi->bi_rw & REQ_FUA)
  897. set_bit(R5_WantFUA, &dev->flags);
  898. tx = async_copy_data(1, wbi, dev->page,
  899. dev->sector, tx);
  900. wbi = r5_next_bio(wbi, dev->sector);
  901. }
  902. }
  903. }
  904. return tx;
  905. }
  906. static void ops_complete_reconstruct(void *stripe_head_ref)
  907. {
  908. struct stripe_head *sh = stripe_head_ref;
  909. int disks = sh->disks;
  910. int pd_idx = sh->pd_idx;
  911. int qd_idx = sh->qd_idx;
  912. int i;
  913. bool fua = false;
  914. pr_debug("%s: stripe %llu\n", __func__,
  915. (unsigned long long)sh->sector);
  916. for (i = disks; i--; )
  917. fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
  918. for (i = disks; i--; ) {
  919. struct r5dev *dev = &sh->dev[i];
  920. if (dev->written || i == pd_idx || i == qd_idx) {
  921. set_bit(R5_UPTODATE, &dev->flags);
  922. if (fua)
  923. set_bit(R5_WantFUA, &dev->flags);
  924. }
  925. }
  926. if (sh->reconstruct_state == reconstruct_state_drain_run)
  927. sh->reconstruct_state = reconstruct_state_drain_result;
  928. else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
  929. sh->reconstruct_state = reconstruct_state_prexor_drain_result;
  930. else {
  931. BUG_ON(sh->reconstruct_state != reconstruct_state_run);
  932. sh->reconstruct_state = reconstruct_state_result;
  933. }
  934. set_bit(STRIPE_HANDLE, &sh->state);
  935. release_stripe(sh);
  936. }
  937. static void
  938. ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
  939. struct dma_async_tx_descriptor *tx)
  940. {
  941. int disks = sh->disks;
  942. struct page **xor_srcs = percpu->scribble;
  943. struct async_submit_ctl submit;
  944. int count = 0, pd_idx = sh->pd_idx, i;
  945. struct page *xor_dest;
  946. int prexor = 0;
  947. unsigned long flags;
  948. pr_debug("%s: stripe %llu\n", __func__,
  949. (unsigned long long)sh->sector);
  950. /* check if prexor is active which means only process blocks
  951. * that are part of a read-modify-write (written)
  952. */
  953. if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
  954. prexor = 1;
  955. xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
  956. for (i = disks; i--; ) {
  957. struct r5dev *dev = &sh->dev[i];
  958. if (dev->written)
  959. xor_srcs[count++] = dev->page;
  960. }
  961. } else {
  962. xor_dest = sh->dev[pd_idx].page;
  963. for (i = disks; i--; ) {
  964. struct r5dev *dev = &sh->dev[i];
  965. if (i != pd_idx)
  966. xor_srcs[count++] = dev->page;
  967. }
  968. }
  969. /* 1/ if we prexor'd then the dest is reused as a source
  970. * 2/ if we did not prexor then we are redoing the parity
  971. * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
  972. * for the synchronous xor case
  973. */
  974. flags = ASYNC_TX_ACK |
  975. (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
  976. atomic_inc(&sh->count);
  977. init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
  978. to_addr_conv(sh, percpu));
  979. if (unlikely(count == 1))
  980. tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
  981. else
  982. tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
  983. }
  984. static void
  985. ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
  986. struct dma_async_tx_descriptor *tx)
  987. {
  988. struct async_submit_ctl submit;
  989. struct page **blocks = percpu->scribble;
  990. int count;
  991. pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
  992. count = set_syndrome_sources(blocks, sh);
  993. atomic_inc(&sh->count);
  994. init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
  995. sh, to_addr_conv(sh, percpu));
  996. async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
  997. }
  998. static void ops_complete_check(void *stripe_head_ref)
  999. {
  1000. struct stripe_head *sh = stripe_head_ref;
  1001. pr_debug("%s: stripe %llu\n", __func__,
  1002. (unsigned long long)sh->sector);
  1003. sh->check_state = check_state_check_result;
  1004. set_bit(STRIPE_HANDLE, &sh->state);
  1005. release_stripe(sh);
  1006. }
  1007. static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
  1008. {
  1009. int disks = sh->disks;
  1010. int pd_idx = sh->pd_idx;
  1011. int qd_idx = sh->qd_idx;
  1012. struct page *xor_dest;
  1013. struct page **xor_srcs = percpu->scribble;
  1014. struct dma_async_tx_descriptor *tx;
  1015. struct async_submit_ctl submit;
  1016. int count;
  1017. int i;
  1018. pr_debug("%s: stripe %llu\n", __func__,
  1019. (unsigned long long)sh->sector);
  1020. count = 0;
  1021. xor_dest = sh->dev[pd_idx].page;
  1022. xor_srcs[count++] = xor_dest;
  1023. for (i = disks; i--; ) {
  1024. if (i == pd_idx || i == qd_idx)
  1025. continue;
  1026. xor_srcs[count++] = sh->dev[i].page;
  1027. }
  1028. init_async_submit(&submit, 0, NULL, NULL, NULL,
  1029. to_addr_conv(sh, percpu));
  1030. tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
  1031. &sh->ops.zero_sum_result, &submit);
  1032. atomic_inc(&sh->count);
  1033. init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
  1034. tx = async_trigger_callback(&submit);
  1035. }
  1036. static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
  1037. {
  1038. struct page **srcs = percpu->scribble;
  1039. struct async_submit_ctl submit;
  1040. int count;
  1041. pr_debug("%s: stripe %llu checkp: %d\n", __func__,
  1042. (unsigned long long)sh->sector, checkp);
  1043. count = set_syndrome_sources(srcs, sh);
  1044. if (!checkp)
  1045. srcs[count] = NULL;
  1046. atomic_inc(&sh->count);
  1047. init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
  1048. sh, to_addr_conv(sh, percpu));
  1049. async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
  1050. &sh->ops.zero_sum_result, percpu->spare_page, &submit);
  1051. }
  1052. static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
  1053. {
  1054. int overlap_clear = 0, i, disks = sh->disks;
  1055. struct dma_async_tx_descriptor *tx = NULL;
  1056. raid5_conf_t *conf = sh->raid_conf;
  1057. int level = conf->level;
  1058. struct raid5_percpu *percpu;
  1059. unsigned long cpu;
  1060. cpu = get_cpu();
  1061. percpu = per_cpu_ptr(conf->percpu, cpu);
  1062. if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
  1063. ops_run_biofill(sh);
  1064. overlap_clear++;
  1065. }
  1066. if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
  1067. if (level < 6)
  1068. tx = ops_run_compute5(sh, percpu);
  1069. else {
  1070. if (sh->ops.target2 < 0 || sh->ops.target < 0)
  1071. tx = ops_run_compute6_1(sh, percpu);
  1072. else
  1073. tx = ops_run_compute6_2(sh, percpu);
  1074. }
  1075. /* terminate the chain if reconstruct is not set to be run */
  1076. if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
  1077. async_tx_ack(tx);
  1078. }
  1079. if (test_bit(STRIPE_OP_PREXOR, &ops_request))
  1080. tx = ops_run_prexor(sh, percpu, tx);
  1081. if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
  1082. tx = ops_run_biodrain(sh, tx);
  1083. overlap_clear++;
  1084. }
  1085. if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
  1086. if (level < 6)
  1087. ops_run_reconstruct5(sh, percpu, tx);
  1088. else
  1089. ops_run_reconstruct6(sh, percpu, tx);
  1090. }
  1091. if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
  1092. if (sh->check_state == check_state_run)
  1093. ops_run_check_p(sh, percpu);
  1094. else if (sh->check_state == check_state_run_q)
  1095. ops_run_check_pq(sh, percpu, 0);
  1096. else if (sh->check_state == check_state_run_pq)
  1097. ops_run_check_pq(sh, percpu, 1);
  1098. else
  1099. BUG();
  1100. }
  1101. if (overlap_clear)
  1102. for (i = disks; i--; ) {
  1103. struct r5dev *dev = &sh->dev[i];
  1104. if (test_and_clear_bit(R5_Overlap, &dev->flags))
  1105. wake_up(&sh->raid_conf->wait_for_overlap);
  1106. }
  1107. put_cpu();
  1108. }
  1109. #ifdef CONFIG_MULTICORE_RAID456
  1110. static void async_run_ops(void *param, async_cookie_t cookie)
  1111. {
  1112. struct stripe_head *sh = param;
  1113. unsigned long ops_request = sh->ops.request;
  1114. clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
  1115. wake_up(&sh->ops.wait_for_ops);
  1116. __raid_run_ops(sh, ops_request);
  1117. release_stripe(sh);
  1118. }
  1119. static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
  1120. {
  1121. /* since handle_stripe can be called outside of raid5d context
  1122. * we need to ensure sh->ops.request is de-staged before another
  1123. * request arrives
  1124. */
  1125. wait_event(sh->ops.wait_for_ops,
  1126. !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
  1127. sh->ops.request = ops_request;
  1128. atomic_inc(&sh->count);
  1129. async_schedule(async_run_ops, sh);
  1130. }
  1131. #else
  1132. #define raid_run_ops __raid_run_ops
  1133. #endif
  1134. static int grow_one_stripe(raid5_conf_t *conf)
  1135. {
  1136. struct stripe_head *sh;
  1137. sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
  1138. if (!sh)
  1139. return 0;
  1140. memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev));
  1141. sh->raid_conf = conf;
  1142. spin_lock_init(&sh->lock);
  1143. #ifdef CONFIG_MULTICORE_RAID456
  1144. init_waitqueue_head(&sh->ops.wait_for_ops);
  1145. #endif
  1146. if (grow_buffers(sh)) {
  1147. shrink_buffers(sh);
  1148. kmem_cache_free(conf->slab_cache, sh);
  1149. return 0;
  1150. }
  1151. /* we just created an active stripe so... */
  1152. atomic_set(&sh->count, 1);
  1153. atomic_inc(&conf->active_stripes);
  1154. INIT_LIST_HEAD(&sh->lru);
  1155. release_stripe(sh);
  1156. return 1;
  1157. }
  1158. static int grow_stripes(raid5_conf_t *conf, int num)
  1159. {
  1160. struct kmem_cache *sc;
  1161. int devs = max(conf->raid_disks, conf->previous_raid_disks);
  1162. if (conf->mddev->gendisk)
  1163. sprintf(conf->cache_name[0],
  1164. "raid%d-%s", conf->level, mdname(conf->mddev));
  1165. else
  1166. sprintf(conf->cache_name[0],
  1167. "raid%d-%p", conf->level, conf->mddev);
  1168. sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]);
  1169. conf->active_name = 0;
  1170. sc = kmem_cache_create(conf->cache_name[conf->active_name],
  1171. sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
  1172. 0, 0, NULL);
  1173. if (!sc)
  1174. return 1;
  1175. conf->slab_cache = sc;
  1176. conf->pool_size = devs;
  1177. while (num--)
  1178. if (!grow_one_stripe(conf))
  1179. return 1;
  1180. return 0;
  1181. }
  1182. /**
  1183. * scribble_len - return the required size of the scribble region
  1184. * @num - total number of disks in the array
  1185. *
  1186. * The size must be enough to contain:
  1187. * 1/ a struct page pointer for each device in the array +2
  1188. * 2/ room to convert each entry in (1) to its corresponding dma
  1189. * (dma_map_page()) or page (page_address()) address.
  1190. *
  1191. * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
  1192. * calculate over all devices (not just the data blocks), using zeros in place
  1193. * of the P and Q blocks.
  1194. */
  1195. static size_t scribble_len(int num)
  1196. {
  1197. size_t len;
  1198. len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
  1199. return len;
  1200. }
  1201. static int resize_stripes(raid5_conf_t *conf, int newsize)
  1202. {
  1203. /* Make all the stripes able to hold 'newsize' devices.
  1204. * New slots in each stripe get 'page' set to a new page.
  1205. *
  1206. * This happens in stages:
  1207. * 1/ create a new kmem_cache and allocate the required number of
  1208. * stripe_heads.
  1209. * 2/ gather all the old stripe_heads and tranfer the pages across
  1210. * to the new stripe_heads. This will have the side effect of
  1211. * freezing the array as once all stripe_heads have been collected,
  1212. * no IO will be possible. Old stripe heads are freed once their
  1213. * pages have been transferred over, and the old kmem_cache is
  1214. * freed when all stripes are done.
  1215. * 3/ reallocate conf->disks to be suitable bigger. If this fails,
  1216. * we simple return a failre status - no need to clean anything up.
  1217. * 4/ allocate new pages for the new slots in the new stripe_heads.
  1218. * If this fails, we don't bother trying the shrink the
  1219. * stripe_heads down again, we just leave them as they are.
  1220. * As each stripe_head is processed the new one is released into
  1221. * active service.
  1222. *
  1223. * Once step2 is started, we cannot afford to wait for a write,
  1224. * so we use GFP_NOIO allocations.
  1225. */
  1226. struct stripe_head *osh, *nsh;
  1227. LIST_HEAD(newstripes);
  1228. struct disk_info *ndisks;
  1229. unsigned long cpu;
  1230. int err;
  1231. struct kmem_cache *sc;
  1232. int i;
  1233. if (newsize <= conf->pool_size)
  1234. return 0; /* never bother to shrink */
  1235. err = md_allow_write(conf->mddev);
  1236. if (err)
  1237. return err;
  1238. /* Step 1 */
  1239. sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
  1240. sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
  1241. 0, 0, NULL);
  1242. if (!sc)
  1243. return -ENOMEM;
  1244. for (i = conf->max_nr_stripes; i; i--) {
  1245. nsh = kmem_cache_alloc(sc, GFP_KERNEL);
  1246. if (!nsh)
  1247. break;
  1248. memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
  1249. nsh->raid_conf = conf;
  1250. spin_lock_init(&nsh->lock);
  1251. #ifdef CONFIG_MULTICORE_RAID456
  1252. init_waitqueue_head(&nsh->ops.wait_for_ops);
  1253. #endif
  1254. list_add(&nsh->lru, &newstripes);
  1255. }
  1256. if (i) {
  1257. /* didn't get enough, give up */
  1258. while (!list_empty(&newstripes)) {
  1259. nsh = list_entry(newstripes.next, struct stripe_head, lru);
  1260. list_del(&nsh->lru);
  1261. kmem_cache_free(sc, nsh);
  1262. }
  1263. kmem_cache_destroy(sc);
  1264. return -ENOMEM;
  1265. }
  1266. /* Step 2 - Must use GFP_NOIO now.
  1267. * OK, we have enough stripes, start collecting inactive
  1268. * stripes and copying them over
  1269. */
  1270. list_for_each_entry(nsh, &newstripes, lru) {
  1271. spin_lock_irq(&conf->device_lock);
  1272. wait_event_lock_irq(conf->wait_for_stripe,
  1273. !list_empty(&conf->inactive_list),
  1274. conf->device_lock,
  1275. );
  1276. osh = get_free_stripe(conf);
  1277. spin_unlock_irq(&conf->device_lock);
  1278. atomic_set(&nsh->count, 1);
  1279. for(i=0; i<conf->pool_size; i++)
  1280. nsh->dev[i].page = osh->dev[i].page;
  1281. for( ; i<newsize; i++)
  1282. nsh->dev[i].page = NULL;
  1283. kmem_cache_free(conf->slab_cache, osh);
  1284. }
  1285. kmem_cache_destroy(conf->slab_cache);
  1286. /* Step 3.
  1287. * At this point, we are holding all the stripes so the array
  1288. * is completely stalled, so now is a good time to resize
  1289. * conf->disks and the scribble region
  1290. */
  1291. ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
  1292. if (ndisks) {
  1293. for (i=0; i<conf->raid_disks; i++)
  1294. ndisks[i] = conf->disks[i];
  1295. kfree(conf->disks);
  1296. conf->disks = ndisks;
  1297. } else
  1298. err = -ENOMEM;
  1299. get_online_cpus();
  1300. conf->scribble_len = scribble_len(newsize);
  1301. for_each_present_cpu(cpu) {
  1302. struct raid5_percpu *percpu;
  1303. void *scribble;
  1304. percpu = per_cpu_ptr(conf->percpu, cpu);
  1305. scribble = kmalloc(conf->scribble_len, GFP_NOIO);
  1306. if (scribble) {
  1307. kfree(percpu->scribble);
  1308. percpu->scribble = scribble;
  1309. } else {
  1310. err = -ENOMEM;
  1311. break;
  1312. }
  1313. }
  1314. put_online_cpus();
  1315. /* Step 4, return new stripes to service */
  1316. while(!list_empty(&newstripes)) {
  1317. nsh = list_entry(newstripes.next, struct stripe_head, lru);
  1318. list_del_init(&nsh->lru);
  1319. for (i=conf->raid_disks; i < newsize; i++)
  1320. if (nsh->dev[i].page == NULL) {
  1321. struct page *p = alloc_page(GFP_NOIO);
  1322. nsh->dev[i].page = p;
  1323. if (!p)
  1324. err = -ENOMEM;
  1325. }
  1326. release_stripe(nsh);
  1327. }
  1328. /* critical section pass, GFP_NOIO no longer needed */
  1329. conf->slab_cache = sc;
  1330. conf->active_name = 1-conf->active_name;
  1331. conf->pool_size = newsize;
  1332. return err;
  1333. }
  1334. static int drop_one_stripe(raid5_conf_t *conf)
  1335. {
  1336. struct stripe_head *sh;
  1337. spin_lock_irq(&conf->device_lock);
  1338. sh = get_free_stripe(conf);
  1339. spin_unlock_irq(&conf->device_lock);
  1340. if (!sh)
  1341. return 0;
  1342. BUG_ON(atomic_read(&sh->count));
  1343. shrink_buffers(sh);
  1344. kmem_cache_free(conf->slab_cache, sh);
  1345. atomic_dec(&conf->active_stripes);
  1346. return 1;
  1347. }
  1348. static void shrink_stripes(raid5_conf_t *conf)
  1349. {
  1350. while (drop_one_stripe(conf))
  1351. ;
  1352. if (conf->slab_cache)
  1353. kmem_cache_destroy(conf->slab_cache);
  1354. conf->slab_cache = NULL;
  1355. }
  1356. static void raid5_end_read_request(struct bio * bi, int error)
  1357. {
  1358. struct stripe_head *sh = bi->bi_private;
  1359. raid5_conf_t *conf = sh->raid_conf;
  1360. int disks = sh->disks, i;
  1361. int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
  1362. char b[BDEVNAME_SIZE];
  1363. mdk_rdev_t *rdev;
  1364. for (i=0 ; i<disks; i++)
  1365. if (bi == &sh->dev[i].req)
  1366. break;
  1367. pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
  1368. (unsigned long long)sh->sector, i, atomic_read(&sh->count),
  1369. uptodate);
  1370. if (i == disks) {
  1371. BUG();
  1372. return;
  1373. }
  1374. if (uptodate) {
  1375. set_bit(R5_UPTODATE, &sh->dev[i].flags);
  1376. if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
  1377. rdev = conf->disks[i].rdev;
  1378. printk_rl(KERN_INFO "md/raid:%s: read error corrected"
  1379. " (%lu sectors at %llu on %s)\n",
  1380. mdname(conf->mddev), STRIPE_SECTORS,
  1381. (unsigned long long)(sh->sector
  1382. + rdev->data_offset),
  1383. bdevname(rdev->bdev, b));
  1384. clear_bit(R5_ReadError, &sh->dev[i].flags);
  1385. clear_bit(R5_ReWrite, &sh->dev[i].flags);
  1386. }
  1387. if (atomic_read(&conf->disks[i].rdev->read_errors))
  1388. atomic_set(&conf->disks[i].rdev->read_errors, 0);
  1389. } else {
  1390. const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
  1391. int retry = 0;
  1392. rdev = conf->disks[i].rdev;
  1393. clear_bit(R5_UPTODATE, &sh->dev[i].flags);
  1394. atomic_inc(&rdev->read_errors);
  1395. if (conf->mddev->degraded >= conf->max_degraded)
  1396. printk_rl(KERN_WARNING
  1397. "md/raid:%s: read error not correctable "
  1398. "(sector %llu on %s).\n",
  1399. mdname(conf->mddev),
  1400. (unsigned long long)(sh->sector
  1401. + rdev->data_offset),
  1402. bdn);
  1403. else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
  1404. /* Oh, no!!! */
  1405. printk_rl(KERN_WARNING
  1406. "md/raid:%s: read error NOT corrected!! "
  1407. "(sector %llu on %s).\n",
  1408. mdname(conf->mddev),
  1409. (unsigned long long)(sh->sector
  1410. + rdev->data_offset),
  1411. bdn);
  1412. else if (atomic_read(&rdev->read_errors)
  1413. > conf->max_nr_stripes)
  1414. printk(KERN_WARNING
  1415. "md/raid:%s: Too many read errors, failing device %s.\n",
  1416. mdname(conf->mddev), bdn);
  1417. else
  1418. retry = 1;
  1419. if (retry)
  1420. set_bit(R5_ReadError, &sh->dev[i].flags);
  1421. else {
  1422. clear_bit(R5_ReadError, &sh->dev[i].flags);
  1423. clear_bit(R5_ReWrite, &sh->dev[i].flags);
  1424. md_error(conf->mddev, rdev);
  1425. }
  1426. }
  1427. rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
  1428. clear_bit(R5_LOCKED, &sh->dev[i].flags);
  1429. set_bit(STRIPE_HANDLE, &sh->state);
  1430. release_stripe(sh);
  1431. }
  1432. static void raid5_end_write_request(struct bio *bi, int error)
  1433. {
  1434. struct stripe_head *sh = bi->bi_private;
  1435. raid5_conf_t *conf = sh->raid_conf;
  1436. int disks = sh->disks, i;
  1437. int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
  1438. for (i=0 ; i<disks; i++)
  1439. if (bi == &sh->dev[i].req)
  1440. break;
  1441. pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
  1442. (unsigned long long)sh->sector, i, atomic_read(&sh->count),
  1443. uptodate);
  1444. if (i == disks) {
  1445. BUG();
  1446. return;
  1447. }
  1448. if (!uptodate)
  1449. md_error(conf->mddev, conf->disks[i].rdev);
  1450. rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
  1451. clear_bit(R5_LOCKED, &sh->dev[i].flags);
  1452. set_bit(STRIPE_HANDLE, &sh->state);
  1453. release_stripe(sh);
  1454. }
  1455. static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
  1456. static void raid5_build_block(struct stripe_head *sh, int i, int previous)
  1457. {
  1458. struct r5dev *dev = &sh->dev[i];
  1459. bio_init(&dev->req);
  1460. dev->req.bi_io_vec = &dev->vec;
  1461. dev->req.bi_vcnt++;
  1462. dev->req.bi_max_vecs++;
  1463. dev->vec.bv_page = dev->page;
  1464. dev->vec.bv_len = STRIPE_SIZE;
  1465. dev->vec.bv_offset = 0;
  1466. dev->req.bi_sector = sh->sector;
  1467. dev->req.bi_private = sh;
  1468. dev->flags = 0;
  1469. dev->sector = compute_blocknr(sh, i, previous);
  1470. }
  1471. static void error(mddev_t *mddev, mdk_rdev_t *rdev)
  1472. {
  1473. char b[BDEVNAME_SIZE];
  1474. raid5_conf_t *conf = mddev->private;
  1475. pr_debug("raid456: error called\n");
  1476. if (test_and_clear_bit(In_sync, &rdev->flags)) {
  1477. unsigned long flags;
  1478. spin_lock_irqsave(&conf->device_lock, flags);
  1479. mddev->degraded++;
  1480. spin_unlock_irqrestore(&conf->device_lock, flags);
  1481. /*
  1482. * if recovery was running, make sure it aborts.
  1483. */
  1484. set_bit(MD_RECOVERY_INTR, &mddev->recovery);
  1485. }
  1486. set_bit(Faulty, &rdev->flags);
  1487. set_bit(MD_CHANGE_DEVS, &mddev->flags);
  1488. printk(KERN_ALERT
  1489. "md/raid:%s: Disk failure on %s, disabling device.\n"
  1490. "md/raid:%s: Operation continuing on %d devices.\n",
  1491. mdname(mddev),
  1492. bdevname(rdev->bdev, b),
  1493. mdname(mddev),
  1494. conf->raid_disks - mddev->degraded);
  1495. }
  1496. /*
  1497. * Input: a 'big' sector number,
  1498. * Output: index of the data and parity disk, and the sector # in them.
  1499. */
  1500. static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
  1501. int previous, int *dd_idx,
  1502. struct stripe_head *sh)
  1503. {
  1504. sector_t stripe, stripe2;
  1505. sector_t chunk_number;
  1506. unsigned int chunk_offset;
  1507. int pd_idx, qd_idx;
  1508. int ddf_layout = 0;
  1509. sector_t new_sector;
  1510. int algorithm = previous ? conf->prev_algo
  1511. : conf->algorithm;
  1512. int sectors_per_chunk = previous ? conf->prev_chunk_sectors
  1513. : conf->chunk_sectors;
  1514. int raid_disks = previous ? conf->previous_raid_disks
  1515. : conf->raid_disks;
  1516. int data_disks = raid_disks - conf->max_degraded;
  1517. /* First compute the information on this sector */
  1518. /*
  1519. * Compute the chunk number and the sector offset inside the chunk
  1520. */
  1521. chunk_offset = sector_div(r_sector, sectors_per_chunk);
  1522. chunk_number = r_sector;
  1523. /*
  1524. * Compute the stripe number
  1525. */
  1526. stripe = chunk_number;
  1527. *dd_idx = sector_div(stripe, data_disks);
  1528. stripe2 = stripe;
  1529. /*
  1530. * Select the parity disk based on the user selected algorithm.
  1531. */
  1532. pd_idx = qd_idx = ~0;
  1533. switch(conf->level) {
  1534. case 4:
  1535. pd_idx = data_disks;
  1536. break;
  1537. case 5:
  1538. switch (algorithm) {
  1539. case ALGORITHM_LEFT_ASYMMETRIC:
  1540. pd_idx = data_disks - sector_div(stripe2, raid_disks);
  1541. if (*dd_idx >= pd_idx)
  1542. (*dd_idx)++;
  1543. break;
  1544. case ALGORITHM_RIGHT_ASYMMETRIC:
  1545. pd_idx = sector_div(stripe2, raid_disks);
  1546. if (*dd_idx >= pd_idx)
  1547. (*dd_idx)++;
  1548. break;
  1549. case ALGORITHM_LEFT_SYMMETRIC:
  1550. pd_idx = data_disks - sector_div(stripe2, raid_disks);
  1551. *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
  1552. break;
  1553. case ALGORITHM_RIGHT_SYMMETRIC:
  1554. pd_idx = sector_div(stripe2, raid_disks);
  1555. *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
  1556. break;
  1557. case ALGORITHM_PARITY_0:
  1558. pd_idx = 0;
  1559. (*dd_idx)++;
  1560. break;
  1561. case ALGORITHM_PARITY_N:
  1562. pd_idx = data_disks;
  1563. break;
  1564. default:
  1565. BUG();
  1566. }
  1567. break;
  1568. case 6:
  1569. switch (algorithm) {
  1570. case ALGORITHM_LEFT_ASYMMETRIC:
  1571. pd_idx = raid_disks - 1 - sector_div(stripe2, raid_disks);
  1572. qd_idx = pd_idx + 1;
  1573. if (pd_idx == raid_disks-1) {
  1574. (*dd_idx)++; /* Q D D D P */
  1575. qd_idx = 0;
  1576. } else if (*dd_idx >= pd_idx)
  1577. (*dd_idx) += 2; /* D D P Q D */
  1578. break;
  1579. case ALGORITHM_RIGHT_ASYMMETRIC:
  1580. pd_idx = sector_div(stripe2, raid_disks);
  1581. qd_idx = pd_idx + 1;
  1582. if (pd_idx == raid_disks-1) {
  1583. (*dd_idx)++; /* Q D D D P */
  1584. qd_idx = 0;
  1585. } else if (*dd_idx >= pd_idx)
  1586. (*dd_idx) += 2; /* D D P Q D */
  1587. break;
  1588. case ALGORITHM_LEFT_SYMMETRIC:
  1589. pd_