PageRenderTime 66ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 1ms

/fs/btrfs/volumes.c

https://bitbucket.org/slukk/jb-tsm-kernel-4.2
C | 3718 lines | 2949 code | 513 blank | 256 comment | 540 complexity | 58b4fc98f53631a2fc9f626b94464381 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.0, AGPL-1.0
  1. /*
  2. * Copyright (C) 2007 Oracle. All rights reserved.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public
  6. * License v2 as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. * General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU General Public
  14. * License along with this program; if not, write to the
  15. * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16. * Boston, MA 021110-1307, USA.
  17. */
  18. #include <linux/sched.h>
  19. #include <linux/bio.h>
  20. #include <linux/slab.h>
  21. #include <linux/buffer_head.h>
  22. #include <linux/blkdev.h>
  23. #include <linux/random.h>
  24. #include <linux/iocontext.h>
  25. #include <linux/capability.h>
  26. #include <asm/div64.h>
  27. #include "compat.h"
  28. #include "ctree.h"
  29. #include "extent_map.h"
  30. #include "disk-io.h"
  31. #include "transaction.h"
  32. #include "print-tree.h"
  33. #include "volumes.h"
  34. #include "async-thread.h"
  35. static int init_first_rw_device(struct btrfs_trans_handle *trans,
  36. struct btrfs_root *root,
  37. struct btrfs_device *device);
  38. static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
  39. static DEFINE_MUTEX(uuid_mutex);
  40. static LIST_HEAD(fs_uuids);
  41. static void lock_chunks(struct btrfs_root *root)
  42. {
  43. mutex_lock(&root->fs_info->chunk_mutex);
  44. }
  45. static void unlock_chunks(struct btrfs_root *root)
  46. {
  47. mutex_unlock(&root->fs_info->chunk_mutex);
  48. }
  49. static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
  50. {
  51. struct btrfs_device *device;
  52. WARN_ON(fs_devices->opened);
  53. while (!list_empty(&fs_devices->devices)) {
  54. device = list_entry(fs_devices->devices.next,
  55. struct btrfs_device, dev_list);
  56. list_del(&device->dev_list);
  57. kfree(device->name);
  58. kfree(device);
  59. }
  60. kfree(fs_devices);
  61. }
  62. int btrfs_cleanup_fs_uuids(void)
  63. {
  64. struct btrfs_fs_devices *fs_devices;
  65. while (!list_empty(&fs_uuids)) {
  66. fs_devices = list_entry(fs_uuids.next,
  67. struct btrfs_fs_devices, list);
  68. list_del(&fs_devices->list);
  69. free_fs_devices(fs_devices);
  70. }
  71. return 0;
  72. }
  73. static noinline struct btrfs_device *__find_device(struct list_head *head,
  74. u64 devid, u8 *uuid)
  75. {
  76. struct btrfs_device *dev;
  77. list_for_each_entry(dev, head, dev_list) {
  78. if (dev->devid == devid &&
  79. (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
  80. return dev;
  81. }
  82. }
  83. return NULL;
  84. }
  85. static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
  86. {
  87. struct btrfs_fs_devices *fs_devices;
  88. list_for_each_entry(fs_devices, &fs_uuids, list) {
  89. if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
  90. return fs_devices;
  91. }
  92. return NULL;
  93. }
  94. static void requeue_list(struct btrfs_pending_bios *pending_bios,
  95. struct bio *head, struct bio *tail)
  96. {
  97. struct bio *old_head;
  98. old_head = pending_bios->head;
  99. pending_bios->head = head;
  100. if (pending_bios->tail)
  101. tail->bi_next = old_head;
  102. else
  103. pending_bios->tail = tail;
  104. }
  105. /*
  106. * we try to collect pending bios for a device so we don't get a large
  107. * number of procs sending bios down to the same device. This greatly
  108. * improves the schedulers ability to collect and merge the bios.
  109. *
  110. * But, it also turns into a long list of bios to process and that is sure
  111. * to eventually make the worker thread block. The solution here is to
  112. * make some progress and then put this work struct back at the end of
  113. * the list if the block device is congested. This way, multiple devices
  114. * can make progress from a single worker thread.
  115. */
  116. static noinline int run_scheduled_bios(struct btrfs_device *device)
  117. {
  118. struct bio *pending;
  119. struct backing_dev_info *bdi;
  120. struct btrfs_fs_info *fs_info;
  121. struct btrfs_pending_bios *pending_bios;
  122. struct bio *tail;
  123. struct bio *cur;
  124. int again = 0;
  125. unsigned long num_run;
  126. unsigned long batch_run = 0;
  127. unsigned long limit;
  128. unsigned long last_waited = 0;
  129. int force_reg = 0;
  130. struct blk_plug plug;
  131. /*
  132. * this function runs all the bios we've collected for
  133. * a particular device. We don't want to wander off to
  134. * another device without first sending all of these down.
  135. * So, setup a plug here and finish it off before we return
  136. */
  137. blk_start_plug(&plug);
  138. bdi = blk_get_backing_dev_info(device->bdev);
  139. fs_info = device->dev_root->fs_info;
  140. limit = btrfs_async_submit_limit(fs_info);
  141. limit = limit * 2 / 3;
  142. loop:
  143. spin_lock(&device->io_lock);
  144. loop_lock:
  145. num_run = 0;
  146. /* take all the bios off the list at once and process them
  147. * later on (without the lock held). But, remember the
  148. * tail and other pointers so the bios can be properly reinserted
  149. * into the list if we hit congestion
  150. */
  151. if (!force_reg && device->pending_sync_bios.head) {
  152. pending_bios = &device->pending_sync_bios;
  153. force_reg = 1;
  154. } else {
  155. pending_bios = &device->pending_bios;
  156. force_reg = 0;
  157. }
  158. pending = pending_bios->head;
  159. tail = pending_bios->tail;
  160. WARN_ON(pending && !tail);
  161. /*
  162. * if pending was null this time around, no bios need processing
  163. * at all and we can stop. Otherwise it'll loop back up again
  164. * and do an additional check so no bios are missed.
  165. *
  166. * device->running_pending is used to synchronize with the
  167. * schedule_bio code.
  168. */
  169. if (device->pending_sync_bios.head == NULL &&
  170. device->pending_bios.head == NULL) {
  171. again = 0;
  172. device->running_pending = 0;
  173. } else {
  174. again = 1;
  175. device->running_pending = 1;
  176. }
  177. pending_bios->head = NULL;
  178. pending_bios->tail = NULL;
  179. spin_unlock(&device->io_lock);
  180. while (pending) {
  181. rmb();
  182. /* we want to work on both lists, but do more bios on the
  183. * sync list than the regular list
  184. */
  185. if ((num_run > 32 &&
  186. pending_bios != &device->pending_sync_bios &&
  187. device->pending_sync_bios.head) ||
  188. (num_run > 64 && pending_bios == &device->pending_sync_bios &&
  189. device->pending_bios.head)) {
  190. spin_lock(&device->io_lock);
  191. requeue_list(pending_bios, pending, tail);
  192. goto loop_lock;
  193. }
  194. cur = pending;
  195. pending = pending->bi_next;
  196. cur->bi_next = NULL;
  197. atomic_dec(&fs_info->nr_async_bios);
  198. if (atomic_read(&fs_info->nr_async_bios) < limit &&
  199. waitqueue_active(&fs_info->async_submit_wait))
  200. wake_up(&fs_info->async_submit_wait);
  201. BUG_ON(atomic_read(&cur->bi_cnt) == 0);
  202. submit_bio(cur->bi_rw, cur);
  203. num_run++;
  204. batch_run++;
  205. if (need_resched())
  206. cond_resched();
  207. /*
  208. * we made progress, there is more work to do and the bdi
  209. * is now congested. Back off and let other work structs
  210. * run instead
  211. */
  212. if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
  213. fs_info->fs_devices->open_devices > 1) {
  214. struct io_context *ioc;
  215. ioc = current->io_context;
  216. /*
  217. * the main goal here is that we don't want to
  218. * block if we're going to be able to submit
  219. * more requests without blocking.
  220. *
  221. * This code does two great things, it pokes into
  222. * the elevator code from a filesystem _and_
  223. * it makes assumptions about how batching works.
  224. */
  225. if (ioc && ioc->nr_batch_requests > 0 &&
  226. time_before(jiffies, ioc->last_waited + HZ/50UL) &&
  227. (last_waited == 0 ||
  228. ioc->last_waited == last_waited)) {
  229. /*
  230. * we want to go through our batch of
  231. * requests and stop. So, we copy out
  232. * the ioc->last_waited time and test
  233. * against it before looping
  234. */
  235. last_waited = ioc->last_waited;
  236. if (need_resched())
  237. cond_resched();
  238. continue;
  239. }
  240. spin_lock(&device->io_lock);
  241. requeue_list(pending_bios, pending, tail);
  242. device->running_pending = 1;
  243. spin_unlock(&device->io_lock);
  244. btrfs_requeue_work(&device->work);
  245. goto done;
  246. }
  247. }
  248. cond_resched();
  249. if (again)
  250. goto loop;
  251. spin_lock(&device->io_lock);
  252. if (device->pending_bios.head || device->pending_sync_bios.head)
  253. goto loop_lock;
  254. spin_unlock(&device->io_lock);
  255. done:
  256. blk_finish_plug(&plug);
  257. return 0;
  258. }
  259. static void pending_bios_fn(struct btrfs_work *work)
  260. {
  261. struct btrfs_device *device;
  262. device = container_of(work, struct btrfs_device, work);
  263. run_scheduled_bios(device);
  264. }
  265. static noinline int device_list_add(const char *path,
  266. struct btrfs_super_block *disk_super,
  267. u64 devid, struct btrfs_fs_devices **fs_devices_ret)
  268. {
  269. struct btrfs_device *device;
  270. struct btrfs_fs_devices *fs_devices;
  271. u64 found_transid = btrfs_super_generation(disk_super);
  272. char *name;
  273. fs_devices = find_fsid(disk_super->fsid);
  274. if (!fs_devices) {
  275. fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
  276. if (!fs_devices)
  277. return -ENOMEM;
  278. INIT_LIST_HEAD(&fs_devices->devices);
  279. INIT_LIST_HEAD(&fs_devices->alloc_list);
  280. list_add(&fs_devices->list, &fs_uuids);
  281. memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
  282. fs_devices->latest_devid = devid;
  283. fs_devices->latest_trans = found_transid;
  284. mutex_init(&fs_devices->device_list_mutex);
  285. device = NULL;
  286. } else {
  287. device = __find_device(&fs_devices->devices, devid,
  288. disk_super->dev_item.uuid);
  289. }
  290. if (!device) {
  291. if (fs_devices->opened)
  292. return -EBUSY;
  293. device = kzalloc(sizeof(*device), GFP_NOFS);
  294. if (!device) {
  295. /* we can safely leave the fs_devices entry around */
  296. return -ENOMEM;
  297. }
  298. device->devid = devid;
  299. device->work.func = pending_bios_fn;
  300. memcpy(device->uuid, disk_super->dev_item.uuid,
  301. BTRFS_UUID_SIZE);
  302. spin_lock_init(&device->io_lock);
  303. device->name = kstrdup(path, GFP_NOFS);
  304. if (!device->name) {
  305. kfree(device);
  306. return -ENOMEM;
  307. }
  308. INIT_LIST_HEAD(&device->dev_alloc_list);
  309. mutex_lock(&fs_devices->device_list_mutex);
  310. list_add_rcu(&device->dev_list, &fs_devices->devices);
  311. mutex_unlock(&fs_devices->device_list_mutex);
  312. device->fs_devices = fs_devices;
  313. fs_devices->num_devices++;
  314. } else if (!device->name || strcmp(device->name, path)) {
  315. name = kstrdup(path, GFP_NOFS);
  316. if (!name)
  317. return -ENOMEM;
  318. kfree(device->name);
  319. device->name = name;
  320. if (device->missing) {
  321. fs_devices->missing_devices--;
  322. device->missing = 0;
  323. }
  324. }
  325. if (found_transid > fs_devices->latest_trans) {
  326. fs_devices->latest_devid = devid;
  327. fs_devices->latest_trans = found_transid;
  328. }
  329. *fs_devices_ret = fs_devices;
  330. return 0;
  331. }
  332. static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
  333. {
  334. struct btrfs_fs_devices *fs_devices;
  335. struct btrfs_device *device;
  336. struct btrfs_device *orig_dev;
  337. fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
  338. if (!fs_devices)
  339. return ERR_PTR(-ENOMEM);
  340. INIT_LIST_HEAD(&fs_devices->devices);
  341. INIT_LIST_HEAD(&fs_devices->alloc_list);
  342. INIT_LIST_HEAD(&fs_devices->list);
  343. mutex_init(&fs_devices->device_list_mutex);
  344. fs_devices->latest_devid = orig->latest_devid;
  345. fs_devices->latest_trans = orig->latest_trans;
  346. memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
  347. /* We have held the volume lock, it is safe to get the devices. */
  348. list_for_each_entry(orig_dev, &orig->devices, dev_list) {
  349. device = kzalloc(sizeof(*device), GFP_NOFS);
  350. if (!device)
  351. goto error;
  352. device->name = kstrdup(orig_dev->name, GFP_NOFS);
  353. if (!device->name) {
  354. kfree(device);
  355. goto error;
  356. }
  357. device->devid = orig_dev->devid;
  358. device->work.func = pending_bios_fn;
  359. memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
  360. spin_lock_init(&device->io_lock);
  361. INIT_LIST_HEAD(&device->dev_list);
  362. INIT_LIST_HEAD(&device->dev_alloc_list);
  363. list_add(&device->dev_list, &fs_devices->devices);
  364. device->fs_devices = fs_devices;
  365. fs_devices->num_devices++;
  366. }
  367. return fs_devices;
  368. error:
  369. free_fs_devices(fs_devices);
  370. return ERR_PTR(-ENOMEM);
  371. }
  372. int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
  373. {
  374. struct btrfs_device *device, *next;
  375. mutex_lock(&uuid_mutex);
  376. again:
  377. /* This is the initialized path, it is safe to release the devices. */
  378. list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
  379. if (device->in_fs_metadata)
  380. continue;
  381. if (device->bdev) {
  382. blkdev_put(device->bdev, device->mode);
  383. device->bdev = NULL;
  384. fs_devices->open_devices--;
  385. }
  386. if (device->writeable) {
  387. list_del_init(&device->dev_alloc_list);
  388. device->writeable = 0;
  389. fs_devices->rw_devices--;
  390. }
  391. list_del_init(&device->dev_list);
  392. fs_devices->num_devices--;
  393. kfree(device->name);
  394. kfree(device);
  395. }
  396. if (fs_devices->seed) {
  397. fs_devices = fs_devices->seed;
  398. goto again;
  399. }
  400. mutex_unlock(&uuid_mutex);
  401. return 0;
  402. }
  403. static void __free_device(struct work_struct *work)
  404. {
  405. struct btrfs_device *device;
  406. device = container_of(work, struct btrfs_device, rcu_work);
  407. if (device->bdev)
  408. blkdev_put(device->bdev, device->mode);
  409. kfree(device->name);
  410. kfree(device);
  411. }
  412. static void free_device(struct rcu_head *head)
  413. {
  414. struct btrfs_device *device;
  415. device = container_of(head, struct btrfs_device, rcu);
  416. INIT_WORK(&device->rcu_work, __free_device);
  417. schedule_work(&device->rcu_work);
  418. }
  419. static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
  420. {
  421. struct btrfs_device *device;
  422. if (--fs_devices->opened > 0)
  423. return 0;
  424. mutex_lock(&fs_devices->device_list_mutex);
  425. list_for_each_entry(device, &fs_devices->devices, dev_list) {
  426. struct btrfs_device *new_device;
  427. if (device->bdev)
  428. fs_devices->open_devices--;
  429. if (device->writeable) {
  430. list_del_init(&device->dev_alloc_list);
  431. fs_devices->rw_devices--;
  432. }
  433. if (device->can_discard)
  434. fs_devices->num_can_discard--;
  435. new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
  436. BUG_ON(!new_device);
  437. memcpy(new_device, device, sizeof(*new_device));
  438. new_device->name = kstrdup(device->name, GFP_NOFS);
  439. BUG_ON(device->name && !new_device->name);
  440. new_device->bdev = NULL;
  441. new_device->writeable = 0;
  442. new_device->in_fs_metadata = 0;
  443. new_device->can_discard = 0;
  444. list_replace_rcu(&device->dev_list, &new_device->dev_list);
  445. call_rcu(&device->rcu, free_device);
  446. }
  447. mutex_unlock(&fs_devices->device_list_mutex);
  448. WARN_ON(fs_devices->open_devices);
  449. WARN_ON(fs_devices->rw_devices);
  450. fs_devices->opened = 0;
  451. fs_devices->seeding = 0;
  452. return 0;
  453. }
  454. int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
  455. {
  456. struct btrfs_fs_devices *seed_devices = NULL;
  457. int ret;
  458. mutex_lock(&uuid_mutex);
  459. ret = __btrfs_close_devices(fs_devices);
  460. if (!fs_devices->opened) {
  461. seed_devices = fs_devices->seed;
  462. fs_devices->seed = NULL;
  463. }
  464. mutex_unlock(&uuid_mutex);
  465. while (seed_devices) {
  466. fs_devices = seed_devices;
  467. seed_devices = fs_devices->seed;
  468. __btrfs_close_devices(fs_devices);
  469. free_fs_devices(fs_devices);
  470. }
  471. return ret;
  472. }
  473. static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
  474. fmode_t flags, void *holder)
  475. {
  476. struct request_queue *q;
  477. struct block_device *bdev;
  478. struct list_head *head = &fs_devices->devices;
  479. struct btrfs_device *device;
  480. struct block_device *latest_bdev = NULL;
  481. struct buffer_head *bh;
  482. struct btrfs_super_block *disk_super;
  483. u64 latest_devid = 0;
  484. u64 latest_transid = 0;
  485. u64 devid;
  486. int seeding = 1;
  487. int ret = 0;
  488. flags |= FMODE_EXCL;
  489. list_for_each_entry(device, head, dev_list) {
  490. if (device->bdev)
  491. continue;
  492. if (!device->name)
  493. continue;
  494. bdev = blkdev_get_by_path(device->name, flags, holder);
  495. if (IS_ERR(bdev)) {
  496. printk(KERN_INFO "open %s failed\n", device->name);
  497. goto error;
  498. }
  499. set_blocksize(bdev, 4096);
  500. bh = btrfs_read_dev_super(bdev);
  501. if (!bh) {
  502. ret = -EINVAL;
  503. goto error_close;
  504. }
  505. disk_super = (struct btrfs_super_block *)bh->b_data;
  506. devid = btrfs_stack_device_id(&disk_super->dev_item);
  507. if (devid != device->devid)
  508. goto error_brelse;
  509. if (memcmp(device->uuid, disk_super->dev_item.uuid,
  510. BTRFS_UUID_SIZE))
  511. goto error_brelse;
  512. device->generation = btrfs_super_generation(disk_super);
  513. if (!latest_transid || device->generation > latest_transid) {
  514. latest_devid = devid;
  515. latest_transid = device->generation;
  516. latest_bdev = bdev;
  517. }
  518. if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
  519. device->writeable = 0;
  520. } else {
  521. device->writeable = !bdev_read_only(bdev);
  522. seeding = 0;
  523. }
  524. q = bdev_get_queue(bdev);
  525. if (blk_queue_discard(q)) {
  526. device->can_discard = 1;
  527. fs_devices->num_can_discard++;
  528. }
  529. device->bdev = bdev;
  530. device->in_fs_metadata = 0;
  531. device->mode = flags;
  532. if (!blk_queue_nonrot(bdev_get_queue(bdev)))
  533. fs_devices->rotating = 1;
  534. fs_devices->open_devices++;
  535. if (device->writeable) {
  536. fs_devices->rw_devices++;
  537. list_add(&device->dev_alloc_list,
  538. &fs_devices->alloc_list);
  539. }
  540. brelse(bh);
  541. continue;
  542. error_brelse:
  543. brelse(bh);
  544. error_close:
  545. blkdev_put(bdev, flags);
  546. error:
  547. continue;
  548. }
  549. if (fs_devices->open_devices == 0) {
  550. ret = -EIO;
  551. goto out;
  552. }
  553. fs_devices->seeding = seeding;
  554. fs_devices->opened = 1;
  555. fs_devices->latest_bdev = latest_bdev;
  556. fs_devices->latest_devid = latest_devid;
  557. fs_devices->latest_trans = latest_transid;
  558. fs_devices->total_rw_bytes = 0;
  559. out:
  560. return ret;
  561. }
  562. int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
  563. fmode_t flags, void *holder)
  564. {
  565. int ret;
  566. mutex_lock(&uuid_mutex);
  567. if (fs_devices->opened) {
  568. fs_devices->opened++;
  569. ret = 0;
  570. } else {
  571. ret = __btrfs_open_devices(fs_devices, flags, holder);
  572. }
  573. mutex_unlock(&uuid_mutex);
  574. return ret;
  575. }
  576. int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
  577. struct btrfs_fs_devices **fs_devices_ret)
  578. {
  579. struct btrfs_super_block *disk_super;
  580. struct block_device *bdev;
  581. struct buffer_head *bh;
  582. int ret;
  583. u64 devid;
  584. u64 transid;
  585. mutex_lock(&uuid_mutex);
  586. flags |= FMODE_EXCL;
  587. bdev = blkdev_get_by_path(path, flags, holder);
  588. if (IS_ERR(bdev)) {
  589. ret = PTR_ERR(bdev);
  590. goto error;
  591. }
  592. ret = set_blocksize(bdev, 4096);
  593. if (ret)
  594. goto error_close;
  595. bh = btrfs_read_dev_super(bdev);
  596. if (!bh) {
  597. ret = -EINVAL;
  598. goto error_close;
  599. }
  600. disk_super = (struct btrfs_super_block *)bh->b_data;
  601. devid = btrfs_stack_device_id(&disk_super->dev_item);
  602. transid = btrfs_super_generation(disk_super);
  603. if (disk_super->label[0])
  604. printk(KERN_INFO "device label %s ", disk_super->label);
  605. else
  606. printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
  607. printk(KERN_CONT "devid %llu transid %llu %s\n",
  608. (unsigned long long)devid, (unsigned long long)transid, path);
  609. ret = device_list_add(path, disk_super, devid, fs_devices_ret);
  610. brelse(bh);
  611. error_close:
  612. blkdev_put(bdev, flags);
  613. error:
  614. mutex_unlock(&uuid_mutex);
  615. return ret;
  616. }
  617. /* helper to account the used device space in the range */
  618. int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
  619. u64 end, u64 *length)
  620. {
  621. struct btrfs_key key;
  622. struct btrfs_root *root = device->dev_root;
  623. struct btrfs_dev_extent *dev_extent;
  624. struct btrfs_path *path;
  625. u64 extent_end;
  626. int ret;
  627. int slot;
  628. struct extent_buffer *l;
  629. *length = 0;
  630. if (start >= device->total_bytes)
  631. return 0;
  632. path = btrfs_alloc_path();
  633. if (!path)
  634. return -ENOMEM;
  635. path->reada = 2;
  636. key.objectid = device->devid;
  637. key.offset = start;
  638. key.type = BTRFS_DEV_EXTENT_KEY;
  639. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  640. if (ret < 0)
  641. goto out;
  642. if (ret > 0) {
  643. ret = btrfs_previous_item(root, path, key.objectid, key.type);
  644. if (ret < 0)
  645. goto out;
  646. }
  647. while (1) {
  648. l = path->nodes[0];
  649. slot = path->slots[0];
  650. if (slot >= btrfs_header_nritems(l)) {
  651. ret = btrfs_next_leaf(root, path);
  652. if (ret == 0)
  653. continue;
  654. if (ret < 0)
  655. goto out;
  656. break;
  657. }
  658. btrfs_item_key_to_cpu(l, &key, slot);
  659. if (key.objectid < device->devid)
  660. goto next;
  661. if (key.objectid > device->devid)
  662. break;
  663. if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
  664. goto next;
  665. dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
  666. extent_end = key.offset + btrfs_dev_extent_length(l,
  667. dev_extent);
  668. if (key.offset <= start && extent_end > end) {
  669. *length = end - start + 1;
  670. break;
  671. } else if (key.offset <= start && extent_end > start)
  672. *length += extent_end - start;
  673. else if (key.offset > start && extent_end <= end)
  674. *length += extent_end - key.offset;
  675. else if (key.offset > start && key.offset <= end) {
  676. *length += end - key.offset + 1;
  677. break;
  678. } else if (key.offset > end)
  679. break;
  680. next:
  681. path->slots[0]++;
  682. }
  683. ret = 0;
  684. out:
  685. btrfs_free_path(path);
  686. return ret;
  687. }
  688. /*
  689. * find_free_dev_extent - find free space in the specified device
  690. * @trans: transaction handler
  691. * @device: the device which we search the free space in
  692. * @num_bytes: the size of the free space that we need
  693. * @start: store the start of the free space.
  694. * @len: the size of the free space. that we find, or the size of the max
  695. * free space if we don't find suitable free space
  696. *
  697. * this uses a pretty simple search, the expectation is that it is
  698. * called very infrequently and that a given device has a small number
  699. * of extents
  700. *
  701. * @start is used to store the start of the free space if we find. But if we
  702. * don't find suitable free space, it will be used to store the start position
  703. * of the max free space.
  704. *
  705. * @len is used to store the size of the free space that we find.
  706. * But if we don't find suitable free space, it is used to store the size of
  707. * the max free space.
  708. */
  709. int find_free_dev_extent(struct btrfs_trans_handle *trans,
  710. struct btrfs_device *device, u64 num_bytes,
  711. u64 *start, u64 *len)
  712. {
  713. struct btrfs_key key;
  714. struct btrfs_root *root = device->dev_root;
  715. struct btrfs_dev_extent *dev_extent;
  716. struct btrfs_path *path;
  717. u64 hole_size;
  718. u64 max_hole_start;
  719. u64 max_hole_size;
  720. u64 extent_end;
  721. u64 search_start;
  722. u64 search_end = device->total_bytes;
  723. int ret;
  724. int slot;
  725. struct extent_buffer *l;
  726. /* FIXME use last free of some kind */
  727. /* we don't want to overwrite the superblock on the drive,
  728. * so we make sure to start at an offset of at least 1MB
  729. */
  730. search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
  731. max_hole_start = search_start;
  732. max_hole_size = 0;
  733. if (search_start >= search_end) {
  734. ret = -ENOSPC;
  735. goto error;
  736. }
  737. path = btrfs_alloc_path();
  738. if (!path) {
  739. ret = -ENOMEM;
  740. goto error;
  741. }
  742. path->reada = 2;
  743. key.objectid = device->devid;
  744. key.offset = search_start;
  745. key.type = BTRFS_DEV_EXTENT_KEY;
  746. ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
  747. if (ret < 0)
  748. goto out;
  749. if (ret > 0) {
  750. ret = btrfs_previous_item(root, path, key.objectid, key.type);
  751. if (ret < 0)
  752. goto out;
  753. }
  754. while (1) {
  755. l = path->nodes[0];
  756. slot = path->slots[0];
  757. if (slot >= btrfs_header_nritems(l)) {
  758. ret = btrfs_next_leaf(root, path);
  759. if (ret == 0)
  760. continue;
  761. if (ret < 0)
  762. goto out;
  763. break;
  764. }
  765. btrfs_item_key_to_cpu(l, &key, slot);
  766. if (key.objectid < device->devid)
  767. goto next;
  768. if (key.objectid > device->devid)
  769. break;
  770. if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
  771. goto next;
  772. if (key.offset > search_start) {
  773. hole_size = key.offset - search_start;
  774. if (hole_size > max_hole_size) {
  775. max_hole_start = search_start;
  776. max_hole_size = hole_size;
  777. }
  778. /*
  779. * If this free space is greater than which we need,
  780. * it must be the max free space that we have found
  781. * until now, so max_hole_start must point to the start
  782. * of this free space and the length of this free space
  783. * is stored in max_hole_size. Thus, we return
  784. * max_hole_start and max_hole_size and go back to the
  785. * caller.
  786. */
  787. if (hole_size >= num_bytes) {
  788. ret = 0;
  789. goto out;
  790. }
  791. }
  792. dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
  793. extent_end = key.offset + btrfs_dev_extent_length(l,
  794. dev_extent);
  795. if (extent_end > search_start)
  796. search_start = extent_end;
  797. next:
  798. path->slots[0]++;
  799. cond_resched();
  800. }
  801. hole_size = search_end- search_start;
  802. if (hole_size > max_hole_size) {
  803. max_hole_start = search_start;
  804. max_hole_size = hole_size;
  805. }
  806. /* See above. */
  807. if (hole_size < num_bytes)
  808. ret = -ENOSPC;
  809. else
  810. ret = 0;
  811. out:
  812. btrfs_free_path(path);
  813. error:
  814. *start = max_hole_start;
  815. if (len)
  816. *len = max_hole_size;
  817. return ret;
  818. }
  819. static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
  820. struct btrfs_device *device,
  821. u64 start)
  822. {
  823. int ret;
  824. struct btrfs_path *path;
  825. struct btrfs_root *root = device->dev_root;
  826. struct btrfs_key key;
  827. struct btrfs_key found_key;
  828. struct extent_buffer *leaf = NULL;
  829. struct btrfs_dev_extent *extent = NULL;
  830. path = btrfs_alloc_path();
  831. if (!path)
  832. return -ENOMEM;
  833. key.objectid = device->devid;
  834. key.offset = start;
  835. key.type = BTRFS_DEV_EXTENT_KEY;
  836. ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
  837. if (ret > 0) {
  838. ret = btrfs_previous_item(root, path, key.objectid,
  839. BTRFS_DEV_EXTENT_KEY);
  840. if (ret)
  841. goto out;
  842. leaf = path->nodes[0];
  843. btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
  844. extent = btrfs_item_ptr(leaf, path->slots[0],
  845. struct btrfs_dev_extent);
  846. BUG_ON(found_key.offset > start || found_key.offset +
  847. btrfs_dev_extent_length(leaf, extent) < start);
  848. } else if (ret == 0) {
  849. leaf = path->nodes[0];
  850. extent = btrfs_item_ptr(leaf, path->slots[0],
  851. struct btrfs_dev_extent);
  852. }
  853. BUG_ON(ret);
  854. if (device->bytes_used > 0)
  855. device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
  856. ret = btrfs_del_item(trans, root, path);
  857. out:
  858. btrfs_free_path(path);
  859. return ret;
  860. }
  861. int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
  862. struct btrfs_device *device,
  863. u64 chunk_tree, u64 chunk_objectid,
  864. u64 chunk_offset, u64 start, u64 num_bytes)
  865. {
  866. int ret;
  867. struct btrfs_path *path;
  868. struct btrfs_root *root = device->dev_root;
  869. struct btrfs_dev_extent *extent;
  870. struct extent_buffer *leaf;
  871. struct btrfs_key key;
  872. WARN_ON(!device->in_fs_metadata);
  873. path = btrfs_alloc_path();
  874. if (!path)
  875. return -ENOMEM;
  876. key.objectid = device->devid;
  877. key.offset = start;
  878. key.type = BTRFS_DEV_EXTENT_KEY;
  879. ret = btrfs_insert_empty_item(trans, root, path, &key,
  880. sizeof(*extent));
  881. BUG_ON(ret);
  882. leaf = path->nodes[0];
  883. extent = btrfs_item_ptr(leaf, path->slots[0],
  884. struct btrfs_dev_extent);
  885. btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
  886. btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
  887. btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
  888. write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
  889. (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
  890. BTRFS_UUID_SIZE);
  891. btrfs_set_dev_extent_length(leaf, extent, num_bytes);
  892. btrfs_mark_buffer_dirty(leaf);
  893. btrfs_free_path(path);
  894. return ret;
  895. }
  896. static noinline int find_next_chunk(struct btrfs_root *root,
  897. u64 objectid, u64 *offset)
  898. {
  899. struct btrfs_path *path;
  900. int ret;
  901. struct btrfs_key key;
  902. struct btrfs_chunk *chunk;
  903. struct btrfs_key found_key;
  904. path = btrfs_alloc_path();
  905. BUG_ON(!path);
  906. key.objectid = objectid;
  907. key.offset = (u64)-1;
  908. key.type = BTRFS_CHUNK_ITEM_KEY;
  909. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  910. if (ret < 0)
  911. goto error;
  912. BUG_ON(ret == 0);
  913. ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
  914. if (ret) {
  915. *offset = 0;
  916. } else {
  917. btrfs_item_key_to_cpu(path->nodes[0], &found_key,
  918. path->slots[0]);
  919. if (found_key.objectid != objectid)
  920. *offset = 0;
  921. else {
  922. chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
  923. struct btrfs_chunk);
  924. *offset = found_key.offset +
  925. btrfs_chunk_length(path->nodes[0], chunk);
  926. }
  927. }
  928. ret = 0;
  929. error:
  930. btrfs_free_path(path);
  931. return ret;
  932. }
  933. static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
  934. {
  935. int ret;
  936. struct btrfs_key key;
  937. struct btrfs_key found_key;
  938. struct btrfs_path *path;
  939. root = root->fs_info->chunk_root;
  940. path = btrfs_alloc_path();
  941. if (!path)
  942. return -ENOMEM;
  943. key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  944. key.type = BTRFS_DEV_ITEM_KEY;
  945. key.offset = (u64)-1;
  946. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  947. if (ret < 0)
  948. goto error;
  949. BUG_ON(ret == 0);
  950. ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
  951. BTRFS_DEV_ITEM_KEY);
  952. if (ret) {
  953. *objectid = 1;
  954. } else {
  955. btrfs_item_key_to_cpu(path->nodes[0], &found_key,
  956. path->slots[0]);
  957. *objectid = found_key.offset + 1;
  958. }
  959. ret = 0;
  960. error:
  961. btrfs_free_path(path);
  962. return ret;
  963. }
  964. /*
  965. * the device information is stored in the chunk root
  966. * the btrfs_device struct should be fully filled in
  967. */
  968. int btrfs_add_device(struct btrfs_trans_handle *trans,
  969. struct btrfs_root *root,
  970. struct btrfs_device *device)
  971. {
  972. int ret;
  973. struct btrfs_path *path;
  974. struct btrfs_dev_item *dev_item;
  975. struct extent_buffer *leaf;
  976. struct btrfs_key key;
  977. unsigned long ptr;
  978. root = root->fs_info->chunk_root;
  979. path = btrfs_alloc_path();
  980. if (!path)
  981. return -ENOMEM;
  982. key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  983. key.type = BTRFS_DEV_ITEM_KEY;
  984. key.offset = device->devid;
  985. ret = btrfs_insert_empty_item(trans, root, path, &key,
  986. sizeof(*dev_item));
  987. if (ret)
  988. goto out;
  989. leaf = path->nodes[0];
  990. dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
  991. btrfs_set_device_id(leaf, dev_item, device->devid);
  992. btrfs_set_device_generation(leaf, dev_item, 0);
  993. btrfs_set_device_type(leaf, dev_item, device->type);
  994. btrfs_set_device_io_align(leaf, dev_item, device->io_align);
  995. btrfs_set_device_io_width(leaf, dev_item, device->io_width);
  996. btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
  997. btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
  998. btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
  999. btrfs_set_device_group(leaf, dev_item, 0);
  1000. btrfs_set_device_seek_speed(leaf, dev_item, 0);
  1001. btrfs_set_device_bandwidth(leaf, dev_item, 0);
  1002. btrfs_set_device_start_offset(leaf, dev_item, 0);
  1003. ptr = (unsigned long)btrfs_device_uuid(dev_item);
  1004. write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
  1005. ptr = (unsigned long)btrfs_device_fsid(dev_item);
  1006. write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
  1007. btrfs_mark_buffer_dirty(leaf);
  1008. ret = 0;
  1009. out:
  1010. btrfs_free_path(path);
  1011. return ret;
  1012. }
  1013. static int btrfs_rm_dev_item(struct btrfs_root *root,
  1014. struct btrfs_device *device)
  1015. {
  1016. int ret;
  1017. struct btrfs_path *path;
  1018. struct btrfs_key key;
  1019. struct btrfs_trans_handle *trans;
  1020. root = root->fs_info->chunk_root;
  1021. path = btrfs_alloc_path();
  1022. if (!path)
  1023. return -ENOMEM;
  1024. trans = btrfs_start_transaction(root, 0);
  1025. if (IS_ERR(trans)) {
  1026. btrfs_free_path(path);
  1027. return PTR_ERR(trans);
  1028. }
  1029. key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  1030. key.type = BTRFS_DEV_ITEM_KEY;
  1031. key.offset = device->devid;
  1032. lock_chunks(root);
  1033. ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
  1034. if (ret < 0)
  1035. goto out;
  1036. if (ret > 0) {
  1037. ret = -ENOENT;
  1038. goto out;
  1039. }
  1040. ret = btrfs_del_item(trans, root, path);
  1041. if (ret)
  1042. goto out;
  1043. out:
  1044. btrfs_free_path(path);
  1045. unlock_chunks(root);
  1046. btrfs_commit_transaction(trans, root);
  1047. return ret;
  1048. }
  1049. int btrfs_rm_device(struct btrfs_root *root, char *device_path)
  1050. {
  1051. struct btrfs_device *device;
  1052. struct btrfs_device *next_device;
  1053. struct block_device *bdev;
  1054. struct buffer_head *bh = NULL;
  1055. struct btrfs_super_block *disk_super;
  1056. struct btrfs_fs_devices *cur_devices;
  1057. u64 all_avail;
  1058. u64 devid;
  1059. u64 num_devices;
  1060. u8 *dev_uuid;
  1061. int ret = 0;
  1062. bool clear_super = false;
  1063. mutex_lock(&uuid_mutex);
  1064. mutex_lock(&root->fs_info->volume_mutex);
  1065. all_avail = root->fs_info->avail_data_alloc_bits |
  1066. root->fs_info->avail_system_alloc_bits |
  1067. root->fs_info->avail_metadata_alloc_bits;
  1068. if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
  1069. root->fs_info->fs_devices->num_devices <= 4) {
  1070. printk(KERN_ERR "btrfs: unable to go below four devices "
  1071. "on raid10\n");
  1072. ret = -EINVAL;
  1073. goto out;
  1074. }
  1075. if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
  1076. root->fs_info->fs_devices->num_devices <= 2) {
  1077. printk(KERN_ERR "btrfs: unable to go below two "
  1078. "devices on raid1\n");
  1079. ret = -EINVAL;
  1080. goto out;
  1081. }
  1082. if (strcmp(device_path, "missing") == 0) {
  1083. struct list_head *devices;
  1084. struct btrfs_device *tmp;
  1085. device = NULL;
  1086. devices = &root->fs_info->fs_devices->devices;
  1087. /*
  1088. * It is safe to read the devices since the volume_mutex
  1089. * is held.
  1090. */
  1091. list_for_each_entry(tmp, devices, dev_list) {
  1092. if (tmp->in_fs_metadata && !tmp->bdev) {
  1093. device = tmp;
  1094. break;
  1095. }
  1096. }
  1097. bdev = NULL;
  1098. bh = NULL;
  1099. disk_super = NULL;
  1100. if (!device) {
  1101. printk(KERN_ERR "btrfs: no missing devices found to "
  1102. "remove\n");
  1103. goto out;
  1104. }
  1105. } else {
  1106. bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
  1107. root->fs_info->bdev_holder);
  1108. if (IS_ERR(bdev)) {
  1109. ret = PTR_ERR(bdev);
  1110. goto out;
  1111. }
  1112. set_blocksize(bdev, 4096);
  1113. bh = btrfs_read_dev_super(bdev);
  1114. if (!bh) {
  1115. ret = -EINVAL;
  1116. goto error_close;
  1117. }
  1118. disk_super = (struct btrfs_super_block *)bh->b_data;
  1119. devid = btrfs_stack_device_id(&disk_super->dev_item);
  1120. dev_uuid = disk_super->dev_item.uuid;
  1121. device = btrfs_find_device(root, devid, dev_uuid,
  1122. disk_super->fsid);
  1123. if (!device) {
  1124. ret = -ENOENT;
  1125. goto error_brelse;
  1126. }
  1127. }
  1128. if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
  1129. printk(KERN_ERR "btrfs: unable to remove the only writeable "
  1130. "device\n");
  1131. ret = -EINVAL;
  1132. goto error_brelse;
  1133. }
  1134. if (device->writeable) {
  1135. lock_chunks(root);
  1136. list_del_init(&device->dev_alloc_list);
  1137. unlock_chunks(root);
  1138. root->fs_info->fs_devices->rw_devices--;
  1139. clear_super = true;
  1140. }
  1141. ret = btrfs_shrink_device(device, 0);
  1142. if (ret)
  1143. goto error_undo;
  1144. ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
  1145. if (ret)
  1146. goto error_undo;
  1147. device->in_fs_metadata = 0;
  1148. btrfs_scrub_cancel_dev(root, device);
  1149. /*
  1150. * the device list mutex makes sure that we don't change
  1151. * the device list while someone else is writing out all
  1152. * the device supers.
  1153. */
  1154. cur_devices = device->fs_devices;
  1155. mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
  1156. list_del_rcu(&device->dev_list);
  1157. device->fs_devices->num_devices--;
  1158. if (device->missing)
  1159. root->fs_info->fs_devices->missing_devices--;
  1160. next_device = list_entry(root->fs_info->fs_devices->devices.next,
  1161. struct btrfs_device, dev_list);
  1162. if (device->bdev == root->fs_info->sb->s_bdev)
  1163. root->fs_info->sb->s_bdev = next_device->bdev;
  1164. if (device->bdev == root->fs_info->fs_devices->latest_bdev)
  1165. root->fs_info->fs_devices->latest_bdev = next_device->bdev;
  1166. if (device->bdev)
  1167. device->fs_devices->open_devices--;
  1168. call_rcu(&device->rcu, free_device);
  1169. mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  1170. num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
  1171. btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
  1172. if (cur_devices->open_devices == 0) {
  1173. struct btrfs_fs_devices *fs_devices;
  1174. fs_devices = root->fs_info->fs_devices;
  1175. while (fs_devices) {
  1176. if (fs_devices->seed == cur_devices)
  1177. break;
  1178. fs_devices = fs_devices->seed;
  1179. }
  1180. fs_devices->seed = cur_devices->seed;
  1181. cur_devices->seed = NULL;
  1182. lock_chunks(root);
  1183. __btrfs_close_devices(cur_devices);
  1184. unlock_chunks(root);
  1185. free_fs_devices(cur_devices);
  1186. }
  1187. /*
  1188. * at this point, the device is zero sized. We want to
  1189. * remove it from the devices list and zero out the old super
  1190. */
  1191. if (clear_super) {
  1192. /* make sure this device isn't detected as part of
  1193. * the FS anymore
  1194. */
  1195. memset(&disk_super->magic, 0, sizeof(disk_super->magic));
  1196. set_buffer_dirty(bh);
  1197. sync_dirty_buffer(bh);
  1198. }
  1199. ret = 0;
  1200. error_brelse:
  1201. brelse(bh);
  1202. error_close:
  1203. if (bdev)
  1204. blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
  1205. out:
  1206. mutex_unlock(&root->fs_info->volume_mutex);
  1207. mutex_unlock(&uuid_mutex);
  1208. return ret;
  1209. error_undo:
  1210. if (device->writeable) {
  1211. lock_chunks(root);
  1212. list_add(&device->dev_alloc_list,
  1213. &root->fs_info->fs_devices->alloc_list);
  1214. unlock_chunks(root);
  1215. root->fs_info->fs_devices->rw_devices++;
  1216. }
  1217. goto error_brelse;
  1218. }
  1219. /*
  1220. * does all the dirty work required for changing file system's UUID.
  1221. */
  1222. static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
  1223. struct btrfs_root *root)
  1224. {
  1225. struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
  1226. struct btrfs_fs_devices *old_devices;
  1227. struct btrfs_fs_devices *seed_devices;
  1228. struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
  1229. struct btrfs_device *device;
  1230. u64 super_flags;
  1231. BUG_ON(!mutex_is_locked(&uuid_mutex));
  1232. if (!fs_devices->seeding)
  1233. return -EINVAL;
  1234. seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
  1235. if (!seed_devices)
  1236. return -ENOMEM;
  1237. old_devices = clone_fs_devices(fs_devices);
  1238. if (IS_ERR(old_devices)) {
  1239. kfree(seed_devices);
  1240. return PTR_ERR(old_devices);
  1241. }
  1242. list_add(&old_devices->list, &fs_uuids);
  1243. memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
  1244. seed_devices->opened = 1;
  1245. INIT_LIST_HEAD(&seed_devices->devices);
  1246. INIT_LIST_HEAD(&seed_devices->alloc_list);
  1247. mutex_init(&seed_devices->device_list_mutex);
  1248. mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
  1249. list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
  1250. synchronize_rcu);
  1251. mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  1252. list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
  1253. list_for_each_entry(device, &seed_devices->devices, dev_list) {
  1254. device->fs_devices = seed_devices;
  1255. }
  1256. fs_devices->seeding = 0;
  1257. fs_devices->num_devices = 0;
  1258. fs_devices->open_devices = 0;
  1259. fs_devices->seed = seed_devices;
  1260. generate_random_uuid(fs_devices->fsid);
  1261. memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
  1262. memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
  1263. super_flags = btrfs_super_flags(disk_super) &
  1264. ~BTRFS_SUPER_FLAG_SEEDING;
  1265. btrfs_set_super_flags(disk_super, super_flags);
  1266. return 0;
  1267. }
  1268. /*
  1269. * strore the expected generation for seed devices in device items.
  1270. */
  1271. static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
  1272. struct btrfs_root *root)
  1273. {
  1274. struct btrfs_path *path;
  1275. struct extent_buffer *leaf;
  1276. struct btrfs_dev_item *dev_item;
  1277. struct btrfs_device *device;
  1278. struct btrfs_key key;
  1279. u8 fs_uuid[BTRFS_UUID_SIZE];
  1280. u8 dev_uuid[BTRFS_UUID_SIZE];
  1281. u64 devid;
  1282. int ret;
  1283. path = btrfs_alloc_path();
  1284. if (!path)
  1285. return -ENOMEM;
  1286. root = root->fs_info->chunk_root;
  1287. key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  1288. key.offset = 0;
  1289. key.type = BTRFS_DEV_ITEM_KEY;
  1290. while (1) {
  1291. ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
  1292. if (ret < 0)
  1293. goto error;
  1294. leaf = path->nodes[0];
  1295. next_slot:
  1296. if (path->slots[0] >= btrfs_header_nritems(leaf)) {
  1297. ret = btrfs_next_leaf(root, path);
  1298. if (ret > 0)
  1299. break;
  1300. if (ret < 0)
  1301. goto error;
  1302. leaf = path->nodes[0];
  1303. btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  1304. btrfs_release_path(path);
  1305. continue;
  1306. }
  1307. btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  1308. if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
  1309. key.type != BTRFS_DEV_ITEM_KEY)
  1310. break;
  1311. dev_item = btrfs_item_ptr(leaf, path->slots[0],
  1312. struct btrfs_dev_item);
  1313. devid = btrfs_device_id(leaf, dev_item);
  1314. read_extent_buffer(leaf, dev_uuid,
  1315. (unsigned long)btrfs_device_uuid(dev_item),
  1316. BTRFS_UUID_SIZE);
  1317. read_extent_buffer(leaf, fs_uuid,
  1318. (unsigned long)btrfs_device_fsid(dev_item),
  1319. BTRFS_UUID_SIZE);
  1320. device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
  1321. BUG_ON(!device);
  1322. if (device->fs_devices->seeding) {
  1323. btrfs_set_device_generation(leaf, dev_item,
  1324. device->generation);
  1325. btrfs_mark_buffer_dirty(leaf);
  1326. }
  1327. path->slots[0]++;
  1328. goto next_slot;
  1329. }
  1330. ret = 0;
  1331. error:
  1332. btrfs_free_path(path);
  1333. return ret;
  1334. }
  1335. int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
  1336. {
  1337. struct request_queue *q;
  1338. struct btrfs_trans_handle *trans;
  1339. struct btrfs_device *device;
  1340. struct block_device *bdev;
  1341. struct list_head *devices;
  1342. struct super_block *sb = root->fs_info->sb;
  1343. u64 total_bytes;
  1344. int seeding_dev = 0;
  1345. int ret = 0;
  1346. if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
  1347. return -EINVAL;
  1348. bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
  1349. root->fs_info->bdev_holder);
  1350. if (IS_ERR(bdev))
  1351. return PTR_ERR(bdev);
  1352. if (root->fs_info->fs_devices->seeding) {
  1353. seeding_dev = 1;
  1354. down_write(&sb->s_umount);
  1355. mutex_lock(&uuid_mutex);
  1356. }
  1357. filemap_write_and_wait(bdev->bd_inode->i_mapping);
  1358. mutex_lock(&root->fs_info->volume_mutex);
  1359. devices = &root->fs_info->fs_devices->devices;
  1360. /*
  1361. * we have the volume lock, so we don't need the extra
  1362. * device list mutex while reading the list here.
  1363. */
  1364. list_for_each_entry(device, devices, dev_list) {
  1365. if (device->bdev == bdev) {
  1366. ret = -EEXIST;
  1367. goto error;
  1368. }
  1369. }
  1370. device = kzalloc(sizeof(*device), GFP_NOFS);
  1371. if (!device) {
  1372. /* we can safely leave the fs_devices entry around */
  1373. ret = -ENOMEM;
  1374. goto error;
  1375. }
  1376. device->name = kstrdup(device_path, GFP_NOFS);
  1377. if (!device->name) {
  1378. kfree(device);
  1379. ret = -ENOMEM;
  1380. goto error;
  1381. }
  1382. ret = find_next_devid(root, &device->devid);
  1383. if (ret) {
  1384. kfree(device->name);
  1385. kfree(device);
  1386. goto error;
  1387. }
  1388. trans = btrfs_start_transaction(root, 0);
  1389. if (IS_ERR(trans)) {
  1390. kfree(device->name);
  1391. kfree(device);
  1392. ret = PTR_ERR(trans);
  1393. goto error;
  1394. }
  1395. lock_chunks(root);
  1396. q = bdev_get_queue(bdev);
  1397. if (blk_queue_discard(q))
  1398. device->can_discard = 1;
  1399. device->writeable = 1;
  1400. device->work.func = pending_bios_fn;
  1401. generate_random_uuid(device->uuid);
  1402. spin_lock_init(&device->io_lock);
  1403. device->generation = trans->transid;
  1404. device->io_width = root->sectorsize;
  1405. device->io_align = root->sectorsize;
  1406. device->sector_size = root->sectorsize;
  1407. device->total_bytes = i_size_read(bdev->bd_inode);
  1408. device->disk_total_bytes = device->total_bytes;
  1409. device->dev_root = root->fs_info->dev_root;
  1410. device->bdev = bdev;
  1411. device->in_fs_metadata = 1;
  1412. device->mode = FMODE_EXCL;
  1413. set_blocksize(device->bdev, 4096);
  1414. if (seeding_dev) {
  1415. sb->s_flags &= ~MS_RDONLY;
  1416. ret = btrfs_prepare_sprout(trans, root);
  1417. BUG_ON(ret);
  1418. }
  1419. device->fs_devices = root->fs_info->fs_devices;
  1420. /*
  1421. * we don't want write_supers to jump in here with our device
  1422. * half setup
  1423. */
  1424. mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
  1425. list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
  1426. list_add(&device->dev_alloc_list,
  1427. &root->fs_info->fs_devices->alloc_list);
  1428. root->fs_info->fs_devices->num_devices++;
  1429. root->fs_info->fs_devices->open_devices++;
  1430. root->fs_info->fs_devices->rw_devices++;
  1431. if (device->can_discard)
  1432. root->fs_info->fs_devices->num_can_discard++;
  1433. root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
  1434. if (!blk_queue_nonrot(bdev_get_queue(bdev)))
  1435. root->fs_info->fs_devices->rotating = 1;
  1436. total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
  1437. btrfs_set_super_total_bytes(&root->fs_info->super_copy,
  1438. total_bytes + device->total_bytes);
  1439. total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
  1440. btrfs_set_super_num_devices(&root->fs_info->super_copy,
  1441. total_bytes + 1);
  1442. mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  1443. if (seeding_dev) {
  1444. ret = init_first_rw_device(trans, root, device);
  1445. BUG_ON(ret);
  1446. ret = btrfs_finish_sprout(trans, root);
  1447. BUG_ON(ret);
  1448. } else {
  1449. ret = btrfs_add_device(trans, root, device);
  1450. }
  1451. /*
  1452. * we've got more storage, clear any full flags on the space
  1453. * infos
  1454. */
  1455. btrfs_clear_space_info_full(root->fs_info);
  1456. unlock_chunks(root);
  1457. btrfs_commit_transaction(trans, root);
  1458. if (seeding_dev) {
  1459. mutex_unlock(&uuid_mutex);
  1460. up_write(&sb->s_umount);
  1461. ret = btrfs_relocate_sys_chunks(root);
  1462. BUG_ON(ret);
  1463. }
  1464. out:
  1465. mutex_unlock(&root->fs_info->volume_mutex);
  1466. return ret;
  1467. error:
  1468. blkdev_put(bdev, FMODE_EXCL);
  1469. if (seeding_dev) {
  1470. mutex_unlock(&uuid_mutex);
  1471. up_write(&sb->s_umount);
  1472. }
  1473. goto out;
  1474. }
  1475. static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
  1476. struct btrfs_device *device)
  1477. {
  1478. int ret;
  1479. struct btrfs_path *path;
  1480. struct btrfs_root *root;
  1481. struct btrfs_dev_item *dev_item;
  1482. struct extent_buffer *leaf;
  1483. struct btrfs_key key;
  1484. root = device->dev_root->fs_info->chunk_root;
  1485. path = btrfs_alloc_path();
  1486. if (!path)
  1487. return -ENOMEM;
  1488. key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  1489. key.type = BTRFS_DEV_ITEM_KEY;
  1490. key.offset = device->devid;
  1491. ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
  1492. if (ret < 0)
  1493. goto out;
  1494. if (ret > 0) {
  1495. ret = -ENOENT;
  1496. goto out;
  1497. }
  1498. leaf = path->nodes[0];
  1499. dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
  1500. btrfs_set_device_id(leaf, dev_item, device->devid);
  1501. btrfs_set_device_type(leaf, dev_item, device->type);
  1502. btrfs_set_device_io_align(leaf, dev_item, device->io_align);
  1503. btrfs_set_device_io_width(leaf, dev_item, device->io_width);
  1504. btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
  1505. btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
  1506. btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
  1507. btrfs_mark_buffer_dirty(leaf);
  1508. out:
  1509. btrfs_free_path(path);
  1510. return ret;
  1511. }
  1512. static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
  1513. struct btrfs_device *device, u64 new_size)
  1514. {
  1515. struct btrfs_super_block *super_copy =
  1516. &device->dev_root->fs_info->super_copy;
  1517. u64 old_total = btrfs_super_total_bytes(super_copy);
  1518. u64 diff = new_size - device->total_bytes;
  1519. if (!device->writeable)
  1520. return -EACCES;
  1521. if (new_size <= device->total_bytes)
  1522. return -EINVAL;
  1523. btrfs_set_super_total_bytes(super_copy, old_total + diff);
  1524. device->fs_devices->total_rw_bytes += diff;
  1525. device->total_bytes = new_size;
  1526. device->disk_total_bytes = new_size;
  1527. btrfs_clear_space_info_full(device->dev_root->fs_info);
  1528. return btrfs_update_device(trans, device);
  1529. }
  1530. int btrfs_grow_device(struct btrfs_trans_handle *trans,
  1531. struct btrfs_device *device, u64 new_size)
  1532. {
  1533. int ret;
  1534. lock_chunks(device->dev_root);
  1535. ret = __btrfs_grow_device(trans, device, new_size);
  1536. unlock_chunks(device->dev_root);
  1537. return ret;
  1538. }
  1539. static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
  1540. struct btrfs_root *root,
  1541. u64 chunk_tree, u64 chunk_objectid,
  1542. u64 chunk_offset)
  1543. {
  1544. int ret;
  1545. struct btrfs_path *path;
  1546. struct btrfs_key key;
  1547. root = root->fs_info->chunk_root;
  1548. path = btrfs_alloc_path();
  1549. if (!path)
  1550. return -ENOMEM;
  1551. key.objectid = chunk_objectid;
  1552. key.offset = chunk_offset;
  1553. key.type = BTRFS_CHUNK_ITEM_KEY;
  1554. ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
  1555. BUG_ON(ret);
  1556. ret = btrfs_del_item(trans, root, path);
  1557. btrfs_free_path(path);
  1558. return ret;
  1559. }
  1560. static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
  1561. chunk_offset)
  1562. {
  1563. struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
  1564. struct btrfs_disk_key *disk_key;
  1565. struct btrfs_chunk *chunk;
  1566. u8 *ptr;
  1567. int ret = 0;
  1568. u32 num_stripes;
  1569. u32 array_size;
  1570. u32 len = 0;
  1571. u32 cur;
  1572. struct btrfs_key key;
  1573. array_size = btrfs_super_sys_array_size(super_copy);
  1574. ptr = super_copy->sys_chunk_array;
  1575. cur = 0;
  1576. while (cur < array_size) {
  1577. disk_key = (struct btrfs_disk_key *)ptr;
  1578. btrfs_disk_key_to_cpu(&key, disk_key);
  1579. len = sizeof(*disk_key);
  1580. if (key.type == BTRFS_CHUNK_ITEM_KEY) {
  1581. chunk = (struct btrfs_chunk *)(ptr + len);
  1582. num_stripes = btrfs_stack_chunk_num_stripes(chunk);
  1583. len += btrfs_chunk_item_size(num_stripes);
  1584. } else {
  1585. ret = -EIO;
  1586. break;
  1587. }
  1588. if (key.objectid == chunk_objectid &&
  1589. key.offset == chunk_offset) {
  1590. memmove(ptr, ptr + len, array_size - (cur + len));
  1591. array_size -= len;
  1592. btrfs_set_super_sys_array_size(super_copy, array_size);
  1593. } else {
  1594. ptr += len;
  1595. cur += len;
  1596. }
  1597. }
  1598. return ret;
  1599. }
  1600. static int btrfs_relocate_chunk(struct btrfs_root *root,
  1601. u64 chunk_tree, u64 chunk_objectid,
  1602. u64 chunk_offset)
  1603. {
  1604. struct extent_map_tree *em_tree;
  1605. struct btrfs_root *extent_root;
  1606. struct btrfs_trans_handle *trans;
  1607. struct extent_map *em;
  1608. struct map_lookup *map;
  1609. int ret;
  1610. int i;
  1611. root = root->fs_info->chunk_root;
  1612. extent_root = root->fs_info->extent_root;
  1613. em_tree = &root->fs_info->mapping_tree.map_tree;
  1614. ret = btrfs_can_relocate(extent_root, chunk_offset);
  1615. if (ret)
  1616. return -ENOSPC;
  1617. /* step one, relocate all the extents inside this chunk */
  1618. ret = btrfs_relocate_block_group(extent_root, chunk_offset);
  1619. if (ret)
  1620. return ret;
  1621. trans = btrfs_start_transaction(root, 0);
  1622. BUG_ON(IS_ERR(trans));
  1623. lock_chunks(root);
  1624. /*
  1625. * step two, delete the device extents and the
  1626. * chunk tree entries
  1627. */
  1628. read_lock(&em_tree->lock);
  1629. em = lookup_extent_mapping(em_tree, chunk_offset, 1);
  1630. read_unlock(&em_tree->lock);
  1631. BUG_ON(em->start > chunk_offset ||
  1632. em->start + em->len < chunk_offset);
  1633. map = (struct map_lookup *)em->bdev;
  1634. for (i = 0; i < map->num_stripes; i++) {
  1635. ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
  1636. map->stripes[i].physical);
  1637. BUG_ON(ret);
  1638. if (map->stripes[i].dev) {
  1639. ret = btrfs_update_device(trans, map->stripes[i].dev);
  1640. BUG_ON(ret);
  1641. }
  1642. }
  1643. ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
  1644. chunk_offset);
  1645. BUG_ON(ret);
  1646. trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
  1647. if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
  1648. ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
  1649. BUG_ON(ret);
  1650. }
  1651. ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
  1652. BUG_ON(ret);
  1653. write_lock(&em_tree->lock);
  1654. remove_extent_mapping(em_tree, em);
  1655. write_unlock(&em_tree->lock);
  1656. kfree(map);
  1657. em->bdev = NULL;
  1658. /* once for the tree */
  1659. free_extent_map(em);
  1660. /* once for us */
  1661. free_extent_map(em);
  1662. unlock_chunks(root);
  1663. btrfs_end_transaction(trans, root);
  1664. return 0;
  1665. }
  1666. static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
  1667. {
  1668. struct btrfs_root *chunk_root = root->fs_info->chunk_root;
  1669. struct btrfs_path *path;
  1670. struct extent_buffer *leaf;
  1671. struct btrfs_chunk *chunk;
  1672. struct btrfs_key key;
  1673. struct btrfs_key found_key;
  1674. u64 chunk_tree = chunk_root->root_key.objectid;
  1675. u64 chunk_type;
  1676. bool retried = false;
  1677. int failed = 0;
  1678. int ret;
  1679. path = btrfs_alloc_path();
  1680. if (!path)
  1681. return -ENOMEM;
  1682. again:
  1683. key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
  1684. key.offset = (u64)-1;
  1685. key.type = BTRFS_CHUNK_ITEM_KEY;
  1686. while (1) {
  1687. ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
  1688. if (ret < 0)
  1689. goto error;
  1690. BUG_ON(ret == 0);
  1691. ret = btrfs_previous_item(chunk_root, path, key.objectid,
  1692. key.type);
  1693. if (ret < 0)
  1694. goto error;
  1695. if (ret > 0)
  1696. break;
  1697. leaf = path->nodes[0];
  1698. btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
  1699. chunk = btrfs_item_ptr(leaf, path->slots[0],
  1700. struct btrfs_chunk);
  1701. chunk_type = btrfs_chunk_type(leaf, chunk);
  1702. btrfs_release_path(path);
  1703. if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
  1704. ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
  1705. found_key.objectid,
  1706. found_key.offset);
  1707. if (ret == -ENOSPC)
  1708. failed++;
  1709. else if (ret)
  1710. BUG();
  1711. }
  1712. if (found_key.offset == 0)
  1713. break;
  1714. key.offset = found_key.offset - 1;
  1715. }
  1716. ret = 0;
  1717. if (failed && !retried) {
  1718. failed = 0;
  1719. retried = true;
  1720. goto again;
  1721. } else if (failed && retried) {
  1722. WARN_ON(1);
  1723. ret = -ENOSPC;
  1724. }
  1725. error:
  1726. btrfs_free_path(path);
  1727. return ret;
  1728. }
  1729. static u64 div_factor(u64 num, int factor)
  1730. {
  1731. if (factor == 10)
  1732. return num;
  1733. num *= factor;
  1734. do_div(num, 10);
  1735. return num;
  1736. }
  1737. int btrfs_balance(struct btrfs_root *dev_root)
  1738. {
  1739. int ret;
  1740. struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
  1741. struct btrfs_device *device;
  1742. u64 old_size;
  1743. u64 size_to_free;
  1744. struct btrfs_path *path;
  1745. struct btrfs_key key;
  1746. struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
  1747. struct btrfs_trans_handle *trans;
  1748. struct btrfs_key found_key;
  1749. if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
  1750. return -EROFS;
  1751. if (!capable(CAP_SYS_ADMIN))
  1752. return -EPERM;
  1753. mutex_lock(&dev_root->fs_info->volume_mutex);
  1754. dev_root = dev_root->fs_info->dev_root;
  1755. /* step one make some room on all the devices */
  1756. list_for_each_entry(device, devices, dev_list) {
  1757. old_size = device->total_bytes;
  1758. size_to_free = div_factor(old_size, 1);
  1759. size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
  1760. if (!device->writeable ||
  1761. device->total_bytes - device->bytes_used > size_to_free)
  1762. continue;
  1763. ret = btrfs_shrink_device(device, old_size - size_to_free);
  1764. if (ret == -ENOSPC)
  1765. break;
  1766. BUG_ON(ret);
  1767. trans = btrfs_start_transaction(dev_root, 0);
  1768. BUG_ON(IS_ERR(trans));
  1769. ret = btrfs_grow_device(trans, device, old_size);
  1770. BUG_ON(ret);
  1771. btrfs_end_transaction(trans, dev_root);
  1772. }
  1773. /* step two, relocate all the chunks */
  1774. path = btrfs_alloc_path();
  1775. BUG_ON(!path);
  1776. key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
  1777. key.offset = (u64)-1;
  1778. key.type = BTRFS_CHUNK_ITEM_KEY;
  1779. while (1) {
  1780. ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
  1781. if (ret < 0)
  1782. goto error;
  1783. /*
  1784. * this shouldn't happen, it means the last relocate
  1785. * failed
  1786. */
  1787. if (ret == 0)
  1788. break;
  1789. ret = btrfs_previous_item(chunk_root, path, 0,
  1790. BTRFS_CHUNK_ITEM_KEY);
  1791. if (ret)
  1792. break;
  1793. btrfs_item_key_to_cpu(path->nodes[0], &found_key,
  1794. path->slots[0]);
  1795. if (found_key.objectid != key.objectid)
  1796. break;
  1797. /* chunk zero is special */
  1798. if (found_key.offset == 0)
  1799. break;
  1800. btrfs_release_path(path);
  1801. ret = btrfs_relocate_chunk(chunk_root,
  1802. chunk_root->root_key.objectid,
  1803. found_key.objectid,
  1804. found_key.offset);
  1805. if (ret && ret != -ENOSPC)
  1806. goto error;
  1807. key.offset = found_key.offset - 1;
  1808. }
  1809. ret = 0;
  1810. error:
  1811. btrfs_free_path(path);
  1812. mutex_unlock(&dev_root->fs_info->volume_mutex);
  1813. return ret;
  1814. }
  1815. /*
  1816. * shrinking a device means finding all of the device extents past
  1817. * the new size, and then following the back refs to the chunks.
  1818. * The chunk relocation code actually frees the device extent
  1819. */
  1820. int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
  1821. {
  1822. struct btrfs_trans_handle *trans;
  1823. struct btrfs_root *root = device->dev_root;
  1824. struct btrfs_dev_extent *dev_extent = NULL;
  1825. struct btrfs_path *path;
  1826. u64 length;
  1827. u64 chunk_tree;
  1828. u64 chunk_objectid;
  1829. u64 chunk_offset;
  1830. int ret;
  1831. int slot;
  1832. int failed = 0;
  1833. bool retried = false;
  1834. struct extent_buffer *l;
  1835. struct btrfs_key key;
  1836. struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
  1837. u64 old_total = btrfs_super_total_bytes(super_copy);
  1838. u64 old_size = device->total_bytes;
  1839. u64 diff = device->total_bytes - new_size;
  1840. if (new_size >= device->total_bytes)
  1841. return -EINVAL;
  1842. path = btrfs_alloc_path();
  1843. if (!path)
  1844. return -ENOMEM;
  1845. path->reada = 2;
  1846. lock_chunks(root);
  1847. device->total_bytes = new_size;
  1848. if (device->writeable)
  1849. device->fs_devices->total_rw_bytes -= diff;
  1850. unlock_chunks(root);
  1851. again:
  1852. key.objectid = device->devid;
  1853. key.offset = (u64)-1;
  1854. key.type = BTRFS_DEV_EXTENT_KEY;
  1855. while (1) {
  1856. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  1857. if (ret < 0)
  1858. goto done;
  1859. ret = btrfs_previous_item(root, path, 0, key.type);
  1860. if (ret < 0)
  1861. goto done;
  1862. if (ret) {
  1863. ret = 0;
  1864. btrfs_release_path(path);
  1865. break;
  1866. }
  1867. l = path->nodes[0];
  1868. slot = path->slots[0];
  1869. btrfs_item_key_to_cpu(l, &key, path->slots[0]);
  1870. if (key.objectid != device->devid) {
  1871. btrfs_release_path(path);
  1872. break;
  1873. }
  1874. dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
  1875. length = btrfs_dev_extent_length(l, dev_extent);
  1876. if (key.offset + length <= new_size) {
  1877. btrfs_release_path(path);
  1878. break;
  1879. }
  1880. chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
  1881. chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
  1882. chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
  1883. btrfs_release_path(path);
  1884. ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
  1885. chunk_offset);
  1886. if (ret && ret != -ENOSPC)
  1887. goto done;
  1888. if (ret == -ENOSPC)
  1889. failed++;
  1890. key.offset -= 1;
  1891. }
  1892. if (failed && !retried) {
  1893. failed = 0;
  1894. retried = true;
  1895. goto again;
  1896. } else if (failed && retried) {
  1897. ret = -ENOSPC;
  1898. lock_chunks(root);
  1899. device->total_bytes = old_size;
  1900. if (device->writeable)
  1901. device->fs_devices->total_rw_bytes += diff;
  1902. unlock_chunks(root);
  1903. goto done;
  1904. }
  1905. /* Shrinking succeeded, else we would be at "done". */
  1906. trans = btrfs_start_transaction(root, 0);
  1907. if (IS_ERR(trans)) {
  1908. ret = PTR_ERR(trans);
  1909. goto done;
  1910. }
  1911. lock_chunks(root);
  1912. device->disk_total_bytes = new_size;
  1913. /* Now btrfs_update_device() will change the on-disk size. */
  1914. ret = btrfs_update_device(trans, device);
  1915. if (ret) {
  1916. unlock_chunks(root);
  1917. btrfs_end_transaction(trans, root);
  1918. goto done;
  1919. }
  1920. WARN_ON(diff > old_total);
  1921. btrfs_set_super_total_bytes(super_copy, old_total - diff);
  1922. unlock_chunks(root);
  1923. btrfs_end_transaction(trans, root);
  1924. done:
  1925. btrfs_free_path(path);
  1926. return ret;
  1927. }
  1928. static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
  1929. struct btrfs_root *root,
  1930. struct btrfs_key *key,
  1931. struct btrfs_chunk *chunk, int item_size)
  1932. {
  1933. struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
  1934. struct btrfs_disk_key disk_key;
  1935. u32 array_size;
  1936. u8 *ptr;
  1937. array_size = btrfs_super_sys_array_size(super_copy);
  1938. if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
  1939. return -EFBIG;
  1940. ptr = super_copy->sys_chunk_array + array_size;
  1941. btrfs_cpu_key_to_disk(&disk_key, key);
  1942. memcpy(ptr, &disk_key, sizeof(disk_key));
  1943. ptr += sizeof(disk_key);
  1944. memcpy(ptr, chunk, item_size);
  1945. item_size += sizeof(disk_key);
  1946. btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
  1947. return 0;
  1948. }
  1949. /*
  1950. * sort the devices in descending order by max_avail, total_avail
  1951. */
  1952. static int btrfs_cmp_device_info(const void *a, const void *b)
  1953. {
  1954. const struct btrfs_device_info *di_a = a;
  1955. const struct btrfs_device_info *di_b = b;
  1956. if (di_a->max_avail > di_b->max_avail)
  1957. return -1;
  1958. if (di_a->max_avail < di_b->max_avail)
  1959. return 1;
  1960. if (di_a->total_avail > di_b->total_avail)
  1961. return -1;
  1962. if (di_a->total_avail < di_b->total_avail)
  1963. return 1;
  1964. return 0;
  1965. }
  1966. static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
  1967. struct btrfs_root *extent_root,
  1968. struct map_lookup **map_ret,
  1969. u64 *num_bytes_out, u64 *stripe_size_out,
  1970. u64 start, u64 type)
  1971. {
  1972. struct btrfs_fs_info *info = extent_root->fs_info;
  1973. struct btrfs_fs_devices *fs_devices = info->fs_devices;
  1974. struct list_head *cur;
  1975. struct map_lookup *map = NULL;
  1976. struct extent_map_tree *em_tree;
  1977. struct extent_map *em;
  1978. struct btrfs_device_info *devices_info = NULL;
  1979. u64 total_avail;
  1980. int num_stripes; /* total number of stripes to allocate */
  1981. int sub_stripes; /* sub_stripes info for map */
  1982. int dev_stripes; /* stripes per dev */
  1983. int devs_max; /* max devs to use */
  1984. int devs_min; /* min devs needed */
  1985. int devs_increment; /* ndevs has to be a multiple of this */
  1986. int ncopies; /* how many copies to data has */
  1987. int ret;
  1988. u64 max_stripe_size;
  1989. u64 max_chunk_size;
  1990. u64 stripe_size;
  1991. u64 num_bytes;
  1992. int ndevs;
  1993. int i;
  1994. int j;
  1995. if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
  1996. (type & BTRFS_BLOCK_GROUP_DUP)) {
  1997. WARN_ON(1);
  1998. type &= ~BTRFS_BLOCK_GROUP_DUP;
  1999. }
  2000. if (list_empty(&fs_devices->alloc_list))
  2001. return -ENOSPC;
  2002. sub_stripes = 1;
  2003. dev_stripes = 1;
  2004. devs_increment = 1;
  2005. ncopies = 1;
  2006. devs_max = 0; /* 0 == as many as possible */
  2007. devs_min = 1;
  2008. /*
  2009. * define the properties of each RAID type.
  2010. * FIXME: move this to a global table and use it in all RAID
  2011. * calculation code
  2012. */
  2013. if (type & (BTRFS_BLOCK_GROUP_DUP)) {
  2014. dev_stripes = 2;
  2015. ncopies = 2;
  2016. devs_max = 1;
  2017. } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
  2018. devs_min = 2;
  2019. } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
  2020. devs_increment = 2;
  2021. ncopies = 2;
  2022. devs_max = 2;
  2023. devs_min = 2;
  2024. } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
  2025. sub_stripes = 2;
  2026. devs_increment = 2;
  2027. ncopies = 2;
  2028. devs_min = 4;
  2029. } else {
  2030. devs_max = 1;
  2031. }
  2032. if (type & BTRFS_BLOCK_GROUP_DATA) {
  2033. max_stripe_size = 1024 * 1024 * 1024;
  2034. max_chunk_size = 10 * max_stripe_size;
  2035. } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
  2036. max_stripe_size = 256 * 1024 * 1024;
  2037. max_chunk_size = max_stripe_size;
  2038. } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
  2039. max_stripe_size = 8 * 1024 * 1024;
  2040. max_chunk_size = 2 * max_stripe_size;
  2041. } else {
  2042. printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
  2043. type);
  2044. BUG_ON(1);
  2045. }
  2046. /* we don't want a chunk larger than 10% of writeable space */
  2047. max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
  2048. max_chunk_size);
  2049. devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
  2050. GFP_NOFS);
  2051. if (!devices_info)
  2052. return -ENOMEM;
  2053. cur = fs_devices->alloc_list.next;
  2054. /*
  2055. * in the first pass through the devices list, we gather information
  2056. * about the available holes on each device.
  2057. */
  2058. ndevs = 0;
  2059. while (cur != &fs_devices->alloc_list) {
  2060. struct btrfs_device *device;
  2061. u64 max_avail;
  2062. u64 dev_offset;
  2063. device = list_entry(cur, struct btrfs_device, dev_alloc_list);
  2064. cur = cur->next;
  2065. if (!device->writeable) {
  2066. printk(KERN_ERR
  2067. "btrfs: read-only device in alloc_list\n");
  2068. WARN_ON(1);
  2069. continue;
  2070. }
  2071. if (!device->in_fs_metadata)
  2072. continue;
  2073. if (device->total_bytes > device->bytes_used)
  2074. total_avail = device->total_bytes - device->bytes_used;
  2075. else
  2076. total_avail = 0;
  2077. /* avail is off by max(alloc_start, 1MB), but that is the same
  2078. * for all devices, so it doesn't hurt the sorting later on
  2079. */
  2080. ret = find_free_dev_extent(trans, device,
  2081. max_stripe_size * dev_stripes,
  2082. &dev_offset, &max_avail);
  2083. if (ret && ret != -ENOSPC)
  2084. goto error;
  2085. if (ret == 0)
  2086. max_avail = max_stripe_size * dev_stripes;
  2087. if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
  2088. continue;
  2089. devices_info[ndevs].dev_offset = dev_offset;
  2090. devices_info[ndevs].max_avail = max_avail;
  2091. devices_info[ndevs].total_avail = total_avail;
  2092. devices_info[ndevs].dev = device;
  2093. ++ndevs;
  2094. }
  2095. /*
  2096. * now sort the devices by hole size / available space
  2097. */
  2098. sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
  2099. btrfs_cmp_device_info, NULL);
  2100. /* round down to number of usable stripes */
  2101. ndevs -= ndevs % devs_increment;
  2102. if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
  2103. ret = -ENOSPC;
  2104. goto error;
  2105. }
  2106. if (devs_max && ndevs > devs_max)
  2107. ndevs = devs_max;
  2108. /*
  2109. * the primary goal is to maximize the number of stripes, so use as many
  2110. * devices as possible, even if the stripes are not maximum sized.
  2111. */
  2112. stripe_size = devices_info[ndevs-1].max_avail;
  2113. num_stripes = ndevs * dev_stripes;
  2114. if (stripe_size * num_stripes > max_chunk_size * ncopies) {
  2115. stripe_size = max_chunk_size * ncopies;
  2116. do_div(stripe_size, num_stripes);
  2117. }
  2118. do_div(stripe_size, dev_stripes);
  2119. do_div(stripe_size, BTRFS_STRIPE_LEN);
  2120. stripe_size *= BTRFS_STRIPE_LEN;
  2121. map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
  2122. if (!map) {
  2123. ret = -ENOMEM;
  2124. goto error;
  2125. }
  2126. map->num_stripes = num_stripes;
  2127. for (i = 0; i < ndevs; ++i) {
  2128. for (j = 0; j < dev_stripes; ++j) {
  2129. int s = i * dev_stripes + j;
  2130. map->stripes[s].dev = devices_info[i].dev;
  2131. map->stripes[s].physical = devices_info[i].dev_offset +
  2132. j * stripe_size;
  2133. }
  2134. }
  2135. map->sector_size = extent_root->sectorsize;
  2136. map->stripe_len = BTRFS_STRIPE_LEN;
  2137. map->io_align = BTRFS_STRIPE_LEN;
  2138. map->io_width = BTRFS_STRIPE_LEN;
  2139. map->type = type;
  2140. map->sub_stripes = sub_stripes;
  2141. *map_ret = map;
  2142. num_bytes = stripe_size * (num_stripes / ncopies);
  2143. *stripe_size_out = stripe_size;
  2144. *num_bytes_out = num_bytes;
  2145. trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
  2146. em = alloc_extent_map();
  2147. if (!em) {
  2148. ret = -ENOMEM;
  2149. goto error;
  2150. }
  2151. em->bdev = (struct block_device *)map;
  2152. em->start = start;
  2153. em->len = num_bytes;
  2154. em->block_start = 0;
  2155. em->block_len = em->len;
  2156. em_tree = &extent_root->fs_info->mapping_tree.map_tree;
  2157. write_lock(&em_tree->lock);
  2158. ret = add_extent_mapping(em_tree, em);
  2159. write_unlock(&em_tree->lock);
  2160. BUG_ON(ret);
  2161. free_extent_map(em);
  2162. ret = btrfs_make_block_group(trans, extent_root, 0, type,
  2163. BTRFS_FIRST_CHUNK_TREE_OBJECTID,
  2164. start, num_bytes);
  2165. BUG_ON(ret);
  2166. for (i = 0; i < map->num_stripes; ++i) {
  2167. struct btrfs_device *device;
  2168. u64 dev_offset;
  2169. device = map->stripes[i].dev;
  2170. dev_offset = map->stripes[i].physical;
  2171. ret = btrfs_alloc_dev_extent(trans, device,
  2172. info->chunk_root->root_key.objectid,
  2173. BTRFS_FIRST_CHUNK_TREE_OBJECTID,
  2174. start, dev_offset, stripe_size);
  2175. BUG_ON(ret);
  2176. }
  2177. kfree(devices_info);
  2178. return 0;
  2179. error:
  2180. kfree(map);
  2181. kfree(devices_info);
  2182. return ret;
  2183. }
  2184. static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
  2185. struct btrfs_root *extent_root,
  2186. struct map_lookup *map, u64 chunk_offset,
  2187. u64 chunk_size, u64 stripe_size)
  2188. {
  2189. u64 dev_offset;
  2190. struct btrfs_key key;
  2191. struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
  2192. struct btrfs_device *device;
  2193. struct btrfs_chunk *chunk;
  2194. struct btrfs_stripe *stripe;
  2195. size_t item_size = btrfs_chunk_item_size(map->num_stripes);
  2196. int index = 0;
  2197. int ret;
  2198. chunk = kzalloc(item_size, GFP_NOFS);
  2199. if (!chunk)
  2200. return -ENOMEM;
  2201. index = 0;
  2202. while (index < map->num_stripes) {
  2203. device = map->stripes[index].dev;
  2204. device->bytes_used += stripe_size;
  2205. ret = btrfs_update_device(trans, device);
  2206. BUG_ON(ret);
  2207. index++;
  2208. }
  2209. index = 0;
  2210. stripe = &chunk->stripe;
  2211. while (index < map->num_stripes) {
  2212. device = map->stripes[index].dev;
  2213. dev_offset = map->stripes[index].physical;
  2214. btrfs_set_stack_stripe_devid(stripe, device->devid);
  2215. btrfs_set_stack_stripe_offset(stripe, dev_offset);
  2216. memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
  2217. stripe++;
  2218. index++;
  2219. }
  2220. btrfs_set_stack_chunk_length(chunk, chunk_size);
  2221. btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
  2222. btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
  2223. btrfs_set_stack_chunk_type(chunk, map->type);
  2224. btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
  2225. btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
  2226. btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
  2227. btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
  2228. btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
  2229. key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
  2230. key.type = BTRFS_CHUNK_ITEM_KEY;
  2231. key.offset = chunk_offset;
  2232. ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
  2233. BUG_ON(ret);
  2234. if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
  2235. ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
  2236. item_size);
  2237. BUG_ON(ret);
  2238. }
  2239. kfree(chunk);
  2240. return 0;
  2241. }
  2242. /*
  2243. * Chunk allocation falls into two parts. The first part does works
  2244. * that make the new allocated chunk useable, but not do any operation
  2245. * that modifies the chunk tree. The second part does the works that
  2246. * require modifying the chunk tree. This division is important for the
  2247. * bootstrap process of adding storage to a seed btrfs.
  2248. */
  2249. int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
  2250. struct btrfs_root *extent_root, u64 type)
  2251. {
  2252. u64 chunk_offset;
  2253. u64 chunk_size;
  2254. u64 stripe_size;
  2255. struct map_lookup *map;
  2256. struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
  2257. int ret;
  2258. ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
  2259. &chunk_offset);
  2260. if (ret)
  2261. return ret;
  2262. ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
  2263. &stripe_size, chunk_offset, type);
  2264. if (ret)
  2265. return ret;
  2266. ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
  2267. chunk_size, stripe_size);
  2268. BUG_ON(ret);
  2269. return 0;
  2270. }
  2271. static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
  2272. struct btrfs_root *root,
  2273. struct btrfs_device *device)
  2274. {
  2275. u64 chunk_offset;
  2276. u64 sys_chunk_offset;
  2277. u64 chunk_size;
  2278. u64 sys_chunk_size;
  2279. u64 stripe_size;
  2280. u64 sys_stripe_size;
  2281. u64 alloc_profile;
  2282. struct map_lookup *map;
  2283. struct map_lookup *sys_map;
  2284. struct btrfs_fs_info *fs_info = root->fs_info;
  2285. struct btrfs_root *extent_root = fs_info->extent_root;
  2286. int ret;
  2287. ret = find_next_chunk(fs_info->chunk_root,
  2288. BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
  2289. BUG_ON(ret);
  2290. alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
  2291. (fs_info->metadata_alloc_profile &
  2292. fs_info->avail_metadata_alloc_bits);
  2293. alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
  2294. ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
  2295. &stripe_size, chunk_offset, alloc_profile);
  2296. BUG_ON(ret);
  2297. sys_chunk_offset = chunk_offset + chunk_size;
  2298. alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
  2299. (fs_info->system_alloc_profile &
  2300. fs_info->avail_system_alloc_bits);
  2301. alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
  2302. ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
  2303. &sys_chunk_size, &sys_stripe_size,
  2304. sys_chunk_offset, alloc_profile);
  2305. BUG_ON(ret);
  2306. ret = btrfs_add_device(trans, fs_info->chunk_root, device);
  2307. BUG_ON(ret);
  2308. /*
  2309. * Modifying chunk tree needs allocating new blocks from both
  2310. * system block group and metadata block group. So we only can
  2311. * do operations require modifying the chunk tree after both
  2312. * block groups were created.
  2313. */
  2314. ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
  2315. chunk_size, stripe_size);
  2316. BUG_ON(ret);
  2317. ret = __finish_chunk_alloc(trans, extent_root, sys_map,
  2318. sys_chunk_offset, sys_chunk_size,
  2319. sys_stripe_size);
  2320. BUG_ON(ret);
  2321. return 0;
  2322. }
  2323. int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
  2324. {
  2325. struct extent_map *em;
  2326. struct map_lookup *map;
  2327. struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
  2328. int readonly = 0;
  2329. int i;
  2330. read_lock(&map_tree->map_tree.lock);
  2331. em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
  2332. read_unlock(&map_tree->map_tree.lock);
  2333. if (!em)
  2334. return 1;
  2335. if (btrfs_test_opt(root, DEGRADED)) {
  2336. free_extent_map(em);
  2337. return 0;
  2338. }
  2339. map = (struct map_lookup *)em->bdev;
  2340. for (i = 0; i < map->num_stripes; i++) {
  2341. if (!map->stripes[i].dev->writeable) {
  2342. readonly = 1;
  2343. break;
  2344. }
  2345. }
  2346. free_extent_map(em);
  2347. return readonly;
  2348. }
  2349. void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
  2350. {
  2351. extent_map_tree_init(&tree->map_tree);
  2352. }
  2353. void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
  2354. {
  2355. struct extent_map *em;
  2356. while (1) {
  2357. write_lock(&tree->map_tree.lock);
  2358. em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
  2359. if (em)
  2360. remove_extent_mapping(&tree->map_tree, em);
  2361. write_unlock(&tree->map_tree.lock);
  2362. if (!em)
  2363. break;
  2364. kfree(em->bdev);
  2365. /* once for us */
  2366. free_extent_map(em);
  2367. /* once for the tree */
  2368. free_extent_map(em);
  2369. }
  2370. }
  2371. int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
  2372. {
  2373. struct extent_map *em;
  2374. struct map_lookup *map;
  2375. struct extent_map_tree *em_tree = &map_tree->map_tree;
  2376. int ret;
  2377. read_lock(&em_tree->lock);
  2378. em = lookup_extent_mapping(em_tree, logical, len);
  2379. read_unlock(&em_tree->lock);
  2380. BUG_ON(!em);
  2381. BUG_ON(em->start > logical || em->start + em->len < logical);
  2382. map = (struct map_lookup *)em->bdev;
  2383. if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
  2384. ret = map->num_stripes;
  2385. else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
  2386. ret = map->sub_stripes;
  2387. else
  2388. ret = 1;
  2389. free_extent_map(em);
  2390. return ret;
  2391. }
  2392. static int find_live_mirror(struct map_lookup *map, int first, int num,
  2393. int optimal)
  2394. {
  2395. int i;
  2396. if (map->stripes[optimal].dev->bdev)
  2397. return optimal;
  2398. for (i = first; i < first + num; i++) {
  2399. if (map->stripes[i].dev->bdev)
  2400. return i;
  2401. }
  2402. /* we couldn't find one that doesn't fail. Just return something
  2403. * and the io error handling code will clean up eventually
  2404. */
  2405. return optimal;
  2406. }
  2407. static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
  2408. u64 logical, u64 *length,
  2409. struct btrfs_multi_bio **multi_ret,
  2410. int mirror_num)
  2411. {
  2412. struct extent_map *em;
  2413. struct map_lookup *map;
  2414. struct extent_map_tree *em_tree = &map_tree->map_tree;
  2415. u64 offset;
  2416. u64 stripe_offset;
  2417. u64 stripe_end_offset;
  2418. u64 stripe_nr;
  2419. u64 stripe_nr_orig;
  2420. u64 stripe_nr_end;
  2421. int stripes_allocated = 8;
  2422. int stripes_required = 1;
  2423. int stripe_index;
  2424. int i;
  2425. int num_stripes;
  2426. int max_errors = 0;
  2427. struct btrfs_multi_bio *multi = NULL;
  2428. if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
  2429. stripes_allocated = 1;
  2430. again:
  2431. if (multi_ret) {
  2432. multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
  2433. GFP_NOFS);
  2434. if (!multi)
  2435. return -ENOMEM;
  2436. atomic_set(&multi->error, 0);
  2437. }
  2438. read_lock(&em_tree->lock);
  2439. em = lookup_extent_mapping(em_tree, logical, *length);
  2440. read_unlock(&em_tree->lock);
  2441. if (!em) {
  2442. printk(KERN_CRIT "unable to find logical %llu len %llu\n",
  2443. (unsigned long long)logical,
  2444. (unsigned long long)*length);
  2445. BUG();
  2446. }
  2447. BUG_ON(em->start > logical || em->start + em->len < logical);
  2448. map = (struct map_lookup *)em->bdev;
  2449. offset = logical - em->start;
  2450. if (mirror_num > map->num_stripes)
  2451. mirror_num = 0;
  2452. /* if our multi bio struct is too small, back off and try again */
  2453. if (rw & REQ_WRITE) {
  2454. if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
  2455. BTRFS_BLOCK_GROUP_DUP)) {
  2456. stripes_required = map->num_stripes;
  2457. max_errors = 1;
  2458. } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
  2459. stripes_required = map->sub_stripes;
  2460. max_errors = 1;
  2461. }
  2462. }
  2463. if (rw & REQ_DISCARD) {
  2464. if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
  2465. BTRFS_BLOCK_GROUP_RAID1 |
  2466. BTRFS_BLOCK_GROUP_DUP |
  2467. BTRFS_BLOCK_GROUP_RAID10)) {
  2468. stripes_required = map->num_stripes;
  2469. }
  2470. }
  2471. if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
  2472. stripes_allocated < stripes_required) {
  2473. stripes_allocated = map->num_stripes;
  2474. free_extent_map(em);
  2475. kfree(multi);
  2476. goto again;
  2477. }
  2478. stripe_nr = offset;
  2479. /*
  2480. * stripe_nr counts the total number of stripes we have to stride
  2481. * to get to this block
  2482. */
  2483. do_div(stripe_nr, map->stripe_len);
  2484. stripe_offset = stripe_nr * map->stripe_len;
  2485. BUG_ON(offset < stripe_offset);
  2486. /* stripe_offset is the offset of this block in its stripe*/
  2487. stripe_offset = offset - stripe_offset;
  2488. if (rw & REQ_DISCARD)
  2489. *length = min_t(u64, em->len - offset, *length);
  2490. else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
  2491. BTRFS_BLOCK_GROUP_RAID1 |
  2492. BTRFS_BLOCK_GROUP_RAID10 |
  2493. BTRFS_BLOCK_GROUP_DUP)) {
  2494. /* we limit the length of each bio to what fits in a stripe */
  2495. *length = min_t(u64, em->len - offset,
  2496. map->stripe_len - stripe_offset);
  2497. } else {
  2498. *length = em->len - offset;
  2499. }
  2500. if (!multi_ret)
  2501. goto out;
  2502. num_stripes = 1;
  2503. stripe_index = 0;
  2504. stripe_nr_orig = stripe_nr;
  2505. stripe_nr_end = (offset + *length + map->stripe_len - 1) &
  2506. (~(map->stripe_len - 1));
  2507. do_div(stripe_nr_end, map->stripe_len);
  2508. stripe_end_offset = stripe_nr_end * map->stripe_len -
  2509. (offset + *length);
  2510. if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
  2511. if (rw & REQ_DISCARD)
  2512. num_stripes = min_t(u64, map->num_stripes,
  2513. stripe_nr_end - stripe_nr_orig);
  2514. stripe_index = do_div(stripe_nr, map->num_stripes);
  2515. } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
  2516. if (rw & (REQ_WRITE | REQ_DISCARD))
  2517. num_stripes = map->num_stripes;
  2518. else if (mirror_num)
  2519. stripe_index = mirror_num - 1;
  2520. else {
  2521. stripe_index = find_live_mirror(map, 0,
  2522. map->num_stripes,
  2523. current->pid % map->num_stripes);
  2524. }
  2525. } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
  2526. if (rw & (REQ_WRITE | REQ_DISCARD))
  2527. num_stripes = map->num_stripes;
  2528. else if (mirror_num)
  2529. stripe_index = mirror_num - 1;
  2530. } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
  2531. int factor = map->num_stripes / map->sub_stripes;
  2532. stripe_index = do_div(stripe_nr, factor);
  2533. stripe_index *= map->sub_stripes;
  2534. if (rw & REQ_WRITE)
  2535. num_stripes = map->sub_stripes;
  2536. else if (rw & REQ_DISCARD)
  2537. num_stripes = min_t(u64, map->sub_stripes *
  2538. (stripe_nr_end - stripe_nr_orig),
  2539. map->num_stripes);
  2540. else if (mirror_num)
  2541. stripe_index += mirror_num - 1;
  2542. else {
  2543. stripe_index = find_live_mirror(map, stripe_index,
  2544. map->sub_stripes, stripe_index +
  2545. current->pid % map->sub_stripes);
  2546. }
  2547. } else {
  2548. /*
  2549. * after this do_div call, stripe_nr is the number of stripes
  2550. * on this device we have to walk to find the data, and
  2551. * stripe_index is the number of our device in the stripe array
  2552. */
  2553. stripe_index = do_div(stripe_nr, map->num_stripes);
  2554. }
  2555. BUG_ON(stripe_index >= map->num_stripes);
  2556. if (rw & REQ_DISCARD) {
  2557. for (i = 0; i < num_stripes; i++) {
  2558. multi->stripes[i].physical =
  2559. map->stripes[stripe_index].physical +
  2560. stripe_offset + stripe_nr * map->stripe_len;
  2561. multi->stripes[i].dev = map->stripes[stripe_index].dev;
  2562. if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
  2563. u64 stripes;
  2564. u32 last_stripe = 0;
  2565. int j;
  2566. div_u64_rem(stripe_nr_end - 1,
  2567. map->num_stripes,
  2568. &last_stripe);
  2569. for (j = 0; j < map->num_stripes; j++) {
  2570. u32 test;
  2571. div_u64_rem(stripe_nr_end - 1 - j,
  2572. map->num_stripes, &test);
  2573. if (test == stripe_index)
  2574. break;
  2575. }
  2576. stripes = stripe_nr_end - 1 - j;
  2577. do_div(stripes, map->num_stripes);
  2578. multi->stripes[i].length = map->stripe_len *
  2579. (stripes - stripe_nr + 1);
  2580. if (i == 0) {
  2581. multi->stripes[i].length -=
  2582. stripe_offset;
  2583. stripe_offset = 0;
  2584. }
  2585. if (stripe_index == last_stripe)
  2586. multi->stripes[i].length -=
  2587. stripe_end_offset;
  2588. } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
  2589. u64 stripes;
  2590. int j;
  2591. int factor = map->num_stripes /
  2592. map->sub_stripes;
  2593. u32 last_stripe = 0;
  2594. div_u64_rem(stripe_nr_end - 1,
  2595. factor, &last_stripe);
  2596. last_stripe *= map->sub_stripes;
  2597. for (j = 0; j < factor; j++) {
  2598. u32 test;
  2599. div_u64_rem(stripe_nr_end - 1 - j,
  2600. factor, &test);
  2601. if (test ==
  2602. stripe_index / map->sub_stripes)
  2603. break;
  2604. }
  2605. stripes = stripe_nr_end - 1 - j;
  2606. do_div(stripes, factor);
  2607. multi->stripes[i].length = map->stripe_len *
  2608. (stripes - stripe_nr + 1);
  2609. if (i < map->sub_stripes) {
  2610. multi->stripes[i].length -=
  2611. stripe_offset;
  2612. if (i == map->sub_stripes - 1)
  2613. stripe_offset = 0;
  2614. }
  2615. if (stripe_index >= last_stripe &&
  2616. stripe_index <= (last_stripe +
  2617. map->sub_stripes - 1)) {
  2618. multi->stripes[i].length -=
  2619. stripe_end_offset;
  2620. }
  2621. } else
  2622. multi->stripes[i].length = *length;
  2623. stripe_index++;
  2624. if (stripe_index == map->num_stripes) {
  2625. /* This could only happen for RAID0/10 */
  2626. stripe_index = 0;
  2627. stripe_nr++;
  2628. }
  2629. }
  2630. } else {
  2631. for (i = 0; i < num_stripes; i++) {
  2632. multi->stripes[i].physical =
  2633. map->stripes[stripe_index].physical +
  2634. stripe_offset +
  2635. stripe_nr * map->stripe_len;
  2636. multi->stripes[i].dev =
  2637. map->stripes[stripe_index].dev;
  2638. stripe_index++;
  2639. }
  2640. }
  2641. if (multi_ret) {
  2642. *multi_ret = multi;
  2643. multi->num_stripes = num_stripes;
  2644. multi->max_errors = max_errors;
  2645. }
  2646. out:
  2647. free_extent_map(em);
  2648. return 0;
  2649. }
  2650. int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
  2651. u64 logical, u64 *length,
  2652. struct btrfs_multi_bio **multi_ret, int mirror_num)
  2653. {
  2654. return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
  2655. mirror_num);
  2656. }
  2657. int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
  2658. u64 chunk_start, u64 physical, u64 devid,
  2659. u64 **logical, int *naddrs, int *stripe_len)
  2660. {
  2661. struct extent_map_tree *em_tree = &map_tree->map_tree;
  2662. struct extent_map *em;
  2663. struct map_lookup *map;
  2664. u64 *buf;
  2665. u64 bytenr;
  2666. u64 length;
  2667. u64 stripe_nr;
  2668. int i, j, nr = 0;
  2669. read_lock(&em_tree->lock);
  2670. em = lookup_extent_mapping(em_tree, chunk_start, 1);
  2671. read_unlock(&em_tree->lock);
  2672. BUG_ON(!em || em->start != chunk_start);
  2673. map = (struct map_lookup *)em->bdev;
  2674. length = em->len;
  2675. if (map->type & BTRFS_BLOCK_GROUP_RAID10)
  2676. do_div(length, map->num_stripes / map->sub_stripes);
  2677. else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
  2678. do_div(length, map->num_stripes);
  2679. buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
  2680. BUG_ON(!buf);
  2681. for (i = 0; i < map->num_stripes; i++) {
  2682. if (devid && map->stripes[i].dev->devid != devid)
  2683. continue;
  2684. if (map->stripes[i].physical > physical ||
  2685. map->stripes[i].physical + length <= physical)
  2686. continue;
  2687. stripe_nr = physical - map->stripes[i].physical;
  2688. do_div(stripe_nr, map->stripe_len);
  2689. if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
  2690. stripe_nr = stripe_nr * map->num_stripes + i;
  2691. do_div(stripe_nr, map->sub_stripes);
  2692. } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
  2693. stripe_nr = stripe_nr * map->num_stripes + i;
  2694. }
  2695. bytenr = chunk_start + stripe_nr * map->stripe_len;
  2696. WARN_ON(nr >= map->num_stripes);
  2697. for (j = 0; j < nr; j++) {
  2698. if (buf[j] == bytenr)
  2699. break;
  2700. }
  2701. if (j == nr) {
  2702. WARN_ON(nr >= map->num_stripes);
  2703. buf[nr++] = bytenr;
  2704. }
  2705. }
  2706. *logical = buf;
  2707. *naddrs = nr;
  2708. *stripe_len = map->stripe_len;
  2709. free_extent_map(em);
  2710. return 0;
  2711. }
  2712. static void end_bio_multi_stripe(struct bio *bio, int err)
  2713. {
  2714. struct btrfs_multi_bio *multi = bio->bi_private;
  2715. int is_orig_bio = 0;
  2716. if (err)
  2717. atomic_inc(&multi->error);
  2718. if (bio == multi->orig_bio)
  2719. is_orig_bio = 1;
  2720. if (atomic_dec_and_test(&multi->stripes_pending)) {
  2721. if (!is_orig_bio) {
  2722. bio_put(bio);
  2723. bio = multi->orig_bio;
  2724. }
  2725. bio->bi_private = multi->private;
  2726. bio->bi_end_io = multi->end_io;
  2727. /* only send an error to the higher layers if it is
  2728. * beyond the tolerance of the multi-bio
  2729. */
  2730. if (atomic_read(&multi->error) > multi->max_errors) {
  2731. err = -EIO;
  2732. } else if (err) {
  2733. /*
  2734. * this bio is actually up to date, we didn't
  2735. * go over the max number of errors
  2736. */
  2737. set_bit(BIO_UPTODATE, &bio->bi_flags);
  2738. err = 0;
  2739. }
  2740. kfree(multi);
  2741. bio_endio(bio, err);
  2742. } else if (!is_orig_bio) {
  2743. bio_put(bio);
  2744. }
  2745. }
  2746. struct async_sched {
  2747. struct bio *bio;
  2748. int rw;
  2749. struct btrfs_fs_info *info;
  2750. struct btrfs_work work;
  2751. };
  2752. /*
  2753. * see run_scheduled_bios for a description of why bios are collected for
  2754. * async submit.
  2755. *
  2756. * This will add one bio to the pending list for a device and make sure
  2757. * the work struct is scheduled.
  2758. */
  2759. static noinline int schedule_bio(struct btrfs_root *root,
  2760. struct btrfs_device *device,
  2761. int rw, struct bio *bio)
  2762. {
  2763. int should_queue = 1;
  2764. struct btrfs_pending_bios *pending_bios;
  2765. /* don't bother with additional async steps for reads, right now */
  2766. if (!(rw & REQ_WRITE)) {
  2767. bio_get(bio);
  2768. submit_bio(rw, bio);
  2769. bio_put(bio);
  2770. return 0;
  2771. }
  2772. /*
  2773. * nr_async_bios allows us to reliably return congestion to the
  2774. * higher layers. Otherwise, the async bio makes it appear we have
  2775. * made progress against dirty pages when we've really just put it
  2776. * on a queue for later
  2777. */
  2778. atomic_inc(&root->fs_info->nr_async_bios);
  2779. WARN_ON(bio->bi_next);
  2780. bio->bi_next = NULL;
  2781. bio->bi_rw |= rw;
  2782. spin_lock(&device->io_lock);
  2783. if (bio->bi_rw & REQ_SYNC)
  2784. pending_bios = &device->pending_sync_bios;
  2785. else
  2786. pending_bios = &device->pending_bios;
  2787. if (pending_bios->tail)
  2788. pending_bios->tail->bi_next = bio;
  2789. pending_bios->tail = bio;
  2790. if (!pending_bios->head)
  2791. pending_bios->head = bio;
  2792. if (device->running_pending)
  2793. should_queue = 0;
  2794. spin_unlock(&device->io_lock);
  2795. if (should_queue)
  2796. btrfs_queue_worker(&root->fs_info->submit_workers,
  2797. &device->work);
  2798. return 0;
  2799. }
  2800. int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
  2801. int mirror_num, int async_submit)
  2802. {
  2803. struct btrfs_mapping_tree *map_tree;
  2804. struct btrfs_device *dev;
  2805. struct bio *first_bio = bio;
  2806. u64 logical = (u64)bio->bi_sector << 9;
  2807. u64 length = 0;
  2808. u64 map_length;
  2809. struct btrfs_multi_bio *multi = NULL;
  2810. int ret;
  2811. int dev_nr = 0;
  2812. int total_devs = 1;
  2813. length = bio->bi_size;
  2814. map_tree = &root->fs_info->mapping_tree;
  2815. map_length = length;
  2816. ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
  2817. mirror_num);
  2818. BUG_ON(ret);
  2819. total_devs = multi->num_stripes;
  2820. if (map_length < length) {
  2821. printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
  2822. "len %llu\n", (unsigned long long)logical,
  2823. (unsigned long long)length,
  2824. (unsigned long long)map_length);
  2825. BUG();
  2826. }
  2827. multi->end_io = first_bio->bi_end_io;
  2828. multi->private = first_bio->bi_private;
  2829. multi->orig_bio = first_bio;
  2830. atomic_set(&multi->stripes_pending, multi->num_stripes);
  2831. while (dev_nr < total_devs) {
  2832. if (total_devs > 1) {
  2833. if (dev_nr < total_devs - 1) {
  2834. bio = bio_clone(first_bio, GFP_NOFS);
  2835. BUG_ON(!bio);
  2836. } else {
  2837. bio = first_bio;
  2838. }
  2839. bio->bi_private = multi;
  2840. bio->bi_end_io = end_bio_multi_stripe;
  2841. }
  2842. bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
  2843. dev = multi->stripes[dev_nr].dev;
  2844. if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
  2845. bio->bi_bdev = dev->bdev;
  2846. if (async_submit)
  2847. schedule_bio(root, dev, rw, bio);
  2848. else
  2849. submit_bio(rw, bio);
  2850. } else {
  2851. bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
  2852. bio->bi_sector = logical >> 9;
  2853. bio_endio(bio, -EIO);
  2854. }
  2855. dev_nr++;
  2856. }
  2857. if (total_devs == 1)
  2858. kfree(multi);
  2859. return 0;
  2860. }
  2861. struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
  2862. u8 *uuid, u8 *fsid)
  2863. {
  2864. struct btrfs_device *device;
  2865. struct btrfs_fs_devices *cur_devices;
  2866. cur_devices = root->fs_info->fs_devices;
  2867. while (cur_devices) {
  2868. if (!fsid ||
  2869. !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
  2870. device = __find_device(&cur_devices->devices,
  2871. devid, uuid);
  2872. if (device)
  2873. return device;
  2874. }
  2875. cur_devices = cur_devices->seed;
  2876. }
  2877. return NULL;
  2878. }
  2879. static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
  2880. u64 devid, u8 *dev_uuid)
  2881. {
  2882. struct btrfs_device *device;
  2883. struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
  2884. device = kzalloc(sizeof(*device), GFP_NOFS);
  2885. if (!device)
  2886. return NULL;
  2887. list_add(&device->dev_list,
  2888. &fs_devices->devices);
  2889. device->dev_root = root->fs_info->dev_root;
  2890. device->devid = devid;
  2891. device->work.func = pending_bios_fn;
  2892. device->fs_devices = fs_devices;
  2893. device->missing = 1;
  2894. fs_devices->num_devices++;
  2895. fs_devices->missing_devices++;
  2896. spin_lock_init(&device->io_lock);
  2897. INIT_LIST_HEAD(&device->dev_alloc_list);
  2898. memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
  2899. return device;
  2900. }
  2901. static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
  2902. struct extent_buffer *leaf,
  2903. struct btrfs_chunk *chunk)
  2904. {
  2905. struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
  2906. struct map_lookup *map;
  2907. struct extent_map *em;
  2908. u64 logical;
  2909. u64 length;
  2910. u64 devid;
  2911. u8 uuid[BTRFS_UUID_SIZE];
  2912. int num_stripes;
  2913. int ret;
  2914. int i;
  2915. logical = key->offset;
  2916. length = btrfs_chunk_length(leaf, chunk);
  2917. read_lock(&map_tree->map_tree.lock);
  2918. em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
  2919. read_unlock(&map_tree->map_tree.lock);
  2920. /* already mapped? */
  2921. if (em && em->start <= logical && em->start + em->len > logical) {
  2922. free_extent_map(em);
  2923. return 0;
  2924. } else if (em) {
  2925. free_extent_map(em);
  2926. }
  2927. em = alloc_extent_map();
  2928. if (!em)
  2929. return -ENOMEM;
  2930. num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
  2931. map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
  2932. if (!map) {
  2933. free_extent_map(em);
  2934. return -ENOMEM;
  2935. }
  2936. em->bdev = (struct block_device *)map;
  2937. em->start = logical;
  2938. em->len = length;
  2939. em->block_start = 0;
  2940. em->block_len = em->len;
  2941. map->num_stripes = num_stripes;
  2942. map->io_width = btrfs_chunk_io_width(leaf, chunk);
  2943. map->io_align = btrfs_chunk_io_align(leaf, chunk);
  2944. map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
  2945. map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
  2946. map->type = btrfs_chunk_type(leaf, chunk);
  2947. map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
  2948. for (i = 0; i < num_stripes; i++) {
  2949. map->stripes[i].physical =
  2950. btrfs_stripe_offset_nr(leaf, chunk, i);
  2951. devid = btrfs_stripe_devid_nr(leaf, chunk, i);
  2952. read_extent_buffer(leaf, uuid, (unsigned long)
  2953. btrfs_stripe_dev_uuid_nr(chunk, i),
  2954. BTRFS_UUID_SIZE);
  2955. map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
  2956. NULL);
  2957. if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
  2958. kfree(map);
  2959. free_extent_map(em);
  2960. return -EIO;
  2961. }
  2962. if (!map->stripes[i].dev) {
  2963. map->stripes[i].dev =
  2964. add_missing_dev(root, devid, uuid);
  2965. if (!map->stripes[i].dev) {
  2966. kfree(map);
  2967. free_extent_map(em);
  2968. return -EIO;
  2969. }
  2970. }
  2971. map->stripes[i].dev->in_fs_metadata = 1;
  2972. }
  2973. write_lock(&map_tree->map_tree.lock);
  2974. ret = add_extent_mapping(&map_tree->map_tree, em);
  2975. write_unlock(&map_tree->map_tree.lock);
  2976. BUG_ON(ret);
  2977. free_extent_map(em);
  2978. return 0;
  2979. }
  2980. static int fill_device_from_item(struct extent_buffer *leaf,
  2981. struct btrfs_dev_item *dev_item,
  2982. struct btrfs_device *device)
  2983. {
  2984. unsigned long ptr;
  2985. device->devid = btrfs_device_id(leaf, dev_item);
  2986. device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
  2987. device->total_bytes = device->disk_total_bytes;
  2988. device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
  2989. device->type = btrfs_device_type(leaf, dev_item);
  2990. device->io_align = btrfs_device_io_align(leaf, dev_item);
  2991. device->io_width = btrfs_device_io_width(leaf, dev_item);
  2992. device->sector_size = btrfs_device_sector_size(leaf, dev_item);
  2993. ptr = (unsigned long)btrfs_device_uuid(dev_item);
  2994. read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
  2995. return 0;
  2996. }
  2997. static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
  2998. {
  2999. struct btrfs_fs_devices *fs_devices;
  3000. int ret;
  3001. mutex_lock(&uuid_mutex);
  3002. fs_devices = root->fs_info->fs_devices->seed;
  3003. while (fs_devices) {
  3004. if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
  3005. ret = 0;
  3006. goto out;
  3007. }
  3008. fs_devices = fs_devices->seed;
  3009. }
  3010. fs_devices = find_fsid(fsid);
  3011. if (!fs_devices) {
  3012. ret = -ENOENT;
  3013. goto out;
  3014. }
  3015. fs_devices = clone_fs_devices(fs_devices);
  3016. if (IS_ERR(fs_devices)) {
  3017. ret = PTR_ERR(fs_devices);
  3018. goto out;
  3019. }
  3020. ret = __btrfs_open_devices(fs_devices, FMODE_READ,
  3021. root->fs_info->bdev_holder);
  3022. if (ret)
  3023. goto out;
  3024. if (!fs_devices->seeding) {
  3025. __btrfs_close_devices(fs_devices);
  3026. free_fs_devices(fs_devices);
  3027. ret = -EINVAL;
  3028. goto out;
  3029. }
  3030. fs_devices->seed = root->fs_info->fs_devices->seed;
  3031. root->fs_info->fs_devices->seed = fs_devices;
  3032. out:
  3033. mutex_unlock(&uuid_mutex);
  3034. return ret;
  3035. }
  3036. static int read_one_dev(struct btrfs_root *root,
  3037. struct extent_buffer *leaf,
  3038. struct btrfs_dev_item *dev_item)
  3039. {
  3040. struct btrfs_device *device;
  3041. u64 devid;
  3042. int ret;
  3043. u8 fs_uuid[BTRFS_UUID_SIZE];
  3044. u8 dev_uuid[BTRFS_UUID_SIZE];
  3045. devid = btrfs_device_id(leaf, dev_item);
  3046. read_extent_buffer(leaf, dev_uuid,
  3047. (unsigned long)btrfs_device_uuid(dev_item),
  3048. BTRFS_UUID_SIZE);
  3049. read_extent_buffer(leaf, fs_uuid,
  3050. (unsigned long)btrfs_device_fsid(dev_item),
  3051. BTRFS_UUID_SIZE);
  3052. if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
  3053. ret = open_seed_devices(root, fs_uuid);
  3054. if (ret && !btrfs_test_opt(root, DEGRADED))
  3055. return ret;
  3056. }
  3057. device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
  3058. if (!device || !device->bdev) {
  3059. if (!btrfs_test_opt(root, DEGRADED))
  3060. return -EIO;
  3061. if (!device) {
  3062. printk(KERN_WARNING "warning devid %llu missing\n",
  3063. (unsigned long long)devid);
  3064. device = add_missing_dev(root, devid, dev_uuid);
  3065. if (!device)
  3066. return -ENOMEM;
  3067. } else if (!device->missing) {
  3068. /*
  3069. * this happens when a device that was properly setup
  3070. * in the device info lists suddenly goes bad.
  3071. * device->bdev is NULL, and so we have to set
  3072. * device->missing to one here
  3073. */
  3074. root->fs_info->fs_devices->missing_devices++;
  3075. device->missing = 1;
  3076. }
  3077. }
  3078. if (device->fs_devices != root->fs_info->fs_devices) {
  3079. BUG_ON(device->writeable);
  3080. if (device->generation !=
  3081. btrfs_device_generation(leaf, dev_item))
  3082. return -EINVAL;
  3083. }
  3084. fill_device_from_item(leaf, dev_item, device);
  3085. device->dev_root = root->fs_info->dev_root;
  3086. device->in_fs_metadata = 1;
  3087. if (device->writeable)
  3088. device->fs_devices->total_rw_bytes += device->total_bytes;
  3089. ret = 0;
  3090. return ret;
  3091. }
  3092. int btrfs_read_sys_array(struct btrfs_root *root)
  3093. {
  3094. struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
  3095. struct extent_buffer *sb;
  3096. struct btrfs_disk_key *disk_key;
  3097. struct btrfs_chunk *chunk;
  3098. u8 *ptr;
  3099. unsigned long sb_ptr;
  3100. int ret = 0;
  3101. u32 num_stripes;
  3102. u32 array_size;
  3103. u32 len = 0;
  3104. u32 cur;
  3105. struct btrfs_key key;
  3106. sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
  3107. BTRFS_SUPER_INFO_SIZE);
  3108. if (!sb)
  3109. return -ENOMEM;
  3110. btrfs_set_buffer_uptodate(sb);
  3111. btrfs_set_buffer_lockdep_class(sb, 0);
  3112. write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
  3113. array_size = btrfs_super_sys_array_size(super_copy);
  3114. ptr = super_copy->sys_chunk_array;
  3115. sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
  3116. cur = 0;
  3117. while (cur < array_size) {
  3118. disk_key = (struct btrfs_disk_key *)ptr;
  3119. btrfs_disk_key_to_cpu(&key, disk_key);
  3120. len = sizeof(*disk_key); ptr += len;
  3121. sb_ptr += len;
  3122. cur += len;
  3123. if (key.type == BTRFS_CHUNK_ITEM_KEY) {
  3124. chunk = (struct btrfs_chunk *)sb_ptr;
  3125. ret = read_one_chunk(root, &key, sb, chunk);
  3126. if (ret)
  3127. break;
  3128. num_stripes = btrfs_chunk_num_stripes(sb, chunk);
  3129. len = btrfs_chunk_item_size(num_stripes);
  3130. } else {
  3131. ret = -EIO;
  3132. break;
  3133. }
  3134. ptr += len;
  3135. sb_ptr += len;
  3136. cur += len;
  3137. }
  3138. free_extent_buffer(sb);
  3139. return ret;
  3140. }
  3141. int btrfs_read_chunk_tree(struct btrfs_root *root)
  3142. {
  3143. struct btrfs_path *path;
  3144. struct extent_buffer *leaf;
  3145. struct btrfs_key key;
  3146. struct btrfs_key found_key;
  3147. int ret;
  3148. int slot;
  3149. root = root->fs_info->chunk_root;
  3150. path = btrfs_alloc_path();
  3151. if (!path)
  3152. return -ENOMEM;
  3153. /* first we search for all of the device items, and then we
  3154. * read in all of the chunk items. This way we can create chunk
  3155. * mappings that reference all of the devices that are afound
  3156. */
  3157. key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  3158. key.offset = 0;
  3159. key.type = 0;
  3160. again:
  3161. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  3162. if (ret < 0)
  3163. goto error;
  3164. while (1) {
  3165. leaf = path->nodes[0];
  3166. slot = path->slots[0];
  3167. if (slot >= btrfs_header_nritems(leaf)) {
  3168. ret = btrfs_next_leaf(root, path);
  3169. if (ret == 0)
  3170. continue;
  3171. if (ret < 0)
  3172. goto error;
  3173. break;
  3174. }
  3175. btrfs_item_key_to_cpu(leaf, &found_key, slot);
  3176. if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
  3177. if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
  3178. break;
  3179. if (found_key.type == BTRFS_DEV_ITEM_KEY) {
  3180. struct btrfs_dev_item *dev_item;
  3181. dev_item = btrfs_item_ptr(leaf, slot,
  3182. struct btrfs_dev_item);
  3183. ret = read_one_dev(root, leaf, dev_item);
  3184. if (ret)
  3185. goto error;
  3186. }
  3187. } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
  3188. struct btrfs_chunk *chunk;
  3189. chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
  3190. ret = read_one_chunk(root, &found_key, leaf, chunk);
  3191. if (ret)
  3192. goto error;
  3193. }
  3194. path->slots[0]++;
  3195. }
  3196. if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
  3197. key.objectid = 0;
  3198. btrfs_release_path(path);
  3199. goto again;
  3200. }
  3201. ret = 0;
  3202. error:
  3203. btrfs_free_path(path);
  3204. return ret;
  3205. }