/fs/btrfs/volumes.c

https://bitbucket.org/slukk/jb-tsm-kernel-4.2 · C · 3718 lines · 2949 code · 513 blank · 256 comment · 540 complexity · 58b4fc98f53631a2fc9f626b94464381 MD5 · raw file

Large files are truncated click here to view the full file

  1. /*
  2. * Copyright (C) 2007 Oracle. All rights reserved.
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU General Public
  6. * License v2 as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful,
  9. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. * General Public License for more details.
  12. *
  13. * You should have received a copy of the GNU General Public
  14. * License along with this program; if not, write to the
  15. * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16. * Boston, MA 021110-1307, USA.
  17. */
  18. #include <linux/sched.h>
  19. #include <linux/bio.h>
  20. #include <linux/slab.h>
  21. #include <linux/buffer_head.h>
  22. #include <linux/blkdev.h>
  23. #include <linux/random.h>
  24. #include <linux/iocontext.h>
  25. #include <linux/capability.h>
  26. #include <asm/div64.h>
  27. #include "compat.h"
  28. #include "ctree.h"
  29. #include "extent_map.h"
  30. #include "disk-io.h"
  31. #include "transaction.h"
  32. #include "print-tree.h"
  33. #include "volumes.h"
  34. #include "async-thread.h"
  35. static int init_first_rw_device(struct btrfs_trans_handle *trans,
  36. struct btrfs_root *root,
  37. struct btrfs_device *device);
  38. static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
  39. static DEFINE_MUTEX(uuid_mutex);
  40. static LIST_HEAD(fs_uuids);
  41. static void lock_chunks(struct btrfs_root *root)
  42. {
  43. mutex_lock(&root->fs_info->chunk_mutex);
  44. }
  45. static void unlock_chunks(struct btrfs_root *root)
  46. {
  47. mutex_unlock(&root->fs_info->chunk_mutex);
  48. }
  49. static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
  50. {
  51. struct btrfs_device *device;
  52. WARN_ON(fs_devices->opened);
  53. while (!list_empty(&fs_devices->devices)) {
  54. device = list_entry(fs_devices->devices.next,
  55. struct btrfs_device, dev_list);
  56. list_del(&device->dev_list);
  57. kfree(device->name);
  58. kfree(device);
  59. }
  60. kfree(fs_devices);
  61. }
  62. int btrfs_cleanup_fs_uuids(void)
  63. {
  64. struct btrfs_fs_devices *fs_devices;
  65. while (!list_empty(&fs_uuids)) {
  66. fs_devices = list_entry(fs_uuids.next,
  67. struct btrfs_fs_devices, list);
  68. list_del(&fs_devices->list);
  69. free_fs_devices(fs_devices);
  70. }
  71. return 0;
  72. }
  73. static noinline struct btrfs_device *__find_device(struct list_head *head,
  74. u64 devid, u8 *uuid)
  75. {
  76. struct btrfs_device *dev;
  77. list_for_each_entry(dev, head, dev_list) {
  78. if (dev->devid == devid &&
  79. (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
  80. return dev;
  81. }
  82. }
  83. return NULL;
  84. }
  85. static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
  86. {
  87. struct btrfs_fs_devices *fs_devices;
  88. list_for_each_entry(fs_devices, &fs_uuids, list) {
  89. if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
  90. return fs_devices;
  91. }
  92. return NULL;
  93. }
  94. static void requeue_list(struct btrfs_pending_bios *pending_bios,
  95. struct bio *head, struct bio *tail)
  96. {
  97. struct bio *old_head;
  98. old_head = pending_bios->head;
  99. pending_bios->head = head;
  100. if (pending_bios->tail)
  101. tail->bi_next = old_head;
  102. else
  103. pending_bios->tail = tail;
  104. }
  105. /*
  106. * we try to collect pending bios for a device so we don't get a large
  107. * number of procs sending bios down to the same device. This greatly
  108. * improves the schedulers ability to collect and merge the bios.
  109. *
  110. * But, it also turns into a long list of bios to process and that is sure
  111. * to eventually make the worker thread block. The solution here is to
  112. * make some progress and then put this work struct back at the end of
  113. * the list if the block device is congested. This way, multiple devices
  114. * can make progress from a single worker thread.
  115. */
  116. static noinline int run_scheduled_bios(struct btrfs_device *device)
  117. {
  118. struct bio *pending;
  119. struct backing_dev_info *bdi;
  120. struct btrfs_fs_info *fs_info;
  121. struct btrfs_pending_bios *pending_bios;
  122. struct bio *tail;
  123. struct bio *cur;
  124. int again = 0;
  125. unsigned long num_run;
  126. unsigned long batch_run = 0;
  127. unsigned long limit;
  128. unsigned long last_waited = 0;
  129. int force_reg = 0;
  130. struct blk_plug plug;
  131. /*
  132. * this function runs all the bios we've collected for
  133. * a particular device. We don't want to wander off to
  134. * another device without first sending all of these down.
  135. * So, setup a plug here and finish it off before we return
  136. */
  137. blk_start_plug(&plug);
  138. bdi = blk_get_backing_dev_info(device->bdev);
  139. fs_info = device->dev_root->fs_info;
  140. limit = btrfs_async_submit_limit(fs_info);
  141. limit = limit * 2 / 3;
  142. loop:
  143. spin_lock(&device->io_lock);
  144. loop_lock:
  145. num_run = 0;
  146. /* take all the bios off the list at once and process them
  147. * later on (without the lock held). But, remember the
  148. * tail and other pointers so the bios can be properly reinserted
  149. * into the list if we hit congestion
  150. */
  151. if (!force_reg && device->pending_sync_bios.head) {
  152. pending_bios = &device->pending_sync_bios;
  153. force_reg = 1;
  154. } else {
  155. pending_bios = &device->pending_bios;
  156. force_reg = 0;
  157. }
  158. pending = pending_bios->head;
  159. tail = pending_bios->tail;
  160. WARN_ON(pending && !tail);
  161. /*
  162. * if pending was null this time around, no bios need processing
  163. * at all and we can stop. Otherwise it'll loop back up again
  164. * and do an additional check so no bios are missed.
  165. *
  166. * device->running_pending is used to synchronize with the
  167. * schedule_bio code.
  168. */
  169. if (device->pending_sync_bios.head == NULL &&
  170. device->pending_bios.head == NULL) {
  171. again = 0;
  172. device->running_pending = 0;
  173. } else {
  174. again = 1;
  175. device->running_pending = 1;
  176. }
  177. pending_bios->head = NULL;
  178. pending_bios->tail = NULL;
  179. spin_unlock(&device->io_lock);
  180. while (pending) {
  181. rmb();
  182. /* we want to work on both lists, but do more bios on the
  183. * sync list than the regular list
  184. */
  185. if ((num_run > 32 &&
  186. pending_bios != &device->pending_sync_bios &&
  187. device->pending_sync_bios.head) ||
  188. (num_run > 64 && pending_bios == &device->pending_sync_bios &&
  189. device->pending_bios.head)) {
  190. spin_lock(&device->io_lock);
  191. requeue_list(pending_bios, pending, tail);
  192. goto loop_lock;
  193. }
  194. cur = pending;
  195. pending = pending->bi_next;
  196. cur->bi_next = NULL;
  197. atomic_dec(&fs_info->nr_async_bios);
  198. if (atomic_read(&fs_info->nr_async_bios) < limit &&
  199. waitqueue_active(&fs_info->async_submit_wait))
  200. wake_up(&fs_info->async_submit_wait);
  201. BUG_ON(atomic_read(&cur->bi_cnt) == 0);
  202. submit_bio(cur->bi_rw, cur);
  203. num_run++;
  204. batch_run++;
  205. if (need_resched())
  206. cond_resched();
  207. /*
  208. * we made progress, there is more work to do and the bdi
  209. * is now congested. Back off and let other work structs
  210. * run instead
  211. */
  212. if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
  213. fs_info->fs_devices->open_devices > 1) {
  214. struct io_context *ioc;
  215. ioc = current->io_context;
  216. /*
  217. * the main goal here is that we don't want to
  218. * block if we're going to be able to submit
  219. * more requests without blocking.
  220. *
  221. * This code does two great things, it pokes into
  222. * the elevator code from a filesystem _and_
  223. * it makes assumptions about how batching works.
  224. */
  225. if (ioc && ioc->nr_batch_requests > 0 &&
  226. time_before(jiffies, ioc->last_waited + HZ/50UL) &&
  227. (last_waited == 0 ||
  228. ioc->last_waited == last_waited)) {
  229. /*
  230. * we want to go through our batch of
  231. * requests and stop. So, we copy out
  232. * the ioc->last_waited time and test
  233. * against it before looping
  234. */
  235. last_waited = ioc->last_waited;
  236. if (need_resched())
  237. cond_resched();
  238. continue;
  239. }
  240. spin_lock(&device->io_lock);
  241. requeue_list(pending_bios, pending, tail);
  242. device->running_pending = 1;
  243. spin_unlock(&device->io_lock);
  244. btrfs_requeue_work(&device->work);
  245. goto done;
  246. }
  247. }
  248. cond_resched();
  249. if (again)
  250. goto loop;
  251. spin_lock(&device->io_lock);
  252. if (device->pending_bios.head || device->pending_sync_bios.head)
  253. goto loop_lock;
  254. spin_unlock(&device->io_lock);
  255. done:
  256. blk_finish_plug(&plug);
  257. return 0;
  258. }
  259. static void pending_bios_fn(struct btrfs_work *work)
  260. {
  261. struct btrfs_device *device;
  262. device = container_of(work, struct btrfs_device, work);
  263. run_scheduled_bios(device);
  264. }
  265. static noinline int device_list_add(const char *path,
  266. struct btrfs_super_block *disk_super,
  267. u64 devid, struct btrfs_fs_devices **fs_devices_ret)
  268. {
  269. struct btrfs_device *device;
  270. struct btrfs_fs_devices *fs_devices;
  271. u64 found_transid = btrfs_super_generation(disk_super);
  272. char *name;
  273. fs_devices = find_fsid(disk_super->fsid);
  274. if (!fs_devices) {
  275. fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
  276. if (!fs_devices)
  277. return -ENOMEM;
  278. INIT_LIST_HEAD(&fs_devices->devices);
  279. INIT_LIST_HEAD(&fs_devices->alloc_list);
  280. list_add(&fs_devices->list, &fs_uuids);
  281. memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
  282. fs_devices->latest_devid = devid;
  283. fs_devices->latest_trans = found_transid;
  284. mutex_init(&fs_devices->device_list_mutex);
  285. device = NULL;
  286. } else {
  287. device = __find_device(&fs_devices->devices, devid,
  288. disk_super->dev_item.uuid);
  289. }
  290. if (!device) {
  291. if (fs_devices->opened)
  292. return -EBUSY;
  293. device = kzalloc(sizeof(*device), GFP_NOFS);
  294. if (!device) {
  295. /* we can safely leave the fs_devices entry around */
  296. return -ENOMEM;
  297. }
  298. device->devid = devid;
  299. device->work.func = pending_bios_fn;
  300. memcpy(device->uuid, disk_super->dev_item.uuid,
  301. BTRFS_UUID_SIZE);
  302. spin_lock_init(&device->io_lock);
  303. device->name = kstrdup(path, GFP_NOFS);
  304. if (!device->name) {
  305. kfree(device);
  306. return -ENOMEM;
  307. }
  308. INIT_LIST_HEAD(&device->dev_alloc_list);
  309. mutex_lock(&fs_devices->device_list_mutex);
  310. list_add_rcu(&device->dev_list, &fs_devices->devices);
  311. mutex_unlock(&fs_devices->device_list_mutex);
  312. device->fs_devices = fs_devices;
  313. fs_devices->num_devices++;
  314. } else if (!device->name || strcmp(device->name, path)) {
  315. name = kstrdup(path, GFP_NOFS);
  316. if (!name)
  317. return -ENOMEM;
  318. kfree(device->name);
  319. device->name = name;
  320. if (device->missing) {
  321. fs_devices->missing_devices--;
  322. device->missing = 0;
  323. }
  324. }
  325. if (found_transid > fs_devices->latest_trans) {
  326. fs_devices->latest_devid = devid;
  327. fs_devices->latest_trans = found_transid;
  328. }
  329. *fs_devices_ret = fs_devices;
  330. return 0;
  331. }
  332. static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
  333. {
  334. struct btrfs_fs_devices *fs_devices;
  335. struct btrfs_device *device;
  336. struct btrfs_device *orig_dev;
  337. fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
  338. if (!fs_devices)
  339. return ERR_PTR(-ENOMEM);
  340. INIT_LIST_HEAD(&fs_devices->devices);
  341. INIT_LIST_HEAD(&fs_devices->alloc_list);
  342. INIT_LIST_HEAD(&fs_devices->list);
  343. mutex_init(&fs_devices->device_list_mutex);
  344. fs_devices->latest_devid = orig->latest_devid;
  345. fs_devices->latest_trans = orig->latest_trans;
  346. memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
  347. /* We have held the volume lock, it is safe to get the devices. */
  348. list_for_each_entry(orig_dev, &orig->devices, dev_list) {
  349. device = kzalloc(sizeof(*device), GFP_NOFS);
  350. if (!device)
  351. goto error;
  352. device->name = kstrdup(orig_dev->name, GFP_NOFS);
  353. if (!device->name) {
  354. kfree(device);
  355. goto error;
  356. }
  357. device->devid = orig_dev->devid;
  358. device->work.func = pending_bios_fn;
  359. memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
  360. spin_lock_init(&device->io_lock);
  361. INIT_LIST_HEAD(&device->dev_list);
  362. INIT_LIST_HEAD(&device->dev_alloc_list);
  363. list_add(&device->dev_list, &fs_devices->devices);
  364. device->fs_devices = fs_devices;
  365. fs_devices->num_devices++;
  366. }
  367. return fs_devices;
  368. error:
  369. free_fs_devices(fs_devices);
  370. return ERR_PTR(-ENOMEM);
  371. }
  372. int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
  373. {
  374. struct btrfs_device *device, *next;
  375. mutex_lock(&uuid_mutex);
  376. again:
  377. /* This is the initialized path, it is safe to release the devices. */
  378. list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
  379. if (device->in_fs_metadata)
  380. continue;
  381. if (device->bdev) {
  382. blkdev_put(device->bdev, device->mode);
  383. device->bdev = NULL;
  384. fs_devices->open_devices--;
  385. }
  386. if (device->writeable) {
  387. list_del_init(&device->dev_alloc_list);
  388. device->writeable = 0;
  389. fs_devices->rw_devices--;
  390. }
  391. list_del_init(&device->dev_list);
  392. fs_devices->num_devices--;
  393. kfree(device->name);
  394. kfree(device);
  395. }
  396. if (fs_devices->seed) {
  397. fs_devices = fs_devices->seed;
  398. goto again;
  399. }
  400. mutex_unlock(&uuid_mutex);
  401. return 0;
  402. }
  403. static void __free_device(struct work_struct *work)
  404. {
  405. struct btrfs_device *device;
  406. device = container_of(work, struct btrfs_device, rcu_work);
  407. if (device->bdev)
  408. blkdev_put(device->bdev, device->mode);
  409. kfree(device->name);
  410. kfree(device);
  411. }
  412. static void free_device(struct rcu_head *head)
  413. {
  414. struct btrfs_device *device;
  415. device = container_of(head, struct btrfs_device, rcu);
  416. INIT_WORK(&device->rcu_work, __free_device);
  417. schedule_work(&device->rcu_work);
  418. }
  419. static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
  420. {
  421. struct btrfs_device *device;
  422. if (--fs_devices->opened > 0)
  423. return 0;
  424. mutex_lock(&fs_devices->device_list_mutex);
  425. list_for_each_entry(device, &fs_devices->devices, dev_list) {
  426. struct btrfs_device *new_device;
  427. if (device->bdev)
  428. fs_devices->open_devices--;
  429. if (device->writeable) {
  430. list_del_init(&device->dev_alloc_list);
  431. fs_devices->rw_devices--;
  432. }
  433. if (device->can_discard)
  434. fs_devices->num_can_discard--;
  435. new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
  436. BUG_ON(!new_device);
  437. memcpy(new_device, device, sizeof(*new_device));
  438. new_device->name = kstrdup(device->name, GFP_NOFS);
  439. BUG_ON(device->name && !new_device->name);
  440. new_device->bdev = NULL;
  441. new_device->writeable = 0;
  442. new_device->in_fs_metadata = 0;
  443. new_device->can_discard = 0;
  444. list_replace_rcu(&device->dev_list, &new_device->dev_list);
  445. call_rcu(&device->rcu, free_device);
  446. }
  447. mutex_unlock(&fs_devices->device_list_mutex);
  448. WARN_ON(fs_devices->open_devices);
  449. WARN_ON(fs_devices->rw_devices);
  450. fs_devices->opened = 0;
  451. fs_devices->seeding = 0;
  452. return 0;
  453. }
  454. int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
  455. {
  456. struct btrfs_fs_devices *seed_devices = NULL;
  457. int ret;
  458. mutex_lock(&uuid_mutex);
  459. ret = __btrfs_close_devices(fs_devices);
  460. if (!fs_devices->opened) {
  461. seed_devices = fs_devices->seed;
  462. fs_devices->seed = NULL;
  463. }
  464. mutex_unlock(&uuid_mutex);
  465. while (seed_devices) {
  466. fs_devices = seed_devices;
  467. seed_devices = fs_devices->seed;
  468. __btrfs_close_devices(fs_devices);
  469. free_fs_devices(fs_devices);
  470. }
  471. return ret;
  472. }
  473. static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
  474. fmode_t flags, void *holder)
  475. {
  476. struct request_queue *q;
  477. struct block_device *bdev;
  478. struct list_head *head = &fs_devices->devices;
  479. struct btrfs_device *device;
  480. struct block_device *latest_bdev = NULL;
  481. struct buffer_head *bh;
  482. struct btrfs_super_block *disk_super;
  483. u64 latest_devid = 0;
  484. u64 latest_transid = 0;
  485. u64 devid;
  486. int seeding = 1;
  487. int ret = 0;
  488. flags |= FMODE_EXCL;
  489. list_for_each_entry(device, head, dev_list) {
  490. if (device->bdev)
  491. continue;
  492. if (!device->name)
  493. continue;
  494. bdev = blkdev_get_by_path(device->name, flags, holder);
  495. if (IS_ERR(bdev)) {
  496. printk(KERN_INFO "open %s failed\n", device->name);
  497. goto error;
  498. }
  499. set_blocksize(bdev, 4096);
  500. bh = btrfs_read_dev_super(bdev);
  501. if (!bh) {
  502. ret = -EINVAL;
  503. goto error_close;
  504. }
  505. disk_super = (struct btrfs_super_block *)bh->b_data;
  506. devid = btrfs_stack_device_id(&disk_super->dev_item);
  507. if (devid != device->devid)
  508. goto error_brelse;
  509. if (memcmp(device->uuid, disk_super->dev_item.uuid,
  510. BTRFS_UUID_SIZE))
  511. goto error_brelse;
  512. device->generation = btrfs_super_generation(disk_super);
  513. if (!latest_transid || device->generation > latest_transid) {
  514. latest_devid = devid;
  515. latest_transid = device->generation;
  516. latest_bdev = bdev;
  517. }
  518. if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
  519. device->writeable = 0;
  520. } else {
  521. device->writeable = !bdev_read_only(bdev);
  522. seeding = 0;
  523. }
  524. q = bdev_get_queue(bdev);
  525. if (blk_queue_discard(q)) {
  526. device->can_discard = 1;
  527. fs_devices->num_can_discard++;
  528. }
  529. device->bdev = bdev;
  530. device->in_fs_metadata = 0;
  531. device->mode = flags;
  532. if (!blk_queue_nonrot(bdev_get_queue(bdev)))
  533. fs_devices->rotating = 1;
  534. fs_devices->open_devices++;
  535. if (device->writeable) {
  536. fs_devices->rw_devices++;
  537. list_add(&device->dev_alloc_list,
  538. &fs_devices->alloc_list);
  539. }
  540. brelse(bh);
  541. continue;
  542. error_brelse:
  543. brelse(bh);
  544. error_close:
  545. blkdev_put(bdev, flags);
  546. error:
  547. continue;
  548. }
  549. if (fs_devices->open_devices == 0) {
  550. ret = -EIO;
  551. goto out;
  552. }
  553. fs_devices->seeding = seeding;
  554. fs_devices->opened = 1;
  555. fs_devices->latest_bdev = latest_bdev;
  556. fs_devices->latest_devid = latest_devid;
  557. fs_devices->latest_trans = latest_transid;
  558. fs_devices->total_rw_bytes = 0;
  559. out:
  560. return ret;
  561. }
  562. int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
  563. fmode_t flags, void *holder)
  564. {
  565. int ret;
  566. mutex_lock(&uuid_mutex);
  567. if (fs_devices->opened) {
  568. fs_devices->opened++;
  569. ret = 0;
  570. } else {
  571. ret = __btrfs_open_devices(fs_devices, flags, holder);
  572. }
  573. mutex_unlock(&uuid_mutex);
  574. return ret;
  575. }
  576. int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
  577. struct btrfs_fs_devices **fs_devices_ret)
  578. {
  579. struct btrfs_super_block *disk_super;
  580. struct block_device *bdev;
  581. struct buffer_head *bh;
  582. int ret;
  583. u64 devid;
  584. u64 transid;
  585. mutex_lock(&uuid_mutex);
  586. flags |= FMODE_EXCL;
  587. bdev = blkdev_get_by_path(path, flags, holder);
  588. if (IS_ERR(bdev)) {
  589. ret = PTR_ERR(bdev);
  590. goto error;
  591. }
  592. ret = set_blocksize(bdev, 4096);
  593. if (ret)
  594. goto error_close;
  595. bh = btrfs_read_dev_super(bdev);
  596. if (!bh) {
  597. ret = -EINVAL;
  598. goto error_close;
  599. }
  600. disk_super = (struct btrfs_super_block *)bh->b_data;
  601. devid = btrfs_stack_device_id(&disk_super->dev_item);
  602. transid = btrfs_super_generation(disk_super);
  603. if (disk_super->label[0])
  604. printk(KERN_INFO "device label %s ", disk_super->label);
  605. else
  606. printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
  607. printk(KERN_CONT "devid %llu transid %llu %s\n",
  608. (unsigned long long)devid, (unsigned long long)transid, path);
  609. ret = device_list_add(path, disk_super, devid, fs_devices_ret);
  610. brelse(bh);
  611. error_close:
  612. blkdev_put(bdev, flags);
  613. error:
  614. mutex_unlock(&uuid_mutex);
  615. return ret;
  616. }
  617. /* helper to account the used device space in the range */
  618. int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
  619. u64 end, u64 *length)
  620. {
  621. struct btrfs_key key;
  622. struct btrfs_root *root = device->dev_root;
  623. struct btrfs_dev_extent *dev_extent;
  624. struct btrfs_path *path;
  625. u64 extent_end;
  626. int ret;
  627. int slot;
  628. struct extent_buffer *l;
  629. *length = 0;
  630. if (start >= device->total_bytes)
  631. return 0;
  632. path = btrfs_alloc_path();
  633. if (!path)
  634. return -ENOMEM;
  635. path->reada = 2;
  636. key.objectid = device->devid;
  637. key.offset = start;
  638. key.type = BTRFS_DEV_EXTENT_KEY;
  639. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  640. if (ret < 0)
  641. goto out;
  642. if (ret > 0) {
  643. ret = btrfs_previous_item(root, path, key.objectid, key.type);
  644. if (ret < 0)
  645. goto out;
  646. }
  647. while (1) {
  648. l = path->nodes[0];
  649. slot = path->slots[0];
  650. if (slot >= btrfs_header_nritems(l)) {
  651. ret = btrfs_next_leaf(root, path);
  652. if (ret == 0)
  653. continue;
  654. if (ret < 0)
  655. goto out;
  656. break;
  657. }
  658. btrfs_item_key_to_cpu(l, &key, slot);
  659. if (key.objectid < device->devid)
  660. goto next;
  661. if (key.objectid > device->devid)
  662. break;
  663. if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
  664. goto next;
  665. dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
  666. extent_end = key.offset + btrfs_dev_extent_length(l,
  667. dev_extent);
  668. if (key.offset <= start && extent_end > end) {
  669. *length = end - start + 1;
  670. break;
  671. } else if (key.offset <= start && extent_end > start)
  672. *length += extent_end - start;
  673. else if (key.offset > start && extent_end <= end)
  674. *length += extent_end - key.offset;
  675. else if (key.offset > start && key.offset <= end) {
  676. *length += end - key.offset + 1;
  677. break;
  678. } else if (key.offset > end)
  679. break;
  680. next:
  681. path->slots[0]++;
  682. }
  683. ret = 0;
  684. out:
  685. btrfs_free_path(path);
  686. return ret;
  687. }
  688. /*
  689. * find_free_dev_extent - find free space in the specified device
  690. * @trans: transaction handler
  691. * @device: the device which we search the free space in
  692. * @num_bytes: the size of the free space that we need
  693. * @start: store the start of the free space.
  694. * @len: the size of the free space. that we find, or the size of the max
  695. * free space if we don't find suitable free space
  696. *
  697. * this uses a pretty simple search, the expectation is that it is
  698. * called very infrequently and that a given device has a small number
  699. * of extents
  700. *
  701. * @start is used to store the start of the free space if we find. But if we
  702. * don't find suitable free space, it will be used to store the start position
  703. * of the max free space.
  704. *
  705. * @len is used to store the size of the free space that we find.
  706. * But if we don't find suitable free space, it is used to store the size of
  707. * the max free space.
  708. */
  709. int find_free_dev_extent(struct btrfs_trans_handle *trans,
  710. struct btrfs_device *device, u64 num_bytes,
  711. u64 *start, u64 *len)
  712. {
  713. struct btrfs_key key;
  714. struct btrfs_root *root = device->dev_root;
  715. struct btrfs_dev_extent *dev_extent;
  716. struct btrfs_path *path;
  717. u64 hole_size;
  718. u64 max_hole_start;
  719. u64 max_hole_size;
  720. u64 extent_end;
  721. u64 search_start;
  722. u64 search_end = device->total_bytes;
  723. int ret;
  724. int slot;
  725. struct extent_buffer *l;
  726. /* FIXME use last free of some kind */
  727. /* we don't want to overwrite the superblock on the drive,
  728. * so we make sure to start at an offset of at least 1MB
  729. */
  730. search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
  731. max_hole_start = search_start;
  732. max_hole_size = 0;
  733. if (search_start >= search_end) {
  734. ret = -ENOSPC;
  735. goto error;
  736. }
  737. path = btrfs_alloc_path();
  738. if (!path) {
  739. ret = -ENOMEM;
  740. goto error;
  741. }
  742. path->reada = 2;
  743. key.objectid = device->devid;
  744. key.offset = search_start;
  745. key.type = BTRFS_DEV_EXTENT_KEY;
  746. ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
  747. if (ret < 0)
  748. goto out;
  749. if (ret > 0) {
  750. ret = btrfs_previous_item(root, path, key.objectid, key.type);
  751. if (ret < 0)
  752. goto out;
  753. }
  754. while (1) {
  755. l = path->nodes[0];
  756. slot = path->slots[0];
  757. if (slot >= btrfs_header_nritems(l)) {
  758. ret = btrfs_next_leaf(root, path);
  759. if (ret == 0)
  760. continue;
  761. if (ret < 0)
  762. goto out;
  763. break;
  764. }
  765. btrfs_item_key_to_cpu(l, &key, slot);
  766. if (key.objectid < device->devid)
  767. goto next;
  768. if (key.objectid > device->devid)
  769. break;
  770. if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
  771. goto next;
  772. if (key.offset > search_start) {
  773. hole_size = key.offset - search_start;
  774. if (hole_size > max_hole_size) {
  775. max_hole_start = search_start;
  776. max_hole_size = hole_size;
  777. }
  778. /*
  779. * If this free space is greater than which we need,
  780. * it must be the max free space that we have found
  781. * until now, so max_hole_start must point to the start
  782. * of this free space and the length of this free space
  783. * is stored in max_hole_size. Thus, we return
  784. * max_hole_start and max_hole_size and go back to the
  785. * caller.
  786. */
  787. if (hole_size >= num_bytes) {
  788. ret = 0;
  789. goto out;
  790. }
  791. }
  792. dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
  793. extent_end = key.offset + btrfs_dev_extent_length(l,
  794. dev_extent);
  795. if (extent_end > search_start)
  796. search_start = extent_end;
  797. next:
  798. path->slots[0]++;
  799. cond_resched();
  800. }
  801. hole_size = search_end- search_start;
  802. if (hole_size > max_hole_size) {
  803. max_hole_start = search_start;
  804. max_hole_size = hole_size;
  805. }
  806. /* See above. */
  807. if (hole_size < num_bytes)
  808. ret = -ENOSPC;
  809. else
  810. ret = 0;
  811. out:
  812. btrfs_free_path(path);
  813. error:
  814. *start = max_hole_start;
  815. if (len)
  816. *len = max_hole_size;
  817. return ret;
  818. }
  819. static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
  820. struct btrfs_device *device,
  821. u64 start)
  822. {
  823. int ret;
  824. struct btrfs_path *path;
  825. struct btrfs_root *root = device->dev_root;
  826. struct btrfs_key key;
  827. struct btrfs_key found_key;
  828. struct extent_buffer *leaf = NULL;
  829. struct btrfs_dev_extent *extent = NULL;
  830. path = btrfs_alloc_path();
  831. if (!path)
  832. return -ENOMEM;
  833. key.objectid = device->devid;
  834. key.offset = start;
  835. key.type = BTRFS_DEV_EXTENT_KEY;
  836. ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
  837. if (ret > 0) {
  838. ret = btrfs_previous_item(root, path, key.objectid,
  839. BTRFS_DEV_EXTENT_KEY);
  840. if (ret)
  841. goto out;
  842. leaf = path->nodes[0];
  843. btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
  844. extent = btrfs_item_ptr(leaf, path->slots[0],
  845. struct btrfs_dev_extent);
  846. BUG_ON(found_key.offset > start || found_key.offset +
  847. btrfs_dev_extent_length(leaf, extent) < start);
  848. } else if (ret == 0) {
  849. leaf = path->nodes[0];
  850. extent = btrfs_item_ptr(leaf, path->slots[0],
  851. struct btrfs_dev_extent);
  852. }
  853. BUG_ON(ret);
  854. if (device->bytes_used > 0)
  855. device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
  856. ret = btrfs_del_item(trans, root, path);
  857. out:
  858. btrfs_free_path(path);
  859. return ret;
  860. }
  861. int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
  862. struct btrfs_device *device,
  863. u64 chunk_tree, u64 chunk_objectid,
  864. u64 chunk_offset, u64 start, u64 num_bytes)
  865. {
  866. int ret;
  867. struct btrfs_path *path;
  868. struct btrfs_root *root = device->dev_root;
  869. struct btrfs_dev_extent *extent;
  870. struct extent_buffer *leaf;
  871. struct btrfs_key key;
  872. WARN_ON(!device->in_fs_metadata);
  873. path = btrfs_alloc_path();
  874. if (!path)
  875. return -ENOMEM;
  876. key.objectid = device->devid;
  877. key.offset = start;
  878. key.type = BTRFS_DEV_EXTENT_KEY;
  879. ret = btrfs_insert_empty_item(trans, root, path, &key,
  880. sizeof(*extent));
  881. BUG_ON(ret);
  882. leaf = path->nodes[0];
  883. extent = btrfs_item_ptr(leaf, path->slots[0],
  884. struct btrfs_dev_extent);
  885. btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
  886. btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
  887. btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
  888. write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
  889. (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
  890. BTRFS_UUID_SIZE);
  891. btrfs_set_dev_extent_length(leaf, extent, num_bytes);
  892. btrfs_mark_buffer_dirty(leaf);
  893. btrfs_free_path(path);
  894. return ret;
  895. }
  896. static noinline int find_next_chunk(struct btrfs_root *root,
  897. u64 objectid, u64 *offset)
  898. {
  899. struct btrfs_path *path;
  900. int ret;
  901. struct btrfs_key key;
  902. struct btrfs_chunk *chunk;
  903. struct btrfs_key found_key;
  904. path = btrfs_alloc_path();
  905. BUG_ON(!path);
  906. key.objectid = objectid;
  907. key.offset = (u64)-1;
  908. key.type = BTRFS_CHUNK_ITEM_KEY;
  909. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  910. if (ret < 0)
  911. goto error;
  912. BUG_ON(ret == 0);
  913. ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
  914. if (ret) {
  915. *offset = 0;
  916. } else {
  917. btrfs_item_key_to_cpu(path->nodes[0], &found_key,
  918. path->slots[0]);
  919. if (found_key.objectid != objectid)
  920. *offset = 0;
  921. else {
  922. chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
  923. struct btrfs_chunk);
  924. *offset = found_key.offset +
  925. btrfs_chunk_length(path->nodes[0], chunk);
  926. }
  927. }
  928. ret = 0;
  929. error:
  930. btrfs_free_path(path);
  931. return ret;
  932. }
  933. static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
  934. {
  935. int ret;
  936. struct btrfs_key key;
  937. struct btrfs_key found_key;
  938. struct btrfs_path *path;
  939. root = root->fs_info->chunk_root;
  940. path = btrfs_alloc_path();
  941. if (!path)
  942. return -ENOMEM;
  943. key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  944. key.type = BTRFS_DEV_ITEM_KEY;
  945. key.offset = (u64)-1;
  946. ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
  947. if (ret < 0)
  948. goto error;
  949. BUG_ON(ret == 0);
  950. ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
  951. BTRFS_DEV_ITEM_KEY);
  952. if (ret) {
  953. *objectid = 1;
  954. } else {
  955. btrfs_item_key_to_cpu(path->nodes[0], &found_key,
  956. path->slots[0]);
  957. *objectid = found_key.offset + 1;
  958. }
  959. ret = 0;
  960. error:
  961. btrfs_free_path(path);
  962. return ret;
  963. }
  964. /*
  965. * the device information is stored in the chunk root
  966. * the btrfs_device struct should be fully filled in
  967. */
  968. int btrfs_add_device(struct btrfs_trans_handle *trans,
  969. struct btrfs_root *root,
  970. struct btrfs_device *device)
  971. {
  972. int ret;
  973. struct btrfs_path *path;
  974. struct btrfs_dev_item *dev_item;
  975. struct extent_buffer *leaf;
  976. struct btrfs_key key;
  977. unsigned long ptr;
  978. root = root->fs_info->chunk_root;
  979. path = btrfs_alloc_path();
  980. if (!path)
  981. return -ENOMEM;
  982. key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  983. key.type = BTRFS_DEV_ITEM_KEY;
  984. key.offset = device->devid;
  985. ret = btrfs_insert_empty_item(trans, root, path, &key,
  986. sizeof(*dev_item));
  987. if (ret)
  988. goto out;
  989. leaf = path->nodes[0];
  990. dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
  991. btrfs_set_device_id(leaf, dev_item, device->devid);
  992. btrfs_set_device_generation(leaf, dev_item, 0);
  993. btrfs_set_device_type(leaf, dev_item, device->type);
  994. btrfs_set_device_io_align(leaf, dev_item, device->io_align);
  995. btrfs_set_device_io_width(leaf, dev_item, device->io_width);
  996. btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
  997. btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
  998. btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
  999. btrfs_set_device_group(leaf, dev_item, 0);
  1000. btrfs_set_device_seek_speed(leaf, dev_item, 0);
  1001. btrfs_set_device_bandwidth(leaf, dev_item, 0);
  1002. btrfs_set_device_start_offset(leaf, dev_item, 0);
  1003. ptr = (unsigned long)btrfs_device_uuid(dev_item);
  1004. write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
  1005. ptr = (unsigned long)btrfs_device_fsid(dev_item);
  1006. write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
  1007. btrfs_mark_buffer_dirty(leaf);
  1008. ret = 0;
  1009. out:
  1010. btrfs_free_path(path);
  1011. return ret;
  1012. }
  1013. static int btrfs_rm_dev_item(struct btrfs_root *root,
  1014. struct btrfs_device *device)
  1015. {
  1016. int ret;
  1017. struct btrfs_path *path;
  1018. struct btrfs_key key;
  1019. struct btrfs_trans_handle *trans;
  1020. root = root->fs_info->chunk_root;
  1021. path = btrfs_alloc_path();
  1022. if (!path)
  1023. return -ENOMEM;
  1024. trans = btrfs_start_transaction(root, 0);
  1025. if (IS_ERR(trans)) {
  1026. btrfs_free_path(path);
  1027. return PTR_ERR(trans);
  1028. }
  1029. key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  1030. key.type = BTRFS_DEV_ITEM_KEY;
  1031. key.offset = device->devid;
  1032. lock_chunks(root);
  1033. ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
  1034. if (ret < 0)
  1035. goto out;
  1036. if (ret > 0) {
  1037. ret = -ENOENT;
  1038. goto out;
  1039. }
  1040. ret = btrfs_del_item(trans, root, path);
  1041. if (ret)
  1042. goto out;
  1043. out:
  1044. btrfs_free_path(path);
  1045. unlock_chunks(root);
  1046. btrfs_commit_transaction(trans, root);
  1047. return ret;
  1048. }
  1049. int btrfs_rm_device(struct btrfs_root *root, char *device_path)
  1050. {
  1051. struct btrfs_device *device;
  1052. struct btrfs_device *next_device;
  1053. struct block_device *bdev;
  1054. struct buffer_head *bh = NULL;
  1055. struct btrfs_super_block *disk_super;
  1056. struct btrfs_fs_devices *cur_devices;
  1057. u64 all_avail;
  1058. u64 devid;
  1059. u64 num_devices;
  1060. u8 *dev_uuid;
  1061. int ret = 0;
  1062. bool clear_super = false;
  1063. mutex_lock(&uuid_mutex);
  1064. mutex_lock(&root->fs_info->volume_mutex);
  1065. all_avail = root->fs_info->avail_data_alloc_bits |
  1066. root->fs_info->avail_system_alloc_bits |
  1067. root->fs_info->avail_metadata_alloc_bits;
  1068. if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
  1069. root->fs_info->fs_devices->num_devices <= 4) {
  1070. printk(KERN_ERR "btrfs: unable to go below four devices "
  1071. "on raid10\n");
  1072. ret = -EINVAL;
  1073. goto out;
  1074. }
  1075. if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
  1076. root->fs_info->fs_devices->num_devices <= 2) {
  1077. printk(KERN_ERR "btrfs: unable to go below two "
  1078. "devices on raid1\n");
  1079. ret = -EINVAL;
  1080. goto out;
  1081. }
  1082. if (strcmp(device_path, "missing") == 0) {
  1083. struct list_head *devices;
  1084. struct btrfs_device *tmp;
  1085. device = NULL;
  1086. devices = &root->fs_info->fs_devices->devices;
  1087. /*
  1088. * It is safe to read the devices since the volume_mutex
  1089. * is held.
  1090. */
  1091. list_for_each_entry(tmp, devices, dev_list) {
  1092. if (tmp->in_fs_metadata && !tmp->bdev) {
  1093. device = tmp;
  1094. break;
  1095. }
  1096. }
  1097. bdev = NULL;
  1098. bh = NULL;
  1099. disk_super = NULL;
  1100. if (!device) {
  1101. printk(KERN_ERR "btrfs: no missing devices found to "
  1102. "remove\n");
  1103. goto out;
  1104. }
  1105. } else {
  1106. bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
  1107. root->fs_info->bdev_holder);
  1108. if (IS_ERR(bdev)) {
  1109. ret = PTR_ERR(bdev);
  1110. goto out;
  1111. }
  1112. set_blocksize(bdev, 4096);
  1113. bh = btrfs_read_dev_super(bdev);
  1114. if (!bh) {
  1115. ret = -EINVAL;
  1116. goto error_close;
  1117. }
  1118. disk_super = (struct btrfs_super_block *)bh->b_data;
  1119. devid = btrfs_stack_device_id(&disk_super->dev_item);
  1120. dev_uuid = disk_super->dev_item.uuid;
  1121. device = btrfs_find_device(root, devid, dev_uuid,
  1122. disk_super->fsid);
  1123. if (!device) {
  1124. ret = -ENOENT;
  1125. goto error_brelse;
  1126. }
  1127. }
  1128. if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
  1129. printk(KERN_ERR "btrfs: unable to remove the only writeable "
  1130. "device\n");
  1131. ret = -EINVAL;
  1132. goto error_brelse;
  1133. }
  1134. if (device->writeable) {
  1135. lock_chunks(root);
  1136. list_del_init(&device->dev_alloc_list);
  1137. unlock_chunks(root);
  1138. root->fs_info->fs_devices->rw_devices--;
  1139. clear_super = true;
  1140. }
  1141. ret = btrfs_shrink_device(device, 0);
  1142. if (ret)
  1143. goto error_undo;
  1144. ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
  1145. if (ret)
  1146. goto error_undo;
  1147. device->in_fs_metadata = 0;
  1148. btrfs_scrub_cancel_dev(root, device);
  1149. /*
  1150. * the device list mutex makes sure that we don't change
  1151. * the device list while someone else is writing out all
  1152. * the device supers.
  1153. */
  1154. cur_devices = device->fs_devices;
  1155. mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
  1156. list_del_rcu(&device->dev_list);
  1157. device->fs_devices->num_devices--;
  1158. if (device->missing)
  1159. root->fs_info->fs_devices->missing_devices--;
  1160. next_device = list_entry(root->fs_info->fs_devices->devices.next,
  1161. struct btrfs_device, dev_list);
  1162. if (device->bdev == root->fs_info->sb->s_bdev)
  1163. root->fs_info->sb->s_bdev = next_device->bdev;
  1164. if (device->bdev == root->fs_info->fs_devices->latest_bdev)
  1165. root->fs_info->fs_devices->latest_bdev = next_device->bdev;
  1166. if (device->bdev)
  1167. device->fs_devices->open_devices--;
  1168. call_rcu(&device->rcu, free_device);
  1169. mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  1170. num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
  1171. btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
  1172. if (cur_devices->open_devices == 0) {
  1173. struct btrfs_fs_devices *fs_devices;
  1174. fs_devices = root->fs_info->fs_devices;
  1175. while (fs_devices) {
  1176. if (fs_devices->seed == cur_devices)
  1177. break;
  1178. fs_devices = fs_devices->seed;
  1179. }
  1180. fs_devices->seed = cur_devices->seed;
  1181. cur_devices->seed = NULL;
  1182. lock_chunks(root);
  1183. __btrfs_close_devices(cur_devices);
  1184. unlock_chunks(root);
  1185. free_fs_devices(cur_devices);
  1186. }
  1187. /*
  1188. * at this point, the device is zero sized. We want to
  1189. * remove it from the devices list and zero out the old super
  1190. */
  1191. if (clear_super) {
  1192. /* make sure this device isn't detected as part of
  1193. * the FS anymore
  1194. */
  1195. memset(&disk_super->magic, 0, sizeof(disk_super->magic));
  1196. set_buffer_dirty(bh);
  1197. sync_dirty_buffer(bh);
  1198. }
  1199. ret = 0;
  1200. error_brelse:
  1201. brelse(bh);
  1202. error_close:
  1203. if (bdev)
  1204. blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
  1205. out:
  1206. mutex_unlock(&root->fs_info->volume_mutex);
  1207. mutex_unlock(&uuid_mutex);
  1208. return ret;
  1209. error_undo:
  1210. if (device->writeable) {
  1211. lock_chunks(root);
  1212. list_add(&device->dev_alloc_list,
  1213. &root->fs_info->fs_devices->alloc_list);
  1214. unlock_chunks(root);
  1215. root->fs_info->fs_devices->rw_devices++;
  1216. }
  1217. goto error_brelse;
  1218. }
  1219. /*
  1220. * does all the dirty work required for changing file system's UUID.
  1221. */
  1222. static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
  1223. struct btrfs_root *root)
  1224. {
  1225. struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
  1226. struct btrfs_fs_devices *old_devices;
  1227. struct btrfs_fs_devices *seed_devices;
  1228. struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
  1229. struct btrfs_device *device;
  1230. u64 super_flags;
  1231. BUG_ON(!mutex_is_locked(&uuid_mutex));
  1232. if (!fs_devices->seeding)
  1233. return -EINVAL;
  1234. seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
  1235. if (!seed_devices)
  1236. return -ENOMEM;
  1237. old_devices = clone_fs_devices(fs_devices);
  1238. if (IS_ERR(old_devices)) {
  1239. kfree(seed_devices);
  1240. return PTR_ERR(old_devices);
  1241. }
  1242. list_add(&old_devices->list, &fs_uuids);
  1243. memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
  1244. seed_devices->opened = 1;
  1245. INIT_LIST_HEAD(&seed_devices->devices);
  1246. INIT_LIST_HEAD(&seed_devices->alloc_list);
  1247. mutex_init(&seed_devices->device_list_mutex);
  1248. mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
  1249. list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
  1250. synchronize_rcu);
  1251. mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  1252. list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
  1253. list_for_each_entry(device, &seed_devices->devices, dev_list) {
  1254. device->fs_devices = seed_devices;
  1255. }
  1256. fs_devices->seeding = 0;
  1257. fs_devices->num_devices = 0;
  1258. fs_devices->open_devices = 0;
  1259. fs_devices->seed = seed_devices;
  1260. generate_random_uuid(fs_devices->fsid);
  1261. memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
  1262. memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
  1263. super_flags = btrfs_super_flags(disk_super) &
  1264. ~BTRFS_SUPER_FLAG_SEEDING;
  1265. btrfs_set_super_flags(disk_super, super_flags);
  1266. return 0;
  1267. }
  1268. /*
  1269. * strore the expected generation for seed devices in device items.
  1270. */
  1271. static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
  1272. struct btrfs_root *root)
  1273. {
  1274. struct btrfs_path *path;
  1275. struct extent_buffer *leaf;
  1276. struct btrfs_dev_item *dev_item;
  1277. struct btrfs_device *device;
  1278. struct btrfs_key key;
  1279. u8 fs_uuid[BTRFS_UUID_SIZE];
  1280. u8 dev_uuid[BTRFS_UUID_SIZE];
  1281. u64 devid;
  1282. int ret;
  1283. path = btrfs_alloc_path();
  1284. if (!path)
  1285. return -ENOMEM;
  1286. root = root->fs_info->chunk_root;
  1287. key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  1288. key.offset = 0;
  1289. key.type = BTRFS_DEV_ITEM_KEY;
  1290. while (1) {
  1291. ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
  1292. if (ret < 0)
  1293. goto error;
  1294. leaf = path->nodes[0];
  1295. next_slot:
  1296. if (path->slots[0] >= btrfs_header_nritems(leaf)) {
  1297. ret = btrfs_next_leaf(root, path);
  1298. if (ret > 0)
  1299. break;
  1300. if (ret < 0)
  1301. goto error;
  1302. leaf = path->nodes[0];
  1303. btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  1304. btrfs_release_path(path);
  1305. continue;
  1306. }
  1307. btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  1308. if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
  1309. key.type != BTRFS_DEV_ITEM_KEY)
  1310. break;
  1311. dev_item = btrfs_item_ptr(leaf, path->slots[0],
  1312. struct btrfs_dev_item);
  1313. devid = btrfs_device_id(leaf, dev_item);
  1314. read_extent_buffer(leaf, dev_uuid,
  1315. (unsigned long)btrfs_device_uuid(dev_item),
  1316. BTRFS_UUID_SIZE);
  1317. read_extent_buffer(leaf, fs_uuid,
  1318. (unsigned long)btrfs_device_fsid(dev_item),
  1319. BTRFS_UUID_SIZE);
  1320. device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
  1321. BUG_ON(!device);
  1322. if (device->fs_devices->seeding) {
  1323. btrfs_set_device_generation(leaf, dev_item,
  1324. device->generation);
  1325. btrfs_mark_buffer_dirty(leaf);
  1326. }
  1327. path->slots[0]++;
  1328. goto next_slot;
  1329. }
  1330. ret = 0;
  1331. error:
  1332. btrfs_free_path(path);
  1333. return ret;
  1334. }
  1335. int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
  1336. {
  1337. struct request_queue *q;
  1338. struct btrfs_trans_handle *trans;
  1339. struct btrfs_device *device;
  1340. struct block_device *bdev;
  1341. struct list_head *devices;
  1342. struct super_block *sb = root->fs_info->sb;
  1343. u64 total_bytes;
  1344. int seeding_dev = 0;
  1345. int ret = 0;
  1346. if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
  1347. return -EINVAL;
  1348. bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
  1349. root->fs_info->bdev_holder);
  1350. if (IS_ERR(bdev))
  1351. return PTR_ERR(bdev);
  1352. if (root->fs_info->fs_devices->seeding) {
  1353. seeding_dev = 1;
  1354. down_write(&sb->s_umount);
  1355. mutex_lock(&uuid_mutex);
  1356. }
  1357. filemap_write_and_wait(bdev->bd_inode->i_mapping);
  1358. mutex_lock(&root->fs_info->volume_mutex);
  1359. devices = &root->fs_info->fs_devices->devices;
  1360. /*
  1361. * we have the volume lock, so we don't need the extra
  1362. * device list mutex while reading the list here.
  1363. */
  1364. list_for_each_entry(device, devices, dev_list) {
  1365. if (device->bdev == bdev) {
  1366. ret = -EEXIST;
  1367. goto error;
  1368. }
  1369. }
  1370. device = kzalloc(sizeof(*device), GFP_NOFS);
  1371. if (!device) {
  1372. /* we can safely leave the fs_devices entry around */
  1373. ret = -ENOMEM;
  1374. goto error;
  1375. }
  1376. device->name = kstrdup(device_path, GFP_NOFS);
  1377. if (!device->name) {
  1378. kfree(device);
  1379. ret = -ENOMEM;
  1380. goto error;
  1381. }
  1382. ret = find_next_devid(root, &device->devid);
  1383. if (ret) {
  1384. kfree(device->name);
  1385. kfree(device);
  1386. goto error;
  1387. }
  1388. trans = btrfs_start_transaction(root, 0);
  1389. if (IS_ERR(trans)) {
  1390. kfree(device->name);
  1391. kfree(device);
  1392. ret = PTR_ERR(trans);
  1393. goto error;
  1394. }
  1395. lock_chunks(root);
  1396. q = bdev_get_queue(bdev);
  1397. if (blk_queue_discard(q))
  1398. device->can_discard = 1;
  1399. device->writeable = 1;
  1400. device->work.func = pending_bios_fn;
  1401. generate_random_uuid(device->uuid);
  1402. spin_lock_init(&device->io_lock);
  1403. device->generation = trans->transid;
  1404. device->io_width = root->sectorsize;
  1405. device->io_align = root->sectorsize;
  1406. device->sector_size = root->sectorsize;
  1407. device->total_bytes = i_size_read(bdev->bd_inode);
  1408. device->disk_total_bytes = device->total_bytes;
  1409. device->dev_root = root->fs_info->dev_root;
  1410. device->bdev = bdev;
  1411. device->in_fs_metadata = 1;
  1412. device->mode = FMODE_EXCL;
  1413. set_blocksize(device->bdev, 4096);
  1414. if (seeding_dev) {
  1415. sb->s_flags &= ~MS_RDONLY;
  1416. ret = btrfs_prepare_sprout(trans, root);
  1417. BUG_ON(ret);
  1418. }
  1419. device->fs_devices = root->fs_info->fs_devices;
  1420. /*
  1421. * we don't want write_supers to jump in here with our device
  1422. * half setup
  1423. */
  1424. mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
  1425. list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
  1426. list_add(&device->dev_alloc_list,
  1427. &root->fs_info->fs_devices->alloc_list);
  1428. root->fs_info->fs_devices->num_devices++;
  1429. root->fs_info->fs_devices->open_devices++;
  1430. root->fs_info->fs_devices->rw_devices++;
  1431. if (device->can_discard)
  1432. root->fs_info->fs_devices->num_can_discard++;
  1433. root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
  1434. if (!blk_queue_nonrot(bdev_get_queue(bdev)))
  1435. root->fs_info->fs_devices->rotating = 1;
  1436. total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
  1437. btrfs_set_super_total_bytes(&root->fs_info->super_copy,
  1438. total_bytes + device->total_bytes);
  1439. total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
  1440. btrfs_set_super_num_devices(&root->fs_info->super_copy,
  1441. total_bytes + 1);
  1442. mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
  1443. if (seeding_dev) {
  1444. ret = init_first_rw_device(trans, root, device);
  1445. BUG_ON(ret);
  1446. ret = btrfs_finish_sprout(trans, root);
  1447. BUG_ON(ret);
  1448. } else {
  1449. ret = btrfs_add_device(trans, root, device);
  1450. }
  1451. /*
  1452. * we've got more storage, clear any full flags on the space
  1453. * infos
  1454. */
  1455. btrfs_clear_space_info_full(root->fs_info);
  1456. unlock_chunks(root);
  1457. btrfs_commit_transaction(trans, root);
  1458. if (seeding_dev) {
  1459. mutex_unlock(&uuid_mutex);
  1460. up_write(&sb->s_umount);
  1461. ret = btrfs_relocate_sys_chunks(root);
  1462. BUG_ON(ret);
  1463. }
  1464. out:
  1465. mutex_unlock(&root->fs_info->volume_mutex);
  1466. return ret;
  1467. error:
  1468. blkdev_put(bdev, FMODE_EXCL);
  1469. if (seeding_dev) {
  1470. mutex_unlock(&uuid_mutex);
  1471. up_write(&sb->s_umount);
  1472. }
  1473. goto out;
  1474. }
  1475. static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
  1476. struct btrfs_device *device)
  1477. {
  1478. int ret;
  1479. struct btrfs_path *path;
  1480. struct btrfs_root *root;
  1481. struct btrfs_dev_item *dev_item;
  1482. struct extent_buffer *leaf;
  1483. struct btrfs_key key;
  1484. root = device->dev_root->fs_info->chunk_root;
  1485. path = btrfs_alloc_path();
  1486. if (!path)
  1487. return -ENOMEM;
  1488. key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
  1489. key.type = BTRFS_DEV_ITEM_KEY;
  1490. key.offset = device->devid;
  1491. ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
  1492. if (ret < 0)
  1493. goto out;
  1494. if (ret > 0) {
  1495. ret = -ENOENT;
  1496. goto out;
  1497. }
  1498. leaf = path->nodes[0];
  1499. dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
  1500. btrfs_set_device_id(leaf, dev_item, device->devid);
  1501. btrfs_set_device_type(leaf, dev_item, device->type);
  1502. btrfs_set_device_io_align(leaf, dev_item, device->io_align);
  1503. btrfs_set_device_io_width(leaf, dev_item, device->io_width);
  1504. btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
  1505. btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
  1506. btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
  1507. btrfs_mark_buffer_dirty(leaf);
  1508. out:
  1509. btrfs_free_path(path);
  1510. return ret;
  1511. }
  1512. static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
  1513. struct btrfs_device *device, u64 new_size)
  1514. {
  1515. struct btrfs_super_block *super_copy =
  1516. &device->dev_root->fs_info->super_copy;
  1517. u64 old_total = btrfs_super_total_bytes(super_copy);
  1518. u64 diff = new_size - device->total_bytes;
  1519. if (!device->writeable)
  1520. return -EACCES;
  1521. if (new_size <= device->total_bytes)
  1522. return -EINVAL;
  1523. btrfs_set_super_total_bytes(super_copy, old_total + diff);
  1524. device->fs_devices->total_rw_bytes += diff;
  1525. device->total_bytes = new_size;
  1526. device->disk_total_bytes = new_size;
  1527. btrfs_clear_space_info_full(device->dev_root->fs_info);
  1528. return btrfs_update_device(trans, device);
  1529. }
  1530. int btrfs_grow_device(struct btrfs_trans_handle *trans,
  1531. struct btrfs_device *device, u64 new_size)
  1532. {
  1533. int ret;
  1534. lock_chunks(device->dev_root);
  1535. ret = __btrfs_grow_device(trans, device, new_size);
  1536. unlock_chunks(device->dev_root);
  1537. return ret;
  1538. }
  1539. static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
  1540. struct btrfs_root *root,
  1541. u64 chunk_tree, u64 chunk_objectid,
  1542. u64 chunk_offset)
  1543. {
  1544. int ret;
  1545. struct btrfs_path *path;
  1546. struct btrfs_key key;
  1547. root = root->fs_info->chunk_root;
  1548. path = btrfs_alloc_path();
  1549. if (!path)
  1550. return -ENOMEM;
  1551. key.objectid = chunk_objectid;
  1552. key.offset = chunk_offset;
  1553. key.type = BTRFS_CHUNK_ITEM_KEY;
  1554. ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
  1555. BUG_ON(ret);
  1556. ret = btrfs_del_item(trans, root, path);
  1557. btrfs_free_path(path);
  1558. return ret;
  1559. }
  1560. static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
  1561. chunk_offset)
  1562. {
  1563. struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
  1564. struct btrfs_disk_key *disk_key;
  1565. struct btrfs_chunk *chunk;
  1566. u8 *ptr;
  1567. int ret = 0;
  1568. u32 num_stripes;
  1569. u32 array_size;
  1570. u32 len = 0;
  1571. u32 cur;
  1572. struct btrfs_key key;
  1573. array_size = btrfs_super_sys_array_size(super_copy);
  1574. ptr = super_copy->sys_chunk_array;
  1575. cur = 0;
  1576. while (cur < array_size) {
  1577. disk_key = (struct btrfs_disk_key *)ptr;
  1578. btrfs_disk_key_to_cpu(&key, disk_key);
  1579. len = sizeof(*disk_key);
  1580. if (key.type == BTRFS_CHUNK_ITEM_KEY) {
  1581. chunk = (struct btrfs_chunk *)(ptr + len);
  1582. num_stripes = btrfs_stack_chunk_num_stripes(chunk);
  1583. len += btrfs_chunk_item_size(num_stripes);
  1584. } else {
  1585. ret = -EIO;
  1586. break;
  1587. }
  1588. if (key.objectid == chunk_objectid &&
  1589. key.offset == chunk_offset) {
  1590. memmove(ptr, ptr + len, array_size - (cur + len));
  1591. array_size -= len;
  1592. btrfs_set_super_sys_array_size(super_copy, array_size);
  1593. } else {
  1594. ptr += len;
  1595. cur += len;
  1596. }
  1597. }
  1598. return ret;
  1599. }
  1600. static int btrfs_relocate_chunk(struct btrfs_root *root,
  1601. u64 chunk_tree, u64 chunk_objectid,
  1602. u64 chunk_offset)
  1603. {
  1604. struct extent_map_tree *em_tree;
  1605. struct btrfs_root *extent_root;
  1606. struct btrfs_trans_handle *trans;
  1607. struct extent_map *em;
  1608. struct map_lookup *map;
  1609. int ret;
  1610. int i;
  1611. root = root->fs_info->chunk_root;
  1612. extent_root = root->fs_info->extent_root;
  1613. em_tree = &root->fs_info->mapping_tree.map_tree;
  1614. ret = btrfs_can_relocate(extent_root, chunk_offset);
  1615. if (ret)
  1616. return -ENOSPC;
  1617. /* step one, relocate all the extents inside this chunk */
  1618. ret = btrfs_relocate_block_group(extent_root, chunk_offset);
  1619. if (ret)
  1620. return ret;
  1621. trans = btrfs_start_transaction(root, 0);
  1622. BUG_ON(IS_ERR(trans));
  1623. lock_chunks(root);
  1624. /*
  1625. * step two, delete the device extents and the
  1626. * chunk tree entries
  1627. */
  1628. read_lock(&em_tree->lock);
  1629. em = lookup_extent_mapping(em_tree, chunk_offset, 1);
  1630. read_unlock(&em_tree->lock);
  1631. BUG_ON(em->start > chunk_offset ||
  1632. em->start + em->len < chunk_offset);
  1633. map = (struct map_lookup *)em->bdev;
  1634. for (i = 0; i < map->num_stripes; i++) {
  1635. ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
  1636. map->stripes[i].physical);
  1637. BUG_ON(ret);
  1638. if (map->stripes[i].dev) {
  1639. ret = btrfs_update_device(trans, map->stripes[i].dev);
  1640. BUG_ON(ret);
  1641. }
  1642. }
  1643. ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
  1644. chunk_offset);
  1645. BUG_ON(ret);
  1646. trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
  1647. if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
  1648. ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
  1649. BUG_ON(ret);
  1650. }
  1651. ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
  1652. BUG_ON(ret);
  1653. write_lock(&em_tree->lock);
  1654. remove_extent_mapping(em_tree, em);
  1655. write_unlock(&em_tree->lock);
  1656. kfree(map);
  1657. em->bdev = NULL;
  1658. /* once for the tree */
  1659. free_extent_map(em);
  1660. /* once for us */
  1661. free_extent_map(em);
  1662. unlock_chunks(root);
  1663. btrfs_end_transaction(trans, root);
  1664. return 0;
  1665. }
  1666. static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
  1667. {
  1668. struct btrfs_root *chunk_root = root->fs_info->chunk_root;
  1669. struct btrfs_path *path;
  1670. struct extent_buffer *leaf;
  1671. struct btrfs_chunk *chunk;
  1672. struct btrfs_key key;
  1673. struct btrfs_key found_key;
  1674. u64 chunk_tree = chunk_root->root_key.objectid;
  1675. u64 chunk_type;
  1676. bool retried = false;
  1677. int failed = 0;
  1678. int ret;
  1679. path = btrfs_alloc_path();
  1680. if (!path)
  1681. return -ENOMEM;
  1682. again:
  1683. key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
  1684. key.offset = (u64)-1;
  1685. key.type = BTRFS_CHUNK_ITEM_KEY;
  1686. while (1) {
  1687. ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
  1688. if (ret < 0)
  1689. goto error;
  1690. BUG_ON(ret == 0);
  1691. ret = btrfs_previous_item(chunk_root, path, key.objectid,
  1692. key.type);
  1693. if (ret