PageRenderTime 708ms CodeModel.GetById 17ms RepoModel.GetById 1ms app.codeStats 0ms

/usr/src/uts/common/fs/zfs/zvol.c

https://bitbucket.org/0xffea/illumos-dccp
C | 1999 lines | 1488 code | 283 blank | 228 comment | 325 complexity | 5571b06c47f33fa8d8cf2c66f0c3a155 MD5 | raw file
Possible License(s): LGPL-2.0, BSD-3-Clause-No-Nuclear-License-2014, MPL-2.0-no-copyleft-exception, AGPL-3.0, BSD-3-Clause, GPL-2.0, LGPL-2.1, LGPL-3.0, AGPL-1.0, GPL-3.0, 0BSD, BSD-2-Clause
  1. /*
  2. * CDDL HEADER START
  3. *
  4. * The contents of this file are subject to the terms of the
  5. * Common Development and Distribution License (the "License").
  6. * You may not use this file except in compliance with the License.
  7. *
  8. * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  9. * or http://www.opensolaris.org/os/licensing.
  10. * See the License for the specific language governing permissions
  11. * and limitations under the License.
  12. *
  13. * When distributing Covered Code, include this CDDL HEADER in each
  14. * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15. * If applicable, add the following below this CDDL HEADER, with the
  16. * fields enclosed by brackets "[]" replaced with your own identifying
  17. * information: Portions Copyright [yyyy] [name of copyright owner]
  18. *
  19. * CDDL HEADER END
  20. */
  21. /*
  22. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23. *
  24. * Portions Copyright 2010 Robert Milkowski
  25. *
  26. * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  27. * Copyright (c) 2012 by Delphix. All rights reserved.
  28. */
  29. /*
  30. * ZFS volume emulation driver.
  31. *
  32. * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  33. * Volumes are accessed through the symbolic links named:
  34. *
  35. * /dev/zvol/dsk/<pool_name>/<dataset_name>
  36. * /dev/zvol/rdsk/<pool_name>/<dataset_name>
  37. *
  38. * These links are created by the /dev filesystem (sdev_zvolops.c).
  39. * Volumes are persistent through reboot. No user command needs to be
  40. * run before opening and using a device.
  41. */
  42. #include <sys/types.h>
  43. #include <sys/param.h>
  44. #include <sys/errno.h>
  45. #include <sys/uio.h>
  46. #include <sys/buf.h>
  47. #include <sys/modctl.h>
  48. #include <sys/open.h>
  49. #include <sys/kmem.h>
  50. #include <sys/conf.h>
  51. #include <sys/cmn_err.h>
  52. #include <sys/stat.h>
  53. #include <sys/zap.h>
  54. #include <sys/spa.h>
  55. #include <sys/zio.h>
  56. #include <sys/dmu_traverse.h>
  57. #include <sys/dnode.h>
  58. #include <sys/dsl_dataset.h>
  59. #include <sys/dsl_prop.h>
  60. #include <sys/dkio.h>
  61. #include <sys/efi_partition.h>
  62. #include <sys/byteorder.h>
  63. #include <sys/pathname.h>
  64. #include <sys/ddi.h>
  65. #include <sys/sunddi.h>
  66. #include <sys/crc32.h>
  67. #include <sys/dirent.h>
  68. #include <sys/policy.h>
  69. #include <sys/fs/zfs.h>
  70. #include <sys/zfs_ioctl.h>
  71. #include <sys/mkdev.h>
  72. #include <sys/zil.h>
  73. #include <sys/refcount.h>
  74. #include <sys/zfs_znode.h>
  75. #include <sys/zfs_rlock.h>
  76. #include <sys/vdev_disk.h>
  77. #include <sys/vdev_impl.h>
  78. #include <sys/zvol.h>
  79. #include <sys/dumphdr.h>
  80. #include <sys/zil_impl.h>
  81. #include "zfs_namecheck.h"
  82. void *zfsdev_state;
  83. static char *zvol_tag = "zvol_tag";
  84. #define ZVOL_DUMPSIZE "dumpsize"
  85. /*
  86. * This lock protects the zfsdev_state structure from being modified
  87. * while it's being used, e.g. an open that comes in before a create
  88. * finishes. It also protects temporary opens of the dataset so that,
  89. * e.g., an open doesn't get a spurious EBUSY.
  90. */
  91. kmutex_t zfsdev_state_lock;
  92. static uint32_t zvol_minors;
  93. typedef struct zvol_extent {
  94. list_node_t ze_node;
  95. dva_t ze_dva; /* dva associated with this extent */
  96. uint64_t ze_nblks; /* number of blocks in extent */
  97. } zvol_extent_t;
  98. /*
  99. * The in-core state of each volume.
  100. */
  101. typedef struct zvol_state {
  102. char zv_name[MAXPATHLEN]; /* pool/dd name */
  103. uint64_t zv_volsize; /* amount of space we advertise */
  104. uint64_t zv_volblocksize; /* volume block size */
  105. minor_t zv_minor; /* minor number */
  106. uint8_t zv_min_bs; /* minimum addressable block shift */
  107. uint8_t zv_flags; /* readonly, dumpified, etc. */
  108. objset_t *zv_objset; /* objset handle */
  109. uint32_t zv_open_count[OTYPCNT]; /* open counts */
  110. uint32_t zv_total_opens; /* total open count */
  111. zilog_t *zv_zilog; /* ZIL handle */
  112. list_t zv_extents; /* List of extents for dump */
  113. znode_t zv_znode; /* for range locking */
  114. dmu_buf_t *zv_dbuf; /* bonus handle */
  115. } zvol_state_t;
  116. /*
  117. * zvol specific flags
  118. */
  119. #define ZVOL_RDONLY 0x1
  120. #define ZVOL_DUMPIFIED 0x2
  121. #define ZVOL_EXCL 0x4
  122. #define ZVOL_WCE 0x8
  123. /*
  124. * zvol maximum transfer in one DMU tx.
  125. */
  126. int zvol_maxphys = DMU_MAX_ACCESS/2;
  127. extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
  128. nvlist_t *, nvlist_t *);
  129. static int zvol_remove_zv(zvol_state_t *);
  130. static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
  131. static int zvol_dumpify(zvol_state_t *zv);
  132. static int zvol_dump_fini(zvol_state_t *zv);
  133. static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
  134. static void
  135. zvol_size_changed(uint64_t volsize, major_t maj, minor_t min)
  136. {
  137. dev_t dev = makedevice(maj, min);
  138. VERIFY(ddi_prop_update_int64(dev, zfs_dip,
  139. "Size", volsize) == DDI_SUCCESS);
  140. VERIFY(ddi_prop_update_int64(dev, zfs_dip,
  141. "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
  142. /* Notify specfs to invalidate the cached size */
  143. spec_size_invalidate(dev, VBLK);
  144. spec_size_invalidate(dev, VCHR);
  145. }
  146. int
  147. zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
  148. {
  149. if (volsize == 0)
  150. return (EINVAL);
  151. if (volsize % blocksize != 0)
  152. return (EINVAL);
  153. #ifdef _ILP32
  154. if (volsize - 1 > SPEC_MAXOFFSET_T)
  155. return (EOVERFLOW);
  156. #endif
  157. return (0);
  158. }
  159. int
  160. zvol_check_volblocksize(uint64_t volblocksize)
  161. {
  162. if (volblocksize < SPA_MINBLOCKSIZE ||
  163. volblocksize > SPA_MAXBLOCKSIZE ||
  164. !ISP2(volblocksize))
  165. return (EDOM);
  166. return (0);
  167. }
  168. int
  169. zvol_get_stats(objset_t *os, nvlist_t *nv)
  170. {
  171. int error;
  172. dmu_object_info_t doi;
  173. uint64_t val;
  174. error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
  175. if (error)
  176. return (error);
  177. dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
  178. error = dmu_object_info(os, ZVOL_OBJ, &doi);
  179. if (error == 0) {
  180. dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
  181. doi.doi_data_block_size);
  182. }
  183. return (error);
  184. }
  185. static zvol_state_t *
  186. zvol_minor_lookup(const char *name)
  187. {
  188. minor_t minor;
  189. zvol_state_t *zv;
  190. ASSERT(MUTEX_HELD(&zfsdev_state_lock));
  191. for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
  192. zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
  193. if (zv == NULL)
  194. continue;
  195. if (strcmp(zv->zv_name, name) == 0)
  196. return (zv);
  197. }
  198. return (NULL);
  199. }
  200. /* extent mapping arg */
  201. struct maparg {
  202. zvol_state_t *ma_zv;
  203. uint64_t ma_blks;
  204. };
  205. /*ARGSUSED*/
  206. static int
  207. zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
  208. const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
  209. {
  210. struct maparg *ma = arg;
  211. zvol_extent_t *ze;
  212. int bs = ma->ma_zv->zv_volblocksize;
  213. if (bp == NULL || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
  214. return (0);
  215. VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
  216. ma->ma_blks++;
  217. /* Abort immediately if we have encountered gang blocks */
  218. if (BP_IS_GANG(bp))
  219. return (EFRAGS);
  220. /*
  221. * See if the block is at the end of the previous extent.
  222. */
  223. ze = list_tail(&ma->ma_zv->zv_extents);
  224. if (ze &&
  225. DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) &&
  226. DVA_GET_OFFSET(BP_IDENTITY(bp)) ==
  227. DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) {
  228. ze->ze_nblks++;
  229. return (0);
  230. }
  231. dprintf_bp(bp, "%s", "next blkptr:");
  232. /* start a new extent */
  233. ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP);
  234. ze->ze_dva = bp->blk_dva[0]; /* structure assignment */
  235. ze->ze_nblks = 1;
  236. list_insert_tail(&ma->ma_zv->zv_extents, ze);
  237. return (0);
  238. }
  239. static void
  240. zvol_free_extents(zvol_state_t *zv)
  241. {
  242. zvol_extent_t *ze;
  243. while (ze = list_head(&zv->zv_extents)) {
  244. list_remove(&zv->zv_extents, ze);
  245. kmem_free(ze, sizeof (zvol_extent_t));
  246. }
  247. }
  248. static int
  249. zvol_get_lbas(zvol_state_t *zv)
  250. {
  251. objset_t *os = zv->zv_objset;
  252. struct maparg ma;
  253. int err;
  254. ma.ma_zv = zv;
  255. ma.ma_blks = 0;
  256. zvol_free_extents(zv);
  257. /* commit any in-flight changes before traversing the dataset */
  258. txg_wait_synced(dmu_objset_pool(os), 0);
  259. err = traverse_dataset(dmu_objset_ds(os), 0,
  260. TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
  261. if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
  262. zvol_free_extents(zv);
  263. return (err ? err : EIO);
  264. }
  265. return (0);
  266. }
  267. /* ARGSUSED */
  268. void
  269. zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
  270. {
  271. zfs_creat_t *zct = arg;
  272. nvlist_t *nvprops = zct->zct_props;
  273. int error;
  274. uint64_t volblocksize, volsize;
  275. VERIFY(nvlist_lookup_uint64(nvprops,
  276. zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
  277. if (nvlist_lookup_uint64(nvprops,
  278. zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
  279. volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
  280. /*
  281. * These properties must be removed from the list so the generic
  282. * property setting step won't apply to them.
  283. */
  284. VERIFY(nvlist_remove_all(nvprops,
  285. zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
  286. (void) nvlist_remove_all(nvprops,
  287. zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
  288. error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
  289. DMU_OT_NONE, 0, tx);
  290. ASSERT(error == 0);
  291. error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
  292. DMU_OT_NONE, 0, tx);
  293. ASSERT(error == 0);
  294. error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
  295. ASSERT(error == 0);
  296. }
  297. /*
  298. * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we
  299. * implement DKIOCFREE/free-long-range.
  300. */
  301. static int
  302. zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
  303. {
  304. uint64_t offset, length;
  305. if (byteswap)
  306. byteswap_uint64_array(lr, sizeof (*lr));
  307. offset = lr->lr_offset;
  308. length = lr->lr_length;
  309. return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
  310. }
  311. /*
  312. * Replay a TX_WRITE ZIL transaction that didn't get committed
  313. * after a system failure
  314. */
  315. static int
  316. zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
  317. {
  318. objset_t *os = zv->zv_objset;
  319. char *data = (char *)(lr + 1); /* data follows lr_write_t */
  320. uint64_t offset, length;
  321. dmu_tx_t *tx;
  322. int error;
  323. if (byteswap)
  324. byteswap_uint64_array(lr, sizeof (*lr));
  325. offset = lr->lr_offset;
  326. length = lr->lr_length;
  327. /* If it's a dmu_sync() block, write the whole block */
  328. if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
  329. uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
  330. if (length < blocksize) {
  331. offset -= offset % blocksize;
  332. length = blocksize;
  333. }
  334. }
  335. tx = dmu_tx_create(os);
  336. dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
  337. error = dmu_tx_assign(tx, TXG_WAIT);
  338. if (error) {
  339. dmu_tx_abort(tx);
  340. } else {
  341. dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
  342. dmu_tx_commit(tx);
  343. }
  344. return (error);
  345. }
  346. /* ARGSUSED */
  347. static int
  348. zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
  349. {
  350. return (ENOTSUP);
  351. }
  352. /*
  353. * Callback vectors for replaying records.
  354. * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
  355. */
  356. zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
  357. zvol_replay_err, /* 0 no such transaction type */
  358. zvol_replay_err, /* TX_CREATE */
  359. zvol_replay_err, /* TX_MKDIR */
  360. zvol_replay_err, /* TX_MKXATTR */
  361. zvol_replay_err, /* TX_SYMLINK */
  362. zvol_replay_err, /* TX_REMOVE */
  363. zvol_replay_err, /* TX_RMDIR */
  364. zvol_replay_err, /* TX_LINK */
  365. zvol_replay_err, /* TX_RENAME */
  366. zvol_replay_write, /* TX_WRITE */
  367. zvol_replay_truncate, /* TX_TRUNCATE */
  368. zvol_replay_err, /* TX_SETATTR */
  369. zvol_replay_err, /* TX_ACL */
  370. zvol_replay_err, /* TX_CREATE_ACL */
  371. zvol_replay_err, /* TX_CREATE_ATTR */
  372. zvol_replay_err, /* TX_CREATE_ACL_ATTR */
  373. zvol_replay_err, /* TX_MKDIR_ACL */
  374. zvol_replay_err, /* TX_MKDIR_ATTR */
  375. zvol_replay_err, /* TX_MKDIR_ACL_ATTR */
  376. zvol_replay_err, /* TX_WRITE2 */
  377. };
  378. int
  379. zvol_name2minor(const char *name, minor_t *minor)
  380. {
  381. zvol_state_t *zv;
  382. mutex_enter(&zfsdev_state_lock);
  383. zv = zvol_minor_lookup(name);
  384. if (minor && zv)
  385. *minor = zv->zv_minor;
  386. mutex_exit(&zfsdev_state_lock);
  387. return (zv ? 0 : -1);
  388. }
  389. /*
  390. * Create a minor node (plus a whole lot more) for the specified volume.
  391. */
  392. int
  393. zvol_create_minor(const char *name)
  394. {
  395. zfs_soft_state_t *zs;
  396. zvol_state_t *zv;
  397. objset_t *os;
  398. dmu_object_info_t doi;
  399. minor_t minor = 0;
  400. char chrbuf[30], blkbuf[30];
  401. int error;
  402. mutex_enter(&zfsdev_state_lock);
  403. if (zvol_minor_lookup(name) != NULL) {
  404. mutex_exit(&zfsdev_state_lock);
  405. return (EEXIST);
  406. }
  407. /* lie and say we're read-only */
  408. error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
  409. if (error) {
  410. mutex_exit(&zfsdev_state_lock);
  411. return (error);
  412. }
  413. if ((minor = zfsdev_minor_alloc()) == 0) {
  414. dmu_objset_disown(os, FTAG);
  415. mutex_exit(&zfsdev_state_lock);
  416. return (ENXIO);
  417. }
  418. if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
  419. dmu_objset_disown(os, FTAG);
  420. mutex_exit(&zfsdev_state_lock);
  421. return (EAGAIN);
  422. }
  423. (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
  424. (char *)name);
  425. (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
  426. if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
  427. minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
  428. ddi_soft_state_free(zfsdev_state, minor);
  429. dmu_objset_disown(os, FTAG);
  430. mutex_exit(&zfsdev_state_lock);
  431. return (EAGAIN);
  432. }
  433. (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
  434. if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
  435. minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
  436. ddi_remove_minor_node(zfs_dip, chrbuf);
  437. ddi_soft_state_free(zfsdev_state, minor);
  438. dmu_objset_disown(os, FTAG);
  439. mutex_exit(&zfsdev_state_lock);
  440. return (EAGAIN);
  441. }
  442. zs = ddi_get_soft_state(zfsdev_state, minor);
  443. zs->zss_type = ZSST_ZVOL;
  444. zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
  445. (void) strlcpy(zv->zv_name, name, MAXPATHLEN);
  446. zv->zv_min_bs = DEV_BSHIFT;
  447. zv->zv_minor = minor;
  448. zv->zv_objset = os;
  449. if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
  450. zv->zv_flags |= ZVOL_RDONLY;
  451. mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
  452. avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
  453. sizeof (rl_t), offsetof(rl_t, r_node));
  454. list_create(&zv->zv_extents, sizeof (zvol_extent_t),
  455. offsetof(zvol_extent_t, ze_node));
  456. /* get and cache the blocksize */
  457. error = dmu_object_info(os, ZVOL_OBJ, &doi);
  458. ASSERT(error == 0);
  459. zv->zv_volblocksize = doi.doi_data_block_size;
  460. if (spa_writeable(dmu_objset_spa(os))) {
  461. if (zil_replay_disable)
  462. zil_destroy(dmu_objset_zil(os), B_FALSE);
  463. else
  464. zil_replay(os, zv, zvol_replay_vector);
  465. }
  466. dmu_objset_disown(os, FTAG);
  467. zv->zv_objset = NULL;
  468. zvol_minors++;
  469. mutex_exit(&zfsdev_state_lock);
  470. return (0);
  471. }
  472. /*
  473. * Remove minor node for the specified volume.
  474. */
  475. static int
  476. zvol_remove_zv(zvol_state_t *zv)
  477. {
  478. char nmbuf[20];
  479. minor_t minor = zv->zv_minor;
  480. ASSERT(MUTEX_HELD(&zfsdev_state_lock));
  481. if (zv->zv_total_opens != 0)
  482. return (EBUSY);
  483. (void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
  484. ddi_remove_minor_node(zfs_dip, nmbuf);
  485. (void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
  486. ddi_remove_minor_node(zfs_dip, nmbuf);
  487. avl_destroy(&zv->zv_znode.z_range_avl);
  488. mutex_destroy(&zv->zv_znode.z_range_lock);
  489. kmem_free(zv, sizeof (zvol_state_t));
  490. ddi_soft_state_free(zfsdev_state, minor);
  491. zvol_minors--;
  492. return (0);
  493. }
  494. int
  495. zvol_remove_minor(const char *name)
  496. {
  497. zvol_state_t *zv;
  498. int rc;
  499. mutex_enter(&zfsdev_state_lock);
  500. if ((zv = zvol_minor_lookup(name)) == NULL) {
  501. mutex_exit(&zfsdev_state_lock);
  502. return (ENXIO);
  503. }
  504. rc = zvol_remove_zv(zv);
  505. mutex_exit(&zfsdev_state_lock);
  506. return (rc);
  507. }
  508. int
  509. zvol_first_open(zvol_state_t *zv)
  510. {
  511. objset_t *os;
  512. uint64_t volsize;
  513. int error;
  514. uint64_t readonly;
  515. /* lie and say we're read-only */
  516. error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
  517. zvol_tag, &os);
  518. if (error)
  519. return (error);
  520. error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
  521. if (error) {
  522. ASSERT(error == 0);
  523. dmu_objset_disown(os, zvol_tag);
  524. return (error);
  525. }
  526. zv->zv_objset = os;
  527. error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
  528. if (error) {
  529. dmu_objset_disown(os, zvol_tag);
  530. return (error);
  531. }
  532. zv->zv_volsize = volsize;
  533. zv->zv_zilog = zil_open(os, zvol_get_data);
  534. zvol_size_changed(zv->zv_volsize, ddi_driver_major(zfs_dip),
  535. zv->zv_minor);
  536. VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
  537. NULL) == 0);
  538. if (readonly || dmu_objset_is_snapshot(os) ||
  539. !spa_writeable(dmu_objset_spa(os)))
  540. zv->zv_flags |= ZVOL_RDONLY;
  541. else
  542. zv->zv_flags &= ~ZVOL_RDONLY;
  543. return (error);
  544. }
  545. void
  546. zvol_last_close(zvol_state_t *zv)
  547. {
  548. zil_close(zv->zv_zilog);
  549. zv->zv_zilog = NULL;
  550. dmu_buf_rele(zv->zv_dbuf, zvol_tag);
  551. zv->zv_dbuf = NULL;
  552. dmu_objset_disown(zv->zv_objset, zvol_tag);
  553. zv->zv_objset = NULL;
  554. }
  555. int
  556. zvol_prealloc(zvol_state_t *zv)
  557. {
  558. objset_t *os = zv->zv_objset;
  559. dmu_tx_t *tx;
  560. uint64_t refd, avail, usedobjs, availobjs;
  561. uint64_t resid = zv->zv_volsize;
  562. uint64_t off = 0;
  563. /* Check the space usage before attempting to allocate the space */
  564. dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs);
  565. if (avail < zv->zv_volsize)
  566. return (ENOSPC);
  567. /* Free old extents if they exist */
  568. zvol_free_extents(zv);
  569. while (resid != 0) {
  570. int error;
  571. uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
  572. tx = dmu_tx_create(os);
  573. dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
  574. error = dmu_tx_assign(tx, TXG_WAIT);
  575. if (error) {
  576. dmu_tx_abort(tx);
  577. (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off);
  578. return (error);
  579. }
  580. dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx);
  581. dmu_tx_commit(tx);
  582. off += bytes;
  583. resid -= bytes;
  584. }
  585. txg_wait_synced(dmu_objset_pool(os), 0);
  586. return (0);
  587. }
  588. int
  589. zvol_update_volsize(objset_t *os, uint64_t volsize)
  590. {
  591. dmu_tx_t *tx;
  592. int error;
  593. ASSERT(MUTEX_HELD(&zfsdev_state_lock));
  594. tx = dmu_tx_create(os);
  595. dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
  596. error = dmu_tx_assign(tx, TXG_WAIT);
  597. if (error) {
  598. dmu_tx_abort(tx);
  599. return (error);
  600. }
  601. error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
  602. &volsize, tx);
  603. dmu_tx_commit(tx);
  604. if (error == 0)
  605. error = dmu_free_long_range(os,
  606. ZVOL_OBJ, volsize, DMU_OBJECT_END);
  607. return (error);
  608. }
  609. void
  610. zvol_remove_minors(const char *name)
  611. {
  612. zvol_state_t *zv;
  613. char *namebuf;
  614. minor_t minor;
  615. namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
  616. (void) strncpy(namebuf, name, strlen(name));
  617. (void) strcat(namebuf, "/");
  618. mutex_enter(&zfsdev_state_lock);
  619. for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
  620. zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
  621. if (zv == NULL)
  622. continue;
  623. if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
  624. (void) zvol_remove_zv(zv);
  625. }
  626. kmem_free(namebuf, strlen(name) + 2);
  627. mutex_exit(&zfsdev_state_lock);
  628. }
  629. int
  630. zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
  631. {
  632. zvol_state_t *zv = NULL;
  633. objset_t *os;
  634. int error;
  635. dmu_object_info_t doi;
  636. uint64_t old_volsize = 0ULL;
  637. uint64_t readonly;
  638. mutex_enter(&zfsdev_state_lock);
  639. zv = zvol_minor_lookup(name);
  640. if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
  641. mutex_exit(&zfsdev_state_lock);
  642. return (error);
  643. }
  644. if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
  645. (error = zvol_check_volsize(volsize,
  646. doi.doi_data_block_size)) != 0)
  647. goto out;
  648. VERIFY(dsl_prop_get_integer(name, "readonly", &readonly,
  649. NULL) == 0);
  650. if (readonly) {
  651. error = EROFS;
  652. goto out;
  653. }
  654. error = zvol_update_volsize(os, volsize);
  655. /*
  656. * Reinitialize the dump area to the new size. If we
  657. * failed to resize the dump area then restore it back to
  658. * its original size.
  659. */
  660. if (zv && error == 0) {
  661. if (zv->zv_flags & ZVOL_DUMPIFIED) {
  662. old_volsize = zv->zv_volsize;
  663. zv->zv_volsize = volsize;
  664. if ((error = zvol_dumpify(zv)) != 0 ||
  665. (error = dumpvp_resize()) != 0) {
  666. (void) zvol_update_volsize(os, old_volsize);
  667. zv->zv_volsize = old_volsize;
  668. error = zvol_dumpify(zv);
  669. }
  670. }
  671. if (error == 0) {
  672. zv->zv_volsize = volsize;
  673. zvol_size_changed(volsize, maj, zv->zv_minor);
  674. }
  675. }
  676. /*
  677. * Generate a LUN expansion event.
  678. */
  679. if (zv && error == 0) {
  680. sysevent_id_t eid;
  681. nvlist_t *attr;
  682. char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
  683. (void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
  684. zv->zv_minor);
  685. VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
  686. VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
  687. (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
  688. ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
  689. nvlist_free(attr);
  690. kmem_free(physpath, MAXPATHLEN);
  691. }
  692. out:
  693. dmu_objset_rele(os, FTAG);
  694. mutex_exit(&zfsdev_state_lock);
  695. return (error);
  696. }
  697. /*ARGSUSED*/
  698. int
  699. zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
  700. {
  701. zvol_state_t *zv;
  702. int err = 0;
  703. mutex_enter(&zfsdev_state_lock);
  704. zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
  705. if (zv == NULL) {
  706. mutex_exit(&zfsdev_state_lock);
  707. return (ENXIO);
  708. }
  709. if (zv->zv_total_opens == 0)
  710. err = zvol_first_open(zv);
  711. if (err) {
  712. mutex_exit(&zfsdev_state_lock);
  713. return (err);
  714. }
  715. if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
  716. err = EROFS;
  717. goto out;
  718. }
  719. if (zv->zv_flags & ZVOL_EXCL) {
  720. err = EBUSY;
  721. goto out;
  722. }
  723. if (flag & FEXCL) {
  724. if (zv->zv_total_opens != 0) {
  725. err = EBUSY;
  726. goto out;
  727. }
  728. zv->zv_flags |= ZVOL_EXCL;
  729. }
  730. if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
  731. zv->zv_open_count[otyp]++;
  732. zv->zv_total_opens++;
  733. }
  734. mutex_exit(&zfsdev_state_lock);
  735. return (err);
  736. out:
  737. if (zv->zv_total_opens == 0)
  738. zvol_last_close(zv);
  739. mutex_exit(&zfsdev_state_lock);
  740. return (err);
  741. }
  742. /*ARGSUSED*/
  743. int
  744. zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
  745. {
  746. minor_t minor = getminor(dev);
  747. zvol_state_t *zv;
  748. int error = 0;
  749. mutex_enter(&zfsdev_state_lock);
  750. zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
  751. if (zv == NULL) {
  752. mutex_exit(&zfsdev_state_lock);
  753. return (ENXIO);
  754. }
  755. if (zv->zv_flags & ZVOL_EXCL) {
  756. ASSERT(zv->zv_total_opens == 1);
  757. zv->zv_flags &= ~ZVOL_EXCL;
  758. }
  759. /*
  760. * If the open count is zero, this is a spurious close.
  761. * That indicates a bug in the kernel / DDI framework.
  762. */
  763. ASSERT(zv->zv_open_count[otyp] != 0);
  764. ASSERT(zv->zv_total_opens != 0);
  765. /*
  766. * You may get multiple opens, but only one close.
  767. */
  768. zv->zv_open_count[otyp]--;
  769. zv->zv_total_opens--;
  770. if (zv->zv_total_opens == 0)
  771. zvol_last_close(zv);
  772. mutex_exit(&zfsdev_state_lock);
  773. return (error);
  774. }
  775. static void
  776. zvol_get_done(zgd_t *zgd, int error)
  777. {
  778. if (zgd->zgd_db)
  779. dmu_buf_rele(zgd->zgd_db, zgd);
  780. zfs_range_unlock(zgd->zgd_rl);
  781. if (error == 0 && zgd->zgd_bp)
  782. zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
  783. kmem_free(zgd, sizeof (zgd_t));
  784. }
  785. /*
  786. * Get data to generate a TX_WRITE intent log record.
  787. */
  788. static int
  789. zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
  790. {
  791. zvol_state_t *zv = arg;
  792. objset_t *os = zv->zv_objset;
  793. uint64_t object = ZVOL_OBJ;
  794. uint64_t offset = lr->lr_offset;
  795. uint64_t size = lr->lr_length; /* length of user data */
  796. blkptr_t *bp = &lr->lr_blkptr;
  797. dmu_buf_t *db;
  798. zgd_t *zgd;
  799. int error;
  800. ASSERT(zio != NULL);
  801. ASSERT(size != 0);
  802. zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
  803. zgd->zgd_zilog = zv->zv_zilog;
  804. zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
  805. /*
  806. * Write records come in two flavors: immediate and indirect.
  807. * For small writes it's cheaper to store the data with the
  808. * log record (immediate); for large writes it's cheaper to
  809. * sync the data and get a pointer to it (indirect) so that
  810. * we don't have to write the data twice.
  811. */
  812. if (buf != NULL) { /* immediate write */
  813. error = dmu_read(os, object, offset, size, buf,
  814. DMU_READ_NO_PREFETCH);
  815. } else {
  816. size = zv->zv_volblocksize;
  817. offset = P2ALIGN(offset, size);
  818. error = dmu_buf_hold(os, object, offset, zgd, &db,
  819. DMU_READ_NO_PREFETCH);
  820. if (error == 0) {
  821. zgd->zgd_db = db;
  822. zgd->zgd_bp = bp;
  823. ASSERT(db->db_offset == offset);
  824. ASSERT(db->db_size == size);
  825. error = dmu_sync(zio, lr->lr_common.lrc_txg,
  826. zvol_get_done, zgd);
  827. if (error == 0)
  828. return (0);
  829. }
  830. }
  831. zvol_get_done(zgd, error);
  832. return (error);
  833. }
  834. /*
  835. * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
  836. *
  837. * We store data in the log buffers if it's small enough.
  838. * Otherwise we will later flush the data out via dmu_sync().
  839. */
  840. ssize_t zvol_immediate_write_sz = 32768;
  841. static void
  842. zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
  843. boolean_t sync)
  844. {
  845. uint32_t blocksize = zv->zv_volblocksize;
  846. zilog_t *zilog = zv->zv_zilog;
  847. boolean_t slogging;
  848. ssize_t immediate_write_sz;
  849. if (zil_replaying(zilog, tx))
  850. return;
  851. immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
  852. ? 0 : zvol_immediate_write_sz;
  853. slogging = spa_has_slogs(zilog->zl_spa) &&
  854. (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
  855. while (resid) {
  856. itx_t *itx;
  857. lr_write_t *lr;
  858. ssize_t len;
  859. itx_wr_state_t write_state;
  860. /*
  861. * Unlike zfs_log_write() we can be called with
  862. * upto DMU_MAX_ACCESS/2 (5MB) writes.
  863. */
  864. if (blocksize > immediate_write_sz && !slogging &&
  865. resid >= blocksize && off % blocksize == 0) {
  866. write_state = WR_INDIRECT; /* uses dmu_sync */
  867. len = blocksize;
  868. } else if (sync) {
  869. write_state = WR_COPIED;
  870. len = MIN(ZIL_MAX_LOG_DATA, resid);
  871. } else {
  872. write_state = WR_NEED_COPY;
  873. len = MIN(ZIL_MAX_LOG_DATA, resid);
  874. }
  875. itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
  876. (write_state == WR_COPIED ? len : 0));
  877. lr = (lr_write_t *)&itx->itx_lr;
  878. if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
  879. ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
  880. zil_itx_destroy(itx);
  881. itx = zil_itx_create(TX_WRITE, sizeof (*lr));
  882. lr = (lr_write_t *)&itx->itx_lr;
  883. write_state = WR_NEED_COPY;
  884. }
  885. itx->itx_wr_state = write_state;
  886. if (write_state == WR_NEED_COPY)
  887. itx->itx_sod += len;
  888. lr->lr_foid = ZVOL_OBJ;
  889. lr->lr_offset = off;
  890. lr->lr_length = len;
  891. lr->lr_blkoff = 0;
  892. BP_ZERO(&lr->lr_blkptr);
  893. itx->itx_private = zv;
  894. itx->itx_sync = sync;
  895. zil_itx_assign(zilog, itx, tx);
  896. off += len;
  897. resid -= len;
  898. }
  899. }
  900. static int
  901. zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
  902. boolean_t doread, boolean_t isdump)
  903. {
  904. vdev_disk_t *dvd;
  905. int c;
  906. int numerrors = 0;
  907. for (c = 0; c < vd->vdev_children; c++) {
  908. ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
  909. vd->vdev_ops == &vdev_replacing_ops ||
  910. vd->vdev_ops == &vdev_spare_ops);
  911. int err = zvol_dumpio_vdev(vd->vdev_child[c],
  912. addr, offset, size, doread, isdump);
  913. if (err != 0) {
  914. numerrors++;
  915. } else if (doread) {
  916. break;
  917. }
  918. }
  919. if (!vd->vdev_ops->vdev_op_leaf)
  920. return (numerrors < vd->vdev_children ? 0 : EIO);
  921. if (doread && !vdev_readable(vd))
  922. return (EIO);
  923. else if (!doread && !vdev_writeable(vd))
  924. return (EIO);
  925. dvd = vd->vdev_tsd;
  926. ASSERT3P(dvd, !=, NULL);
  927. offset += VDEV_LABEL_START_SIZE;
  928. if (ddi_in_panic() || isdump) {
  929. ASSERT(!doread);
  930. if (doread)
  931. return (EIO);
  932. return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
  933. lbtodb(size)));
  934. } else {
  935. return (vdev_disk_physio(dvd->vd_lh, addr, size, offset,
  936. doread ? B_READ : B_WRITE));
  937. }
  938. }
  939. static int
  940. zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
  941. boolean_t doread, boolean_t isdump)
  942. {
  943. vdev_t *vd;
  944. int error;
  945. zvol_extent_t *ze;
  946. spa_t *spa = dmu_objset_spa(zv->zv_objset);
  947. /* Must be sector aligned, and not stradle a block boundary. */
  948. if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) ||
  949. P2BOUNDARY(offset, size, zv->zv_volblocksize)) {
  950. return (EINVAL);
  951. }
  952. ASSERT(size <= zv->zv_volblocksize);
  953. /* Locate the extent this belongs to */
  954. ze = list_head(&zv->zv_extents);
  955. while (offset >= ze->ze_nblks * zv->zv_volblocksize) {
  956. offset -= ze->ze_nblks * zv->zv_volblocksize;
  957. ze = list_next(&zv->zv_extents, ze);
  958. }
  959. if (!ddi_in_panic())
  960. spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
  961. vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
  962. offset += DVA_GET_OFFSET(&ze->ze_dva);
  963. error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump);
  964. if (!ddi_in_panic())
  965. spa_config_exit(spa, SCL_STATE, FTAG);
  966. return (error);
  967. }
  968. int
  969. zvol_strategy(buf_t *bp)
  970. {
  971. zfs_soft_state_t *zs = NULL;
  972. zvol_state_t *zv;
  973. uint64_t off, volsize;
  974. size_t resid;
  975. char *addr;
  976. objset_t *os;
  977. rl_t *rl;
  978. int error = 0;
  979. boolean_t doread = bp->b_flags & B_READ;
  980. boolean_t is_dump;
  981. boolean_t sync;
  982. if (getminor(bp->b_edev) == 0) {
  983. error = EINVAL;
  984. } else {
  985. zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
  986. if (zs == NULL)
  987. error = ENXIO;
  988. else if (zs->zss_type != ZSST_ZVOL)
  989. error = EINVAL;
  990. }
  991. if (error) {
  992. bioerror(bp, error);
  993. biodone(bp);
  994. return (0);
  995. }
  996. zv = zs->zss_data;
  997. if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
  998. bioerror(bp, EROFS);
  999. biodone(bp);
  1000. return (0);
  1001. }
  1002. off = ldbtob(bp->b_blkno);
  1003. volsize = zv->zv_volsize;
  1004. os = zv->zv_objset;
  1005. ASSERT(os != NULL);
  1006. bp_mapin(bp);
  1007. addr = bp->b_un.b_addr;
  1008. resid = bp->b_bcount;
  1009. if (resid > 0 && (off < 0 || off >= volsize)) {
  1010. bioerror(bp, EIO);
  1011. biodone(bp);
  1012. return (0);
  1013. }
  1014. is_dump = zv->zv_flags & ZVOL_DUMPIFIED;
  1015. sync = ((!(bp->b_flags & B_ASYNC) &&
  1016. !(zv->zv_flags & ZVOL_WCE)) ||
  1017. (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
  1018. !doread && !is_dump;
  1019. /*
  1020. * There must be no buffer changes when doing a dmu_sync() because
  1021. * we can't change the data whilst calculating the checksum.
  1022. */
  1023. rl = zfs_range_lock(&zv->zv_znode, off, resid,
  1024. doread ? RL_READER : RL_WRITER);
  1025. while (resid != 0 && off < volsize) {
  1026. size_t size = MIN(resid, zvol_maxphys);
  1027. if (is_dump) {
  1028. size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
  1029. error = zvol_dumpio(zv, addr, off, size,
  1030. doread, B_FALSE);
  1031. } else if (doread) {
  1032. error = dmu_read(os, ZVOL_OBJ, off, size, addr,
  1033. DMU_READ_PREFETCH);
  1034. } else {
  1035. dmu_tx_t *tx = dmu_tx_create(os);
  1036. dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
  1037. error = dmu_tx_assign(tx, TXG_WAIT);
  1038. if (error) {
  1039. dmu_tx_abort(tx);
  1040. } else {
  1041. dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
  1042. zvol_log_write(zv, tx, off, size, sync);
  1043. dmu_tx_commit(tx);
  1044. }
  1045. }
  1046. if (error) {
  1047. /* convert checksum errors into IO errors */
  1048. if (error == ECKSUM)
  1049. error = EIO;
  1050. break;
  1051. }
  1052. off += size;
  1053. addr += size;
  1054. resid -= size;
  1055. }
  1056. zfs_range_unlock(rl);
  1057. if ((bp->b_resid = resid) == bp->b_bcount)
  1058. bioerror(bp, off > volsize ? EINVAL : error);
  1059. if (sync)
  1060. zil_commit(zv->zv_zilog, ZVOL_OBJ);
  1061. biodone(bp);
  1062. return (0);
  1063. }
  1064. /*
  1065. * Set the buffer count to the zvol maximum transfer.
  1066. * Using our own routine instead of the default minphys()
  1067. * means that for larger writes we write bigger buffers on X86
  1068. * (128K instead of 56K) and flush the disk write cache less often
  1069. * (every zvol_maxphys - currently 1MB) instead of minphys (currently
  1070. * 56K on X86 and 128K on sparc).
  1071. */
  1072. void
  1073. zvol_minphys(struct buf *bp)
  1074. {
  1075. if (bp->b_bcount > zvol_maxphys)
  1076. bp->b_bcount = zvol_maxphys;
  1077. }
  1078. int
  1079. zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks)
  1080. {
  1081. minor_t minor = getminor(dev);
  1082. zvol_state_t *zv;
  1083. int error = 0;
  1084. uint64_t size;
  1085. uint64_t boff;
  1086. uint64_t resid;
  1087. zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
  1088. if (zv == NULL)
  1089. return (ENXIO);
  1090. boff = ldbtob(blkno);
  1091. resid = ldbtob(nblocks);
  1092. VERIFY3U(boff + resid, <=, zv->zv_volsize);
  1093. while (resid) {
  1094. size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff);
  1095. error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE);
  1096. if (error)
  1097. break;
  1098. boff += size;
  1099. addr += size;
  1100. resid -= size;
  1101. }
  1102. return (error);
  1103. }
  1104. /*ARGSUSED*/
  1105. int
  1106. zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
  1107. {
  1108. minor_t minor = getminor(dev);
  1109. zvol_state_t *zv;
  1110. uint64_t volsize;
  1111. rl_t *rl;
  1112. int error = 0;
  1113. zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
  1114. if (zv == NULL)
  1115. return (ENXIO);
  1116. volsize = zv->zv_volsize;
  1117. if (uio->uio_resid > 0 &&
  1118. (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
  1119. return (EIO);
  1120. if (zv->zv_flags & ZVOL_DUMPIFIED) {
  1121. error = physio(zvol_strategy, NULL, dev, B_READ,
  1122. zvol_minphys, uio);
  1123. return (error);
  1124. }
  1125. rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
  1126. RL_READER);
  1127. while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
  1128. uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
  1129. /* don't read past the end */
  1130. if (bytes > volsize - uio->uio_loffset)
  1131. bytes = volsize - uio->uio_loffset;
  1132. error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
  1133. if (error) {
  1134. /* convert checksum errors into IO errors */
  1135. if (error == ECKSUM)
  1136. error = EIO;
  1137. break;
  1138. }
  1139. }
  1140. zfs_range_unlock(rl);
  1141. return (error);
  1142. }
  1143. /*ARGSUSED*/
  1144. int
  1145. zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
  1146. {
  1147. minor_t minor = getminor(dev);
  1148. zvol_state_t *zv;
  1149. uint64_t volsize;
  1150. rl_t *rl;
  1151. int error = 0;
  1152. boolean_t sync;
  1153. zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
  1154. if (zv == NULL)
  1155. return (ENXIO);
  1156. volsize = zv->zv_volsize;
  1157. if (uio->uio_resid > 0 &&
  1158. (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
  1159. return (EIO);
  1160. if (zv->zv_flags & ZVOL_DUMPIFIED) {
  1161. error = physio(zvol_strategy, NULL, dev, B_WRITE,
  1162. zvol_minphys, uio);
  1163. return (error);
  1164. }
  1165. sync = !(zv->zv_flags & ZVOL_WCE) ||
  1166. (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
  1167. rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
  1168. RL_WRITER);
  1169. while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
  1170. uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1);
  1171. uint64_t off = uio->uio_loffset;
  1172. dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
  1173. if (bytes > volsize - off) /* don't write past the end */
  1174. bytes = volsize - off;
  1175. dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
  1176. error = dmu_tx_assign(tx, TXG_WAIT);
  1177. if (error) {
  1178. dmu_tx_abort(tx);
  1179. break;
  1180. }
  1181. error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx);
  1182. if (error == 0)
  1183. zvol_log_write(zv, tx, off, bytes, sync);
  1184. dmu_tx_commit(tx);
  1185. if (error)
  1186. break;
  1187. }
  1188. zfs_range_unlock(rl);
  1189. if (sync)
  1190. zil_commit(zv->zv_zilog, ZVOL_OBJ);
  1191. return (error);
  1192. }
  1193. int
  1194. zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
  1195. {
  1196. struct uuid uuid = EFI_RESERVED;
  1197. efi_gpe_t gpe = { 0 };
  1198. uint32_t crc;
  1199. dk_efi_t efi;
  1200. int length;
  1201. char *ptr;
  1202. if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag))
  1203. return (EFAULT);
  1204. ptr = (char *)(uintptr_t)efi.dki_data_64;
  1205. length = efi.dki_length;
  1206. /*
  1207. * Some clients may attempt to request a PMBR for the
  1208. * zvol. Currently this interface will return EINVAL to
  1209. * such requests. These requests could be supported by
  1210. * adding a check for lba == 0 and consing up an appropriate
  1211. * PMBR.
  1212. */
  1213. if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0)
  1214. return (EINVAL);
  1215. gpe.efi_gpe_StartingLBA = LE_64(34ULL);
  1216. gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1);
  1217. UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
  1218. if (efi.dki_lba == 1) {
  1219. efi_gpt_t gpt = { 0 };
  1220. gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
  1221. gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
  1222. gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
  1223. gpt.efi_gpt_MyLBA = LE_64(1ULL);
  1224. gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
  1225. gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1);
  1226. gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL);
  1227. gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
  1228. gpt.efi_gpt_SizeOfPartitionEntry =
  1229. LE_32(sizeof (efi_gpe_t));
  1230. CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
  1231. gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
  1232. CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
  1233. gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
  1234. if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length),
  1235. flag))
  1236. return (EFAULT);
  1237. ptr += sizeof (gpt);
  1238. length -= sizeof (gpt);
  1239. }
  1240. if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe),
  1241. length), flag))
  1242. return (EFAULT);
  1243. return (0);
  1244. }
  1245. /*
  1246. * BEGIN entry points to allow external callers access to the volume.
  1247. */
  1248. /*
  1249. * Return the volume parameters needed for access from an external caller.
  1250. * These values are invariant as long as the volume is held open.
  1251. */
  1252. int
  1253. zvol_get_volume_params(minor_t minor, uint64_t *blksize,
  1254. uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
  1255. void **rl_hdl, void **bonus_hdl)
  1256. {
  1257. zvol_state_t *zv;
  1258. zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
  1259. if (zv == NULL)
  1260. return (ENXIO);
  1261. if (zv->zv_flags & ZVOL_DUMPIFIED)
  1262. return (ENXIO);
  1263. ASSERT(blksize && max_xfer_len && minor_hdl &&
  1264. objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
  1265. *blksize = zv->zv_volblocksize;
  1266. *max_xfer_len = (uint64_t)zvol_maxphys;
  1267. *minor_hdl = zv;
  1268. *objset_hdl = zv->zv_objset;
  1269. *zil_hdl = zv->zv_zilog;
  1270. *rl_hdl = &zv->zv_znode;
  1271. *bonus_hdl = zv->zv_dbuf;
  1272. return (0);
  1273. }
  1274. /*
  1275. * Return the current volume size to an external caller.
  1276. * The size can change while the volume is open.
  1277. */
  1278. uint64_t
  1279. zvol_get_volume_size(void *minor_hdl)
  1280. {
  1281. zvol_state_t *zv = minor_hdl;
  1282. return (zv->zv_volsize);
  1283. }
  1284. /*
  1285. * Return the current WCE setting to an external caller.
  1286. * The WCE setting can change while the volume is open.
  1287. */
  1288. int
  1289. zvol_get_volume_wce(void *minor_hdl)
  1290. {
  1291. zvol_state_t *zv = minor_hdl;
  1292. return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
  1293. }
  1294. /*
  1295. * Entry point for external callers to zvol_log_write
  1296. */
  1297. void
  1298. zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
  1299. boolean_t sync)
  1300. {
  1301. zvol_state_t *zv = minor_hdl;
  1302. zvol_log_write(zv, tx, off, resid, sync);
  1303. }
  1304. /*
  1305. * END entry points to allow external callers access to the volume.
  1306. */
  1307. /*
  1308. * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
  1309. */
  1310. static void
  1311. zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
  1312. boolean_t sync)
  1313. {
  1314. itx_t *itx;
  1315. lr_truncate_t *lr;
  1316. zilog_t *zilog = zv->zv_zilog;
  1317. if (zil_replaying(zilog, tx))
  1318. return;
  1319. itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
  1320. lr = (lr_truncate_t *)&itx->itx_lr;
  1321. lr->lr_foid = ZVOL_OBJ;
  1322. lr->lr_offset = off;
  1323. lr->lr_length = len;
  1324. itx->itx_sync = sync;
  1325. zil_itx_assign(zilog, itx, tx);
  1326. }
  1327. /*
  1328. * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I).
  1329. * Also a dirtbag dkio ioctl for unmap/free-block functionality.
  1330. */
  1331. /*ARGSUSED*/
  1332. int
  1333. zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
  1334. {
  1335. zvol_state_t *zv;
  1336. struct dk_cinfo dki;
  1337. struct dk_minfo dkm;
  1338. struct dk_callback *dkc;
  1339. int error = 0;
  1340. rl_t *rl;
  1341. mutex_enter(&zfsdev_state_lock);
  1342. zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
  1343. if (zv == NULL) {
  1344. mutex_exit(&zfsdev_state_lock);
  1345. return (ENXIO);
  1346. }
  1347. ASSERT(zv->zv_total_opens > 0);
  1348. switch (cmd) {
  1349. case DKIOCINFO:
  1350. bzero(&dki, sizeof (dki));
  1351. (void) strcpy(dki.dki_cname, "zvol");
  1352. (void) strcpy(dki.dki_dname, "zvol");
  1353. dki.dki_ctype = DKC_UNKNOWN;
  1354. dki.dki_unit = getminor(dev);
  1355. dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
  1356. mutex_exit(&zfsdev_state_lock);
  1357. if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
  1358. error = EFAULT;
  1359. return (error);
  1360. case DKIOCGMEDIAINFO:
  1361. bzero(&dkm, sizeof (dkm));
  1362. dkm.dki_lbsize = 1U << zv->zv_min_bs;
  1363. dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
  1364. dkm.dki_media_type = DK_UNKNOWN;
  1365. mutex_exit(&zfsdev_state_lock);
  1366. if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
  1367. error = EFAULT;
  1368. return (error);
  1369. case DKIOCGETEFI:
  1370. {
  1371. uint64_t vs = zv->zv_volsize;
  1372. uint8_t bs = zv->zv_min_bs;
  1373. mutex_exit(&zfsdev_state_lock);
  1374. error = zvol_getefi((void *)arg, flag, vs, bs);
  1375. return (error);
  1376. }
  1377. case DKIOCFLUSHWRITECACHE:
  1378. dkc = (struct dk_callback *)arg;
  1379. mutex_exit(&zfsdev_state_lock);
  1380. zil_commit(zv->zv_zilog, ZVOL_OBJ);
  1381. if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
  1382. (*dkc->dkc_callback)(dkc->dkc_cookie, error);
  1383. error = 0;
  1384. }
  1385. return (error);
  1386. case DKIOCGETWCE:
  1387. {
  1388. int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
  1389. if (ddi_copyout(&wce, (void *)arg, sizeof (int),
  1390. flag))
  1391. error = EFAULT;
  1392. break;
  1393. }
  1394. case DKIOCSETWCE:
  1395. {
  1396. int wce;
  1397. if (ddi_copyin((void *)arg, &wce, sizeof (int),
  1398. flag)) {
  1399. error = EFAULT;
  1400. break;
  1401. }
  1402. if (wce) {
  1403. zv->zv_flags |= ZVOL_WCE;
  1404. mutex_exit(&zfsdev_state_lock);
  1405. } else {
  1406. zv->zv_flags &= ~ZVOL_WCE;
  1407. mutex_exit(&zfsdev_state_lock);
  1408. zil_commit(zv->zv_zilog, ZVOL_OBJ);
  1409. }
  1410. return (0);
  1411. }
  1412. case DKIOCGGEOM:
  1413. case DKIOCGVTOC:
  1414. /*
  1415. * commands using these (like prtvtoc) expect ENOTSUP
  1416. * since we're emulating an EFI label
  1417. */
  1418. error = ENOTSUP;
  1419. break;
  1420. case DKIOCDUMPINIT:
  1421. rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
  1422. RL_WRITER);
  1423. error = zvol_dumpify(zv);
  1424. zfs_range_unlock(rl);
  1425. break;
  1426. case DKIOCDUMPFINI:
  1427. if (!(zv->zv_flags & ZVOL_DUMPIFIED))
  1428. break;
  1429. rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
  1430. RL_WRITER);
  1431. error = zvol_dump_fini(zv);
  1432. zfs_range_unlock(rl);
  1433. break;
  1434. case DKIOCFREE:
  1435. {
  1436. dkioc_free_t df;
  1437. dmu_tx_t *tx;
  1438. if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
  1439. error = EFAULT;
  1440. break;
  1441. }
  1442. /*
  1443. * Apply Postel's Law to length-checking. If they overshoot,
  1444. * just blank out until the end, if there's a need to blank
  1445. * out anything.
  1446. */
  1447. if (df.df_start >= zv->zv_volsize)
  1448. break; /* No need to do anything... */
  1449. if (df.df_start + df.df_length > zv->zv_volsize)
  1450. df.df_length = DMU_OBJECT_END;
  1451. rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
  1452. RL_WRITER);
  1453. tx = dmu_tx_create(zv->zv_objset);
  1454. error = dmu_tx_assign(tx, TXG_WAIT);
  1455. if (error != 0) {
  1456. dmu_tx_abort(tx);
  1457. } else {
  1458. zvol_log_truncate(zv, tx, df.df_start,
  1459. df.df_length, B_TRUE);
  1460. dmu_tx_commit(tx);
  1461. error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
  1462. df.df_start, df.df_length);
  1463. }
  1464. zfs_range_unlock(rl);
  1465. if (error == 0) {
  1466. /*
  1467. * If the write-cache is disabled or 'sync' property
  1468. * is set to 'always' then treat this as a synchronous
  1469. * operation (i.e. commit to zil).
  1470. */
  1471. if (!(zv->zv_flags & ZVOL_WCE) ||
  1472. (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS))
  1473. zil_commit(zv->zv_zilog, ZVOL_OBJ);
  1474. /*
  1475. * If the caller really wants synchronous writes, and
  1476. * can't wait for them, don't return until the write
  1477. * is done.
  1478. */
  1479. if (df.df_flags & DF_WAIT_SYNC) {
  1480. txg_wait_synced(
  1481. dmu_objset_pool(zv->zv_objset), 0);
  1482. }
  1483. }
  1484. break;
  1485. }
  1486. default:
  1487. error = ENOTTY;
  1488. break;
  1489. }
  1490. mutex_exit(&zfsdev_state_lock);
  1491. return (error);
  1492. }
  1493. int
  1494. zvol_busy(void)
  1495. {
  1496. return (zvol_minors != 0);
  1497. }
  1498. void
  1499. zvol_init(void)
  1500. {
  1501. VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
  1502. 1) == 0);
  1503. mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
  1504. }
  1505. void
  1506. zvol_fini(void)
  1507. {
  1508. mutex_destroy(&zfsdev_state_lock);
  1509. ddi_soft_state_fini(&zfsdev_state);
  1510. }
  1511. static int
  1512. zvol_dump_init(zvol_state_t *zv, boolean_t resize)
  1513. {
  1514. dmu_tx_t *tx;
  1515. int error = 0;
  1516. objset_t *os = zv->zv_objset;
  1517. nvlist_t *nv = NULL;
  1518. uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
  1519. ASSERT(MUTEX_HELD(&zfsdev_state_lock));
  1520. error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
  1521. DMU_OBJECT_END);
  1522. /* wait for dmu_free_long_range to actually free the blocks */
  1523. txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
  1524. tx = dmu_tx_create(os);
  1525. dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
  1526. dmu_tx_hold_bonus(tx, ZVOL_OBJ);
  1527. error = dmu_tx_assign(tx, TXG_WAIT);
  1528. if (error) {
  1529. dmu_tx_abort(tx);
  1530. return (error);
  1531. }
  1532. /*
  1533. * If we are resizing the dump device then we only need to
  1534. * update the refreservation to match the newly updated
  1535. * zvolsize. Otherwise, we save off the original state of the
  1536. * zvol so that we can restore them if the zvol is ever undumpified.
  1537. */
  1538. if (resize) {
  1539. error = zap_update(os, ZVOL_ZAP_OBJ,
  1540. zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
  1541. &zv->zv_volsize, tx);
  1542. } else {
  1543. uint64_t checksum, compress, refresrv, vbs, dedup;
  1544. error = dsl_prop_get_integer(zv->zv_name,
  1545. zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
  1546. error = error ? error : dsl_prop_get_integer(zv->zv_name,
  1547. zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL);
  1548. error = error ? error : dsl_prop_get_integer(zv->zv_name,
  1549. zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
  1550. error = error ? error : dsl_prop_get_integer(zv->zv_name,
  1551. zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
  1552. if (version >= SPA_VERSION_DEDUP) {
  1553. error = error ? error :
  1554. dsl_prop_get_integer(zv->zv_name,
  1555. zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
  1556. }
  1557. error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
  1558. zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
  1559. &compress, tx);
  1560. error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
  1561. zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx);
  1562. error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
  1563. zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
  1564. &refresrv, tx);
  1565. error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
  1566. zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
  1567. &vbs, tx);
  1568. error = error ? error : dmu_object_set_blocksize(
  1569. os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
  1570. if (version >= SPA_VERSION_DEDUP) {
  1571. error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
  1572. zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
  1573. &dedup, tx);
  1574. }
  1575. if (error == 0)
  1576. zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
  1577. }
  1578. dmu_tx_commit(tx);
  1579. /*
  1580. * We only need update the zvol's property if we are initializing
  1581. * the dump area for the first time.
  1582. */
  1583. if (!resize) {
  1584. VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
  1585. VERIFY(nvlist_add_uint64(nv,
  1586. zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
  1587. VERIFY(nvlist_add_uint64(nv,
  1588. zfs_prop_to_name(ZFS_PROP_COMPRESSION),
  1589. ZIO_COMPRESS_OFF) == 0);
  1590. VERIFY(nvlist_add_uint64(nv,
  1591. zfs_prop_to_name(ZFS_PROP_CHECKSUM),
  1592. ZIO_CHECKSUM_OFF) == 0);
  1593. if (version >= SPA_VERSION_DEDUP) {
  1594. VERIFY(nvlist_add_uint64(nv,
  1595. zfs_prop_to_name(ZFS_PROP_DEDUP),
  1596. ZIO_CHECKSUM_OFF) == 0);
  1597. }
  1598. error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
  1599. nv, NULL);
  1600. nvlist_free(nv);
  1601. if (error)
  1602. return (error);
  1603. }
  1604. /* Allocate the space for the dump */
  1605. error = zvol_prealloc(zv);
  1606. return (error);
  1607. }
  1608. static int
  1609. zvol_dumpify(zvol_state_t *zv)
  1610. {
  1611. int error = 0;
  1612. uint64_t dumpsize = 0;
  1613. dmu_tx_t *tx;
  1614. objset_t *os = zv->zv_objset;
  1615. if (zv->zv_flags & ZVOL_RDONLY)
  1616. return (EROFS);
  1617. if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
  1618. 8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
  1619. boolean_t resize = (dumpsize > 0);
  1620. if ((error = zvol_dump_init(zv, resize)) != 0) {
  1621. (void) zvol_dump_fini(zv);
  1622. return (error);
  1623. }
  1624. }
  1625. /*
  1626. * Build up our lba mapping.
  1627. */
  1628. error = zvol_get_lbas(zv);
  1629. if (error) {
  1630. (void) zvol_dump_fini(zv);
  1631. return (error);
  1632. }
  1633. tx = dmu_tx_create(os);
  1634. dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
  1635. error = dmu_tx_assign(tx, TXG_WAIT);
  1636. if (error) {
  1637. dmu_tx_abort(tx);
  1638. (void) zvol_dump_fini(zv);
  1639. return (error);
  1640. }
  1641. zv->zv_flags |= ZVOL_DUMPIFIED;
  1642. error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1,
  1643. &zv->zv_volsize, tx);
  1644. dmu_tx_commit(tx);
  1645. if (error) {
  1646. (void) zvol_dump_fini(zv);
  1647. return (error);
  1648. }
  1649. txg_wait_synced(dmu_objset_pool(os), 0);
  1650. return (0);
  1651. }
  1652. static int
  1653. zvol_dump_fini(zvol_state_t *zv)
  1654. {
  1655. dmu_tx_t *tx;
  1656. objset_t *os = zv->zv_objset;
  1657. nvlist_t *nv;
  1658. int error = 0;
  1659. uint64_t checksum, compress, refresrv, vbs, dedup;
  1660. uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
  1661. /*
  1662. * Attempt to restore the zvol back to its pre-dumpified state.
  1663. * This is a best-effort attempt as it's possible that not all
  1664. * of these properties were initialized during the dumpify process
  1665. * (i.e. error during zvol_dump_init).
  1666. */
  1667. tx = dmu_tx_create(os);
  1668. dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
  1669. error = dmu_tx_assign(tx, TXG_WAIT);
  1670. if (error) {
  1671. dmu_tx_abort(tx);
  1672. return (error);
  1673. }
  1674. (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx);
  1675. dmu_tx_commit(tx);
  1676. (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
  1677. zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum);
  1678. (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
  1679. zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress);
  1680. (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
  1681. zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv);
  1682. (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
  1683. zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs);
  1684. VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
  1685. (void) nvlist_add_uint64(nv,
  1686. zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum);
  1687. (void) nvlist_add_uint64(nv,
  1688. zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
  1689. (void) nvlist_add_uint64(nv,
  1690. zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
  1691. if (version >= SPA_VERSION_DEDUP &&
  1692. zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
  1693. zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
  1694. (void) nvlist_add_uint64(nv,
  1695. zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
  1696. }
  1697. (void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
  1698. nv, NULL);
  1699. nvlist_free(nv);
  1700. zvol_free_extents(zv);
  1701. zv->zv_flags &= ~ZVOL_DUMPIFIED;
  1702. (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
  1703. /* wait for dmu_free_long_range to actually free the blocks */
  1704. txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
  1705. tx = dmu_tx_create(os);
  1706. dmu_tx_hold_bonus(tx, ZVOL_OBJ);
  1707. error = dmu_tx_assign(tx, TXG_WAIT);
  1708. if (error) {
  1709. dmu_tx_abort(tx);
  1710. return (error);
  1711. }
  1712. if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
  1713. zv->zv_volblocksize = vbs;
  1714. dmu_tx_commit(tx);
  1715. return (0);
  1716. }