PageRenderTime 158ms CodeModel.GetById 19ms app.highlight 123ms RepoModel.GetById 1ms app.codeStats 1ms

/drivers/md/dm-thin.c

http://github.com/mirrors/linux
C | 4549 lines | 3149 code | 782 blank | 618 comment | 431 complexity | 8b180e4f08d96d2cd5c069d9d49c4789 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/*
   2 * Copyright (C) 2011-2012 Red Hat UK.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-thin-metadata.h"
   8#include "dm-bio-prison-v1.h"
   9#include "dm.h"
  10
  11#include <linux/device-mapper.h>
  12#include <linux/dm-io.h>
  13#include <linux/dm-kcopyd.h>
  14#include <linux/jiffies.h>
  15#include <linux/log2.h>
  16#include <linux/list.h>
  17#include <linux/rculist.h>
  18#include <linux/init.h>
  19#include <linux/module.h>
  20#include <linux/slab.h>
  21#include <linux/vmalloc.h>
  22#include <linux/sort.h>
  23#include <linux/rbtree.h>
  24
  25#define	DM_MSG_PREFIX	"thin"
  26
  27/*
  28 * Tunable constants
  29 */
  30#define ENDIO_HOOK_POOL_SIZE 1024
  31#define MAPPING_POOL_SIZE 1024
  32#define COMMIT_PERIOD HZ
  33#define NO_SPACE_TIMEOUT_SECS 60
  34
  35static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
  36
  37DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
  38		"A percentage of time allocated for copy on write");
  39
  40/*
  41 * The block size of the device holding pool data must be
  42 * between 64KB and 1GB.
  43 */
  44#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
  45#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
  46
  47/*
  48 * Device id is restricted to 24 bits.
  49 */
  50#define MAX_DEV_ID ((1 << 24) - 1)
  51
  52/*
  53 * How do we handle breaking sharing of data blocks?
  54 * =================================================
  55 *
  56 * We use a standard copy-on-write btree to store the mappings for the
  57 * devices (note I'm talking about copy-on-write of the metadata here, not
  58 * the data).  When you take an internal snapshot you clone the root node
  59 * of the origin btree.  After this there is no concept of an origin or a
  60 * snapshot.  They are just two device trees that happen to point to the
  61 * same data blocks.
  62 *
  63 * When we get a write in we decide if it's to a shared data block using
  64 * some timestamp magic.  If it is, we have to break sharing.
  65 *
  66 * Let's say we write to a shared block in what was the origin.  The
  67 * steps are:
  68 *
  69 * i) plug io further to this physical block. (see bio_prison code).
  70 *
  71 * ii) quiesce any read io to that shared data block.  Obviously
  72 * including all devices that share this block.  (see dm_deferred_set code)
  73 *
  74 * iii) copy the data block to a newly allocate block.  This step can be
  75 * missed out if the io covers the block. (schedule_copy).
  76 *
  77 * iv) insert the new mapping into the origin's btree
  78 * (process_prepared_mapping).  This act of inserting breaks some
  79 * sharing of btree nodes between the two devices.  Breaking sharing only
  80 * effects the btree of that specific device.  Btrees for the other
  81 * devices that share the block never change.  The btree for the origin
  82 * device as it was after the last commit is untouched, ie. we're using
  83 * persistent data structures in the functional programming sense.
  84 *
  85 * v) unplug io to this physical block, including the io that triggered
  86 * the breaking of sharing.
  87 *
  88 * Steps (ii) and (iii) occur in parallel.
  89 *
  90 * The metadata _doesn't_ need to be committed before the io continues.  We
  91 * get away with this because the io is always written to a _new_ block.
  92 * If there's a crash, then:
  93 *
  94 * - The origin mapping will point to the old origin block (the shared
  95 * one).  This will contain the data as it was before the io that triggered
  96 * the breaking of sharing came in.
  97 *
  98 * - The snap mapping still points to the old block.  As it would after
  99 * the commit.
 100 *
 101 * The downside of this scheme is the timestamp magic isn't perfect, and
 102 * will continue to think that data block in the snapshot device is shared
 103 * even after the write to the origin has broken sharing.  I suspect data
 104 * blocks will typically be shared by many different devices, so we're
 105 * breaking sharing n + 1 times, rather than n, where n is the number of
 106 * devices that reference this data block.  At the moment I think the
 107 * benefits far, far outweigh the disadvantages.
 108 */
 109
 110/*----------------------------------------------------------------*/
 111
 112/*
 113 * Key building.
 114 */
 115enum lock_space {
 116	VIRTUAL,
 117	PHYSICAL
 118};
 119
 120static void build_key(struct dm_thin_device *td, enum lock_space ls,
 121		      dm_block_t b, dm_block_t e, struct dm_cell_key *key)
 122{
 123	key->virtual = (ls == VIRTUAL);
 124	key->dev = dm_thin_dev_id(td);
 125	key->block_begin = b;
 126	key->block_end = e;
 127}
 128
 129static void build_data_key(struct dm_thin_device *td, dm_block_t b,
 130			   struct dm_cell_key *key)
 131{
 132	build_key(td, PHYSICAL, b, b + 1llu, key);
 133}
 134
 135static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 136			      struct dm_cell_key *key)
 137{
 138	build_key(td, VIRTUAL, b, b + 1llu, key);
 139}
 140
 141/*----------------------------------------------------------------*/
 142
 143#define THROTTLE_THRESHOLD (1 * HZ)
 144
 145struct throttle {
 146	struct rw_semaphore lock;
 147	unsigned long threshold;
 148	bool throttle_applied;
 149};
 150
 151static void throttle_init(struct throttle *t)
 152{
 153	init_rwsem(&t->lock);
 154	t->throttle_applied = false;
 155}
 156
 157static void throttle_work_start(struct throttle *t)
 158{
 159	t->threshold = jiffies + THROTTLE_THRESHOLD;
 160}
 161
 162static void throttle_work_update(struct throttle *t)
 163{
 164	if (!t->throttle_applied && jiffies > t->threshold) {
 165		down_write(&t->lock);
 166		t->throttle_applied = true;
 167	}
 168}
 169
 170static void throttle_work_complete(struct throttle *t)
 171{
 172	if (t->throttle_applied) {
 173		t->throttle_applied = false;
 174		up_write(&t->lock);
 175	}
 176}
 177
 178static void throttle_lock(struct throttle *t)
 179{
 180	down_read(&t->lock);
 181}
 182
 183static void throttle_unlock(struct throttle *t)
 184{
 185	up_read(&t->lock);
 186}
 187
 188/*----------------------------------------------------------------*/
 189
 190/*
 191 * A pool device ties together a metadata device and a data device.  It
 192 * also provides the interface for creating and destroying internal
 193 * devices.
 194 */
 195struct dm_thin_new_mapping;
 196
 197/*
 198 * The pool runs in various modes.  Ordered in degraded order for comparisons.
 199 */
 200enum pool_mode {
 201	PM_WRITE,		/* metadata may be changed */
 202	PM_OUT_OF_DATA_SPACE,	/* metadata may be changed, though data may not be allocated */
 203
 204	/*
 205	 * Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
 206	 */
 207	PM_OUT_OF_METADATA_SPACE,
 208	PM_READ_ONLY,		/* metadata may not be changed */
 209
 210	PM_FAIL,		/* all I/O fails */
 211};
 212
 213struct pool_features {
 214	enum pool_mode mode;
 215
 216	bool zero_new_blocks:1;
 217	bool discard_enabled:1;
 218	bool discard_passdown:1;
 219	bool error_if_no_space:1;
 220};
 221
 222struct thin_c;
 223typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
 224typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
 225typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
 226
 227#define CELL_SORT_ARRAY_SIZE 8192
 228
 229struct pool {
 230	struct list_head list;
 231	struct dm_target *ti;	/* Only set if a pool target is bound */
 232
 233	struct mapped_device *pool_md;
 234	struct block_device *data_dev;
 235	struct block_device *md_dev;
 236	struct dm_pool_metadata *pmd;
 237
 238	dm_block_t low_water_blocks;
 239	uint32_t sectors_per_block;
 240	int sectors_per_block_shift;
 241
 242	struct pool_features pf;
 243	bool low_water_triggered:1;	/* A dm event has been sent */
 244	bool suspended:1;
 245	bool out_of_data_space:1;
 246
 247	struct dm_bio_prison *prison;
 248	struct dm_kcopyd_client *copier;
 249
 250	struct work_struct worker;
 251	struct workqueue_struct *wq;
 252	struct throttle throttle;
 253	struct delayed_work waker;
 254	struct delayed_work no_space_timeout;
 255
 256	unsigned long last_commit_jiffies;
 257	unsigned ref_count;
 258
 259	spinlock_t lock;
 260	struct bio_list deferred_flush_bios;
 261	struct bio_list deferred_flush_completions;
 262	struct list_head prepared_mappings;
 263	struct list_head prepared_discards;
 264	struct list_head prepared_discards_pt2;
 265	struct list_head active_thins;
 266
 267	struct dm_deferred_set *shared_read_ds;
 268	struct dm_deferred_set *all_io_ds;
 269
 270	struct dm_thin_new_mapping *next_mapping;
 271
 272	process_bio_fn process_bio;
 273	process_bio_fn process_discard;
 274
 275	process_cell_fn process_cell;
 276	process_cell_fn process_discard_cell;
 277
 278	process_mapping_fn process_prepared_mapping;
 279	process_mapping_fn process_prepared_discard;
 280	process_mapping_fn process_prepared_discard_pt2;
 281
 282	struct dm_bio_prison_cell **cell_sort_array;
 283
 284	mempool_t mapping_pool;
 285
 286	struct bio flush_bio;
 287};
 288
 289static void metadata_operation_failed(struct pool *pool, const char *op, int r);
 290
 291static enum pool_mode get_pool_mode(struct pool *pool)
 292{
 293	return pool->pf.mode;
 294}
 295
 296static void notify_of_pool_mode_change(struct pool *pool)
 297{
 298	const char *descs[] = {
 299		"write",
 300		"out-of-data-space",
 301		"read-only",
 302		"read-only",
 303		"fail"
 304	};
 305	const char *extra_desc = NULL;
 306	enum pool_mode mode = get_pool_mode(pool);
 307
 308	if (mode == PM_OUT_OF_DATA_SPACE) {
 309		if (!pool->pf.error_if_no_space)
 310			extra_desc = " (queue IO)";
 311		else
 312			extra_desc = " (error IO)";
 313	}
 314
 315	dm_table_event(pool->ti->table);
 316	DMINFO("%s: switching pool to %s%s mode",
 317	       dm_device_name(pool->pool_md),
 318	       descs[(int)mode], extra_desc ? : "");
 319}
 320
 321/*
 322 * Target context for a pool.
 323 */
 324struct pool_c {
 325	struct dm_target *ti;
 326	struct pool *pool;
 327	struct dm_dev *data_dev;
 328	struct dm_dev *metadata_dev;
 329	struct dm_target_callbacks callbacks;
 330
 331	dm_block_t low_water_blocks;
 332	struct pool_features requested_pf; /* Features requested during table load */
 333	struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
 334};
 335
 336/*
 337 * Target context for a thin.
 338 */
 339struct thin_c {
 340	struct list_head list;
 341	struct dm_dev *pool_dev;
 342	struct dm_dev *origin_dev;
 343	sector_t origin_size;
 344	dm_thin_id dev_id;
 345
 346	struct pool *pool;
 347	struct dm_thin_device *td;
 348	struct mapped_device *thin_md;
 349
 350	bool requeue_mode:1;
 351	spinlock_t lock;
 352	struct list_head deferred_cells;
 353	struct bio_list deferred_bio_list;
 354	struct bio_list retry_on_resume_list;
 355	struct rb_root sort_bio_list; /* sorted list of deferred bios */
 356
 357	/*
 358	 * Ensures the thin is not destroyed until the worker has finished
 359	 * iterating the active_thins list.
 360	 */
 361	refcount_t refcount;
 362	struct completion can_destroy;
 363};
 364
 365/*----------------------------------------------------------------*/
 366
 367static bool block_size_is_power_of_two(struct pool *pool)
 368{
 369	return pool->sectors_per_block_shift >= 0;
 370}
 371
 372static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
 373{
 374	return block_size_is_power_of_two(pool) ?
 375		(b << pool->sectors_per_block_shift) :
 376		(b * pool->sectors_per_block);
 377}
 378
 379/*----------------------------------------------------------------*/
 380
 381struct discard_op {
 382	struct thin_c *tc;
 383	struct blk_plug plug;
 384	struct bio *parent_bio;
 385	struct bio *bio;
 386};
 387
 388static void begin_discard(struct discard_op *op, struct thin_c *tc, struct bio *parent)
 389{
 390	BUG_ON(!parent);
 391
 392	op->tc = tc;
 393	blk_start_plug(&op->plug);
 394	op->parent_bio = parent;
 395	op->bio = NULL;
 396}
 397
 398static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e)
 399{
 400	struct thin_c *tc = op->tc;
 401	sector_t s = block_to_sectors(tc->pool, data_b);
 402	sector_t len = block_to_sectors(tc->pool, data_e - data_b);
 403
 404	return __blkdev_issue_discard(tc->pool_dev->bdev, s, len,
 405				      GFP_NOWAIT, 0, &op->bio);
 406}
 407
 408static void end_discard(struct discard_op *op, int r)
 409{
 410	if (op->bio) {
 411		/*
 412		 * Even if one of the calls to issue_discard failed, we
 413		 * need to wait for the chain to complete.
 414		 */
 415		bio_chain(op->bio, op->parent_bio);
 416		bio_set_op_attrs(op->bio, REQ_OP_DISCARD, 0);
 417		submit_bio(op->bio);
 418	}
 419
 420	blk_finish_plug(&op->plug);
 421
 422	/*
 423	 * Even if r is set, there could be sub discards in flight that we
 424	 * need to wait for.
 425	 */
 426	if (r && !op->parent_bio->bi_status)
 427		op->parent_bio->bi_status = errno_to_blk_status(r);
 428	bio_endio(op->parent_bio);
 429}
 430
 431/*----------------------------------------------------------------*/
 432
 433/*
 434 * wake_worker() is used when new work is queued and when pool_resume is
 435 * ready to continue deferred IO processing.
 436 */
 437static void wake_worker(struct pool *pool)
 438{
 439	queue_work(pool->wq, &pool->worker);
 440}
 441
 442/*----------------------------------------------------------------*/
 443
 444static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
 445		      struct dm_bio_prison_cell **cell_result)
 446{
 447	int r;
 448	struct dm_bio_prison_cell *cell_prealloc;
 449
 450	/*
 451	 * Allocate a cell from the prison's mempool.
 452	 * This might block but it can't fail.
 453	 */
 454	cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
 455
 456	r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
 457	if (r)
 458		/*
 459		 * We reused an old cell; we can get rid of
 460		 * the new one.
 461		 */
 462		dm_bio_prison_free_cell(pool->prison, cell_prealloc);
 463
 464	return r;
 465}
 466
 467static void cell_release(struct pool *pool,
 468			 struct dm_bio_prison_cell *cell,
 469			 struct bio_list *bios)
 470{
 471	dm_cell_release(pool->prison, cell, bios);
 472	dm_bio_prison_free_cell(pool->prison, cell);
 473}
 474
 475static void cell_visit_release(struct pool *pool,
 476			       void (*fn)(void *, struct dm_bio_prison_cell *),
 477			       void *context,
 478			       struct dm_bio_prison_cell *cell)
 479{
 480	dm_cell_visit_release(pool->prison, fn, context, cell);
 481	dm_bio_prison_free_cell(pool->prison, cell);
 482}
 483
 484static void cell_release_no_holder(struct pool *pool,
 485				   struct dm_bio_prison_cell *cell,
 486				   struct bio_list *bios)
 487{
 488	dm_cell_release_no_holder(pool->prison, cell, bios);
 489	dm_bio_prison_free_cell(pool->prison, cell);
 490}
 491
 492static void cell_error_with_code(struct pool *pool,
 493		struct dm_bio_prison_cell *cell, blk_status_t error_code)
 494{
 495	dm_cell_error(pool->prison, cell, error_code);
 496	dm_bio_prison_free_cell(pool->prison, cell);
 497}
 498
 499static blk_status_t get_pool_io_error_code(struct pool *pool)
 500{
 501	return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR;
 502}
 503
 504static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
 505{
 506	cell_error_with_code(pool, cell, get_pool_io_error_code(pool));
 507}
 508
 509static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
 510{
 511	cell_error_with_code(pool, cell, 0);
 512}
 513
 514static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
 515{
 516	cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE);
 517}
 518
 519/*----------------------------------------------------------------*/
 520
 521/*
 522 * A global list of pools that uses a struct mapped_device as a key.
 523 */
 524static struct dm_thin_pool_table {
 525	struct mutex mutex;
 526	struct list_head pools;
 527} dm_thin_pool_table;
 528
 529static void pool_table_init(void)
 530{
 531	mutex_init(&dm_thin_pool_table.mutex);
 532	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
 533}
 534
 535static void pool_table_exit(void)
 536{
 537	mutex_destroy(&dm_thin_pool_table.mutex);
 538}
 539
 540static void __pool_table_insert(struct pool *pool)
 541{
 542	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 543	list_add(&pool->list, &dm_thin_pool_table.pools);
 544}
 545
 546static void __pool_table_remove(struct pool *pool)
 547{
 548	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 549	list_del(&pool->list);
 550}
 551
 552static struct pool *__pool_table_lookup(struct mapped_device *md)
 553{
 554	struct pool *pool = NULL, *tmp;
 555
 556	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 557
 558	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 559		if (tmp->pool_md == md) {
 560			pool = tmp;
 561			break;
 562		}
 563	}
 564
 565	return pool;
 566}
 567
 568static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
 569{
 570	struct pool *pool = NULL, *tmp;
 571
 572	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 573
 574	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 575		if (tmp->md_dev == md_dev) {
 576			pool = tmp;
 577			break;
 578		}
 579	}
 580
 581	return pool;
 582}
 583
 584/*----------------------------------------------------------------*/
 585
 586struct dm_thin_endio_hook {
 587	struct thin_c *tc;
 588	struct dm_deferred_entry *shared_read_entry;
 589	struct dm_deferred_entry *all_io_entry;
 590	struct dm_thin_new_mapping *overwrite_mapping;
 591	struct rb_node rb_node;
 592	struct dm_bio_prison_cell *cell;
 593};
 594
 595static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
 596{
 597	bio_list_merge(bios, master);
 598	bio_list_init(master);
 599}
 600
 601static void error_bio_list(struct bio_list *bios, blk_status_t error)
 602{
 603	struct bio *bio;
 604
 605	while ((bio = bio_list_pop(bios))) {
 606		bio->bi_status = error;
 607		bio_endio(bio);
 608	}
 609}
 610
 611static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
 612		blk_status_t error)
 613{
 614	struct bio_list bios;
 615
 616	bio_list_init(&bios);
 617
 618	spin_lock_irq(&tc->lock);
 619	__merge_bio_list(&bios, master);
 620	spin_unlock_irq(&tc->lock);
 621
 622	error_bio_list(&bios, error);
 623}
 624
 625static void requeue_deferred_cells(struct thin_c *tc)
 626{
 627	struct pool *pool = tc->pool;
 628	struct list_head cells;
 629	struct dm_bio_prison_cell *cell, *tmp;
 630
 631	INIT_LIST_HEAD(&cells);
 632
 633	spin_lock_irq(&tc->lock);
 634	list_splice_init(&tc->deferred_cells, &cells);
 635	spin_unlock_irq(&tc->lock);
 636
 637	list_for_each_entry_safe(cell, tmp, &cells, user_list)
 638		cell_requeue(pool, cell);
 639}
 640
 641static void requeue_io(struct thin_c *tc)
 642{
 643	struct bio_list bios;
 644
 645	bio_list_init(&bios);
 646
 647	spin_lock_irq(&tc->lock);
 648	__merge_bio_list(&bios, &tc->deferred_bio_list);
 649	__merge_bio_list(&bios, &tc->retry_on_resume_list);
 650	spin_unlock_irq(&tc->lock);
 651
 652	error_bio_list(&bios, BLK_STS_DM_REQUEUE);
 653	requeue_deferred_cells(tc);
 654}
 655
 656static void error_retry_list_with_code(struct pool *pool, blk_status_t error)
 657{
 658	struct thin_c *tc;
 659
 660	rcu_read_lock();
 661	list_for_each_entry_rcu(tc, &pool->active_thins, list)
 662		error_thin_bio_list(tc, &tc->retry_on_resume_list, error);
 663	rcu_read_unlock();
 664}
 665
 666static void error_retry_list(struct pool *pool)
 667{
 668	error_retry_list_with_code(pool, get_pool_io_error_code(pool));
 669}
 670
 671/*
 672 * This section of code contains the logic for processing a thin device's IO.
 673 * Much of the code depends on pool object resources (lists, workqueues, etc)
 674 * but most is exclusively called from the thin target rather than the thin-pool
 675 * target.
 676 */
 677
 678static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 679{
 680	struct pool *pool = tc->pool;
 681	sector_t block_nr = bio->bi_iter.bi_sector;
 682
 683	if (block_size_is_power_of_two(pool))
 684		block_nr >>= pool->sectors_per_block_shift;
 685	else
 686		(void) sector_div(block_nr, pool->sectors_per_block);
 687
 688	return block_nr;
 689}
 690
 691/*
 692 * Returns the _complete_ blocks that this bio covers.
 693 */
 694static void get_bio_block_range(struct thin_c *tc, struct bio *bio,
 695				dm_block_t *begin, dm_block_t *end)
 696{
 697	struct pool *pool = tc->pool;
 698	sector_t b = bio->bi_iter.bi_sector;
 699	sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
 700
 701	b += pool->sectors_per_block - 1ull; /* so we round up */
 702
 703	if (block_size_is_power_of_two(pool)) {
 704		b >>= pool->sectors_per_block_shift;
 705		e >>= pool->sectors_per_block_shift;
 706	} else {
 707		(void) sector_div(b, pool->sectors_per_block);
 708		(void) sector_div(e, pool->sectors_per_block);
 709	}
 710
 711	if (e < b)
 712		/* Can happen if the bio is within a single block. */
 713		e = b;
 714
 715	*begin = b;
 716	*end = e;
 717}
 718
 719static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 720{
 721	struct pool *pool = tc->pool;
 722	sector_t bi_sector = bio->bi_iter.bi_sector;
 723
 724	bio_set_dev(bio, tc->pool_dev->bdev);
 725	if (block_size_is_power_of_two(pool))
 726		bio->bi_iter.bi_sector =
 727			(block << pool->sectors_per_block_shift) |
 728			(bi_sector & (pool->sectors_per_block - 1));
 729	else
 730		bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
 731				 sector_div(bi_sector, pool->sectors_per_block);
 732}
 733
 734static void remap_to_origin(struct thin_c *tc, struct bio *bio)
 735{
 736	bio_set_dev(bio, tc->origin_dev->bdev);
 737}
 738
 739static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
 740{
 741	return op_is_flush(bio->bi_opf) &&
 742		dm_thin_changed_this_transaction(tc->td);
 743}
 744
 745static void inc_all_io_entry(struct pool *pool, struct bio *bio)
 746{
 747	struct dm_thin_endio_hook *h;
 748
 749	if (bio_op(bio) == REQ_OP_DISCARD)
 750		return;
 751
 752	h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 753	h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
 754}
 755
 756static void issue(struct thin_c *tc, struct bio *bio)
 757{
 758	struct pool *pool = tc->pool;
 759
 760	if (!bio_triggers_commit(tc, bio)) {
 761		generic_make_request(bio);
 762		return;
 763	}
 764
 765	/*
 766	 * Complete bio with an error if earlier I/O caused changes to
 767	 * the metadata that can't be committed e.g, due to I/O errors
 768	 * on the metadata device.
 769	 */
 770	if (dm_thin_aborted_changes(tc->td)) {
 771		bio_io_error(bio);
 772		return;
 773	}
 774
 775	/*
 776	 * Batch together any bios that trigger commits and then issue a
 777	 * single commit for them in process_deferred_bios().
 778	 */
 779	spin_lock_irq(&pool->lock);
 780	bio_list_add(&pool->deferred_flush_bios, bio);
 781	spin_unlock_irq(&pool->lock);
 782}
 783
 784static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
 785{
 786	remap_to_origin(tc, bio);
 787	issue(tc, bio);
 788}
 789
 790static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 791			    dm_block_t block)
 792{
 793	remap(tc, bio, block);
 794	issue(tc, bio);
 795}
 796
 797/*----------------------------------------------------------------*/
 798
 799/*
 800 * Bio endio functions.
 801 */
 802struct dm_thin_new_mapping {
 803	struct list_head list;
 804
 805	bool pass_discard:1;
 806	bool maybe_shared:1;
 807
 808	/*
 809	 * Track quiescing, copying and zeroing preparation actions.  When this
 810	 * counter hits zero the block is prepared and can be inserted into the
 811	 * btree.
 812	 */
 813	atomic_t prepare_actions;
 814
 815	blk_status_t status;
 816	struct thin_c *tc;
 817	dm_block_t virt_begin, virt_end;
 818	dm_block_t data_block;
 819	struct dm_bio_prison_cell *cell;
 820
 821	/*
 822	 * If the bio covers the whole area of a block then we can avoid
 823	 * zeroing or copying.  Instead this bio is hooked.  The bio will
 824	 * still be in the cell, so care has to be taken to avoid issuing
 825	 * the bio twice.
 826	 */
 827	struct bio *bio;
 828	bio_end_io_t *saved_bi_end_io;
 829};
 830
 831static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
 832{
 833	struct pool *pool = m->tc->pool;
 834
 835	if (atomic_dec_and_test(&m->prepare_actions)) {
 836		list_add_tail(&m->list, &pool->prepared_mappings);
 837		wake_worker(pool);
 838	}
 839}
 840
 841static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
 842{
 843	unsigned long flags;
 844	struct pool *pool = m->tc->pool;
 845
 846	spin_lock_irqsave(&pool->lock, flags);
 847	__complete_mapping_preparation(m);
 848	spin_unlock_irqrestore(&pool->lock, flags);
 849}
 850
 851static void copy_complete(int read_err, unsigned long write_err, void *context)
 852{
 853	struct dm_thin_new_mapping *m = context;
 854
 855	m->status = read_err || write_err ? BLK_STS_IOERR : 0;
 856	complete_mapping_preparation(m);
 857}
 858
 859static void overwrite_endio(struct bio *bio)
 860{
 861	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 862	struct dm_thin_new_mapping *m = h->overwrite_mapping;
 863
 864	bio->bi_end_io = m->saved_bi_end_io;
 865
 866	m->status = bio->bi_status;
 867	complete_mapping_preparation(m);
 868}
 869
 870/*----------------------------------------------------------------*/
 871
 872/*
 873 * Workqueue.
 874 */
 875
 876/*
 877 * Prepared mapping jobs.
 878 */
 879
 880/*
 881 * This sends the bios in the cell, except the original holder, back
 882 * to the deferred_bios list.
 883 */
 884static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 885{
 886	struct pool *pool = tc->pool;
 887	unsigned long flags;
 888	int has_work;
 889
 890	spin_lock_irqsave(&tc->lock, flags);
 891	cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
 892	has_work = !bio_list_empty(&tc->deferred_bio_list);
 893	spin_unlock_irqrestore(&tc->lock, flags);
 894
 895	if (has_work)
 896		wake_worker(pool);
 897}
 898
 899static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
 900
 901struct remap_info {
 902	struct thin_c *tc;
 903	struct bio_list defer_bios;
 904	struct bio_list issue_bios;
 905};
 906
 907static void __inc_remap_and_issue_cell(void *context,
 908				       struct dm_bio_prison_cell *cell)
 909{
 910	struct remap_info *info = context;
 911	struct bio *bio;
 912
 913	while ((bio = bio_list_pop(&cell->bios))) {
 914		if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD)
 915			bio_list_add(&info->defer_bios, bio);
 916		else {
 917			inc_all_io_entry(info->tc->pool, bio);
 918
 919			/*
 920			 * We can't issue the bios with the bio prison lock
 921			 * held, so we add them to a list to issue on
 922			 * return from this function.
 923			 */
 924			bio_list_add(&info->issue_bios, bio);
 925		}
 926	}
 927}
 928
 929static void inc_remap_and_issue_cell(struct thin_c *tc,
 930				     struct dm_bio_prison_cell *cell,
 931				     dm_block_t block)
 932{
 933	struct bio *bio;
 934	struct remap_info info;
 935
 936	info.tc = tc;
 937	bio_list_init(&info.defer_bios);
 938	bio_list_init(&info.issue_bios);
 939
 940	/*
 941	 * We have to be careful to inc any bios we're about to issue
 942	 * before the cell is released, and avoid a race with new bios
 943	 * being added to the cell.
 944	 */
 945	cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
 946			   &info, cell);
 947
 948	while ((bio = bio_list_pop(&info.defer_bios)))
 949		thin_defer_bio(tc, bio);
 950
 951	while ((bio = bio_list_pop(&info.issue_bios)))
 952		remap_and_issue(info.tc, bio, block);
 953}
 954
 955static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 956{
 957	cell_error(m->tc->pool, m->cell);
 958	list_del(&m->list);
 959	mempool_free(m, &m->tc->pool->mapping_pool);
 960}
 961
 962static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
 963{
 964	struct pool *pool = tc->pool;
 965
 966	/*
 967	 * If the bio has the REQ_FUA flag set we must commit the metadata
 968	 * before signaling its completion.
 969	 */
 970	if (!bio_triggers_commit(tc, bio)) {
 971		bio_endio(bio);
 972		return;
 973	}
 974
 975	/*
 976	 * Complete bio with an error if earlier I/O caused changes to the
 977	 * metadata that can't be committed, e.g, due to I/O errors on the
 978	 * metadata device.
 979	 */
 980	if (dm_thin_aborted_changes(tc->td)) {
 981		bio_io_error(bio);
 982		return;
 983	}
 984
 985	/*
 986	 * Batch together any bios that trigger commits and then issue a
 987	 * single commit for them in process_deferred_bios().
 988	 */
 989	spin_lock_irq(&pool->lock);
 990	bio_list_add(&pool->deferred_flush_completions, bio);
 991	spin_unlock_irq(&pool->lock);
 992}
 993
 994static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 995{
 996	struct thin_c *tc = m->tc;
 997	struct pool *pool = tc->pool;
 998	struct bio *bio = m->bio;
 999	int r;
1000
1001	if (m->status) {
1002		cell_error(pool, m->cell);
1003		goto out;
1004	}
1005
1006	/*
1007	 * Commit the prepared block into the mapping btree.
1008	 * Any I/O for this block arriving after this point will get
1009	 * remapped to it directly.
1010	 */
1011	r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
1012	if (r) {
1013		metadata_operation_failed(pool, "dm_thin_insert_block", r);
1014		cell_error(pool, m->cell);
1015		goto out;
1016	}
1017
1018	/*
1019	 * Release any bios held while the block was being provisioned.
1020	 * If we are processing a write bio that completely covers the block,
1021	 * we already processed it so can ignore it now when processing
1022	 * the bios in the cell.
1023	 */
1024	if (bio) {
1025		inc_remap_and_issue_cell(tc, m->cell, m->data_block);
1026		complete_overwrite_bio(tc, bio);
1027	} else {
1028		inc_all_io_entry(tc->pool, m->cell->holder);
1029		remap_and_issue(tc, m->cell->holder, m->data_block);
1030		inc_remap_and_issue_cell(tc, m->cell, m->data_block);
1031	}
1032
1033out:
1034	list_del(&m->list);
1035	mempool_free(m, &pool->mapping_pool);
1036}
1037
1038/*----------------------------------------------------------------*/
1039
1040static void free_discard_mapping(struct dm_thin_new_mapping *m)
1041{
1042	struct thin_c *tc = m->tc;
1043	if (m->cell)
1044		cell_defer_no_holder(tc, m->cell);
1045	mempool_free(m, &tc->pool->mapping_pool);
1046}
1047
1048static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
1049{
1050	bio_io_error(m->bio);
1051	free_discard_mapping(m);
1052}
1053
1054static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
1055{
1056	bio_endio(m->bio);
1057	free_discard_mapping(m);
1058}
1059
1060static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
1061{
1062	int r;
1063	struct thin_c *tc = m->tc;
1064
1065	r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
1066	if (r) {
1067		metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
1068		bio_io_error(m->bio);
1069	} else
1070		bio_endio(m->bio);
1071
1072	cell_defer_no_holder(tc, m->cell);
1073	mempool_free(m, &tc->pool->mapping_pool);
1074}
1075
1076/*----------------------------------------------------------------*/
1077
1078static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m,
1079						   struct bio *discard_parent)
1080{
1081	/*
1082	 * We've already unmapped this range of blocks, but before we
1083	 * passdown we have to check that these blocks are now unused.
1084	 */
1085	int r = 0;
1086	bool shared = true;
1087	struct thin_c *tc = m->tc;
1088	struct pool *pool = tc->pool;
1089	dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
1090	struct discard_op op;
1091
1092	begin_discard(&op, tc, discard_parent);
1093	while (b != end) {
1094		/* find start of unmapped run */
1095		for (; b < end; b++) {
1096			r = dm_pool_block_is_shared(pool->pmd, b, &shared);
1097			if (r)
1098				goto out;
1099
1100			if (!shared)
1101				break;
1102		}
1103
1104		if (b == end)
1105			break;
1106
1107		/* find end of run */
1108		for (e = b + 1; e != end; e++) {
1109			r = dm_pool_block_is_shared(pool->pmd, e, &shared);
1110			if (r)
1111				goto out;
1112
1113			if (shared)
1114				break;
1115		}
1116
1117		r = issue_discard(&op, b, e);
1118		if (r)
1119			goto out;
1120
1121		b = e;
1122	}
1123out:
1124	end_discard(&op, r);
1125}
1126
1127static void queue_passdown_pt2(struct dm_thin_new_mapping *m)
1128{
1129	unsigned long flags;
1130	struct pool *pool = m->tc->pool;
1131
1132	spin_lock_irqsave(&pool->lock, flags);
1133	list_add_tail(&m->list, &pool->prepared_discards_pt2);
1134	spin_unlock_irqrestore(&pool->lock, flags);
1135	wake_worker(pool);
1136}
1137
1138static void passdown_endio(struct bio *bio)
1139{
1140	/*
1141	 * It doesn't matter if the passdown discard failed, we still want
1142	 * to unmap (we ignore err).
1143	 */
1144	queue_passdown_pt2(bio->bi_private);
1145	bio_put(bio);
1146}
1147
1148static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
1149{
1150	int r;
1151	struct thin_c *tc = m->tc;
1152	struct pool *pool = tc->pool;
1153	struct bio *discard_parent;
1154	dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin);
1155
1156	/*
1157	 * Only this thread allocates blocks, so we can be sure that the
1158	 * newly unmapped blocks will not be allocated before the end of
1159	 * the function.
1160	 */
1161	r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
1162	if (r) {
1163		metadata_operation_failed(pool, "dm_thin_remove_range", r);
1164		bio_io_error(m->bio);
1165		cell_defer_no_holder(tc, m->cell);
1166		mempool_free(m, &pool->mapping_pool);
1167		return;
1168	}
1169
1170	/*
1171	 * Increment the unmapped blocks.  This prevents a race between the
1172	 * passdown io and reallocation of freed blocks.
1173	 */
1174	r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
1175	if (r) {
1176		metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
1177		bio_io_error(m->bio);
1178		cell_defer_no_holder(tc, m->cell);
1179		mempool_free(m, &pool->mapping_pool);
1180		return;
1181	}
1182
1183	discard_parent = bio_alloc(GFP_NOIO, 1);
1184	if (!discard_parent) {
1185		DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.",
1186		       dm_device_name(tc->pool->pool_md));
1187		queue_passdown_pt2(m);
1188
1189	} else {
1190		discard_parent->bi_end_io = passdown_endio;
1191		discard_parent->bi_private = m;
1192
1193		if (m->maybe_shared)
1194			passdown_double_checking_shared_status(m, discard_parent);
1195		else {
1196			struct discard_op op;
1197
1198			begin_discard(&op, tc, discard_parent);
1199			r = issue_discard(&op, m->data_block, data_end);
1200			end_discard(&op, r);
1201		}
1202	}
1203}
1204
1205static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
1206{
1207	int r;
1208	struct thin_c *tc = m->tc;
1209	struct pool *pool = tc->pool;
1210
1211	/*
1212	 * The passdown has completed, so now we can decrement all those
1213	 * unmapped blocks.
1214	 */
1215	r = dm_pool_dec_data_range(pool->pmd, m->data_block,
1216				   m->data_block + (m->virt_end - m->virt_begin));
1217	if (r) {
1218		metadata_operation_failed(pool, "dm_pool_dec_data_range", r);
1219		bio_io_error(m->bio);
1220	} else
1221		bio_endio(m->bio);
1222
1223	cell_defer_no_holder(tc, m->cell);
1224	mempool_free(m, &pool->mapping_pool);
1225}
1226
1227static void process_prepared(struct pool *pool, struct list_head *head,
1228			     process_mapping_fn *fn)
1229{
1230	struct list_head maps;
1231	struct dm_thin_new_mapping *m, *tmp;
1232
1233	INIT_LIST_HEAD(&maps);
1234	spin_lock_irq(&pool->lock);
1235	list_splice_init(head, &maps);
1236	spin_unlock_irq(&pool->lock);
1237
1238	list_for_each_entry_safe(m, tmp, &maps, list)
1239		(*fn)(m);
1240}
1241
1242/*
1243 * Deferred bio jobs.
1244 */
1245static int io_overlaps_block(struct pool *pool, struct bio *bio)
1246{
1247	return bio->bi_iter.bi_size ==
1248		(pool->sectors_per_block << SECTOR_SHIFT);
1249}
1250
1251static int io_overwrites_block(struct pool *pool, struct bio *bio)
1252{
1253	return (bio_data_dir(bio) == WRITE) &&
1254		io_overlaps_block(pool, bio);
1255}
1256
1257static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
1258			       bio_end_io_t *fn)
1259{
1260	*save = bio->bi_end_io;
1261	bio->bi_end_io = fn;
1262}
1263
1264static int ensure_next_mapping(struct pool *pool)
1265{
1266	if (pool->next_mapping)
1267		return 0;
1268
1269	pool->next_mapping = mempool_alloc(&pool->mapping_pool, GFP_ATOMIC);
1270
1271	return pool->next_mapping ? 0 : -ENOMEM;
1272}
1273
1274static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
1275{
1276	struct dm_thin_new_mapping *m = pool->next_mapping;
1277
1278	BUG_ON(!pool->next_mapping);
1279
1280	memset(m, 0, sizeof(struct dm_thin_new_mapping));
1281	INIT_LIST_HEAD(&m->list);
1282	m->bio = NULL;
1283
1284	pool->next_mapping = NULL;
1285
1286	return m;
1287}
1288
1289static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
1290		    sector_t begin, sector_t end)
1291{
1292	struct dm_io_region to;
1293
1294	to.bdev = tc->pool_dev->bdev;
1295	to.sector = begin;
1296	to.count = end - begin;
1297
1298	dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
1299}
1300
1301static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
1302				      dm_block_t data_begin,
1303				      struct dm_thin_new_mapping *m)
1304{
1305	struct pool *pool = tc->pool;
1306	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1307
1308	h->overwrite_mapping = m;
1309	m->bio = bio;
1310	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
1311	inc_all_io_entry(pool, bio);
1312	remap_and_issue(tc, bio, data_begin);
1313}
1314
1315/*
1316 * A partial copy also needs to zero the uncopied region.
1317 */
1318static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
1319			  struct dm_dev *origin, dm_block_t data_origin,
1320			  dm_block_t data_dest,
1321			  struct dm_bio_prison_cell *cell, struct bio *bio,
1322			  sector_t len)
1323{
1324	struct pool *pool = tc->pool;
1325	struct dm_thin_new_mapping *m = get_next_mapping(pool);
1326
1327	m->tc = tc;
1328	m->virt_begin = virt_block;
1329	m->virt_end = virt_block + 1u;
1330	m->data_block = data_dest;
1331	m->cell = cell;
1332
1333	/*
1334	 * quiesce action + copy action + an extra reference held for the
1335	 * duration of this function (we may need to inc later for a
1336	 * partial zero).
1337	 */
1338	atomic_set(&m->prepare_actions, 3);
1339
1340	if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
1341		complete_mapping_preparation(m); /* already quiesced */
1342
1343	/*
1344	 * IO to pool_dev remaps to the pool target's data_dev.
1345	 *
1346	 * If the whole block of data is being overwritten, we can issue the
1347	 * bio immediately. Otherwise we use kcopyd to clone the data first.
1348	 */
1349	if (io_overwrites_block(pool, bio))
1350		remap_and_issue_overwrite(tc, bio, data_dest, m);
1351	else {
1352		struct dm_io_region from, to;
1353
1354		from.bdev = origin->bdev;
1355		from.sector = data_origin * pool->sectors_per_block;
1356		from.count = len;
1357
1358		to.bdev = tc->pool_dev->bdev;
1359		to.sector = data_dest * pool->sectors_per_block;
1360		to.count = len;
1361
1362		dm_kcopyd_copy(pool->copier, &from, 1, &to,
1363			       0, copy_complete, m);
1364
1365		/*
1366		 * Do we need to zero a tail region?
1367		 */
1368		if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
1369			atomic_inc(&m->prepare_actions);
1370			ll_zero(tc, m,
1371				data_dest * pool->sectors_per_block + len,
1372				(data_dest + 1) * pool->sectors_per_block);
1373		}
1374	}
1375
1376	complete_mapping_preparation(m); /* drop our ref */
1377}
1378
1379static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
1380				   dm_block_t data_origin, dm_block_t data_dest,
1381				   struct dm_bio_prison_cell *cell, struct bio *bio)
1382{
1383	schedule_copy(tc, virt_block, tc->pool_dev,
1384		      data_origin, data_dest, cell, bio,
1385		      tc->pool->sectors_per_block);
1386}
1387
1388static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
1389			  dm_block_t data_block, struct dm_bio_prison_cell *cell,
1390			  struct bio *bio)
1391{
1392	struct pool *pool = tc->pool;
1393	struct dm_thin_new_mapping *m = get_next_mapping(pool);
1394
1395	atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
1396	m->tc = tc;
1397	m->virt_begin = virt_block;
1398	m->virt_end = virt_block + 1u;
1399	m->data_block = data_block;
1400	m->cell = cell;
1401
1402	/*
1403	 * If the whole block of data is being overwritten or we are not
1404	 * zeroing pre-existing data, we can issue the bio immediately.
1405	 * Otherwise we use kcopyd to zero the data first.
1406	 */
1407	if (pool->pf.zero_new_blocks) {
1408		if (io_overwrites_block(pool, bio))
1409			remap_and_issue_overwrite(tc, bio, data_block, m);
1410		else
1411			ll_zero(tc, m, data_block * pool->sectors_per_block,
1412				(data_block + 1) * pool->sectors_per_block);
1413	} else
1414		process_prepared_mapping(m);
1415}
1416
1417static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
1418				   dm_block_t data_dest,
1419				   struct dm_bio_prison_cell *cell, struct bio *bio)
1420{
1421	struct pool *pool = tc->pool;
1422	sector_t virt_block_begin = virt_block * pool->sectors_per_block;
1423	sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
1424
1425	if (virt_block_end <= tc->origin_size)
1426		schedule_copy(tc, virt_block, tc->origin_dev,
1427			      virt_block, data_dest, cell, bio,
1428			      pool->sectors_per_block);
1429
1430	else if (virt_block_begin < tc->origin_size)
1431		schedule_copy(tc, virt_block, tc->origin_dev,
1432			      virt_block, data_dest, cell, bio,
1433			      tc->origin_size - virt_block_begin);
1434
1435	else
1436		schedule_zero(tc, virt_block, data_dest, cell, bio);
1437}
1438
1439static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
1440
1441static void requeue_bios(struct pool *pool);
1442
1443static bool is_read_only_pool_mode(enum pool_mode mode)
1444{
1445	return (mode == PM_OUT_OF_METADATA_SPACE || mode == PM_READ_ONLY);
1446}
1447
1448static bool is_read_only(struct pool *pool)
1449{
1450	return is_read_only_pool_mode(get_pool_mode(pool));
1451}
1452
1453static void check_for_metadata_space(struct pool *pool)
1454{
1455	int r;
1456	const char *ooms_reason = NULL;
1457	dm_block_t nr_free;
1458
1459	r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free);
1460	if (r)
1461		ooms_reason = "Could not get free metadata blocks";
1462	else if (!nr_free)
1463		ooms_reason = "No free metadata blocks";
1464
1465	if (ooms_reason && !is_read_only(pool)) {
1466		DMERR("%s", ooms_reason);
1467		set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE);
1468	}
1469}
1470
1471static void check_for_data_space(struct pool *pool)
1472{
1473	int r;
1474	dm_block_t nr_free;
1475
1476	if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE)
1477		return;
1478
1479	r = dm_pool_get_free_block_count(pool->pmd, &nr_free);
1480	if (r)
1481		return;
1482
1483	if (nr_free) {
1484		set_pool_mode(pool, PM_WRITE);
1485		requeue_bios(pool);
1486	}
1487}
1488
1489/*
1490 * A non-zero return indicates read_only or fail_io mode.
1491 * Many callers don't care about the return value.
1492 */
1493static int commit(struct pool *pool)
1494{
1495	int r;
1496
1497	if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
1498		return -EINVAL;
1499
1500	r = dm_pool_commit_metadata(pool->pmd);
1501	if (r)
1502		metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
1503	else {
1504		check_for_metadata_space(pool);
1505		check_for_data_space(pool);
1506	}
1507
1508	return r;
1509}
1510
1511static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
1512{
1513	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
1514		DMWARN("%s: reached low water mark for data device: sending event.",
1515		       dm_device_name(pool->pool_md));
1516		spin_lock_irq(&pool->lock);
1517		pool->low_water_triggered = true;
1518		spin_unlock_irq(&pool->lock);
1519		dm_table_event(pool->ti->table);
1520	}
1521}
1522
1523static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1524{
1525	int r;
1526	dm_block_t free_blocks;
1527	struct pool *pool = tc->pool;
1528
1529	if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
1530		return -EINVAL;
1531
1532	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1533	if (r) {
1534		metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
1535		return r;
1536	}
1537
1538	check_low_water_mark(pool, free_blocks);
1539
1540	if (!free_blocks) {
1541		/*
1542		 * Try to commit to see if that will free up some
1543		 * more space.
1544		 */
1545		r = commit(pool);
1546		if (r)
1547			return r;
1548
1549		r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1550		if (r) {
1551			metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
1552			return r;
1553		}
1554
1555		if (!free_blocks) {
1556			set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
1557			return -ENOSPC;
1558		}
1559	}
1560
1561	r = dm_pool_alloc_data_block(pool->pmd, result);
1562	if (r) {
1563		if (r == -ENOSPC)
1564			set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
1565		else
1566			metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
1567		return r;
1568	}
1569
1570	r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks);
1571	if (r) {
1572		metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r);
1573		return r;
1574	}
1575
1576	if (!free_blocks) {
1577		/* Let's commit before we use up the metadata reserve. */
1578		r = commit(pool);
1579		if (r)
1580			return r;
1581	}
1582
1583	return 0;
1584}
1585
1586/*
1587 * If we have run out of space, queue bios until the device is
1588 * resumed, presumably after having been reloaded with more space.
1589 */
1590static void retry_on_resume(struct bio *bio)
1591{
1592	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1593	struct thin_c *tc = h->tc;
1594
1595	spin_lock_irq(&tc->lock);
1596	bio_list_add(&tc->retry_on_resume_list, bio);
1597	spin_unlock_irq(&tc->lock);
1598}
1599
1600static blk_status_t should_error_unserviceable_bio(struct pool *pool)
1601{
1602	enum pool_mode m = get_pool_mode(pool);
1603
1604	switch (m) {
1605	case PM_WRITE:
1606		/* Shouldn't get here */
1607		DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
1608		return BLK_STS_IOERR;
1609
1610	case PM_OUT_OF_DATA_SPACE:
1611		return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
1612
1613	case PM_OUT_OF_METADATA_SPACE:
1614	case PM_READ_ONLY:
1615	case PM_FAIL:
1616		return BLK_STS_IOERR;
1617	default:
1618		/* Shouldn't get here */
1619		DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
1620		return BLK_STS_IOERR;
1621	}
1622}
1623
1624static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
1625{
1626	blk_status_t error = should_error_unserviceable_bio(pool);
1627
1628	if (error) {
1629		bio->bi_status = error;
1630		bio_endio(bio);
1631	} else
1632		retry_on_resume(bio);
1633}
1634
1635static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
1636{
1637	struct bio *bio;
1638	struct bio_list bios;
1639	blk_status_t error;
1640
1641	error = should_error_unserviceable_bio(pool);
1642	if (error) {
1643		cell_error_with_code(pool, cell, error);
1644		return;
1645	}
1646
1647	bio_list_init(&bios);
1648	cell_release(pool, cell, &bios);
1649
1650	while ((bio = bio_list_pop(&bios)))
1651		retry_on_resume(bio);
1652}
1653
1654static void process_discard_cell_no_passdown(struct thin_c *tc,
1655					     struct dm_bio_prison_cell *virt_cell)
1656{
1657	struct pool *pool = tc->pool;
1658	struct dm_thin_new_mapping *m = get_next_mapping(pool);
1659
1660	/*
1661	 * We don't need to lock the data blocks, since there's no
1662	 * passdown.  We only lock data blocks for allocation and breaking sharing.
1663	 */
1664	m->tc = tc;
1665	m->virt_begin = virt_cell->key.block_begin;
1666	m->virt_end = virt_cell->key.block_end;
1667	m->cell = virt_cell;
1668	m->bio = virt_cell->holder;
1669
1670	if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
1671		pool->process_prepared_discard(m);
1672}
1673
1674static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
1675				 struct bio *bio)
1676{
1677	struct pool *pool = tc->pool;
1678
1679	int r;
1680	bool maybe_shared;
1681	struct dm_cell_key data_key;
1682	struct dm_bio_prison_cell *data_cell;
1683	struct dm_thin_new_mapping *m;
1684	dm_block_t virt_begin, virt_end, data_begin;
1685
1686	while (begin != end) {
1687		r = ensure_next_mapping(pool);
1688		if (r)
1689			/* we did our best */
1690			return;
1691
1692		r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
1693					      &data_begin, &maybe_shared);
1694		if (r)
1695			/*
1696			 * Silently fail, letting any mappings we've
1697			 * created complete.
1698			 */
1699			break;
1700
1701		build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
1702		if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
1703			/* contention, we'll give up with this range */
1704			begin = virt_end;
1705			continue;
1706		}
1707
1708		/*
1709		 * IO may still be going to the destination block.  We must
1710		 * quiesce before we can do the removal.
1711		 */
1712		m = get_next_mapping(pool);
1713		m->tc = tc;
1714		m->maybe_shared = maybe_shared;
1715		m->virt_begin = virt_begin;
1716		m->virt_end = virt_end;
1717		m->data_block = data_begin;
1718		m->cell = data_cell;
1719		m->bio = bio;
1720
1721		/*
1722		 * The parent bio must not complete before sub discard bios are
1723		 * chained to it (see end_discard's bio_chain)!
1724		 *
1725		 * This per-mapping bi_remaining increment is paired with
1726		 * the implicit decrement that occurs via bio_endio() in
1727		 * end_discard().
1728		 */
1729		bio_inc_remaining(bio);
1730		if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
1731			pool->process_prepared_discard(m);
1732
1733		begin = virt_end;
1734	}
1735}
1736
1737static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell)
1738{
1739	struct bio *bio = virt_cell->holder;
1740	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1741
1742	/*
1743	 * The virt_cell will only get freed once the origin bio completes.
1744	 * This means it will remain locked while all the individual
1745	 * passdown bios are in flight.
1746	 */
1747	h->cell = virt_cell;
1748	break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
1749
1750	/*
1751	 * We complete the bio now, knowing that the bi_remaining field
1752	 * will prevent completion until the sub range discards have
1753	 * completed.
1754	 */
1755	bio_endio(bio);
1756}
1757
1758static void process_discard_bio(struct thin_c *tc, struct bio *bio)
1759{
1760	dm_block_t begin, end;
1761	struct dm_cell_key virt_key;
1762	struct dm_bio_prison_cell *virt_cell;
1763
1764	get_bio_block_range(tc, bio, &begin, &end);
1765	if (begin == end) {
1766		/*
1767		 * The discard covers less than a block.
1768		 */
1769		bio_endio(bio);
1770		return;
1771	}
1772
1773	build_key(tc->td, VIRTUAL, begin, end, &virt_key);
1774	if (bio_detain(tc->pool, &virt_key, bio, &virt_cell))
1775		/*
1776		 * Potential starvation issue: We're relying on the
1777		 * fs/application being well behaved, and not trying to
1778		 * send IO to a region at the same time as discarding it.
1779		 * If they do this persistently then it's possible this
1780		 * cell will never be granted.
1781		 */
1782		return;
1783
1784	tc->pool->process_discard_cell(tc, virt_cell);
1785}
1786
1787static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1788			  struct dm_cell_key *key,
1789			  struct dm_thin_lookup_result *lookup_result,
1790			  struct dm_bio_prison_cell *cell)
1791{
1792	int r;
1793	dm_block_t data_block;
1794	struct pool *pool = tc->pool;
1795
1796	r = alloc_data_block(tc, &data_block);
1797	switch (r) {
1798	case 0:
1799		schedule_internal_copy(tc, block, lookup_result->block,
1800				       data_block, cell, bio);
1801		break;
1802
1803	case -ENOSPC:
1804		retry_bios_on_resume(pool, cell);
1805		break;
1806
1807	default:
1808		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1809			    __func__, r);
1810		cell_error(pool, cell);
1811		break;
1812	}
1813}
1814
1815static void __remap_and_issue_shared_cell(void *context,
1816					  struct dm_bio_prison_cell *cell)
1817{
1818	struct remap_info *info = context;
1819	struct bio *bio;
1820
1821	while ((bio = bio_list_pop(&cell->bios))) {
1822		if (bio_data_dir(bio) == WRITE || op_is_flush(bio->bi_opf) ||
1823		    bio_op(bio) == REQ_OP_DISCARD)
1824			bio_list_add(&info->defer_bios, bio);
1825		else {
1826			struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1827
1828			h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);
1829			inc_all_io_entry(info->tc->pool, bio);
1830			bio_list_add(&info->issue_bios, bio);
1831		}
1832	}
1833}
1834
1835static void remap_and_issue_shared_cell(struct thin_c *tc,
1836					struct dm_bio_prison_cell *cell,
1837					dm_block_t block)
1838{
1839	struct bio *bio;
1840	struct remap_info info;
1841
1842	info.tc = tc;
1843	bio_list_init(&info.defer_bios);
1844	bio_list_init(&info.issue_bios);
1845
1846	cell_visit_release(tc->pool, __remap_and_issue_shared_cell,
1847			   &info, cell);
1848
1849	while ((bio = bio_list_pop(&info.defer_bios)))
1850		thin_defer_bio(tc, bio);
1851
1852	while ((bio = bio_list_pop(&info.issue_bios)))
1853		remap_and_issue(tc, bio, block);
1854}
1855
1856static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1857			       dm_block_t block,
1858			       struct dm_thin_lookup_result *lookup_result,
1859			       struct dm_bio_prison_cell *virt_cell)
1860{
1861	struct dm_bio_prison_cell *data_cell;
1862	struct pool *pool = tc->pool;
1863	struct dm_cell_key key;
1864
1865	/*
1866	 * If cell is already occupied, then sharing is already in the process
1867	 * of being broken so we have nothing further to do here.
1868	 */
1869	build_data_key(tc->td, lookup_result->block, &key);
1870	if (bio_detain(pool, &key, bio, &data_cell)) {
1871		cell_defer_no_holder(tc, virt_cell);
1872		return;
1873	}
1874
1875	if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
1876		break_sharing(tc, bio, block, &key, lookup_result, data_cell);
1877		cell_defer_no_holder(tc, virt_cell);
1878	} else {
1879		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1880
1881		h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
1882		inc_all_io_entry(pool, bio);
1883		remap_and_issue(tc, bio, lookup_result->block);
1884
1885		remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);
1886		remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);
1887	}
1888}
1889
1890static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1891			    struct dm_bio_prison_cell *cell)
1892{
1893	int r;
1894	dm_block_t data_block;
1895	struct pool *pool = tc->pool;
1896
1897	/*
1898	 * Remap empty bios (flushes) immediately, without provisioning.
1899	 */
1900	if (!bio->bi_iter.bi_size) {
1901		inc_all_io_entry(pool, bio);
1902		cell_defer_no_holder(tc, cell);
1903
1904		remap_and_issue(tc, bio, 0);
1905		return;
1906	}
1907
1908	/*
1909	 * Fill read bios with zeroes and complete them immediately.
1910	 */
1911	if (bio_data_dir(bio) == READ) {
1912		zero_fill_bio(bio);
1913		cell_defer_no_holder(tc, cell);
1914		bio_endio(bio);
1915		return;
1916	}
1917
1918	r = alloc_data_block(tc, &data_block);
1919	switch (r) {
1920	case 0:
1921		if (tc->origin_dev)
1922			schedule_external_copy(tc, block, data_block, cell, bio);
1923		else
1924			schedule_zero(tc, block, data_block, cell, bio);
1925		break;
1926
1927	case -ENOSPC:
1928		retry_bios_on_resume(pool, cell);
1929		break;
1930
1931	default:
1932		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1933			    __func__, r);
1934		cell_error(pool, cell);
1935		break;
1936	}
1937}
1938
1939static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1940{
1941	int r;
1942	struct pool *pool = tc->pool;
1943	struct bio *bio = cell->holder;
1944	dm_block_t block = get_bio_block(tc, bio);
1945	struct dm_thin_lookup_result lookup_result;
1946
1947	if (tc->requeue_mode) {
1948		cell_requeue(pool, cell);
1949		return;
1950	}
1951
1952	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1953	switch (r) {
1954	case 0:
1955		if (lookup_result.shared)
1956			process_shared_bio(tc, bio, block, &lookup_result, cell);
1957		else {
1958			inc_all_io_entry(pool, bio);
1959			remap_and_issue(tc, bio, lookup_result.block);
1960			inc_remap_and_issue_cell(tc, cell, lookup_result.block);
1961		}
1962		break;
1963
1964	case -ENODATA:
1965		if (bio_data_dir(bio) == READ && tc->origin_dev) {
1966			inc_all_io_entry(pool, bio);
1967			cell_defer_no_holder(tc, cell);
1968
1969			if (bio_end_sector(bio) <= tc->origin_size)
1970				remap_to_origin_and_issue(tc, bio);
1971
1972			else if (bio->bi_iter.bi_sector < tc->origin_size)

Large files files are truncated, but you can click here to view the full file