PageRenderTime 59ms CodeModel.GetById 17ms app.highlight 30ms RepoModel.GetById 1ms app.codeStats 0ms

/fs/jbd2/transaction.c

https://bitbucket.org/cyanogenmod/android_kernel_asus_tf300t
C | 2225 lines | 1135 code | 248 blank | 842 comment | 224 complexity | aeb233a4b2fc90793f7b349382ba7d3a MD5 | raw file
Possible License(s): LGPL-2.0, AGPL-1.0, GPL-2.0

Large files files are truncated, but you can click here to view the full file

   1/*
   2 * linux/fs/jbd2/transaction.c
   3 *
   4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5 *
   6 * Copyright 1998 Red Hat corp --- All Rights Reserved
   7 *
   8 * This file is part of the Linux kernel and is made available under
   9 * the terms of the GNU General Public License, version 2, or at your
  10 * option, any later version, incorporated herein by reference.
  11 *
  12 * Generic filesystem transaction handling code; part of the ext2fs
  13 * journaling system.
  14 *
  15 * This file manages transactions (compound commits managed by the
  16 * journaling code) and handles (individual atomic operations by the
  17 * filesystem).
  18 */
  19
  20#include <linux/time.h>
  21#include <linux/fs.h>
  22#include <linux/jbd2.h>
  23#include <linux/errno.h>
  24#include <linux/slab.h>
  25#include <linux/timer.h>
  26#include <linux/mm.h>
  27#include <linux/highmem.h>
  28#include <linux/hrtimer.h>
  29#include <linux/backing-dev.h>
  30#include <linux/module.h>
  31
  32static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
  33static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
  34
  35/*
  36 * jbd2_get_transaction: obtain a new transaction_t object.
  37 *
  38 * Simply allocate and initialise a new transaction.  Create it in
  39 * RUNNING state and add it to the current journal (which should not
  40 * have an existing running transaction: we only make a new transaction
  41 * once we have started to commit the old one).
  42 *
  43 * Preconditions:
  44 *	The journal MUST be locked.  We don't perform atomic mallocs on the
  45 *	new transaction	and we can't block without protecting against other
  46 *	processes trying to touch the journal while it is in transition.
  47 *
  48 */
  49
  50static transaction_t *
  51jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
  52{
  53	transaction->t_journal = journal;
  54	transaction->t_state = T_RUNNING;
  55	transaction->t_start_time = ktime_get();
  56	transaction->t_tid = journal->j_transaction_sequence++;
  57	transaction->t_expires = jiffies + journal->j_commit_interval;
  58	spin_lock_init(&transaction->t_handle_lock);
  59	atomic_set(&transaction->t_updates, 0);
  60	atomic_set(&transaction->t_outstanding_credits, 0);
  61	atomic_set(&transaction->t_handle_count, 0);
  62	INIT_LIST_HEAD(&transaction->t_inode_list);
  63	INIT_LIST_HEAD(&transaction->t_private_list);
  64
  65	/* Set up the commit timer for the new transaction. */
  66	journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
  67	add_timer(&journal->j_commit_timer);
  68
  69	J_ASSERT(journal->j_running_transaction == NULL);
  70	journal->j_running_transaction = transaction;
  71	transaction->t_max_wait = 0;
  72	transaction->t_start = jiffies;
  73
  74	return transaction;
  75}
  76
  77/*
  78 * Handle management.
  79 *
  80 * A handle_t is an object which represents a single atomic update to a
  81 * filesystem, and which tracks all of the modifications which form part
  82 * of that one update.
  83 */
  84
  85/*
  86 * Update transaction's maximum wait time, if debugging is enabled.
  87 *
  88 * In order for t_max_wait to be reliable, it must be protected by a
  89 * lock.  But doing so will mean that start_this_handle() can not be
  90 * run in parallel on SMP systems, which limits our scalability.  So
  91 * unless debugging is enabled, we no longer update t_max_wait, which
  92 * means that maximum wait time reported by the jbd2_run_stats
  93 * tracepoint will always be zero.
  94 */
  95static inline void update_t_max_wait(transaction_t *transaction,
  96				     unsigned long ts)
  97{
  98#ifdef CONFIG_JBD2_DEBUG
  99	if (jbd2_journal_enable_debug &&
 100	    time_after(transaction->t_start, ts)) {
 101		ts = jbd2_time_diff(ts, transaction->t_start);
 102		spin_lock(&transaction->t_handle_lock);
 103		if (ts > transaction->t_max_wait)
 104			transaction->t_max_wait = ts;
 105		spin_unlock(&transaction->t_handle_lock);
 106	}
 107#endif
 108}
 109
 110/*
 111 * start_this_handle: Given a handle, deal with any locking or stalling
 112 * needed to make sure that there is enough journal space for the handle
 113 * to begin.  Attach the handle to a transaction and set up the
 114 * transaction's buffer credits.
 115 */
 116
 117static int start_this_handle(journal_t *journal, handle_t *handle,
 118			     int gfp_mask)
 119{
 120	transaction_t	*transaction, *new_transaction = NULL;
 121	tid_t		tid;
 122	int		needed, need_to_start;
 123	int		nblocks = handle->h_buffer_credits;
 124	unsigned long ts = jiffies;
 125
 126	if (nblocks > journal->j_max_transaction_buffers) {
 127		printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
 128		       current->comm, nblocks,
 129		       journal->j_max_transaction_buffers);
 130		return -ENOSPC;
 131	}
 132
 133alloc_transaction:
 134	if (!journal->j_running_transaction) {
 135		new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask);
 136		if (!new_transaction) {
 137			/*
 138			 * If __GFP_FS is not present, then we may be
 139			 * being called from inside the fs writeback
 140			 * layer, so we MUST NOT fail.  Since
 141			 * __GFP_NOFAIL is going away, we will arrange
 142			 * to retry the allocation ourselves.
 143			 */
 144			if ((gfp_mask & __GFP_FS) == 0) {
 145				congestion_wait(BLK_RW_ASYNC, HZ/50);
 146				goto alloc_transaction;
 147			}
 148			return -ENOMEM;
 149		}
 150	}
 151
 152	jbd_debug(3, "New handle %p going live.\n", handle);
 153
 154	/*
 155	 * We need to hold j_state_lock until t_updates has been incremented,
 156	 * for proper journal barrier handling
 157	 */
 158repeat:
 159	read_lock(&journal->j_state_lock);
 160	BUG_ON(journal->j_flags & JBD2_UNMOUNT);
 161	if (is_journal_aborted(journal) ||
 162	    (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
 163		read_unlock(&journal->j_state_lock);
 164		kfree(new_transaction);
 165		return -EROFS;
 166	}
 167
 168	/* Wait on the journal's transaction barrier if necessary */
 169	if (journal->j_barrier_count) {
 170		read_unlock(&journal->j_state_lock);
 171		wait_event(journal->j_wait_transaction_locked,
 172				journal->j_barrier_count == 0);
 173		goto repeat;
 174	}
 175
 176	if (!journal->j_running_transaction) {
 177		read_unlock(&journal->j_state_lock);
 178		if (!new_transaction)
 179			goto alloc_transaction;
 180		write_lock(&journal->j_state_lock);
 181		if (!journal->j_running_transaction) {
 182			jbd2_get_transaction(journal, new_transaction);
 183			new_transaction = NULL;
 184		}
 185		write_unlock(&journal->j_state_lock);
 186		goto repeat;
 187	}
 188
 189	transaction = journal->j_running_transaction;
 190
 191	/*
 192	 * If the current transaction is locked down for commit, wait for the
 193	 * lock to be released.
 194	 */
 195	if (transaction->t_state == T_LOCKED) {
 196		DEFINE_WAIT(wait);
 197
 198		prepare_to_wait(&journal->j_wait_transaction_locked,
 199					&wait, TASK_UNINTERRUPTIBLE);
 200		read_unlock(&journal->j_state_lock);
 201		schedule();
 202		finish_wait(&journal->j_wait_transaction_locked, &wait);
 203		goto repeat;
 204	}
 205
 206	/*
 207	 * If there is not enough space left in the log to write all potential
 208	 * buffers requested by this operation, we need to stall pending a log
 209	 * checkpoint to free some more log space.
 210	 */
 211	needed = atomic_add_return(nblocks,
 212				   &transaction->t_outstanding_credits);
 213
 214	if (needed > journal->j_max_transaction_buffers) {
 215		/*
 216		 * If the current transaction is already too large, then start
 217		 * to commit it: we can then go back and attach this handle to
 218		 * a new transaction.
 219		 */
 220		DEFINE_WAIT(wait);
 221
 222		jbd_debug(2, "Handle %p starting new commit...\n", handle);
 223		atomic_sub(nblocks, &transaction->t_outstanding_credits);
 224		prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
 225				TASK_UNINTERRUPTIBLE);
 226		tid = transaction->t_tid;
 227		need_to_start = !tid_geq(journal->j_commit_request, tid);
 228		read_unlock(&journal->j_state_lock);
 229		if (need_to_start)
 230			jbd2_log_start_commit(journal, tid);
 231		schedule();
 232		finish_wait(&journal->j_wait_transaction_locked, &wait);
 233		goto repeat;
 234	}
 235
 236	/*
 237	 * The commit code assumes that it can get enough log space
 238	 * without forcing a checkpoint.  This is *critical* for
 239	 * correctness: a checkpoint of a buffer which is also
 240	 * associated with a committing transaction creates a deadlock,
 241	 * so commit simply cannot force through checkpoints.
 242	 *
 243	 * We must therefore ensure the necessary space in the journal
 244	 * *before* starting to dirty potentially checkpointed buffers
 245	 * in the new transaction.
 246	 *
 247	 * The worst part is, any transaction currently committing can
 248	 * reduce the free space arbitrarily.  Be careful to account for
 249	 * those buffers when checkpointing.
 250	 */
 251
 252	/*
 253	 * @@@ AKPM: This seems rather over-defensive.  We're giving commit
 254	 * a _lot_ of headroom: 1/4 of the journal plus the size of
 255	 * the committing transaction.  Really, we only need to give it
 256	 * committing_transaction->t_outstanding_credits plus "enough" for
 257	 * the log control blocks.
 258	 * Also, this test is inconsistent with the matching one in
 259	 * jbd2_journal_extend().
 260	 */
 261	if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) {
 262		jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
 263		atomic_sub(nblocks, &transaction->t_outstanding_credits);
 264		read_unlock(&journal->j_state_lock);
 265		write_lock(&journal->j_state_lock);
 266		if (__jbd2_log_space_left(journal) < jbd_space_needed(journal))
 267			__jbd2_log_wait_for_space(journal);
 268		write_unlock(&journal->j_state_lock);
 269		goto repeat;
 270	}
 271
 272	/* OK, account for the buffers that this operation expects to
 273	 * use and add the handle to the running transaction. 
 274	 */
 275	update_t_max_wait(transaction, ts);
 276	handle->h_transaction = transaction;
 277	atomic_inc(&transaction->t_updates);
 278	atomic_inc(&transaction->t_handle_count);
 279	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
 280		  handle, nblocks,
 281		  atomic_read(&transaction->t_outstanding_credits),
 282		  __jbd2_log_space_left(journal));
 283	read_unlock(&journal->j_state_lock);
 284
 285	lock_map_acquire(&handle->h_lockdep_map);
 286	kfree(new_transaction);
 287	return 0;
 288}
 289
 290static struct lock_class_key jbd2_handle_key;
 291
 292/* Allocate a new handle.  This should probably be in a slab... */
 293static handle_t *new_handle(int nblocks)
 294{
 295	handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
 296	if (!handle)
 297		return NULL;
 298	memset(handle, 0, sizeof(*handle));
 299	handle->h_buffer_credits = nblocks;
 300	handle->h_ref = 1;
 301
 302	lockdep_init_map(&handle->h_lockdep_map, "jbd2_handle",
 303						&jbd2_handle_key, 0);
 304
 305	return handle;
 306}
 307
 308/**
 309 * handle_t *jbd2_journal_start() - Obtain a new handle.
 310 * @journal: Journal to start transaction on.
 311 * @nblocks: number of block buffer we might modify
 312 *
 313 * We make sure that the transaction can guarantee at least nblocks of
 314 * modified buffers in the log.  We block until the log can guarantee
 315 * that much space.
 316 *
 317 * This function is visible to journal users (like ext3fs), so is not
 318 * called with the journal already locked.
 319 *
 320 * Return a pointer to a newly allocated handle, or an ERR_PTR() value
 321 * on failure.
 322 */
 323handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask)
 324{
 325	handle_t *handle = journal_current_handle();
 326	int err;
 327
 328	if (!journal)
 329		return ERR_PTR(-EROFS);
 330
 331	if (handle) {
 332		J_ASSERT(handle->h_transaction->t_journal == journal);
 333		handle->h_ref++;
 334		return handle;
 335	}
 336
 337	handle = new_handle(nblocks);
 338	if (!handle)
 339		return ERR_PTR(-ENOMEM);
 340
 341	current->journal_info = handle;
 342
 343	err = start_this_handle(journal, handle, gfp_mask);
 344	if (err < 0) {
 345		jbd2_free_handle(handle);
 346		current->journal_info = NULL;
 347		handle = ERR_PTR(err);
 348	}
 349	return handle;
 350}
 351EXPORT_SYMBOL(jbd2__journal_start);
 352
 353
 354handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
 355{
 356	return jbd2__journal_start(journal, nblocks, GFP_NOFS);
 357}
 358EXPORT_SYMBOL(jbd2_journal_start);
 359
 360
 361/**
 362 * int jbd2_journal_extend() - extend buffer credits.
 363 * @handle:  handle to 'extend'
 364 * @nblocks: nr blocks to try to extend by.
 365 *
 366 * Some transactions, such as large extends and truncates, can be done
 367 * atomically all at once or in several stages.  The operation requests
 368 * a credit for a number of buffer modications in advance, but can
 369 * extend its credit if it needs more.
 370 *
 371 * jbd2_journal_extend tries to give the running handle more buffer credits.
 372 * It does not guarantee that allocation - this is a best-effort only.
 373 * The calling process MUST be able to deal cleanly with a failure to
 374 * extend here.
 375 *
 376 * Return 0 on success, non-zero on failure.
 377 *
 378 * return code < 0 implies an error
 379 * return code > 0 implies normal transaction-full status.
 380 */
 381int jbd2_journal_extend(handle_t *handle, int nblocks)
 382{
 383	transaction_t *transaction = handle->h_transaction;
 384	journal_t *journal = transaction->t_journal;
 385	int result;
 386	int wanted;
 387
 388	result = -EIO;
 389	if (is_handle_aborted(handle))
 390		goto out;
 391
 392	result = 1;
 393
 394	read_lock(&journal->j_state_lock);
 395
 396	/* Don't extend a locked-down transaction! */
 397	if (handle->h_transaction->t_state != T_RUNNING) {
 398		jbd_debug(3, "denied handle %p %d blocks: "
 399			  "transaction not running\n", handle, nblocks);
 400		goto error_out;
 401	}
 402
 403	spin_lock(&transaction->t_handle_lock);
 404	wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks;
 405
 406	if (wanted > journal->j_max_transaction_buffers) {
 407		jbd_debug(3, "denied handle %p %d blocks: "
 408			  "transaction too large\n", handle, nblocks);
 409		goto unlock;
 410	}
 411
 412	if (wanted > __jbd2_log_space_left(journal)) {
 413		jbd_debug(3, "denied handle %p %d blocks: "
 414			  "insufficient log space\n", handle, nblocks);
 415		goto unlock;
 416	}
 417
 418	handle->h_buffer_credits += nblocks;
 419	atomic_add(nblocks, &transaction->t_outstanding_credits);
 420	result = 0;
 421
 422	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
 423unlock:
 424	spin_unlock(&transaction->t_handle_lock);
 425error_out:
 426	read_unlock(&journal->j_state_lock);
 427out:
 428	return result;
 429}
 430
 431
 432/**
 433 * int jbd2_journal_restart() - restart a handle .
 434 * @handle:  handle to restart
 435 * @nblocks: nr credits requested
 436 *
 437 * Restart a handle for a multi-transaction filesystem
 438 * operation.
 439 *
 440 * If the jbd2_journal_extend() call above fails to grant new buffer credits
 441 * to a running handle, a call to jbd2_journal_restart will commit the
 442 * handle's transaction so far and reattach the handle to a new
 443 * transaction capabable of guaranteeing the requested number of
 444 * credits.
 445 */
 446int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
 447{
 448	transaction_t *transaction = handle->h_transaction;
 449	journal_t *journal = transaction->t_journal;
 450	tid_t		tid;
 451	int		need_to_start, ret;
 452
 453	/* If we've had an abort of any type, don't even think about
 454	 * actually doing the restart! */
 455	if (is_handle_aborted(handle))
 456		return 0;
 457
 458	/*
 459	 * First unlink the handle from its current transaction, and start the
 460	 * commit on that.
 461	 */
 462	J_ASSERT(atomic_read(&transaction->t_updates) > 0);
 463	J_ASSERT(journal_current_handle() == handle);
 464
 465	read_lock(&journal->j_state_lock);
 466	spin_lock(&transaction->t_handle_lock);
 467	atomic_sub(handle->h_buffer_credits,
 468		   &transaction->t_outstanding_credits);
 469	if (atomic_dec_and_test(&transaction->t_updates))
 470		wake_up(&journal->j_wait_updates);
 471	spin_unlock(&transaction->t_handle_lock);
 472
 473	jbd_debug(2, "restarting handle %p\n", handle);
 474	tid = transaction->t_tid;
 475	need_to_start = !tid_geq(journal->j_commit_request, tid);
 476	read_unlock(&journal->j_state_lock);
 477	if (need_to_start)
 478		jbd2_log_start_commit(journal, tid);
 479
 480	lock_map_release(&handle->h_lockdep_map);
 481	handle->h_buffer_credits = nblocks;
 482	ret = start_this_handle(journal, handle, gfp_mask);
 483	return ret;
 484}
 485EXPORT_SYMBOL(jbd2__journal_restart);
 486
 487
 488int jbd2_journal_restart(handle_t *handle, int nblocks)
 489{
 490	return jbd2__journal_restart(handle, nblocks, GFP_NOFS);
 491}
 492EXPORT_SYMBOL(jbd2_journal_restart);
 493
 494/**
 495 * void jbd2_journal_lock_updates () - establish a transaction barrier.
 496 * @journal:  Journal to establish a barrier on.
 497 *
 498 * This locks out any further updates from being started, and blocks
 499 * until all existing updates have completed, returning only once the
 500 * journal is in a quiescent state with no updates running.
 501 *
 502 * The journal lock should not be held on entry.
 503 */
 504void jbd2_journal_lock_updates(journal_t *journal)
 505{
 506	DEFINE_WAIT(wait);
 507
 508	write_lock(&journal->j_state_lock);
 509	++journal->j_barrier_count;
 510
 511	/* Wait until there are no running updates */
 512	while (1) {
 513		transaction_t *transaction = journal->j_running_transaction;
 514
 515		if (!transaction)
 516			break;
 517
 518		spin_lock(&transaction->t_handle_lock);
 519		if (!atomic_read(&transaction->t_updates)) {
 520			spin_unlock(&transaction->t_handle_lock);
 521			break;
 522		}
 523		prepare_to_wait(&journal->j_wait_updates, &wait,
 524				TASK_UNINTERRUPTIBLE);
 525		spin_unlock(&transaction->t_handle_lock);
 526		write_unlock(&journal->j_state_lock);
 527		schedule();
 528		finish_wait(&journal->j_wait_updates, &wait);
 529		write_lock(&journal->j_state_lock);
 530	}
 531	write_unlock(&journal->j_state_lock);
 532
 533	/*
 534	 * We have now established a barrier against other normal updates, but
 535	 * we also need to barrier against other jbd2_journal_lock_updates() calls
 536	 * to make sure that we serialise special journal-locked operations
 537	 * too.
 538	 */
 539	mutex_lock(&journal->j_barrier);
 540}
 541
 542/**
 543 * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier
 544 * @journal:  Journal to release the barrier on.
 545 *
 546 * Release a transaction barrier obtained with jbd2_journal_lock_updates().
 547 *
 548 * Should be called without the journal lock held.
 549 */
 550void jbd2_journal_unlock_updates (journal_t *journal)
 551{
 552	J_ASSERT(journal->j_barrier_count != 0);
 553
 554	mutex_unlock(&journal->j_barrier);
 555	write_lock(&journal->j_state_lock);
 556	--journal->j_barrier_count;
 557	write_unlock(&journal->j_state_lock);
 558	wake_up(&journal->j_wait_transaction_locked);
 559}
 560
 561static void warn_dirty_buffer(struct buffer_head *bh)
 562{
 563	char b[BDEVNAME_SIZE];
 564
 565	printk(KERN_WARNING
 566	       "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
 567	       "There's a risk of filesystem corruption in case of system "
 568	       "crash.\n",
 569	       bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
 570}
 571
 572/*
 573 * If the buffer is already part of the current transaction, then there
 574 * is nothing we need to do.  If it is already part of a prior
 575 * transaction which we are still committing to disk, then we need to
 576 * make sure that we do not overwrite the old copy: we do copy-out to
 577 * preserve the copy going to disk.  We also account the buffer against
 578 * the handle's metadata buffer credits (unless the buffer is already
 579 * part of the transaction, that is).
 580 *
 581 */
 582static int
 583do_get_write_access(handle_t *handle, struct journal_head *jh,
 584			int force_copy)
 585{
 586	struct buffer_head *bh;
 587	transaction_t *transaction;
 588	journal_t *journal;
 589	int error;
 590	char *frozen_buffer = NULL;
 591	int need_copy = 0;
 592
 593	if (is_handle_aborted(handle))
 594		return -EROFS;
 595
 596	transaction = handle->h_transaction;
 597	journal = transaction->t_journal;
 598
 599	jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
 600
 601	JBUFFER_TRACE(jh, "entry");
 602repeat:
 603	bh = jh2bh(jh);
 604
 605	/* @@@ Need to check for errors here at some point. */
 606
 607	lock_buffer(bh);
 608	jbd_lock_bh_state(bh);
 609
 610	/* We now hold the buffer lock so it is safe to query the buffer
 611	 * state.  Is the buffer dirty?
 612	 *
 613	 * If so, there are two possibilities.  The buffer may be
 614	 * non-journaled, and undergoing a quite legitimate writeback.
 615	 * Otherwise, it is journaled, and we don't expect dirty buffers
 616	 * in that state (the buffers should be marked JBD_Dirty
 617	 * instead.)  So either the IO is being done under our own
 618	 * control and this is a bug, or it's a third party IO such as
 619	 * dump(8) (which may leave the buffer scheduled for read ---
 620	 * ie. locked but not dirty) or tune2fs (which may actually have
 621	 * the buffer dirtied, ugh.)  */
 622
 623	if (buffer_dirty(bh)) {
 624		/*
 625		 * First question: is this buffer already part of the current
 626		 * transaction or the existing committing transaction?
 627		 */
 628		if (jh->b_transaction) {
 629			J_ASSERT_JH(jh,
 630				jh->b_transaction == transaction ||
 631				jh->b_transaction ==
 632					journal->j_committing_transaction);
 633			if (jh->b_next_transaction)
 634				J_ASSERT_JH(jh, jh->b_next_transaction ==
 635							transaction);
 636			warn_dirty_buffer(bh);
 637		}
 638		/*
 639		 * In any case we need to clean the dirty flag and we must
 640		 * do it under the buffer lock to be sure we don't race
 641		 * with running write-out.
 642		 */
 643		JBUFFER_TRACE(jh, "Journalling dirty buffer");
 644		clear_buffer_dirty(bh);
 645		set_buffer_jbddirty(bh);
 646	}
 647
 648	unlock_buffer(bh);
 649
 650	error = -EROFS;
 651	if (is_handle_aborted(handle)) {
 652		jbd_unlock_bh_state(bh);
 653		goto out;
 654	}
 655	error = 0;
 656
 657	/*
 658	 * The buffer is already part of this transaction if b_transaction or
 659	 * b_next_transaction points to it
 660	 */
 661	if (jh->b_transaction == transaction ||
 662	    jh->b_next_transaction == transaction)
 663		goto done;
 664
 665	/*
 666	 * this is the first time this transaction is touching this buffer,
 667	 * reset the modified flag
 668	 */
 669       jh->b_modified = 0;
 670
 671	/*
 672	 * If there is already a copy-out version of this buffer, then we don't
 673	 * need to make another one
 674	 */
 675	if (jh->b_frozen_data) {
 676		JBUFFER_TRACE(jh, "has frozen data");
 677		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
 678		jh->b_next_transaction = transaction;
 679		goto done;
 680	}
 681
 682	/* Is there data here we need to preserve? */
 683
 684	if (jh->b_transaction && jh->b_transaction != transaction) {
 685		JBUFFER_TRACE(jh, "owned by older transaction");
 686		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
 687		J_ASSERT_JH(jh, jh->b_transaction ==
 688					journal->j_committing_transaction);
 689
 690		/* There is one case we have to be very careful about.
 691		 * If the committing transaction is currently writing
 692		 * this buffer out to disk and has NOT made a copy-out,
 693		 * then we cannot modify the buffer contents at all
 694		 * right now.  The essence of copy-out is that it is the
 695		 * extra copy, not the primary copy, which gets
 696		 * journaled.  If the primary copy is already going to
 697		 * disk then we cannot do copy-out here. */
 698
 699		if (jh->b_jlist == BJ_Shadow) {
 700			DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
 701			wait_queue_head_t *wqh;
 702
 703			wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
 704
 705			JBUFFER_TRACE(jh, "on shadow: sleep");
 706			jbd_unlock_bh_state(bh);
 707			/* commit wakes up all shadow buffers after IO */
 708			for ( ; ; ) {
 709				prepare_to_wait(wqh, &wait.wait,
 710						TASK_UNINTERRUPTIBLE);
 711				if (jh->b_jlist != BJ_Shadow)
 712					break;
 713				schedule();
 714			}
 715			finish_wait(wqh, &wait.wait);
 716			goto repeat;
 717		}
 718
 719		/* Only do the copy if the currently-owning transaction
 720		 * still needs it.  If it is on the Forget list, the
 721		 * committing transaction is past that stage.  The
 722		 * buffer had better remain locked during the kmalloc,
 723		 * but that should be true --- we hold the journal lock
 724		 * still and the buffer is already on the BUF_JOURNAL
 725		 * list so won't be flushed.
 726		 *
 727		 * Subtle point, though: if this is a get_undo_access,
 728		 * then we will be relying on the frozen_data to contain
 729		 * the new value of the committed_data record after the
 730		 * transaction, so we HAVE to force the frozen_data copy
 731		 * in that case. */
 732
 733		if (jh->b_jlist != BJ_Forget || force_copy) {
 734			JBUFFER_TRACE(jh, "generate frozen data");
 735			if (!frozen_buffer) {
 736				JBUFFER_TRACE(jh, "allocate memory for buffer");
 737				jbd_unlock_bh_state(bh);
 738				frozen_buffer =
 739					jbd2_alloc(jh2bh(jh)->b_size,
 740							 GFP_NOFS);
 741				if (!frozen_buffer) {
 742					printk(KERN_EMERG
 743					       "%s: OOM for frozen_buffer\n",
 744					       __func__);
 745					JBUFFER_TRACE(jh, "oom!");
 746					error = -ENOMEM;
 747					jbd_lock_bh_state(bh);
 748					goto done;
 749				}
 750				goto repeat;
 751			}
 752			jh->b_frozen_data = frozen_buffer;
 753			frozen_buffer = NULL;
 754			need_copy = 1;
 755		}
 756		jh->b_next_transaction = transaction;
 757	}
 758
 759
 760	/*
 761	 * Finally, if the buffer is not journaled right now, we need to make
 762	 * sure it doesn't get written to disk before the caller actually
 763	 * commits the new data
 764	 */
 765	if (!jh->b_transaction) {
 766		JBUFFER_TRACE(jh, "no transaction");
 767		J_ASSERT_JH(jh, !jh->b_next_transaction);
 768		JBUFFER_TRACE(jh, "file as BJ_Reserved");
 769		spin_lock(&journal->j_list_lock);
 770		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
 771		spin_unlock(&journal->j_list_lock);
 772	}
 773
 774done:
 775	if (need_copy) {
 776		struct page *page;
 777		int offset;
 778		char *source;
 779
 780		J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)),
 781			    "Possible IO failure.\n");
 782		page = jh2bh(jh)->b_page;
 783		offset = offset_in_page(jh2bh(jh)->b_data);
 784		source = kmap_atomic(page, KM_USER0);
 785		/* Fire data frozen trigger just before we copy the data */
 786		jbd2_buffer_frozen_trigger(jh, source + offset,
 787					   jh->b_triggers);
 788		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
 789		kunmap_atomic(source, KM_USER0);
 790
 791		/*
 792		 * Now that the frozen data is saved off, we need to store
 793		 * any matching triggers.
 794		 */
 795		jh->b_frozen_triggers = jh->b_triggers;
 796	}
 797	jbd_unlock_bh_state(bh);
 798
 799	/*
 800	 * If we are about to journal a buffer, then any revoke pending on it is
 801	 * no longer valid
 802	 */
 803	jbd2_journal_cancel_revoke(handle, jh);
 804
 805out:
 806	if (unlikely(frozen_buffer))	/* It's usually NULL */
 807		jbd2_free(frozen_buffer, bh->b_size);
 808
 809	JBUFFER_TRACE(jh, "exit");
 810	return error;
 811}
 812
 813/**
 814 * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
 815 * @handle: transaction to add buffer modifications to
 816 * @bh:     bh to be used for metadata writes
 817 *
 818 * Returns an error code or 0 on success.
 819 *
 820 * In full data journalling mode the buffer may be of type BJ_AsyncData,
 821 * because we're write()ing a buffer which is also part of a shared mapping.
 822 */
 823
 824int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
 825{
 826	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
 827	int rc;
 828
 829	/* We do not want to get caught playing with fields which the
 830	 * log thread also manipulates.  Make sure that the buffer
 831	 * completes any outstanding IO before proceeding. */
 832	rc = do_get_write_access(handle, jh, 0);
 833	jbd2_journal_put_journal_head(jh);
 834	return rc;
 835}
 836
 837
 838/*
 839 * When the user wants to journal a newly created buffer_head
 840 * (ie. getblk() returned a new buffer and we are going to populate it
 841 * manually rather than reading off disk), then we need to keep the
 842 * buffer_head locked until it has been completely filled with new
 843 * data.  In this case, we should be able to make the assertion that
 844 * the bh is not already part of an existing transaction.
 845 *
 846 * The buffer should already be locked by the caller by this point.
 847 * There is no lock ranking violation: it was a newly created,
 848 * unlocked buffer beforehand. */
 849
 850/**
 851 * int jbd2_journal_get_create_access () - notify intent to use newly created bh
 852 * @handle: transaction to new buffer to
 853 * @bh: new buffer.
 854 *
 855 * Call this if you create a new bh.
 856 */
 857int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 858{
 859	transaction_t *transaction = handle->h_transaction;
 860	journal_t *journal = transaction->t_journal;
 861	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
 862	int err;
 863
 864	jbd_debug(5, "journal_head %p\n", jh);
 865	err = -EROFS;
 866	if (is_handle_aborted(handle))
 867		goto out;
 868	err = 0;
 869
 870	JBUFFER_TRACE(jh, "entry");
 871	/*
 872	 * The buffer may already belong to this transaction due to pre-zeroing
 873	 * in the filesystem's new_block code.  It may also be on the previous,
 874	 * committing transaction's lists, but it HAS to be in Forget state in
 875	 * that case: the transaction must have deleted the buffer for it to be
 876	 * reused here.
 877	 */
 878	jbd_lock_bh_state(bh);
 879	spin_lock(&journal->j_list_lock);
 880	J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
 881		jh->b_transaction == NULL ||
 882		(jh->b_transaction == journal->j_committing_transaction &&
 883			  jh->b_jlist == BJ_Forget)));
 884
 885	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
 886	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
 887
 888	if (jh->b_transaction == NULL) {
 889		/*
 890		 * Previous jbd2_journal_forget() could have left the buffer
 891		 * with jbddirty bit set because it was being committed. When
 892		 * the commit finished, we've filed the buffer for
 893		 * checkpointing and marked it dirty. Now we are reallocating
 894		 * the buffer so the transaction freeing it must have
 895		 * committed and so it's safe to clear the dirty bit.
 896		 */
 897		clear_buffer_dirty(jh2bh(jh));
 898		/* first access by this transaction */
 899		jh->b_modified = 0;
 900
 901		JBUFFER_TRACE(jh, "file as BJ_Reserved");
 902		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
 903	} else if (jh->b_transaction == journal->j_committing_transaction) {
 904		/* first access by this transaction */
 905		jh->b_modified = 0;
 906
 907		JBUFFER_TRACE(jh, "set next transaction");
 908		jh->b_next_transaction = transaction;
 909	}
 910	spin_unlock(&journal->j_list_lock);
 911	jbd_unlock_bh_state(bh);
 912
 913	/*
 914	 * akpm: I added this.  ext3_alloc_branch can pick up new indirect
 915	 * blocks which contain freed but then revoked metadata.  We need
 916	 * to cancel the revoke in case we end up freeing it yet again
 917	 * and the reallocating as data - this would cause a second revoke,
 918	 * which hits an assertion error.
 919	 */
 920	JBUFFER_TRACE(jh, "cancelling revoke");
 921	jbd2_journal_cancel_revoke(handle, jh);
 922out:
 923	jbd2_journal_put_journal_head(jh);
 924	return err;
 925}
 926
 927/**
 928 * int jbd2_journal_get_undo_access() -  Notify intent to modify metadata with
 929 *     non-rewindable consequences
 930 * @handle: transaction
 931 * @bh: buffer to undo
 932 *
 933 * Sometimes there is a need to distinguish between metadata which has
 934 * been committed to disk and that which has not.  The ext3fs code uses
 935 * this for freeing and allocating space, we have to make sure that we
 936 * do not reuse freed space until the deallocation has been committed,
 937 * since if we overwrote that space we would make the delete
 938 * un-rewindable in case of a crash.
 939 *
 940 * To deal with that, jbd2_journal_get_undo_access requests write access to a
 941 * buffer for parts of non-rewindable operations such as delete
 942 * operations on the bitmaps.  The journaling code must keep a copy of
 943 * the buffer's contents prior to the undo_access call until such time
 944 * as we know that the buffer has definitely been committed to disk.
 945 *
 946 * We never need to know which transaction the committed data is part
 947 * of, buffers touched here are guaranteed to be dirtied later and so
 948 * will be committed to a new transaction in due course, at which point
 949 * we can discard the old committed data pointer.
 950 *
 951 * Returns error number or 0 on success.
 952 */
 953int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
 954{
 955	int err;
 956	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
 957	char *committed_data = NULL;
 958
 959	JBUFFER_TRACE(jh, "entry");
 960
 961	/*
 962	 * Do this first --- it can drop the journal lock, so we want to
 963	 * make sure that obtaining the committed_data is done
 964	 * atomically wrt. completion of any outstanding commits.
 965	 */
 966	err = do_get_write_access(handle, jh, 1);
 967	if (err)
 968		goto out;
 969
 970repeat:
 971	if (!jh->b_committed_data) {
 972		committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
 973		if (!committed_data) {
 974			printk(KERN_EMERG "%s: No memory for committed data\n",
 975				__func__);
 976			err = -ENOMEM;
 977			goto out;
 978		}
 979	}
 980
 981	jbd_lock_bh_state(bh);
 982	if (!jh->b_committed_data) {
 983		/* Copy out the current buffer contents into the
 984		 * preserved, committed copy. */
 985		JBUFFER_TRACE(jh, "generate b_committed data");
 986		if (!committed_data) {
 987			jbd_unlock_bh_state(bh);
 988			goto repeat;
 989		}
 990
 991		jh->b_committed_data = committed_data;
 992		committed_data = NULL;
 993		memcpy(jh->b_committed_data, bh->b_data, bh->b_size);
 994	}
 995	jbd_unlock_bh_state(bh);
 996out:
 997	jbd2_journal_put_journal_head(jh);
 998	if (unlikely(committed_data))
 999		jbd2_free(committed_data, bh->b_size);
1000	return err;
1001}
1002
1003/**
1004 * void jbd2_journal_set_triggers() - Add triggers for commit writeout
1005 * @bh: buffer to trigger on
1006 * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
1007 *
1008 * Set any triggers on this journal_head.  This is always safe, because
1009 * triggers for a committing buffer will be saved off, and triggers for
1010 * a running transaction will match the buffer in that transaction.
1011 *
1012 * Call with NULL to clear the triggers.
1013 */
1014void jbd2_journal_set_triggers(struct buffer_head *bh,
1015			       struct jbd2_buffer_trigger_type *type)
1016{
1017	struct journal_head *jh = bh2jh(bh);
1018
1019	jh->b_triggers = type;
1020}
1021
1022void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
1023				struct jbd2_buffer_trigger_type *triggers)
1024{
1025	struct buffer_head *bh = jh2bh(jh);
1026
1027	if (!triggers || !triggers->t_frozen)
1028		return;
1029
1030	triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
1031}
1032
1033void jbd2_buffer_abort_trigger(struct journal_head *jh,
1034			       struct jbd2_buffer_trigger_type *triggers)
1035{
1036	if (!triggers || !triggers->t_abort)
1037		return;
1038
1039	triggers->t_abort(triggers, jh2bh(jh));
1040}
1041
1042
1043
1044/**
1045 * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
1046 * @handle: transaction to add buffer to.
1047 * @bh: buffer to mark
1048 *
1049 * mark dirty metadata which needs to be journaled as part of the current
1050 * transaction.
1051 *
1052 * The buffer is placed on the transaction's metadata list and is marked
1053 * as belonging to the transaction.
1054 *
1055 * Returns error number or 0 on success.
1056 *
1057 * Special care needs to be taken if the buffer already belongs to the
1058 * current committing transaction (in which case we should have frozen
1059 * data present for that commit).  In that case, we don't relink the
1060 * buffer: that only gets done when the old transaction finally
1061 * completes its commit.
1062 */
1063int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
1064{
1065	transaction_t *transaction = handle->h_transaction;
1066	journal_t *journal = transaction->t_journal;
1067	struct journal_head *jh = bh2jh(bh);
1068
1069	jbd_debug(5, "journal_head %p\n", jh);
1070	JBUFFER_TRACE(jh, "entry");
1071	if (is_handle_aborted(handle))
1072		goto out;
1073
1074	jbd_lock_bh_state(bh);
1075
1076	if (jh->b_modified == 0) {
1077		/*
1078		 * This buffer's got modified and becoming part
1079		 * of the transaction. This needs to be done
1080		 * once a transaction -bzzz
1081		 */
1082		jh->b_modified = 1;
1083		J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
1084		handle->h_buffer_credits--;
1085	}
1086
1087	/*
1088	 * fastpath, to avoid expensive locking.  If this buffer is already
1089	 * on the running transaction's metadata list there is nothing to do.
1090	 * Nobody can take it off again because there is a handle open.
1091	 * I _think_ we're OK here with SMP barriers - a mistaken decision will
1092	 * result in this test being false, so we go in and take the locks.
1093	 */
1094	if (jh->b_transaction == transaction && jh->b_jlist == BJ_Metadata) {
1095		JBUFFER_TRACE(jh, "fastpath");
1096		J_ASSERT_JH(jh, jh->b_transaction ==
1097					journal->j_running_transaction);
1098		goto out_unlock_bh;
1099	}
1100
1101	set_buffer_jbddirty(bh);
1102
1103	/*
1104	 * Metadata already on the current transaction list doesn't
1105	 * need to be filed.  Metadata on another transaction's list must
1106	 * be committing, and will be refiled once the commit completes:
1107	 * leave it alone for now.
1108	 */
1109	if (jh->b_transaction != transaction) {
1110		JBUFFER_TRACE(jh, "already on other transaction");
1111		J_ASSERT_JH(jh, jh->b_transaction ==
1112					journal->j_committing_transaction);
1113		J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
1114		/* And this case is illegal: we can't reuse another
1115		 * transaction's data buffer, ever. */
1116		goto out_unlock_bh;
1117	}
1118
1119	/* That test should have eliminated the following case: */
1120	J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
1121
1122	JBUFFER_TRACE(jh, "file as BJ_Metadata");
1123	spin_lock(&journal->j_list_lock);
1124	__jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
1125	spin_unlock(&journal->j_list_lock);
1126out_unlock_bh:
1127	jbd_unlock_bh_state(bh);
1128out:
1129	JBUFFER_TRACE(jh, "exit");
1130	return 0;
1131}
1132
1133/*
1134 * jbd2_journal_release_buffer: undo a get_write_access without any buffer
1135 * updates, if the update decided in the end that it didn't need access.
1136 *
1137 */
1138void
1139jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
1140{
1141	BUFFER_TRACE(bh, "entry");
1142}
1143
1144/**
1145 * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
1146 * @handle: transaction handle
1147 * @bh:     bh to 'forget'
1148 *
1149 * We can only do the bforget if there are no commits pending against the
1150 * buffer.  If the buffer is dirty in the current running transaction we
1151 * can safely unlink it.
1152 *
1153 * bh may not be a journalled buffer at all - it may be a non-JBD
1154 * buffer which came off the hashtable.  Check for this.
1155 *
1156 * Decrements bh->b_count by one.
1157 *
1158 * Allow this call even if the handle has aborted --- it may be part of
1159 * the caller's cleanup after an abort.
1160 */
1161int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
1162{
1163	transaction_t *transaction = handle->h_transaction;
1164	journal_t *journal = transaction->t_journal;
1165	struct journal_head *jh;
1166	int drop_reserve = 0;
1167	int err = 0;
1168	int was_modified = 0;
1169
1170	BUFFER_TRACE(bh, "entry");
1171
1172	jbd_lock_bh_state(bh);
1173	spin_lock(&journal->j_list_lock);
1174
1175	if (!buffer_jbd(bh))
1176		goto not_jbd;
1177	jh = bh2jh(bh);
1178
1179	/* Critical error: attempting to delete a bitmap buffer, maybe?
1180	 * Don't do any jbd operations, and return an error. */
1181	if (!J_EXPECT_JH(jh, !jh->b_committed_data,
1182			 "inconsistent data on disk")) {
1183		err = -EIO;
1184		goto not_jbd;
1185	}
1186
1187	/* keep track of wether or not this transaction modified us */
1188	was_modified = jh->b_modified;
1189
1190	/*
1191	 * The buffer's going from the transaction, we must drop
1192	 * all references -bzzz
1193	 */
1194	jh->b_modified = 0;
1195
1196	if (jh->b_transaction == handle->h_transaction) {
1197		J_ASSERT_JH(jh, !jh->b_frozen_data);
1198
1199		/* If we are forgetting a buffer which is already part
1200		 * of this transaction, then we can just drop it from
1201		 * the transaction immediately. */
1202		clear_buffer_dirty(bh);
1203		clear_buffer_jbddirty(bh);
1204
1205		JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
1206
1207		/*
1208		 * we only want to drop a reference if this transaction
1209		 * modified the buffer
1210		 */
1211		if (was_modified)
1212			drop_reserve = 1;
1213
1214		/*
1215		 * We are no longer going to journal this buffer.
1216		 * However, the commit of this transaction is still
1217		 * important to the buffer: the delete that we are now
1218		 * processing might obsolete an old log entry, so by
1219		 * committing, we can satisfy the buffer's checkpoint.
1220		 *
1221		 * So, if we have a checkpoint on the buffer, we should
1222		 * now refile the buffer on our BJ_Forget list so that
1223		 * we know to remove the checkpoint after we commit.
1224		 */
1225
1226		if (jh->b_cp_transaction) {
1227			__jbd2_journal_temp_unlink_buffer(jh);
1228			__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
1229		} else {
1230			__jbd2_journal_unfile_buffer(jh);
1231			if (!buffer_jbd(bh)) {
1232				spin_unlock(&journal->j_list_lock);
1233				jbd_unlock_bh_state(bh);
1234				__bforget(bh);
1235				goto drop;
1236			}
1237		}
1238	} else if (jh->b_transaction) {
1239		J_ASSERT_JH(jh, (jh->b_transaction ==
1240				 journal->j_committing_transaction));
1241		/* However, if the buffer is still owned by a prior
1242		 * (committing) transaction, we can't drop it yet... */
1243		JBUFFER_TRACE(jh, "belongs to older transaction");
1244		/* ... but we CAN drop it from the new transaction if we
1245		 * have also modified it since the original commit. */
1246
1247		if (jh->b_next_transaction) {
1248			J_ASSERT(jh->b_next_transaction == transaction);
1249			jh->b_next_transaction = NULL;
1250
1251			/*
1252			 * only drop a reference if this transaction modified
1253			 * the buffer
1254			 */
1255			if (was_modified)
1256				drop_reserve = 1;
1257		}
1258	}
1259
1260not_jbd:
1261	spin_unlock(&journal->j_list_lock);
1262	jbd_unlock_bh_state(bh);
1263	__brelse(bh);
1264drop:
1265	if (drop_reserve) {
1266		/* no need to reserve log space for this block -bzzz */
1267		handle->h_buffer_credits++;
1268	}
1269	return err;
1270}
1271
1272/**
1273 * int jbd2_journal_stop() - complete a transaction
1274 * @handle: tranaction to complete.
1275 *
1276 * All done for a particular handle.
1277 *
1278 * There is not much action needed here.  We just return any remaining
1279 * buffer credits to the transaction and remove the handle.  The only
1280 * complication is that we need to start a commit operation if the
1281 * filesystem is marked for synchronous update.
1282 *
1283 * jbd2_journal_stop itself will not usually return an error, but it may
1284 * do so in unusual circumstances.  In particular, expect it to
1285 * return -EIO if a jbd2_journal_abort has been executed since the
1286 * transaction began.
1287 */
1288int jbd2_journal_stop(handle_t *handle)
1289{
1290	transaction_t *transaction = handle->h_transaction;
1291	journal_t *journal = transaction->t_journal;
1292	int err, wait_for_commit = 0;
1293	tid_t tid;
1294	pid_t pid;
1295
1296	J_ASSERT(journal_current_handle() == handle);
1297
1298	if (is_handle_aborted(handle))
1299		err = -EIO;
1300	else {
1301		J_ASSERT(atomic_read(&transaction->t_updates) > 0);
1302		err = 0;
1303	}
1304
1305	if (--handle->h_ref > 0) {
1306		jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
1307			  handle->h_ref);
1308		return err;
1309	}
1310
1311	jbd_debug(4, "Handle %p going down\n", handle);
1312
1313	/*
1314	 * Implement synchronous transaction batching.  If the handle
1315	 * was synchronous, don't force a commit immediately.  Let's
1316	 * yield and let another thread piggyback onto this
1317	 * transaction.  Keep doing that while new threads continue to
1318	 * arrive.  It doesn't cost much - we're about to run a commit
1319	 * and sleep on IO anyway.  Speeds up many-threaded, many-dir
1320	 * operations by 30x or more...
1321	 *
1322	 * We try and optimize the sleep time against what the
1323	 * underlying disk can do, instead of having a static sleep
1324	 * time.  This is useful for the case where our storage is so
1325	 * fast that it is more optimal to go ahead and force a flush
1326	 * and wait for the transaction to be committed than it is to
1327	 * wait for an arbitrary amount of time for new writers to
1328	 * join the transaction.  We achieve this by measuring how
1329	 * long it takes to commit a transaction, and compare it with
1330	 * how long this transaction has been running, and if run time
1331	 * < commit time then we sleep for the delta and commit.  This
1332	 * greatly helps super fast disks that would see slowdowns as
1333	 * more threads started doing fsyncs.
1334	 *
1335	 * But don't do this if this process was the most recent one
1336	 * to perform a synchronous write.  We do this to detect the
1337	 * case where a single process is doing a stream of sync
1338	 * writes.  No point in waiting for joiners in that case.
1339	 */
1340	pid = current->pid;
1341	if (handle->h_sync && journal->j_last_sync_writer != pid) {
1342		u64 commit_time, trans_time;
1343
1344		journal->j_last_sync_writer = pid;
1345
1346		read_lock(&journal->j_state_lock);
1347		commit_time = journal->j_average_commit_time;
1348		read_unlock(&journal->j_state_lock);
1349
1350		trans_time = ktime_to_ns(ktime_sub(ktime_get(),
1351						   transaction->t_start_time));
1352
1353		commit_time = max_t(u64, commit_time,
1354				    1000*journal->j_min_batch_time);
1355		commit_time = min_t(u64, commit_time,
1356				    1000*journal->j_max_batch_time);
1357
1358		if (trans_time < commit_time) {
1359			ktime_t expires = ktime_add_ns(ktime_get(),
1360						       commit_time);
1361			set_current_state(TASK_UNINTERRUPTIBLE);
1362			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
1363		}
1364	}
1365
1366	if (handle->h_sync)
1367		transaction->t_synchronous_commit = 1;
1368	current->journal_info = NULL;
1369	atomic_sub(handle->h_buffer_credits,
1370		   &transaction->t_outstanding_credits);
1371
1372	/*
1373	 * If the handle is marked SYNC, we need to set another commit
1374	 * going!  We also want to force a commit if the current
1375	 * transaction is occupying too much of the log, or if the
1376	 * transaction is too old now.
1377	 */
1378	if (handle->h_sync ||
1379	    (atomic_read(&transaction->t_outstanding_credits) >
1380	     journal->j_max_transaction_buffers) ||
1381	    time_after_eq(jiffies, transaction->t_expires)) {
1382		/* Do this even for aborted journals: an abort still
1383		 * completes the commit thread, it just doesn't write
1384		 * anything to disk. */
1385
1386		jbd_debug(2, "transaction too old, requesting commit for "
1387					"handle %p\n", handle);
1388		/* This is non-blocking */
1389		jbd2_log_start_commit(journal, transaction->t_tid);
1390
1391		/*
1392		 * Special case: JBD2_SYNC synchronous updates require us
1393		 * to wait for the commit to complete.
1394		 */
1395		if (handle->h_sync && !(current->flags & PF_MEMALLOC))
1396			wait_for_commit = 1;
1397	}
1398
1399	/*
1400	 * Once we drop t_updates, if it goes to zero the transaction
1401	 * could start committing on us and eventually disappear.  So
1402	 * once we do this, we must not dereference transaction
1403	 * pointer again.
1404	 */
1405	tid = transaction->t_tid;
1406	if (atomic_dec_and_test(&transaction->t_updates)) {
1407		wake_up(&journal->j_wait_updates);
1408		if (journal->j_barrier_count)
1409			wake_up(&journal->j_wait_transaction_locked);
1410	}
1411
1412	if (wait_for_commit)
1413		err = jbd2_log_wait_commit(journal, tid);
1414
1415	lock_map_release(&handle->h_lockdep_map);
1416
1417	jbd2_free_handle(handle);
1418	return err;
1419}
1420
1421/**
1422 * int jbd2_journal_force_commit() - force any uncommitted transactions
1423 * @journal: journal to force
1424 *
1425 * For synchronous operations: force any uncommitted transactions
1426 * to disk.  May seem kludgy, but it reuses all the handle batching
1427 * code in a very simple manner.
1428 */
1429int jbd2_journal_force_commit(journal_t *journal)
1430{
1431	handle_t *handle;
1432	int ret;
1433
1434	handle = jbd2_journal_start(journal, 1);
1435	if (IS_ERR(handle)) {
1436		ret = PTR_ERR(handle);
1437	} else {
1438		handle->h_sync = 1;
1439		ret = jbd2_journal_stop(handle);
1440	}
1441	return ret;
1442}
1443
1444/*
1445 *
1446 * List management code snippets: various functions for manipulating the
1447 * transaction buffer lists.
1448 *
1449 */
1450
1451/*
1452 * Append a buffer to a transaction list, given the transaction's list head
1453 * pointer.
1454 *
1455 * j_list_lock is held.
1456 *
1457 * jbd_lock_bh_state(jh2bh(jh)) is held.
1458 */
1459
1460static inline void
1461__blist_add_buffer(struct journal_head **list, struct journal_head *jh)
1462{
1463	if (!*list) {
1464		jh->b_tnext = jh->b_tprev = jh;
1465		*list = jh;
1466	} else {
1467		/* Insert at the tail of the list to preserve order */
1468		struct journal_head *first = *list, *last = first->b_tprev;
1469		jh->b_tprev = last;
1470		jh->b_tnext = first;
1471		last->b_tnext = first->b_tprev = jh;
1472	}
1473}
1474
1475/*
1476 * Remove a buffer from a transaction list, given the transaction's list
1477 * head pointer.
1478 *
1479 * Called with j_list_lock held, and the journal may not be locked.
1480 *
1481 * jbd_lock_bh_state(jh2bh(jh)) is held.
1482 */
1483
1484static inline void
1485__blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1486{
1487	if (*list == jh) {
1488		*list = jh->b_tnext;
1489		if (*list == jh)
1490			*list = NULL;
1491	}
1492	jh->b_tprev->b_tnext = jh->b_tnext;
1493	jh->b_tnext->b_tprev = jh->b_tprev;
1494}
1495
1496/*
1497 * Remove a buffer from the appropriate transaction list.
1498 *
1499 * Note that this function can *change* the value of
1500 * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
1501 * t_log_list or t_reserved_list.  If the caller is holding onto a copy of one
1502 * of these pointers, it could go bad.  Generally the caller needs to re-read
1503 * the pointer from the transaction_t.
1504 *
1505 * Called under j_list_lock.  The journal may not be locked.
1506 */
1507void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1508{
1509	struct journal_head **list = NULL;
1510	transaction_t *transaction;
1511	struct buffer_head *bh = jh2bh(jh);
1512
1513	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
1514	transaction = jh->b_transaction;
1515	if (transaction)
1516		assert_spin_locked(&transaction->t_journal->j_list_lock);
1517
1518	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
1519	if (jh->b_jlist != BJ_None)
1520		J_ASSERT_JH(jh, transaction != NULL);
1521
1522	switch (jh->b_jlist) {
1523	case BJ_None:
1524		return;
1525	case BJ_Metadata:
1526		transaction->t_nr_buffers--;
1527		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
1528		list = &transaction->t_buffers;
1529		break;
1530	case BJ_Forget:
1531		list = &transaction->t_forget;
1532		break;
1533	case BJ_IO:
1534		list = &transaction->t_iobuf_list;
1535		break;
1536	case BJ_Shadow:
1537		list = &transaction->t_shadow_list;
1538		break;
1539	case BJ_LogCtl:
1540		list = &transaction->t_log_list;
1541		break;
1542	case BJ_Reserved:
1543		list = &transaction->t_reserved_list;
1544		break;
1545	}
1546
1547	__blist_del_buffer(list, jh);
1548	jh->b_jlist = BJ_None;
1549	if (test_clear_buffer_jbddirty(bh))
1550		mark_buffer_dirty(bh);	/* Expose it to the VM */
1551}
1552
1553/*
1554 * Remove buffer from all transactions.
1555 *
1556 * Called with bh_state lock and j_list_lock
1557 *
1558 * jh and bh may be already freed when this function returns.
1559 */
1560static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
1561{
1562	__jbd2_journal_temp_unlink_buffer(jh);
1563	jh->b_transaction = NULL;
1564	jbd2_journal_put_journal_head(jh);
1565}
1566
1567void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
1568{
1569	struct buffer_head *bh = jh2bh(jh);
1570
1571	/* Get reference so that buffer cannot be freed before we unlock it */
1572	get_bh(bh);
1573	jbd_lock_bh_state(bh);
1574	spin_lock(&journal->j_list_lock);
1575	__jbd2_journal_unfile_buffer(jh);
1576	spin_unlock(&journal->j_list_lock);
1577	jbd_unlock_bh_state(bh);
1578	__brelse(bh);
1579}
1580
1581/*
1582 * Called from jbd2_journal_try_to_free_buffers().
1583 *
1584 * Called under jbd_lock_bh_state(bh)
1585 */
1586static void
1587__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1588{
1589	struct journal_head *jh;
1590
1591	jh = bh2jh(bh);
1592
1593	if (buffer_locked(bh) || buffer_dirty(bh))
1594		goto out;
1595
1596	if (jh->b_next_transaction != NULL)
1597		goto out;
1598
1599	spin_lock(&journal->j_list_lock);
1600	if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1601		/* written-back checkpointed metadata buffer */
1602		if (jh->b_jlist == BJ_None) {
1603			JBUFFER_TRACE(jh, "remove from checkpoint list");
1604			__jbd2_journal_remove_checkpoint(jh);
1605		}
1606	}
1607	spin_unlock(&journal->j_list_lock);
1608out:
1609	return;
1610}
1611
1612/**
1613 * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
1614 * @journal: journal for operation
1615 * @page: to try and free
1616 * @gfp_mask: we use the mask to detect how hard should we try to release
1617 * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
1618 * release the buffers.
1619 *
1620 *
1621 * For all the buffers on this page,
1622 * if they are fully written out ordered data, move them onto BUF_CLEAN
1623 * so try_to_free_buffers() can reap them.
1624 *
1625 * This function returns non-zero if we wish try_to_free_buffers()
1626 * to be called. We do this if the page is releasable by try_to_free_buffers().
1627 * We also do it if the page has locked or dirty buffers and the caller wants
1628 * us to perform sync or async writeout.
1629 *
1630 * This complicates JBD locking somewhat.  We aren't protected by the
1631 * BKL here.  We wish to remove the buffer from its committing or
1632 * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
1633 *
1634 * This may *change* the value of transaction_t->t_datalist, so anyone
1635 * who looks at t_datalist needs to lock against this function.
1636 *
1637 * Even worse, someone may be doing a jbd2_journal_dirty_data on this
1638 * buffer.  So we need to lock against that.  jbd2_journal_dirty_data()
1639 * will come out of the lock with the buffer dirty, which makes it
1640 * ineligible for release here.
1641 *
1642 * Who else is affected by this?  hmm...  Really the only contender
1643 * is do_get_write_access() - it could be looking at the buffer while
1644 * journal_try_to_free_buffer() is changing its state.  But that
1645 * cannot happen because we never reallocate freed data as metadata
1646 * while the data is part of a transaction.  Yes?
1647 *
1648 * Return 0 on failure, 1 on success
1649 */
1650int jbd2_journal_try_to_free_buffers(journal

Large files files are truncated, but you can click here to view the full file