PageRenderTime 74ms CodeModel.GetById 13ms app.highlight 51ms RepoModel.GetById 0ms app.codeStats 1ms

/fs/namei.c

https://bitbucket.org/cyanogenmod/android_kernel_asus_tf300t
C | 3411 lines | 2426 code | 405 blank | 580 comment | 587 complexity | fb2fd2e2ce1cc24bb0eb80bc14eb23bf MD5 | raw file
Possible License(s): LGPL-2.0, AGPL-1.0, GPL-2.0

Large files files are truncated, but you can click here to view the full file

   1/*
   2 *  linux/fs/namei.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 * Some corrections by tytso.
   9 */
  10
  11/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
  12 * lookup logic.
  13 */
  14/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
  15 */
  16
  17#include <linux/init.h>
  18#include <linux/module.h>
  19#include <linux/slab.h>
  20#include <linux/fs.h>
  21#include <linux/namei.h>
  22#include <linux/pagemap.h>
  23#include <linux/fsnotify.h>
  24#include <linux/personality.h>
  25#include <linux/security.h>
  26#include <linux/ima.h>
  27#include <linux/syscalls.h>
  28#include <linux/mount.h>
  29#include <linux/audit.h>
  30#include <linux/capability.h>
  31#include <linux/file.h>
  32#include <linux/fcntl.h>
  33#include <linux/device_cgroup.h>
  34#include <linux/fs_struct.h>
  35#include <linux/posix_acl.h>
  36#include <asm/uaccess.h>
  37
  38#include "internal.h"
  39
  40/* [Feb-1997 T. Schoebel-Theuer]
  41 * Fundamental changes in the pathname lookup mechanisms (namei)
  42 * were necessary because of omirr.  The reason is that omirr needs
  43 * to know the _real_ pathname, not the user-supplied one, in case
  44 * of symlinks (and also when transname replacements occur).
  45 *
  46 * The new code replaces the old recursive symlink resolution with
  47 * an iterative one (in case of non-nested symlink chains).  It does
  48 * this with calls to <fs>_follow_link().
  49 * As a side effect, dir_namei(), _namei() and follow_link() are now 
  50 * replaced with a single function lookup_dentry() that can handle all 
  51 * the special cases of the former code.
  52 *
  53 * With the new dcache, the pathname is stored at each inode, at least as
  54 * long as the refcount of the inode is positive.  As a side effect, the
  55 * size of the dcache depends on the inode cache and thus is dynamic.
  56 *
  57 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
  58 * resolution to correspond with current state of the code.
  59 *
  60 * Note that the symlink resolution is not *completely* iterative.
  61 * There is still a significant amount of tail- and mid- recursion in
  62 * the algorithm.  Also, note that <fs>_readlink() is not used in
  63 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
  64 * may return different results than <fs>_follow_link().  Many virtual
  65 * filesystems (including /proc) exhibit this behavior.
  66 */
  67
  68/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
  69 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
  70 * and the name already exists in form of a symlink, try to create the new
  71 * name indicated by the symlink. The old code always complained that the
  72 * name already exists, due to not following the symlink even if its target
  73 * is nonexistent.  The new semantics affects also mknod() and link() when
  74 * the name is a symlink pointing to a non-existent name.
  75 *
  76 * I don't know which semantics is the right one, since I have no access
  77 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
  78 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
  79 * "old" one. Personally, I think the new semantics is much more logical.
  80 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
  81 * file does succeed in both HP-UX and SunOs, but not in Solaris
  82 * and in the old Linux semantics.
  83 */
  84
  85/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
  86 * semantics.  See the comments in "open_namei" and "do_link" below.
  87 *
  88 * [10-Sep-98 Alan Modra] Another symlink change.
  89 */
  90
  91/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
  92 *	inside the path - always follow.
  93 *	in the last component in creation/removal/renaming - never follow.
  94 *	if LOOKUP_FOLLOW passed - follow.
  95 *	if the pathname has trailing slashes - follow.
  96 *	otherwise - don't follow.
  97 * (applied in that order).
  98 *
  99 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 100 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 101 * During the 2.4 we need to fix the userland stuff depending on it -
 102 * hopefully we will be able to get rid of that wart in 2.5. So far only
 103 * XEmacs seems to be relying on it...
 104 */
 105/*
 106 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
 107 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
 108 * any extra contention...
 109 */
 110
 111/* In order to reduce some races, while at the same time doing additional
 112 * checking and hopefully speeding things up, we copy filenames to the
 113 * kernel data space before using them..
 114 *
 115 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 116 * PATH_MAX includes the nul terminator --RR.
 117 */
 118static int do_getname(const char __user *filename, char *page)
 119{
 120	int retval;
 121	unsigned long len = PATH_MAX;
 122
 123	if (!segment_eq(get_fs(), KERNEL_DS)) {
 124		if ((unsigned long) filename >= TASK_SIZE)
 125			return -EFAULT;
 126		if (TASK_SIZE - (unsigned long) filename < PATH_MAX)
 127			len = TASK_SIZE - (unsigned long) filename;
 128	}
 129
 130	retval = strncpy_from_user(page, filename, len);
 131	if (retval > 0) {
 132		if (retval < len)
 133			return 0;
 134		return -ENAMETOOLONG;
 135	} else if (!retval)
 136		retval = -ENOENT;
 137	return retval;
 138}
 139
 140static char *getname_flags(const char __user *filename, int flags, int *empty)
 141{
 142	char *tmp, *result;
 143
 144	result = ERR_PTR(-ENOMEM);
 145	tmp = __getname();
 146	if (tmp)  {
 147		int retval = do_getname(filename, tmp);
 148
 149		result = tmp;
 150		if (retval < 0) {
 151			if (retval == -ENOENT && empty)
 152				*empty = 1;
 153			if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
 154				__putname(tmp);
 155				result = ERR_PTR(retval);
 156			}
 157		}
 158	}
 159	audit_getname(result);
 160	return result;
 161}
 162
 163char *getname(const char __user * filename)
 164{
 165	return getname_flags(filename, 0, 0);
 166}
 167
 168#ifdef CONFIG_AUDITSYSCALL
 169void putname(const char *name)
 170{
 171	if (unlikely(!audit_dummy_context()))
 172		audit_putname(name);
 173	else
 174		__putname(name);
 175}
 176EXPORT_SYMBOL(putname);
 177#endif
 178
 179static int check_acl(struct inode *inode, int mask)
 180{
 181#ifdef CONFIG_FS_POSIX_ACL
 182	struct posix_acl *acl;
 183
 184	if (mask & MAY_NOT_BLOCK) {
 185		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
 186	        if (!acl)
 187	                return -EAGAIN;
 188		/* no ->get_acl() calls in RCU mode... */
 189		if (acl == ACL_NOT_CACHED)
 190			return -ECHILD;
 191	        return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
 192	}
 193
 194	acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
 195
 196	/*
 197	 * A filesystem can force a ACL callback by just never filling the
 198	 * ACL cache. But normally you'd fill the cache either at inode
 199	 * instantiation time, or on the first ->get_acl call.
 200	 *
 201	 * If the filesystem doesn't have a get_acl() function at all, we'll
 202	 * just create the negative cache entry.
 203	 */
 204	if (acl == ACL_NOT_CACHED) {
 205	        if (inode->i_op->get_acl) {
 206			acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS);
 207			if (IS_ERR(acl))
 208				return PTR_ERR(acl);
 209		} else {
 210		        set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
 211		        return -EAGAIN;
 212		}
 213	}
 214
 215	if (acl) {
 216	        int error = posix_acl_permission(inode, acl, mask);
 217	        posix_acl_release(acl);
 218	        return error;
 219	}
 220#endif
 221
 222	return -EAGAIN;
 223}
 224
 225/*
 226 * This does basic POSIX ACL permission checking
 227 */
 228static int acl_permission_check(struct inode *inode, int mask)
 229{
 230	unsigned int mode = inode->i_mode;
 231
 232	mask &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK;
 233
 234	if (current_user_ns() != inode_userns(inode))
 235		goto other_perms;
 236
 237	if (likely(current_fsuid() == inode->i_uid))
 238		mode >>= 6;
 239	else {
 240		if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
 241			int error = check_acl(inode, mask);
 242			if (error != -EAGAIN)
 243				return error;
 244		}
 245
 246		if (in_group_p(inode->i_gid))
 247			mode >>= 3;
 248	}
 249
 250other_perms:
 251	/*
 252	 * If the DACs are ok we don't need any capability check.
 253	 */
 254	if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
 255		return 0;
 256	return -EACCES;
 257}
 258
 259/**
 260 * generic_permission -  check for access rights on a Posix-like filesystem
 261 * @inode:	inode to check access rights for
 262 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 263 *
 264 * Used to check for read/write/execute permissions on a file.
 265 * We use "fsuid" for this, letting us set arbitrary permissions
 266 * for filesystem access without changing the "normal" uids which
 267 * are used for other things.
 268 *
 269 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 270 * request cannot be satisfied (eg. requires blocking or too much complexity).
 271 * It would then be called again in ref-walk mode.
 272 */
 273int generic_permission(struct inode *inode, int mask)
 274{
 275	int ret;
 276
 277	/*
 278	 * Do the basic POSIX ACL permission checks.
 279	 */
 280	ret = acl_permission_check(inode, mask);
 281	if (ret != -EACCES)
 282		return ret;
 283
 284	if (S_ISDIR(inode->i_mode)) {
 285		/* DACs are overridable for directories */
 286		if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
 287			return 0;
 288		if (!(mask & MAY_WRITE))
 289			if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
 290				return 0;
 291		return -EACCES;
 292	}
 293	/*
 294	 * Read/write DACs are always overridable.
 295	 * Executable DACs are overridable when there is
 296	 * at least one exec bit set.
 297	 */
 298	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
 299		if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
 300			return 0;
 301
 302	/*
 303	 * Searching includes executable on directories, else just read.
 304	 */
 305	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
 306	if (mask == MAY_READ)
 307		if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
 308			return 0;
 309
 310	return -EACCES;
 311}
 312
 313/*
 314 * We _really_ want to just do "generic_permission()" without
 315 * even looking at the inode->i_op values. So we keep a cache
 316 * flag in inode->i_opflags, that says "this has not special
 317 * permission function, use the fast case".
 318 */
 319static inline int do_inode_permission(struct inode *inode, int mask)
 320{
 321	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
 322		if (likely(inode->i_op->permission))
 323			return inode->i_op->permission(inode, mask);
 324
 325		/* This gets set once for the inode lifetime */
 326		spin_lock(&inode->i_lock);
 327		inode->i_opflags |= IOP_FASTPERM;
 328		spin_unlock(&inode->i_lock);
 329	}
 330	return generic_permission(inode, mask);
 331}
 332
 333/**
 334 * inode_permission  -  check for access rights to a given inode
 335 * @inode:	inode to check permission on
 336 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 337 *
 338 * Used to check for read/write/execute permissions on an inode.
 339 * We use "fsuid" for this, letting us set arbitrary permissions
 340 * for filesystem access without changing the "normal" uids which
 341 * are used for other things.
 342 */
 343int inode_permission(struct inode *inode, int mask)
 344{
 345	int retval;
 346
 347	if (unlikely(mask & MAY_WRITE)) {
 348		umode_t mode = inode->i_mode;
 349
 350		/*
 351		 * Nobody gets write access to a read-only fs.
 352		 */
 353		if (IS_RDONLY(inode) &&
 354		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
 355			return -EROFS;
 356
 357		/*
 358		 * Nobody gets write access to an immutable file.
 359		 */
 360		if (IS_IMMUTABLE(inode))
 361			return -EACCES;
 362	}
 363
 364	retval = do_inode_permission(inode, mask);
 365	if (retval)
 366		return retval;
 367
 368	retval = devcgroup_inode_permission(inode, mask);
 369	if (retval)
 370		return retval;
 371
 372	return security_inode_permission(inode, mask);
 373}
 374
 375/**
 376 * path_get - get a reference to a path
 377 * @path: path to get the reference to
 378 *
 379 * Given a path increment the reference count to the dentry and the vfsmount.
 380 */
 381void path_get(struct path *path)
 382{
 383	mntget(path->mnt);
 384	dget(path->dentry);
 385}
 386EXPORT_SYMBOL(path_get);
 387
 388/**
 389 * path_put - put a reference to a path
 390 * @path: path to put the reference to
 391 *
 392 * Given a path decrement the reference count to the dentry and the vfsmount.
 393 */
 394void path_put(struct path *path)
 395{
 396	dput(path->dentry);
 397	mntput(path->mnt);
 398}
 399EXPORT_SYMBOL(path_put);
 400
 401/*
 402 * Path walking has 2 modes, rcu-walk and ref-walk (see
 403 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 404 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 405 * normal reference counts on dentries and vfsmounts to transition to rcu-walk
 406 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 407 * got stuck, so ref-walk may continue from there. If this is not successful
 408 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 409 * to restart the path walk from the beginning in ref-walk mode.
 410 */
 411
 412/**
 413 * unlazy_walk - try to switch to ref-walk mode.
 414 * @nd: nameidata pathwalk data
 415 * @dentry: child of nd->path.dentry or NULL
 416 * Returns: 0 on success, -ECHILD on failure
 417 *
 418 * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
 419 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
 420 * @nd or NULL.  Must be called from rcu-walk context.
 421 */
 422static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 423{
 424	struct fs_struct *fs = current->fs;
 425	struct dentry *parent = nd->path.dentry;
 426	int want_root = 0;
 427
 428	BUG_ON(!(nd->flags & LOOKUP_RCU));
 429	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
 430		want_root = 1;
 431		spin_lock(&fs->lock);
 432		if (nd->root.mnt != fs->root.mnt ||
 433				nd->root.dentry != fs->root.dentry)
 434			goto err_root;
 435	}
 436	spin_lock(&parent->d_lock);
 437	if (!dentry) {
 438		if (!__d_rcu_to_refcount(parent, nd->seq))
 439			goto err_parent;
 440		BUG_ON(nd->inode != parent->d_inode);
 441	} else {
 442		if (dentry->d_parent != parent)
 443			goto err_parent;
 444		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 445		if (!__d_rcu_to_refcount(dentry, nd->seq))
 446			goto err_child;
 447		/*
 448		 * If the sequence check on the child dentry passed, then
 449		 * the child has not been removed from its parent. This
 450		 * means the parent dentry must be valid and able to take
 451		 * a reference at this point.
 452		 */
 453		BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
 454		BUG_ON(!parent->d_count);
 455		parent->d_count++;
 456		spin_unlock(&dentry->d_lock);
 457	}
 458	spin_unlock(&parent->d_lock);
 459	if (want_root) {
 460		path_get(&nd->root);
 461		spin_unlock(&fs->lock);
 462	}
 463	mntget(nd->path.mnt);
 464
 465	rcu_read_unlock();
 466	br_read_unlock(vfsmount_lock);
 467	nd->flags &= ~LOOKUP_RCU;
 468	return 0;
 469
 470err_child:
 471	spin_unlock(&dentry->d_lock);
 472err_parent:
 473	spin_unlock(&parent->d_lock);
 474err_root:
 475	if (want_root)
 476		spin_unlock(&fs->lock);
 477	return -ECHILD;
 478}
 479
 480/**
 481 * release_open_intent - free up open intent resources
 482 * @nd: pointer to nameidata
 483 */
 484void release_open_intent(struct nameidata *nd)
 485{
 486	struct file *file = nd->intent.open.file;
 487
 488	if (file && !IS_ERR(file)) {
 489		if (file->f_path.dentry == NULL)
 490			put_filp(file);
 491		else
 492			fput(file);
 493	}
 494}
 495
 496static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd)
 497{
 498	return dentry->d_op->d_revalidate(dentry, nd);
 499}
 500
 501/**
 502 * complete_walk - successful completion of path walk
 503 * @nd:  pointer nameidata
 504 *
 505 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 506 * Revalidate the final result, unless we'd already done that during
 507 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 508 * success, -error on failure.  In case of failure caller does not
 509 * need to drop nd->path.
 510 */
 511static int complete_walk(struct nameidata *nd)
 512{
 513	struct dentry *dentry = nd->path.dentry;
 514	int status;
 515
 516	if (nd->flags & LOOKUP_RCU) {
 517		nd->flags &= ~LOOKUP_RCU;
 518		if (!(nd->flags & LOOKUP_ROOT))
 519			nd->root.mnt = NULL;
 520		spin_lock(&dentry->d_lock);
 521		if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
 522			spin_unlock(&dentry->d_lock);
 523			rcu_read_unlock();
 524			br_read_unlock(vfsmount_lock);
 525			return -ECHILD;
 526		}
 527		BUG_ON(nd->inode != dentry->d_inode);
 528		spin_unlock(&dentry->d_lock);
 529		mntget(nd->path.mnt);
 530		rcu_read_unlock();
 531		br_read_unlock(vfsmount_lock);
 532	}
 533
 534	if (likely(!(nd->flags & LOOKUP_JUMPED)))
 535		return 0;
 536
 537	if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
 538		return 0;
 539
 540	if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
 541		return 0;
 542
 543	/* Note: we do not d_invalidate() */
 544	status = d_revalidate(dentry, nd);
 545	if (status > 0)
 546		return 0;
 547
 548	if (!status)
 549		status = -ESTALE;
 550
 551	path_put(&nd->path);
 552	return status;
 553}
 554
 555static __always_inline void set_root(struct nameidata *nd)
 556{
 557	if (!nd->root.mnt)
 558		get_fs_root(current->fs, &nd->root);
 559}
 560
 561static int link_path_walk(const char *, struct nameidata *);
 562
 563static __always_inline void set_root_rcu(struct nameidata *nd)
 564{
 565	if (!nd->root.mnt) {
 566		struct fs_struct *fs = current->fs;
 567		unsigned seq;
 568
 569		do {
 570			seq = read_seqcount_begin(&fs->seq);
 571			nd->root = fs->root;
 572			nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
 573		} while (read_seqcount_retry(&fs->seq, seq));
 574	}
 575}
 576
 577static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 578{
 579	int ret;
 580
 581	if (IS_ERR(link))
 582		goto fail;
 583
 584	if (*link == '/') {
 585		set_root(nd);
 586		path_put(&nd->path);
 587		nd->path = nd->root;
 588		path_get(&nd->root);
 589		nd->flags |= LOOKUP_JUMPED;
 590	}
 591	nd->inode = nd->path.dentry->d_inode;
 592
 593	ret = link_path_walk(link, nd);
 594	return ret;
 595fail:
 596	path_put(&nd->path);
 597	return PTR_ERR(link);
 598}
 599
 600static void path_put_conditional(struct path *path, struct nameidata *nd)
 601{
 602	dput(path->dentry);
 603	if (path->mnt != nd->path.mnt)
 604		mntput(path->mnt);
 605}
 606
 607static inline void path_to_nameidata(const struct path *path,
 608					struct nameidata *nd)
 609{
 610	if (!(nd->flags & LOOKUP_RCU)) {
 611		dput(nd->path.dentry);
 612		if (nd->path.mnt != path->mnt)
 613			mntput(nd->path.mnt);
 614	}
 615	nd->path.mnt = path->mnt;
 616	nd->path.dentry = path->dentry;
 617}
 618
 619static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
 620{
 621	struct inode *inode = link->dentry->d_inode;
 622	if (!IS_ERR(cookie) && inode->i_op->put_link)
 623		inode->i_op->put_link(link->dentry, nd, cookie);
 624	path_put(link);
 625}
 626
 627static __always_inline int
 628follow_link(struct path *link, struct nameidata *nd, void **p)
 629{
 630	int error;
 631	struct dentry *dentry = link->dentry;
 632
 633	BUG_ON(nd->flags & LOOKUP_RCU);
 634
 635	if (link->mnt == nd->path.mnt)
 636		mntget(link->mnt);
 637
 638	if (unlikely(current->total_link_count >= 40)) {
 639		*p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
 640		path_put(&nd->path);
 641		return -ELOOP;
 642	}
 643	cond_resched();
 644	current->total_link_count++;
 645
 646	touch_atime(link->mnt, dentry);
 647	nd_set_link(nd, NULL);
 648
 649	error = security_inode_follow_link(link->dentry, nd);
 650	if (error) {
 651		*p = ERR_PTR(error); /* no ->put_link(), please */
 652		path_put(&nd->path);
 653		return error;
 654	}
 655
 656	nd->last_type = LAST_BIND;
 657	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
 658	error = PTR_ERR(*p);
 659	if (!IS_ERR(*p)) {
 660		char *s = nd_get_link(nd);
 661		error = 0;
 662		if (s)
 663			error = __vfs_follow_link(nd, s);
 664		else if (nd->last_type == LAST_BIND) {
 665			nd->flags |= LOOKUP_JUMPED;
 666			nd->inode = nd->path.dentry->d_inode;
 667			if (nd->inode->i_op->follow_link) {
 668				/* stepped on a _really_ weird one */
 669				path_put(&nd->path);
 670				error = -ELOOP;
 671			}
 672		}
 673	}
 674	return error;
 675}
 676
 677static int follow_up_rcu(struct path *path)
 678{
 679	struct vfsmount *parent;
 680	struct dentry *mountpoint;
 681
 682	parent = path->mnt->mnt_parent;
 683	if (parent == path->mnt)
 684		return 0;
 685	mountpoint = path->mnt->mnt_mountpoint;
 686	path->dentry = mountpoint;
 687	path->mnt = parent;
 688	return 1;
 689}
 690
 691int follow_up(struct path *path)
 692{
 693	struct vfsmount *parent;
 694	struct dentry *mountpoint;
 695
 696	br_read_lock(vfsmount_lock);
 697	parent = path->mnt->mnt_parent;
 698	if (parent == path->mnt) {
 699		br_read_unlock(vfsmount_lock);
 700		return 0;
 701	}
 702	mntget(parent);
 703	mountpoint = dget(path->mnt->mnt_mountpoint);
 704	br_read_unlock(vfsmount_lock);
 705	dput(path->dentry);
 706	path->dentry = mountpoint;
 707	mntput(path->mnt);
 708	path->mnt = parent;
 709	return 1;
 710}
 711
 712/*
 713 * Perform an automount
 714 * - return -EISDIR to tell follow_managed() to stop and return the path we
 715 *   were called with.
 716 */
 717static int follow_automount(struct path *path, unsigned flags,
 718			    bool *need_mntput)
 719{
 720	struct vfsmount *mnt;
 721	int err;
 722
 723	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
 724		return -EREMOTE;
 725
 726	/* We don't want to mount if someone's just doing a stat -
 727	 * unless they're stat'ing a directory and appended a '/' to
 728	 * the name.
 729	 *
 730	 * We do, however, want to mount if someone wants to open or
 731	 * create a file of any type under the mountpoint, wants to
 732	 * traverse through the mountpoint or wants to open the
 733	 * mounted directory.  Also, autofs may mark negative dentries
 734	 * as being automount points.  These will need the attentions
 735	 * of the daemon to instantiate them before they can be used.
 736	 */
 737	if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
 738		     LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
 739	    path->dentry->d_inode)
 740		return -EISDIR;
 741
 742	current->total_link_count++;
 743	if (current->total_link_count >= 40)
 744		return -ELOOP;
 745
 746	mnt = path->dentry->d_op->d_automount(path);
 747	if (IS_ERR(mnt)) {
 748		/*
 749		 * The filesystem is allowed to return -EISDIR here to indicate
 750		 * it doesn't want to automount.  For instance, autofs would do
 751		 * this so that its userspace daemon can mount on this dentry.
 752		 *
 753		 * However, we can only permit this if it's a terminal point in
 754		 * the path being looked up; if it wasn't then the remainder of
 755		 * the path is inaccessible and we should say so.
 756		 */
 757		if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT))
 758			return -EREMOTE;
 759		return PTR_ERR(mnt);
 760	}
 761
 762	if (!mnt) /* mount collision */
 763		return 0;
 764
 765	if (!*need_mntput) {
 766		/* lock_mount() may release path->mnt on error */
 767		mntget(path->mnt);
 768		*need_mntput = true;
 769	}
 770	err = finish_automount(mnt, path);
 771
 772	switch (err) {
 773	case -EBUSY:
 774		/* Someone else made a mount here whilst we were busy */
 775		return 0;
 776	case 0:
 777		path_put(path);
 778		path->mnt = mnt;
 779		path->dentry = dget(mnt->mnt_root);
 780		return 0;
 781	default:
 782		return err;
 783	}
 784
 785}
 786
 787/*
 788 * Handle a dentry that is managed in some way.
 789 * - Flagged for transit management (autofs)
 790 * - Flagged as mountpoint
 791 * - Flagged as automount point
 792 *
 793 * This may only be called in refwalk mode.
 794 *
 795 * Serialization is taken care of in namespace.c
 796 */
 797static int follow_managed(struct path *path, unsigned flags)
 798{
 799	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
 800	unsigned managed;
 801	bool need_mntput = false;
 802	int ret = 0;
 803
 804	/* Given that we're not holding a lock here, we retain the value in a
 805	 * local variable for each dentry as we look at it so that we don't see
 806	 * the components of that value change under us */
 807	while (managed = ACCESS_ONCE(path->dentry->d_flags),
 808	       managed &= DCACHE_MANAGED_DENTRY,
 809	       unlikely(managed != 0)) {
 810		/* Allow the filesystem to manage the transit without i_mutex
 811		 * being held. */
 812		if (managed & DCACHE_MANAGE_TRANSIT) {
 813			BUG_ON(!path->dentry->d_op);
 814			BUG_ON(!path->dentry->d_op->d_manage);
 815			ret = path->dentry->d_op->d_manage(path->dentry, false);
 816			if (ret < 0)
 817				break;
 818		}
 819
 820		/* Transit to a mounted filesystem. */
 821		if (managed & DCACHE_MOUNTED) {
 822			struct vfsmount *mounted = lookup_mnt(path);
 823			if (mounted) {
 824				dput(path->dentry);
 825				if (need_mntput)
 826					mntput(path->mnt);
 827				path->mnt = mounted;
 828				path->dentry = dget(mounted->mnt_root);
 829				need_mntput = true;
 830				continue;
 831			}
 832
 833			/* Something is mounted on this dentry in another
 834			 * namespace and/or whatever was mounted there in this
 835			 * namespace got unmounted before we managed to get the
 836			 * vfsmount_lock */
 837		}
 838
 839		/* Handle an automount point */
 840		if (managed & DCACHE_NEED_AUTOMOUNT) {
 841			ret = follow_automount(path, flags, &need_mntput);
 842			if (ret < 0)
 843				break;
 844			continue;
 845		}
 846
 847		/* We didn't change the current path point */
 848		break;
 849	}
 850
 851	if (need_mntput && path->mnt == mnt)
 852		mntput(path->mnt);
 853	if (ret == -EISDIR)
 854		ret = 0;
 855	return ret < 0 ? ret : need_mntput;
 856}
 857
 858int follow_down_one(struct path *path)
 859{
 860	struct vfsmount *mounted;
 861
 862	mounted = lookup_mnt(path);
 863	if (mounted) {
 864		dput(path->dentry);
 865		mntput(path->mnt);
 866		path->mnt = mounted;
 867		path->dentry = dget(mounted->mnt_root);
 868		return 1;
 869	}
 870	return 0;
 871}
 872
 873static inline bool managed_dentry_might_block(struct dentry *dentry)
 874{
 875	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
 876		dentry->d_op->d_manage(dentry, true) < 0);
 877}
 878
 879/*
 880 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 881 * we meet a managed dentry that would need blocking.
 882 */
 883static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 884			       struct inode **inode)
 885{
 886	for (;;) {
 887		struct vfsmount *mounted;
 888		/*
 889		 * Don't forget we might have a non-mountpoint managed dentry
 890		 * that wants to block transit.
 891		 */
 892		if (unlikely(managed_dentry_might_block(path->dentry)))
 893			return false;
 894
 895		if (!d_mountpoint(path->dentry))
 896			break;
 897
 898		mounted = __lookup_mnt(path->mnt, path->dentry, 1);
 899		if (!mounted)
 900			break;
 901		path->mnt = mounted;
 902		path->dentry = mounted->mnt_root;
 903		nd->flags |= LOOKUP_JUMPED;
 904		nd->seq = read_seqcount_begin(&path->dentry->d_seq);
 905		/*
 906		 * Update the inode too. We don't need to re-check the
 907		 * dentry sequence number here after this d_inode read,
 908		 * because a mount-point is always pinned.
 909		 */
 910		*inode = path->dentry->d_inode;
 911	}
 912	return true;
 913}
 914
 915static void follow_mount_rcu(struct nameidata *nd)
 916{
 917	while (d_mountpoint(nd->path.dentry)) {
 918		struct vfsmount *mounted;
 919		mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
 920		if (!mounted)
 921			break;
 922		nd->path.mnt = mounted;
 923		nd->path.dentry = mounted->mnt_root;
 924		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
 925	}
 926}
 927
 928static int follow_dotdot_rcu(struct nameidata *nd)
 929{
 930	set_root_rcu(nd);
 931
 932	while (1) {
 933		if (nd->path.dentry == nd->root.dentry &&
 934		    nd->path.mnt == nd->root.mnt) {
 935			break;
 936		}
 937		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 938			struct dentry *old = nd->path.dentry;
 939			struct dentry *parent = old->d_parent;
 940			unsigned seq;
 941
 942			seq = read_seqcount_begin(&parent->d_seq);
 943			if (read_seqcount_retry(&old->d_seq, nd->seq))
 944				goto failed;
 945			nd->path.dentry = parent;
 946			nd->seq = seq;
 947			break;
 948		}
 949		if (!follow_up_rcu(&nd->path))
 950			break;
 951		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
 952	}
 953	follow_mount_rcu(nd);
 954	nd->inode = nd->path.dentry->d_inode;
 955	return 0;
 956
 957failed:
 958	nd->flags &= ~LOOKUP_RCU;
 959	if (!(nd->flags & LOOKUP_ROOT))
 960		nd->root.mnt = NULL;
 961	rcu_read_unlock();
 962	br_read_unlock(vfsmount_lock);
 963	return -ECHILD;
 964}
 965
 966/*
 967 * Follow down to the covering mount currently visible to userspace.  At each
 968 * point, the filesystem owning that dentry may be queried as to whether the
 969 * caller is permitted to proceed or not.
 970 */
 971int follow_down(struct path *path)
 972{
 973	unsigned managed;
 974	int ret;
 975
 976	while (managed = ACCESS_ONCE(path->dentry->d_flags),
 977	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
 978		/* Allow the filesystem to manage the transit without i_mutex
 979		 * being held.
 980		 *
 981		 * We indicate to the filesystem if someone is trying to mount
 982		 * something here.  This gives autofs the chance to deny anyone
 983		 * other than its daemon the right to mount on its
 984		 * superstructure.
 985		 *
 986		 * The filesystem may sleep at this point.
 987		 */
 988		if (managed & DCACHE_MANAGE_TRANSIT) {
 989			BUG_ON(!path->dentry->d_op);
 990			BUG_ON(!path->dentry->d_op->d_manage);
 991			ret = path->dentry->d_op->d_manage(
 992				path->dentry, false);
 993			if (ret < 0)
 994				return ret == -EISDIR ? 0 : ret;
 995		}
 996
 997		/* Transit to a mounted filesystem. */
 998		if (managed & DCACHE_MOUNTED) {
 999			struct vfsmount *mounted = lookup_mnt(path);
1000			if (!mounted)
1001				break;
1002			dput(path->dentry);
1003			mntput(path->mnt);
1004			path->mnt = mounted;
1005			path->dentry = dget(mounted->mnt_root);
1006			continue;
1007		}
1008
1009		/* Don't handle automount points here */
1010		break;
1011	}
1012	return 0;
1013}
1014
1015/*
1016 * Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
1017 */
1018static void follow_mount(struct path *path)
1019{
1020	while (d_mountpoint(path->dentry)) {
1021		struct vfsmount *mounted = lookup_mnt(path);
1022		if (!mounted)
1023			break;
1024		dput(path->dentry);
1025		mntput(path->mnt);
1026		path->mnt = mounted;
1027		path->dentry = dget(mounted->mnt_root);
1028	}
1029}
1030
1031static void follow_dotdot(struct nameidata *nd)
1032{
1033	set_root(nd);
1034
1035	while(1) {
1036		struct dentry *old = nd->path.dentry;
1037
1038		if (nd->path.dentry == nd->root.dentry &&
1039		    nd->path.mnt == nd->root.mnt) {
1040			break;
1041		}
1042		if (nd->path.dentry != nd->path.mnt->mnt_root) {
1043			/* rare case of legitimate dget_parent()... */
1044			nd->path.dentry = dget_parent(nd->path.dentry);
1045			dput(old);
1046			break;
1047		}
1048		if (!follow_up(&nd->path))
1049			break;
1050	}
1051	follow_mount(&nd->path);
1052	nd->inode = nd->path.dentry->d_inode;
1053}
1054
1055/*
1056 * Allocate a dentry with name and parent, and perform a parent
1057 * directory ->lookup on it. Returns the new dentry, or ERR_PTR
1058 * on error. parent->d_inode->i_mutex must be held. d_lookup must
1059 * have verified that no child exists while under i_mutex.
1060 */
1061static struct dentry *d_alloc_and_lookup(struct dentry *parent,
1062				struct qstr *name, struct nameidata *nd)
1063{
1064	struct inode *inode = parent->d_inode;
1065	struct dentry *dentry;
1066	struct dentry *old;
1067
1068	/* Don't create child dentry for a dead directory. */
1069	if (unlikely(IS_DEADDIR(inode)))
1070		return ERR_PTR(-ENOENT);
1071
1072	dentry = d_alloc(parent, name);
1073	if (unlikely(!dentry))
1074		return ERR_PTR(-ENOMEM);
1075
1076	old = inode->i_op->lookup(inode, dentry, nd);
1077	if (unlikely(old)) {
1078		dput(dentry);
1079		dentry = old;
1080	}
1081	return dentry;
1082}
1083
1084/*
1085 * We already have a dentry, but require a lookup to be performed on the parent
1086 * directory to fill in d_inode. Returns the new dentry, or ERR_PTR on error.
1087 * parent->d_inode->i_mutex must be held. d_lookup must have verified that no
1088 * child exists while under i_mutex.
1089 */
1090static struct dentry *d_inode_lookup(struct dentry *parent, struct dentry *dentry,
1091				     struct nameidata *nd)
1092{
1093	struct inode *inode = parent->d_inode;
1094	struct dentry *old;
1095
1096	/* Don't create child dentry for a dead directory. */
1097	if (unlikely(IS_DEADDIR(inode)))
1098		return ERR_PTR(-ENOENT);
1099
1100	old = inode->i_op->lookup(inode, dentry, nd);
1101	if (unlikely(old)) {
1102		dput(dentry);
1103		dentry = old;
1104	}
1105	return dentry;
1106}
1107
1108/*
1109 *  It's more convoluted than I'd like it to be, but... it's still fairly
1110 *  small and for now I'd prefer to have fast path as straight as possible.
1111 *  It _is_ time-critical.
1112 */
1113static int do_lookup(struct nameidata *nd, struct qstr *name,
1114			struct path *path, struct inode **inode)
1115{
1116	struct vfsmount *mnt = nd->path.mnt;
1117	struct dentry *dentry, *parent = nd->path.dentry;
1118	int need_reval = 1;
1119	int status = 1;
1120	int err;
1121
1122	/*
1123	 * Rename seqlock is not required here because in the off chance
1124	 * of a false negative due to a concurrent rename, we're going to
1125	 * do the non-racy lookup, below.
1126	 */
1127	if (nd->flags & LOOKUP_RCU) {
1128		unsigned seq;
1129		*inode = nd->inode;
1130		dentry = __d_lookup_rcu(parent, name, &seq, inode);
1131		if (!dentry)
1132			goto unlazy;
1133
1134		/* Memory barrier in read_seqcount_begin of child is enough */
1135		if (__read_seqcount_retry(&parent->d_seq, nd->seq))
1136			return -ECHILD;
1137		nd->seq = seq;
1138
1139		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
1140			status = d_revalidate(dentry, nd);
1141			if (unlikely(status <= 0)) {
1142				if (status != -ECHILD)
1143					need_reval = 0;
1144				goto unlazy;
1145			}
1146		}
1147		if (unlikely(d_need_lookup(dentry)))
1148			goto unlazy;
1149		path->mnt = mnt;
1150		path->dentry = dentry;
1151		if (unlikely(!__follow_mount_rcu(nd, path, inode)))
1152			goto unlazy;
1153		if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
1154			goto unlazy;
1155		return 0;
1156unlazy:
1157		if (unlazy_walk(nd, dentry))
1158			return -ECHILD;
1159	} else {
1160		dentry = __d_lookup(parent, name);
1161	}
1162
1163	if (dentry && unlikely(d_need_lookup(dentry))) {
1164		dput(dentry);
1165		dentry = NULL;
1166	}
1167retry:
1168	if (unlikely(!dentry)) {
1169		struct inode *dir = parent->d_inode;
1170		BUG_ON(nd->inode != dir);
1171
1172		mutex_lock(&dir->i_mutex);
1173		dentry = d_lookup(parent, name);
1174		if (likely(!dentry)) {
1175			dentry = d_alloc_and_lookup(parent, name, nd);
1176			if (IS_ERR(dentry)) {
1177				mutex_unlock(&dir->i_mutex);
1178				return PTR_ERR(dentry);
1179			}
1180			/* known good */
1181			need_reval = 0;
1182			status = 1;
1183		} else if (unlikely(d_need_lookup(dentry))) {
1184			dentry = d_inode_lookup(parent, dentry, nd);
1185			if (IS_ERR(dentry)) {
1186				mutex_unlock(&dir->i_mutex);
1187				return PTR_ERR(dentry);
1188			}
1189			/* known good */
1190			need_reval = 0;
1191			status = 1;
1192		}
1193		mutex_unlock(&dir->i_mutex);
1194	}
1195	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
1196		status = d_revalidate(dentry, nd);
1197	if (unlikely(status <= 0)) {
1198		if (status < 0) {
1199			dput(dentry);
1200			return status;
1201		}
1202		if (!d_invalidate(dentry)) {
1203			dput(dentry);
1204			dentry = NULL;
1205			need_reval = 1;
1206			goto retry;
1207		}
1208	}
1209
1210	path->mnt = mnt;
1211	path->dentry = dentry;
1212	err = follow_managed(path, nd->flags);
1213	if (unlikely(err < 0)) {
1214		path_put_conditional(path, nd);
1215		return err;
1216	}
1217	if (err)
1218		nd->flags |= LOOKUP_JUMPED;
1219	*inode = path->dentry->d_inode;
1220	return 0;
1221}
1222
1223static inline int may_lookup(struct nameidata *nd)
1224{
1225	if (nd->flags & LOOKUP_RCU) {
1226		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
1227		if (err != -ECHILD)
1228			return err;
1229		if (unlazy_walk(nd, NULL))
1230			return -ECHILD;
1231	}
1232	return inode_permission(nd->inode, MAY_EXEC);
1233}
1234
1235static inline int handle_dots(struct nameidata *nd, int type)
1236{
1237	if (type == LAST_DOTDOT) {
1238		if (nd->flags & LOOKUP_RCU) {
1239			if (follow_dotdot_rcu(nd))
1240				return -ECHILD;
1241		} else
1242			follow_dotdot(nd);
1243	}
1244	return 0;
1245}
1246
1247static void terminate_walk(struct nameidata *nd)
1248{
1249	if (!(nd->flags & LOOKUP_RCU)) {
1250		path_put(&nd->path);
1251	} else {
1252		nd->flags &= ~LOOKUP_RCU;
1253		if (!(nd->flags & LOOKUP_ROOT))
1254			nd->root.mnt = NULL;
1255		rcu_read_unlock();
1256		br_read_unlock(vfsmount_lock);
1257	}
1258}
1259
1260/*
1261 * Do we need to follow links? We _really_ want to be able
1262 * to do this check without having to look at inode->i_op,
1263 * so we keep a cache of "no, this doesn't need follow_link"
1264 * for the common case.
1265 */
1266static inline int should_follow_link(struct inode *inode, int follow)
1267{
1268	if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
1269		if (likely(inode->i_op->follow_link))
1270			return follow;
1271
1272		/* This gets set once for the inode lifetime */
1273		spin_lock(&inode->i_lock);
1274		inode->i_opflags |= IOP_NOFOLLOW;
1275		spin_unlock(&inode->i_lock);
1276	}
1277	return 0;
1278}
1279
1280static inline int walk_component(struct nameidata *nd, struct path *path,
1281		struct qstr *name, int type, int follow)
1282{
1283	struct inode *inode;
1284	int err;
1285	/*
1286	 * "." and ".." are special - ".." especially so because it has
1287	 * to be able to know about the current root directory and
1288	 * parent relationships.
1289	 */
1290	if (unlikely(type != LAST_NORM))
1291		return handle_dots(nd, type);
1292	err = do_lookup(nd, name, path, &inode);
1293	if (unlikely(err)) {
1294		terminate_walk(nd);
1295		return err;
1296	}
1297	if (!inode) {
1298		path_to_nameidata(path, nd);
1299		terminate_walk(nd);
1300		return -ENOENT;
1301	}
1302	if (should_follow_link(inode, follow)) {
1303		if (nd->flags & LOOKUP_RCU) {
1304			if (unlikely(unlazy_walk(nd, path->dentry))) {
1305				terminate_walk(nd);
1306				return -ECHILD;
1307			}
1308		}
1309		BUG_ON(inode != path->dentry->d_inode);
1310		return 1;
1311	}
1312	path_to_nameidata(path, nd);
1313	nd->inode = inode;
1314	return 0;
1315}
1316
1317/*
1318 * This limits recursive symlink follows to 8, while
1319 * limiting consecutive symlinks to 40.
1320 *
1321 * Without that kind of total limit, nasty chains of consecutive
1322 * symlinks can cause almost arbitrarily long lookups.
1323 */
1324static inline int nested_symlink(struct path *path, struct nameidata *nd)
1325{
1326	int res;
1327
1328	if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
1329		path_put_conditional(path, nd);
1330		path_put(&nd->path);
1331		return -ELOOP;
1332	}
1333	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
1334
1335	nd->depth++;
1336	current->link_count++;
1337
1338	do {
1339		struct path link = *path;
1340		void *cookie;
1341
1342		res = follow_link(&link, nd, &cookie);
1343		if (!res)
1344			res = walk_component(nd, path, &nd->last,
1345					     nd->last_type, LOOKUP_FOLLOW);
1346		put_link(nd, &link, cookie);
1347	} while (res > 0);
1348
1349	current->link_count--;
1350	nd->depth--;
1351	return res;
1352}
1353
1354/*
1355 * We really don't want to look at inode->i_op->lookup
1356 * when we don't have to. So we keep a cache bit in
1357 * the inode ->i_opflags field that says "yes, we can
1358 * do lookup on this inode".
1359 */
1360static inline int can_lookup(struct inode *inode)
1361{
1362	if (likely(inode->i_opflags & IOP_LOOKUP))
1363		return 1;
1364	if (likely(!inode->i_op->lookup))
1365		return 0;
1366
1367	/* We do this once for the lifetime of the inode */
1368	spin_lock(&inode->i_lock);
1369	inode->i_opflags |= IOP_LOOKUP;
1370	spin_unlock(&inode->i_lock);
1371	return 1;
1372}
1373
1374/*
1375 * Name resolution.
1376 * This is the basic name resolution function, turning a pathname into
1377 * the final dentry. We expect 'base' to be positive and a directory.
1378 *
1379 * Returns 0 and nd will have valid dentry and mnt on success.
1380 * Returns error and drops reference to input namei data on failure.
1381 */
1382static int link_path_walk(const char *name, struct nameidata *nd)
1383{
1384	struct path next;
1385	int err;
1386	
1387	while (*name=='/')
1388		name++;
1389	if (!*name)
1390		return 0;
1391
1392	/* At this point we know we have a real path component. */
1393	for(;;) {
1394		unsigned long hash;
1395		struct qstr this;
1396		unsigned int c;
1397		int type;
1398
1399		err = may_lookup(nd);
1400 		if (err)
1401			break;
1402
1403		this.name = name;
1404		c = *(const unsigned char *)name;
1405
1406		hash = init_name_hash();
1407		do {
1408			name++;
1409			hash = partial_name_hash(c, hash);
1410			c = *(const unsigned char *)name;
1411		} while (c && (c != '/'));
1412		this.len = name - (const char *) this.name;
1413		this.hash = end_name_hash(hash);
1414
1415		type = LAST_NORM;
1416		if (this.name[0] == '.') switch (this.len) {
1417			case 2:
1418				if (this.name[1] == '.') {
1419					type = LAST_DOTDOT;
1420					nd->flags |= LOOKUP_JUMPED;
1421				}
1422				break;
1423			case 1:
1424				type = LAST_DOT;
1425		}
1426		if (likely(type == LAST_NORM)) {
1427			struct dentry *parent = nd->path.dentry;
1428			nd->flags &= ~LOOKUP_JUMPED;
1429			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1430				err = parent->d_op->d_hash(parent, nd->inode,
1431							   &this);
1432				if (err < 0)
1433					break;
1434			}
1435		}
1436
1437		/* remove trailing slashes? */
1438		if (!c)
1439			goto last_component;
1440		while (*++name == '/');
1441		if (!*name)
1442			goto last_component;
1443
1444		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
1445		if (err < 0)
1446			return err;
1447
1448		if (err) {
1449			err = nested_symlink(&next, nd);
1450			if (err)
1451				return err;
1452		}
1453		if (can_lookup(nd->inode))
1454			continue;
1455		err = -ENOTDIR; 
1456		break;
1457		/* here ends the main loop */
1458
1459last_component:
1460		nd->last = this;
1461		nd->last_type = type;
1462		return 0;
1463	}
1464	terminate_walk(nd);
1465	return err;
1466}
1467
1468static int path_init(int dfd, const char *name, unsigned int flags,
1469		     struct nameidata *nd, struct file **fp)
1470{
1471	int retval = 0;
1472	int fput_needed;
1473	struct file *file;
1474
1475	nd->last_type = LAST_ROOT; /* if there are only slashes... */
1476	nd->flags = flags | LOOKUP_JUMPED;
1477	nd->depth = 0;
1478	if (flags & LOOKUP_ROOT) {
1479		struct inode *inode = nd->root.dentry->d_inode;
1480		if (*name) {
1481			if (!inode->i_op->lookup)
1482				return -ENOTDIR;
1483			retval = inode_permission(inode, MAY_EXEC);
1484			if (retval)
1485				return retval;
1486		}
1487		nd->path = nd->root;
1488		nd->inode = inode;
1489		if (flags & LOOKUP_RCU) {
1490			br_read_lock(vfsmount_lock);
1491			rcu_read_lock();
1492			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1493		} else {
1494			path_get(&nd->path);
1495		}
1496		return 0;
1497	}
1498
1499	nd->root.mnt = NULL;
1500
1501	if (*name=='/') {
1502		if (flags & LOOKUP_RCU) {
1503			br_read_lock(vfsmount_lock);
1504			rcu_read_lock();
1505			set_root_rcu(nd);
1506		} else {
1507			set_root(nd);
1508			path_get(&nd->root);
1509		}
1510		nd->path = nd->root;
1511	} else if (dfd == AT_FDCWD) {
1512		if (flags & LOOKUP_RCU) {
1513			struct fs_struct *fs = current->fs;
1514			unsigned seq;
1515
1516			br_read_lock(vfsmount_lock);
1517			rcu_read_lock();
1518
1519			do {
1520				seq = read_seqcount_begin(&fs->seq);
1521				nd->path = fs->pwd;
1522				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1523			} while (read_seqcount_retry(&fs->seq, seq));
1524		} else {
1525			get_fs_pwd(current->fs, &nd->path);
1526		}
1527	} else {
1528		struct dentry *dentry;
1529
1530		file = fget_raw_light(dfd, &fput_needed);
1531		retval = -EBADF;
1532		if (!file)
1533			goto out_fail;
1534
1535		dentry = file->f_path.dentry;
1536
1537		if (*name) {
1538			retval = -ENOTDIR;
1539			if (!S_ISDIR(dentry->d_inode->i_mode))
1540				goto fput_fail;
1541
1542			retval = inode_permission(dentry->d_inode, MAY_EXEC);
1543			if (retval)
1544				goto fput_fail;
1545		}
1546
1547		nd->path = file->f_path;
1548		if (flags & LOOKUP_RCU) {
1549			if (fput_needed)
1550				*fp = file;
1551			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1552			br_read_lock(vfsmount_lock);
1553			rcu_read_lock();
1554		} else {
1555			path_get(&file->f_path);
1556			fput_light(file, fput_needed);
1557		}
1558	}
1559
1560	nd->inode = nd->path.dentry->d_inode;
1561	return 0;
1562
1563fput_fail:
1564	fput_light(file, fput_needed);
1565out_fail:
1566	return retval;
1567}
1568
1569static inline int lookup_last(struct nameidata *nd, struct path *path)
1570{
1571	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
1572		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
1573
1574	nd->flags &= ~LOOKUP_PARENT;
1575	return walk_component(nd, path, &nd->last, nd->last_type,
1576					nd->flags & LOOKUP_FOLLOW);
1577}
1578
1579/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
1580static int path_lookupat(int dfd, const char *name,
1581				unsigned int flags, struct nameidata *nd)
1582{
1583	struct file *base = NULL;
1584	struct path path;
1585	int err;
1586
1587	/*
1588	 * Path walking is largely split up into 2 different synchronisation
1589	 * schemes, rcu-walk and ref-walk (explained in
1590	 * Documentation/filesystems/path-lookup.txt). These share much of the
1591	 * path walk code, but some things particularly setup, cleanup, and
1592	 * following mounts are sufficiently divergent that functions are
1593	 * duplicated. Typically there is a function foo(), and its RCU
1594	 * analogue, foo_rcu().
1595	 *
1596	 * -ECHILD is the error number of choice (just to avoid clashes) that
1597	 * is returned if some aspect of an rcu-walk fails. Such an error must
1598	 * be handled by restarting a traditional ref-walk (which will always
1599	 * be able to complete).
1600	 */
1601	err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
1602
1603	if (unlikely(err))
1604		return err;
1605
1606	current->total_link_count = 0;
1607	err = link_path_walk(name, nd);
1608
1609	if (!err && !(flags & LOOKUP_PARENT)) {
1610		err = lookup_last(nd, &path);
1611		while (err > 0) {
1612			void *cookie;
1613			struct path link = path;
1614			nd->flags |= LOOKUP_PARENT;
1615			err = follow_link(&link, nd, &cookie);
1616			if (!err)
1617				err = lookup_last(nd, &path);
1618			put_link(nd, &link, cookie);
1619		}
1620	}
1621
1622	if (!err)
1623		err = complete_walk(nd);
1624
1625	if (!err && nd->flags & LOOKUP_DIRECTORY) {
1626		if (!nd->inode->i_op->lookup) {
1627			path_put(&nd->path);
1628			err = -ENOTDIR;
1629		}
1630	}
1631
1632	if (base)
1633		fput(base);
1634
1635	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
1636		path_put(&nd->root);
1637		nd->root.mnt = NULL;
1638	}
1639	return err;
1640}
1641
1642static int do_path_lookup(int dfd, const char *name,
1643				unsigned int flags, struct nameidata *nd)
1644{
1645	int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
1646	if (unlikely(retval == -ECHILD))
1647		retval = path_lookupat(dfd, name, flags, nd);
1648	if (unlikely(retval == -ESTALE))
1649		retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
1650
1651	if (likely(!retval)) {
1652		if (unlikely(!audit_dummy_context())) {
1653			if (nd->path.dentry && nd->inode)
1654				audit_inode(name, nd->path.dentry);
1655		}
1656	}
1657	return retval;
1658}
1659
1660int kern_path_parent(const char *name, struct nameidata *nd)
1661{
1662	return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd);
1663}
1664
1665int kern_path(const char *name, unsigned int flags, struct path *path)
1666{
1667	struct nameidata nd;
1668	int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
1669	if (!res)
1670		*path = nd.path;
1671	return res;
1672}
1673
1674/**
1675 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
1676 * @dentry:  pointer to dentry of the base directory
1677 * @mnt: pointer to vfs mount of the base directory
1678 * @name: pointer to file name
1679 * @flags: lookup flags
1680 * @path: pointer to struct path to fill
1681 */
1682int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1683		    const char *name, unsigned int flags,
1684		    struct path *path)
1685{
1686	struct nameidata nd;
1687	int err;
1688	nd.root.dentry = dentry;
1689	nd.root.mnt = mnt;
1690	BUG_ON(flags & LOOKUP_PARENT);
1691	/* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
1692	err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd);
1693	if (!err)
1694		*path = nd.path;
1695	return err;
1696}
1697
1698static struct dentry *__lookup_hash(struct qstr *name,
1699		struct dentry *base, struct nameidata *nd)
1700{
1701	struct inode *inode = base->d_inode;
1702	struct dentry *dentry;
1703	int err;
1704
1705	err = inode_permission(inode, MAY_EXEC);
1706	if (err)
1707		return ERR_PTR(err);
1708
1709	/*
1710	 * Don't bother with __d_lookup: callers are for creat as
1711	 * well as unlink, so a lot of the time it would cost
1712	 * a double lookup.
1713	 */
1714	dentry = d_lookup(base, name);
1715
1716	if (dentry && d_need_lookup(dentry)) {
1717		/*
1718		 * __lookup_hash is called with the parent dir's i_mutex already
1719		 * held, so we are good to go here.
1720		 */
1721		dentry = d_inode_lookup(base, dentry, nd);
1722		if (IS_ERR(dentry))
1723			return dentry;
1724	}
1725
1726	if (dentry && (dentry->d_flags & DCACHE_OP_REVALIDATE)) {
1727		int status = d_revalidate(dentry, nd);
1728		if (unlikely(status <= 0)) {
1729			/*
1730			 * The dentry failed validation.
1731			 * If d_revalidate returned 0 attempt to invalidate
1732			 * the dentry otherwise d_revalidate is asking us
1733			 * to return a fail status.
1734			 */
1735			if (status < 0) {
1736				dput(dentry);
1737				return ERR_PTR(status);
1738			} else if (!d_invalidate(dentry)) {
1739				dput(dentry);
1740				dentry = NULL;
1741			}
1742		}
1743	}
1744
1745	if (!dentry)
1746		dentry = d_alloc_and_lookup(base, name, nd);
1747
1748	return dentry;
1749}
1750
1751/*
1752 * Restricted form of lookup. Doesn't follow links, single-component only,
1753 * needs parent already locked. Doesn't follow mounts.
1754 * SMP-safe.
1755 */
1756static struct dentry *lookup_hash(struct nameidata *nd)
1757{
1758	return __lookup_hash(&nd->last, nd->path.dentry, nd);
1759}
1760
1761/**
1762 * lookup_one_len - filesystem helper to lookup single pathname component
1763 * @name:	pathname component to lookup
1764 * @base:	base directory to lookup from
1765 * @len:	maximum length @len should be interpreted to
1766 *
1767 * Note that this routine is purely a helper for filesystem usage and should
1768 * not be called by generic code.  Also note that by using this function the
1769 * nameidata argument is passed to the filesystem methods and a filesystem
1770 * using this helper needs to be prepared for that.
1771 */
1772struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1773{
1774	struct qstr this;
1775	unsigned long hash;
1776	unsigned int c;
1777
1778	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
1779
1780	this.name = name;
1781	this.len = len;
1782	if (!len)
1783		return ERR_PTR(-EACCES);
1784
1785	hash = init_name_hash();
1786	while (len--) {
1787		c = *(const unsigned char *)name++;
1788		if (c == '/' || c == '\0')
1789			return ERR_PTR(-EACCES);
1790		hash = partial_name_hash(c, hash);
1791	}
1792	this.hash = end_name_hash(hash);
1793	/*
1794	 * See if the low-level filesystem might want
1795	 * to use its own hash..
1796	 */
1797	if (base->d_flags & DCACHE_OP_HASH) {
1798		int err = base->d_op->d_hash(base, base->d_inode, &this);
1799		if (err < 0)
1800			return ERR_PTR(err);
1801	}
1802
1803	return __lookup_hash(&this, base, NULL);
1804}
1805
1806int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
1807		 struct path *path, int *empty)
1808{
1809	struct nameidata nd;
1810	char *tmp = getname_flags(name, flags, empty);
1811	int err = PTR_ERR(tmp);
1812	if (!IS_ERR(tmp)) {
1813
1814		BUG_ON(flags & LOOKUP_PARENT);
1815
1816		err = do_path_lookup(dfd, tmp, flags, &nd);
1817		putname(tmp);
1818		if (!err)
1819			*path = nd.path;
1820	}
1821	return err;
1822}
1823
1824int user_path_at(int dfd, const char __user *name, unsigned flags,
1825		 struct path *path)
1826{
1827	return user_path_at_empty(dfd, name, flags, path, 0);
1828}
1829
1830static int user_path_parent(int dfd, const char __user *path,
1831			struct nameidata *nd, char **name)
1832{
1833	char *s = getname(path);
1834	int error;
1835
1836	if (IS_ERR(s))
1837		return PTR_ERR(s);
1838
1839	error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd);
1840	if (error)
1841		putname(s);
1842	else
1843		*name = s;
1844
1845	return error;
1846}
1847
1848/*
1849 * It's inline, so penalty for filesystems that don't use sticky bit is
1850 * minimal.
1851 */
1852static inline int check_sticky(struct inode *dir, struct inode *inode)
1853{
1854	uid_t fsuid = current_fsuid();
1855
1856	if (!(dir->i_mode & S_ISVTX))
1857		return 0;
1858	if (current_user_ns() != inode_userns(inode))
1859		goto other_userns;
1860	if (inode->i_uid == fsuid)
1861		return 0;
1862	if (dir->i_uid == fsuid)
1863		return 0;
1864
1865other_userns:
1866	return !ns_capable(inode_userns(inode), CAP_FOWNER);
1867}
1868
1869/*
1870 *	Check whether we can remove a link victim from directory dir, check
1871 *  whether the type of victim is right.
1872 *  1. We can't do it if dir is read-only (done in permission())
1873 *  2. We should have write and exec permissions on dir
1874 *  3. We can't remove anything from append-only dir
1875 *  4. We can't do anything with immutable dir (done in permission())
1876 *  5. If the sticky bit on dir is set we should either
1877 *	a. be owner of dir, or
1878 *	b. be owner of victim, or
1879 *	c. have CAP_FOWNER capability
1880 *  6. If the victim is append-only or immutable we can't do antyhing with
1881 *     links pointing to it.
1882 *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
1883 *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
1884 *  9. We can't remove a root or mountpoint.
1885 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
1886 *     nfs_async_unlink().
1887 */
1888static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
1889{
1890	int error;
1891
1892	if (!victim->d_inode)
1893		return -ENOENT;
1894
1895	BUG_ON(victim->d_parent->d_inode != dir);
1896	audit_inode_child(victim, dir);
1897
1898	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
1899	if (error)
1900		return error;
1901	if (IS_APPEND(dir))
1902		return -EPERM;
1903	if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
1904	    IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
1905		return -EPERM;
1906	if (isdir) {
1907		if (!S_ISDIR(victim->d_inode->i_mode))
1908			return -ENOTDIR;
1909		if (IS_ROOT(victim))
1910			return -EBUSY;
1911	} else if (S_ISDIR(victim->d_inode->i_mode))
1912		return -EISDIR;
1913	if (IS_DEADDIR(dir))
1914		return -ENOENT;
1915	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
1916		return -EBUSY;
1917	return 0;
1918}
1919
1920/*	Check whether we can create an object with dentry child in directory
1921 *  dir.
1922 *  1. We can't do it if child already exists (open has special treatment for
1923 *     this case, but since we are inlined it's OK)
1924 *  2. We can't do it if dir is read-only (done in permission())
1925 *  3. We should have write and exec permissions on dir
1926 *  4. We can't do it if dir is immutable (done in permission())
1927 */
1928static inline int may_create(struct inode *dir, struct dentry *child)
1929{
1930	if (child->d_inode)
1931		return -EEXIST;
1932	if (IS_DEADDIR(dir))
1933		return -ENOENT;
1934	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
1935}
1936
1937/*
1938 * p1 and p2 should be directories on the same fs.
1939 */
1940struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
1941{
1942	struct dentry *p;
1943
1944	if (p1 == p2) {
1945		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
1946		return NULL;
1947	}
1948
1949	mutex_lock(&p1->d_inode->i_sb->s_vfs_rena

Large files files are truncated, but you can click here to view the full file