PageRenderTime 12ms CodeModel.GetById 8ms app.highlight 98ms RepoModel.GetById 1ms app.codeStats 0ms

/fs/namei.c

https://bitbucket.org/evzijst/gittest
C | 2454 lines | 1730 code | 261 blank | 463 comment | 448 complexity | c119db45b6d614e4b8154620dc28cf65 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

   1/*
   2 *  linux/fs/namei.c
   3 *
   4 *  Copyright (C) 1991, 1992  Linus Torvalds
   5 */
   6
   7/*
   8 * Some corrections by tytso.
   9 */
  10
  11/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
  12 * lookup logic.
  13 */
  14/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
  15 */
  16
  17#include <linux/init.h>
  18#include <linux/module.h>
  19#include <linux/slab.h>
  20#include <linux/fs.h>
  21#include <linux/namei.h>
  22#include <linux/quotaops.h>
  23#include <linux/pagemap.h>
  24#include <linux/dnotify.h>
  25#include <linux/smp_lock.h>
  26#include <linux/personality.h>
  27#include <linux/security.h>
  28#include <linux/syscalls.h>
  29#include <linux/mount.h>
  30#include <linux/audit.h>
  31#include <asm/namei.h>
  32#include <asm/uaccess.h>
  33
  34#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
  35
  36/* [Feb-1997 T. Schoebel-Theuer]
  37 * Fundamental changes in the pathname lookup mechanisms (namei)
  38 * were necessary because of omirr.  The reason is that omirr needs
  39 * to know the _real_ pathname, not the user-supplied one, in case
  40 * of symlinks (and also when transname replacements occur).
  41 *
  42 * The new code replaces the old recursive symlink resolution with
  43 * an iterative one (in case of non-nested symlink chains).  It does
  44 * this with calls to <fs>_follow_link().
  45 * As a side effect, dir_namei(), _namei() and follow_link() are now 
  46 * replaced with a single function lookup_dentry() that can handle all 
  47 * the special cases of the former code.
  48 *
  49 * With the new dcache, the pathname is stored at each inode, at least as
  50 * long as the refcount of the inode is positive.  As a side effect, the
  51 * size of the dcache depends on the inode cache and thus is dynamic.
  52 *
  53 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
  54 * resolution to correspond with current state of the code.
  55 *
  56 * Note that the symlink resolution is not *completely* iterative.
  57 * There is still a significant amount of tail- and mid- recursion in
  58 * the algorithm.  Also, note that <fs>_readlink() is not used in
  59 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
  60 * may return different results than <fs>_follow_link().  Many virtual
  61 * filesystems (including /proc) exhibit this behavior.
  62 */
  63
  64/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
  65 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
  66 * and the name already exists in form of a symlink, try to create the new
  67 * name indicated by the symlink. The old code always complained that the
  68 * name already exists, due to not following the symlink even if its target
  69 * is nonexistent.  The new semantics affects also mknod() and link() when
  70 * the name is a symlink pointing to a non-existant name.
  71 *
  72 * I don't know which semantics is the right one, since I have no access
  73 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
  74 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
  75 * "old" one. Personally, I think the new semantics is much more logical.
  76 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
  77 * file does succeed in both HP-UX and SunOs, but not in Solaris
  78 * and in the old Linux semantics.
  79 */
  80
  81/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
  82 * semantics.  See the comments in "open_namei" and "do_link" below.
  83 *
  84 * [10-Sep-98 Alan Modra] Another symlink change.
  85 */
  86
  87/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
  88 *	inside the path - always follow.
  89 *	in the last component in creation/removal/renaming - never follow.
  90 *	if LOOKUP_FOLLOW passed - follow.
  91 *	if the pathname has trailing slashes - follow.
  92 *	otherwise - don't follow.
  93 * (applied in that order).
  94 *
  95 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
  96 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
  97 * During the 2.4 we need to fix the userland stuff depending on it -
  98 * hopefully we will be able to get rid of that wart in 2.5. So far only
  99 * XEmacs seems to be relying on it...
 100 */
 101/*
 102 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
 103 * implemented.  Let's see if raised priority of ->s_vfs_rename_sem gives
 104 * any extra contention...
 105 */
 106
 107/* In order to reduce some races, while at the same time doing additional
 108 * checking and hopefully speeding things up, we copy filenames to the
 109 * kernel data space before using them..
 110 *
 111 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 112 * PATH_MAX includes the nul terminator --RR.
 113 */
 114static inline int do_getname(const char __user *filename, char *page)
 115{
 116	int retval;
 117	unsigned long len = PATH_MAX;
 118
 119	if (!segment_eq(get_fs(), KERNEL_DS)) {
 120		if ((unsigned long) filename >= TASK_SIZE)
 121			return -EFAULT;
 122		if (TASK_SIZE - (unsigned long) filename < PATH_MAX)
 123			len = TASK_SIZE - (unsigned long) filename;
 124	}
 125
 126	retval = strncpy_from_user(page, filename, len);
 127	if (retval > 0) {
 128		if (retval < len)
 129			return 0;
 130		return -ENAMETOOLONG;
 131	} else if (!retval)
 132		retval = -ENOENT;
 133	return retval;
 134}
 135
 136char * getname(const char __user * filename)
 137{
 138	char *tmp, *result;
 139
 140	result = ERR_PTR(-ENOMEM);
 141	tmp = __getname();
 142	if (tmp)  {
 143		int retval = do_getname(filename, tmp);
 144
 145		result = tmp;
 146		if (retval < 0) {
 147			__putname(tmp);
 148			result = ERR_PTR(retval);
 149		}
 150	}
 151	audit_getname(result);
 152	return result;
 153}
 154
 155#ifdef CONFIG_AUDITSYSCALL
 156void putname(const char *name)
 157{
 158	if (unlikely(current->audit_context))
 159		audit_putname(name);
 160	else
 161		__putname(name);
 162}
 163EXPORT_SYMBOL(putname);
 164#endif
 165
 166
 167/**
 168 * generic_permission  -  check for access rights on a Posix-like filesystem
 169 * @inode:	inode to check access rights for
 170 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 171 * @check_acl:	optional callback to check for Posix ACLs
 172 *
 173 * Used to check for read/write/execute permissions on a file.
 174 * We use "fsuid" for this, letting us set arbitrary permissions
 175 * for filesystem access without changing the "normal" uids which
 176 * are used for other things..
 177 */
 178int generic_permission(struct inode *inode, int mask,
 179		int (*check_acl)(struct inode *inode, int mask))
 180{
 181	umode_t			mode = inode->i_mode;
 182
 183	if (current->fsuid == inode->i_uid)
 184		mode >>= 6;
 185	else {
 186		if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
 187			int error = check_acl(inode, mask);
 188			if (error == -EACCES)
 189				goto check_capabilities;
 190			else if (error != -EAGAIN)
 191				return error;
 192		}
 193
 194		if (in_group_p(inode->i_gid))
 195			mode >>= 3;
 196	}
 197
 198	/*
 199	 * If the DACs are ok we don't need any capability check.
 200	 */
 201	if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask))
 202		return 0;
 203
 204 check_capabilities:
 205	/*
 206	 * Read/write DACs are always overridable.
 207	 * Executable DACs are overridable if at least one exec bit is set.
 208	 */
 209	if (!(mask & MAY_EXEC) ||
 210	    (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
 211		if (capable(CAP_DAC_OVERRIDE))
 212			return 0;
 213
 214	/*
 215	 * Searching includes executable on directories, else just read.
 216	 */
 217	if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
 218		if (capable(CAP_DAC_READ_SEARCH))
 219			return 0;
 220
 221	return -EACCES;
 222}
 223
 224int permission(struct inode *inode, int mask, struct nameidata *nd)
 225{
 226	int retval, submask;
 227
 228	if (mask & MAY_WRITE) {
 229		umode_t mode = inode->i_mode;
 230
 231		/*
 232		 * Nobody gets write access to a read-only fs.
 233		 */
 234		if (IS_RDONLY(inode) &&
 235		    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
 236			return -EROFS;
 237
 238		/*
 239		 * Nobody gets write access to an immutable file.
 240		 */
 241		if (IS_IMMUTABLE(inode))
 242			return -EACCES;
 243	}
 244
 245
 246	/* Ordinary permission routines do not understand MAY_APPEND. */
 247	submask = mask & ~MAY_APPEND;
 248	if (inode->i_op && inode->i_op->permission)
 249		retval = inode->i_op->permission(inode, submask, nd);
 250	else
 251		retval = generic_permission(inode, submask, NULL);
 252	if (retval)
 253		return retval;
 254
 255	return security_inode_permission(inode, mask, nd);
 256}
 257
 258/*
 259 * get_write_access() gets write permission for a file.
 260 * put_write_access() releases this write permission.
 261 * This is used for regular files.
 262 * We cannot support write (and maybe mmap read-write shared) accesses and
 263 * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode
 264 * can have the following values:
 265 * 0: no writers, no VM_DENYWRITE mappings
 266 * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist
 267 * > 0: (i_writecount) users are writing to the file.
 268 *
 269 * Normally we operate on that counter with atomic_{inc,dec} and it's safe
 270 * except for the cases where we don't hold i_writecount yet. Then we need to
 271 * use {get,deny}_write_access() - these functions check the sign and refuse
 272 * to do the change if sign is wrong. Exclusion between them is provided by
 273 * the inode->i_lock spinlock.
 274 */
 275
 276int get_write_access(struct inode * inode)
 277{
 278	spin_lock(&inode->i_lock);
 279	if (atomic_read(&inode->i_writecount) < 0) {
 280		spin_unlock(&inode->i_lock);
 281		return -ETXTBSY;
 282	}
 283	atomic_inc(&inode->i_writecount);
 284	spin_unlock(&inode->i_lock);
 285
 286	return 0;
 287}
 288
 289int deny_write_access(struct file * file)
 290{
 291	struct inode *inode = file->f_dentry->d_inode;
 292
 293	spin_lock(&inode->i_lock);
 294	if (atomic_read(&inode->i_writecount) > 0) {
 295		spin_unlock(&inode->i_lock);
 296		return -ETXTBSY;
 297	}
 298	atomic_dec(&inode->i_writecount);
 299	spin_unlock(&inode->i_lock);
 300
 301	return 0;
 302}
 303
 304void path_release(struct nameidata *nd)
 305{
 306	dput(nd->dentry);
 307	mntput(nd->mnt);
 308}
 309
 310/*
 311 * umount() mustn't call path_release()/mntput() as that would clear
 312 * mnt_expiry_mark
 313 */
 314void path_release_on_umount(struct nameidata *nd)
 315{
 316	dput(nd->dentry);
 317	_mntput(nd->mnt);
 318}
 319
 320/*
 321 * Internal lookup() using the new generic dcache.
 322 * SMP-safe
 323 */
 324static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
 325{
 326	struct dentry * dentry = __d_lookup(parent, name);
 327
 328	/* lockess __d_lookup may fail due to concurrent d_move() 
 329	 * in some unrelated directory, so try with d_lookup
 330	 */
 331	if (!dentry)
 332		dentry = d_lookup(parent, name);
 333
 334	if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
 335		if (!dentry->d_op->d_revalidate(dentry, nd) && !d_invalidate(dentry)) {
 336			dput(dentry);
 337			dentry = NULL;
 338		}
 339	}
 340	return dentry;
 341}
 342
 343/*
 344 * Short-cut version of permission(), for calling by
 345 * path_walk(), when dcache lock is held.  Combines parts
 346 * of permission() and generic_permission(), and tests ONLY for
 347 * MAY_EXEC permission.
 348 *
 349 * If appropriate, check DAC only.  If not appropriate, or
 350 * short-cut DAC fails, then call permission() to do more
 351 * complete permission check.
 352 */
 353static inline int exec_permission_lite(struct inode *inode,
 354				       struct nameidata *nd)
 355{
 356	umode_t	mode = inode->i_mode;
 357
 358	if (inode->i_op && inode->i_op->permission)
 359		return -EAGAIN;
 360
 361	if (current->fsuid == inode->i_uid)
 362		mode >>= 6;
 363	else if (in_group_p(inode->i_gid))
 364		mode >>= 3;
 365
 366	if (mode & MAY_EXEC)
 367		goto ok;
 368
 369	if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE))
 370		goto ok;
 371
 372	if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE))
 373		goto ok;
 374
 375	if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH))
 376		goto ok;
 377
 378	return -EACCES;
 379ok:
 380	return security_inode_permission(inode, MAY_EXEC, nd);
 381}
 382
 383/*
 384 * This is called when everything else fails, and we actually have
 385 * to go to the low-level filesystem to find out what we should do..
 386 *
 387 * We get the directory semaphore, and after getting that we also
 388 * make sure that nobody added the entry to the dcache in the meantime..
 389 * SMP-safe
 390 */
 391static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
 392{
 393	struct dentry * result;
 394	struct inode *dir = parent->d_inode;
 395
 396	down(&dir->i_sem);
 397	/*
 398	 * First re-do the cached lookup just in case it was created
 399	 * while we waited for the directory semaphore..
 400	 *
 401	 * FIXME! This could use version numbering or similar to
 402	 * avoid unnecessary cache lookups.
 403	 *
 404	 * The "dcache_lock" is purely to protect the RCU list walker
 405	 * from concurrent renames at this point (we mustn't get false
 406	 * negatives from the RCU list walk here, unlike the optimistic
 407	 * fast walk).
 408	 *
 409	 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
 410	 */
 411	result = d_lookup(parent, name);
 412	if (!result) {
 413		struct dentry * dentry = d_alloc(parent, name);
 414		result = ERR_PTR(-ENOMEM);
 415		if (dentry) {
 416			result = dir->i_op->lookup(dir, dentry, nd);
 417			if (result)
 418				dput(dentry);
 419			else
 420				result = dentry;
 421		}
 422		up(&dir->i_sem);
 423		return result;
 424	}
 425
 426	/*
 427	 * Uhhuh! Nasty case: the cache was re-populated while
 428	 * we waited on the semaphore. Need to revalidate.
 429	 */
 430	up(&dir->i_sem);
 431	if (result->d_op && result->d_op->d_revalidate) {
 432		if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) {
 433			dput(result);
 434			result = ERR_PTR(-ENOENT);
 435		}
 436	}
 437	return result;
 438}
 439
 440static int __emul_lookup_dentry(const char *, struct nameidata *);
 441
 442/* SMP-safe */
 443static inline int
 444walk_init_root(const char *name, struct nameidata *nd)
 445{
 446	read_lock(&current->fs->lock);
 447	if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
 448		nd->mnt = mntget(current->fs->altrootmnt);
 449		nd->dentry = dget(current->fs->altroot);
 450		read_unlock(&current->fs->lock);
 451		if (__emul_lookup_dentry(name,nd))
 452			return 0;
 453		read_lock(&current->fs->lock);
 454	}
 455	nd->mnt = mntget(current->fs->rootmnt);
 456	nd->dentry = dget(current->fs->root);
 457	read_unlock(&current->fs->lock);
 458	return 1;
 459}
 460
 461static inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 462{
 463	int res = 0;
 464	char *name;
 465	if (IS_ERR(link))
 466		goto fail;
 467
 468	if (*link == '/') {
 469		path_release(nd);
 470		if (!walk_init_root(link, nd))
 471			/* weird __emul_prefix() stuff did it */
 472			goto out;
 473	}
 474	res = link_path_walk(link, nd);
 475out:
 476	if (nd->depth || res || nd->last_type!=LAST_NORM)
 477		return res;
 478	/*
 479	 * If it is an iterative symlinks resolution in open_namei() we
 480	 * have to copy the last component. And all that crap because of
 481	 * bloody create() on broken symlinks. Furrfu...
 482	 */
 483	name = __getname();
 484	if (unlikely(!name)) {
 485		path_release(nd);
 486		return -ENOMEM;
 487	}
 488	strcpy(name, nd->last.name);
 489	nd->last.name = name;
 490	return 0;
 491fail:
 492	path_release(nd);
 493	return PTR_ERR(link);
 494}
 495
 496static inline int __do_follow_link(struct dentry *dentry, struct nameidata *nd)
 497{
 498	int error;
 499
 500	touch_atime(nd->mnt, dentry);
 501	nd_set_link(nd, NULL);
 502	error = dentry->d_inode->i_op->follow_link(dentry, nd);
 503	if (!error) {
 504		char *s = nd_get_link(nd);
 505		if (s)
 506			error = __vfs_follow_link(nd, s);
 507		if (dentry->d_inode->i_op->put_link)
 508			dentry->d_inode->i_op->put_link(dentry, nd);
 509	}
 510
 511	return error;
 512}
 513
 514/*
 515 * This limits recursive symlink follows to 8, while
 516 * limiting consecutive symlinks to 40.
 517 *
 518 * Without that kind of total limit, nasty chains of consecutive
 519 * symlinks can cause almost arbitrarily long lookups. 
 520 */
 521static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd)
 522{
 523	int err = -ELOOP;
 524	if (current->link_count >= MAX_NESTED_LINKS)
 525		goto loop;
 526	if (current->total_link_count >= 40)
 527		goto loop;
 528	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
 529	cond_resched();
 530	err = security_inode_follow_link(dentry, nd);
 531	if (err)
 532		goto loop;
 533	current->link_count++;
 534	current->total_link_count++;
 535	nd->depth++;
 536	err = __do_follow_link(dentry, nd);
 537	current->link_count--;
 538	nd->depth--;
 539	return err;
 540loop:
 541	path_release(nd);
 542	return err;
 543}
 544
 545int follow_up(struct vfsmount **mnt, struct dentry **dentry)
 546{
 547	struct vfsmount *parent;
 548	struct dentry *mountpoint;
 549	spin_lock(&vfsmount_lock);
 550	parent=(*mnt)->mnt_parent;
 551	if (parent == *mnt) {
 552		spin_unlock(&vfsmount_lock);
 553		return 0;
 554	}
 555	mntget(parent);
 556	mountpoint=dget((*mnt)->mnt_mountpoint);
 557	spin_unlock(&vfsmount_lock);
 558	dput(*dentry);
 559	*dentry = mountpoint;
 560	mntput(*mnt);
 561	*mnt = parent;
 562	return 1;
 563}
 564
 565/* no need for dcache_lock, as serialization is taken care in
 566 * namespace.c
 567 */
 568static int follow_mount(struct vfsmount **mnt, struct dentry **dentry)
 569{
 570	int res = 0;
 571	while (d_mountpoint(*dentry)) {
 572		struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
 573		if (!mounted)
 574			break;
 575		mntput(*mnt);
 576		*mnt = mounted;
 577		dput(*dentry);
 578		*dentry = dget(mounted->mnt_root);
 579		res = 1;
 580	}
 581	return res;
 582}
 583
 584/* no need for dcache_lock, as serialization is taken care in
 585 * namespace.c
 586 */
 587static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry)
 588{
 589	struct vfsmount *mounted;
 590
 591	mounted = lookup_mnt(*mnt, *dentry);
 592	if (mounted) {
 593		mntput(*mnt);
 594		*mnt = mounted;
 595		dput(*dentry);
 596		*dentry = dget(mounted->mnt_root);
 597		return 1;
 598	}
 599	return 0;
 600}
 601
 602int follow_down(struct vfsmount **mnt, struct dentry **dentry)
 603{
 604	return __follow_down(mnt,dentry);
 605}
 606 
 607static inline void follow_dotdot(struct vfsmount **mnt, struct dentry **dentry)
 608{
 609	while(1) {
 610		struct vfsmount *parent;
 611		struct dentry *old = *dentry;
 612
 613                read_lock(&current->fs->lock);
 614		if (*dentry == current->fs->root &&
 615		    *mnt == current->fs->rootmnt) {
 616                        read_unlock(&current->fs->lock);
 617			break;
 618		}
 619                read_unlock(&current->fs->lock);
 620		spin_lock(&dcache_lock);
 621		if (*dentry != (*mnt)->mnt_root) {
 622			*dentry = dget((*dentry)->d_parent);
 623			spin_unlock(&dcache_lock);
 624			dput(old);
 625			break;
 626		}
 627		spin_unlock(&dcache_lock);
 628		spin_lock(&vfsmount_lock);
 629		parent = (*mnt)->mnt_parent;
 630		if (parent == *mnt) {
 631			spin_unlock(&vfsmount_lock);
 632			break;
 633		}
 634		mntget(parent);
 635		*dentry = dget((*mnt)->mnt_mountpoint);
 636		spin_unlock(&vfsmount_lock);
 637		dput(old);
 638		mntput(*mnt);
 639		*mnt = parent;
 640	}
 641	follow_mount(mnt, dentry);
 642}
 643
 644struct path {
 645	struct vfsmount *mnt;
 646	struct dentry *dentry;
 647};
 648
 649/*
 650 *  It's more convoluted than I'd like it to be, but... it's still fairly
 651 *  small and for now I'd prefer to have fast path as straight as possible.
 652 *  It _is_ time-critical.
 653 */
 654static int do_lookup(struct nameidata *nd, struct qstr *name,
 655		     struct path *path)
 656{
 657	struct vfsmount *mnt = nd->mnt;
 658	struct dentry *dentry = __d_lookup(nd->dentry, name);
 659
 660	if (!dentry)
 661		goto need_lookup;
 662	if (dentry->d_op && dentry->d_op->d_revalidate)
 663		goto need_revalidate;
 664done:
 665	path->mnt = mnt;
 666	path->dentry = dentry;
 667	return 0;
 668
 669need_lookup:
 670	dentry = real_lookup(nd->dentry, name, nd);
 671	if (IS_ERR(dentry))
 672		goto fail;
 673	goto done;
 674
 675need_revalidate:
 676	if (dentry->d_op->d_revalidate(dentry, nd))
 677		goto done;
 678	if (d_invalidate(dentry))
 679		goto done;
 680	dput(dentry);
 681	goto need_lookup;
 682
 683fail:
 684	return PTR_ERR(dentry);
 685}
 686
 687/*
 688 * Name resolution.
 689 *
 690 * This is the basic name resolution function, turning a pathname
 691 * into the final dentry.
 692 *
 693 * We expect 'base' to be positive and a directory.
 694 */
 695static fastcall int __link_path_walk(const char * name, struct nameidata *nd)
 696{
 697	struct path next;
 698	struct inode *inode;
 699	int err;
 700	unsigned int lookup_flags = nd->flags;
 701	
 702	while (*name=='/')
 703		name++;
 704	if (!*name)
 705		goto return_reval;
 706
 707	inode = nd->dentry->d_inode;
 708	if (nd->depth)
 709		lookup_flags = LOOKUP_FOLLOW;
 710
 711	/* At this point we know we have a real path component. */
 712	for(;;) {
 713		unsigned long hash;
 714		struct qstr this;
 715		unsigned int c;
 716
 717		err = exec_permission_lite(inode, nd);
 718		if (err == -EAGAIN) { 
 719			err = permission(inode, MAY_EXEC, nd);
 720		}
 721 		if (err)
 722			break;
 723
 724		this.name = name;
 725		c = *(const unsigned char *)name;
 726
 727		hash = init_name_hash();
 728		do {
 729			name++;
 730			hash = partial_name_hash(c, hash);
 731			c = *(const unsigned char *)name;
 732		} while (c && (c != '/'));
 733		this.len = name - (const char *) this.name;
 734		this.hash = end_name_hash(hash);
 735
 736		/* remove trailing slashes? */
 737		if (!c)
 738			goto last_component;
 739		while (*++name == '/');
 740		if (!*name)
 741			goto last_with_slashes;
 742
 743		/*
 744		 * "." and ".." are special - ".." especially so because it has
 745		 * to be able to know about the current root directory and
 746		 * parent relationships.
 747		 */
 748		if (this.name[0] == '.') switch (this.len) {
 749			default:
 750				break;
 751			case 2:	
 752				if (this.name[1] != '.')
 753					break;
 754				follow_dotdot(&nd->mnt, &nd->dentry);
 755				inode = nd->dentry->d_inode;
 756				/* fallthrough */
 757			case 1:
 758				continue;
 759		}
 760		/*
 761		 * See if the low-level filesystem might want
 762		 * to use its own hash..
 763		 */
 764		if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
 765			err = nd->dentry->d_op->d_hash(nd->dentry, &this);
 766			if (err < 0)
 767				break;
 768		}
 769		nd->flags |= LOOKUP_CONTINUE;
 770		/* This does the actual lookups.. */
 771		err = do_lookup(nd, &this, &next);
 772		if (err)
 773			break;
 774		/* Check mountpoints.. */
 775		follow_mount(&next.mnt, &next.dentry);
 776
 777		err = -ENOENT;
 778		inode = next.dentry->d_inode;
 779		if (!inode)
 780			goto out_dput;
 781		err = -ENOTDIR; 
 782		if (!inode->i_op)
 783			goto out_dput;
 784
 785		if (inode->i_op->follow_link) {
 786			mntget(next.mnt);
 787			err = do_follow_link(next.dentry, nd);
 788			dput(next.dentry);
 789			mntput(next.mnt);
 790			if (err)
 791				goto return_err;
 792			err = -ENOENT;
 793			inode = nd->dentry->d_inode;
 794			if (!inode)
 795				break;
 796			err = -ENOTDIR; 
 797			if (!inode->i_op)
 798				break;
 799		} else {
 800			dput(nd->dentry);
 801			nd->mnt = next.mnt;
 802			nd->dentry = next.dentry;
 803		}
 804		err = -ENOTDIR; 
 805		if (!inode->i_op->lookup)
 806			break;
 807		continue;
 808		/* here ends the main loop */
 809
 810last_with_slashes:
 811		lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 812last_component:
 813		nd->flags &= ~LOOKUP_CONTINUE;
 814		if (lookup_flags & LOOKUP_PARENT)
 815			goto lookup_parent;
 816		if (this.name[0] == '.') switch (this.len) {
 817			default:
 818				break;
 819			case 2:	
 820				if (this.name[1] != '.')
 821					break;
 822				follow_dotdot(&nd->mnt, &nd->dentry);
 823				inode = nd->dentry->d_inode;
 824				/* fallthrough */
 825			case 1:
 826				goto return_reval;
 827		}
 828		if (nd->dentry->d_op && nd->dentry->d_op->d_hash) {
 829			err = nd->dentry->d_op->d_hash(nd->dentry, &this);
 830			if (err < 0)
 831				break;
 832		}
 833		err = do_lookup(nd, &this, &next);
 834		if (err)
 835			break;
 836		follow_mount(&next.mnt, &next.dentry);
 837		inode = next.dentry->d_inode;
 838		if ((lookup_flags & LOOKUP_FOLLOW)
 839		    && inode && inode->i_op && inode->i_op->follow_link) {
 840			mntget(next.mnt);
 841			err = do_follow_link(next.dentry, nd);
 842			dput(next.dentry);
 843			mntput(next.mnt);
 844			if (err)
 845				goto return_err;
 846			inode = nd->dentry->d_inode;
 847		} else {
 848			dput(nd->dentry);
 849			nd->mnt = next.mnt;
 850			nd->dentry = next.dentry;
 851		}
 852		err = -ENOENT;
 853		if (!inode)
 854			break;
 855		if (lookup_flags & LOOKUP_DIRECTORY) {
 856			err = -ENOTDIR; 
 857			if (!inode->i_op || !inode->i_op->lookup)
 858				break;
 859		}
 860		goto return_base;
 861lookup_parent:
 862		nd->last = this;
 863		nd->last_type = LAST_NORM;
 864		if (this.name[0] != '.')
 865			goto return_base;
 866		if (this.len == 1)
 867			nd->last_type = LAST_DOT;
 868		else if (this.len == 2 && this.name[1] == '.')
 869			nd->last_type = LAST_DOTDOT;
 870		else
 871			goto return_base;
 872return_reval:
 873		/*
 874		 * We bypassed the ordinary revalidation routines.
 875		 * We may need to check the cached dentry for staleness.
 876		 */
 877		if (nd->dentry && nd->dentry->d_sb &&
 878		    (nd->dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)) {
 879			err = -ESTALE;
 880			/* Note: we do not d_invalidate() */
 881			if (!nd->dentry->d_op->d_revalidate(nd->dentry, nd))
 882				break;
 883		}
 884return_base:
 885		return 0;
 886out_dput:
 887		dput(next.dentry);
 888		break;
 889	}
 890	path_release(nd);
 891return_err:
 892	return err;
 893}
 894
 895/*
 896 * Wrapper to retry pathname resolution whenever the underlying
 897 * file system returns an ESTALE.
 898 *
 899 * Retry the whole path once, forcing real lookup requests
 900 * instead of relying on the dcache.
 901 */
 902int fastcall link_path_walk(const char *name, struct nameidata *nd)
 903{
 904	struct nameidata save = *nd;
 905	int result;
 906
 907	/* make sure the stuff we saved doesn't go away */
 908	dget(save.dentry);
 909	mntget(save.mnt);
 910
 911	result = __link_path_walk(name, nd);
 912	if (result == -ESTALE) {
 913		*nd = save;
 914		dget(nd->dentry);
 915		mntget(nd->mnt);
 916		nd->flags |= LOOKUP_REVAL;
 917		result = __link_path_walk(name, nd);
 918	}
 919
 920	dput(save.dentry);
 921	mntput(save.mnt);
 922
 923	return result;
 924}
 925
 926int fastcall path_walk(const char * name, struct nameidata *nd)
 927{
 928	current->total_link_count = 0;
 929	return link_path_walk(name, nd);
 930}
 931
 932/* SMP-safe */
 933/* returns 1 if everything is done */
 934static int __emul_lookup_dentry(const char *name, struct nameidata *nd)
 935{
 936	if (path_walk(name, nd))
 937		return 0;		/* something went wrong... */
 938
 939	if (!nd->dentry->d_inode || S_ISDIR(nd->dentry->d_inode->i_mode)) {
 940		struct dentry *old_dentry = nd->dentry;
 941		struct vfsmount *old_mnt = nd->mnt;
 942		struct qstr last = nd->last;
 943		int last_type = nd->last_type;
 944		/*
 945		 * NAME was not found in alternate root or it's a directory.  Try to find
 946		 * it in the normal root:
 947		 */
 948		nd->last_type = LAST_ROOT;
 949		read_lock(&current->fs->lock);
 950		nd->mnt = mntget(current->fs->rootmnt);
 951		nd->dentry = dget(current->fs->root);
 952		read_unlock(&current->fs->lock);
 953		if (path_walk(name, nd) == 0) {
 954			if (nd->dentry->d_inode) {
 955				dput(old_dentry);
 956				mntput(old_mnt);
 957				return 1;
 958			}
 959			path_release(nd);
 960		}
 961		nd->dentry = old_dentry;
 962		nd->mnt = old_mnt;
 963		nd->last = last;
 964		nd->last_type = last_type;
 965	}
 966	return 1;
 967}
 968
 969void set_fs_altroot(void)
 970{
 971	char *emul = __emul_prefix();
 972	struct nameidata nd;
 973	struct vfsmount *mnt = NULL, *oldmnt;
 974	struct dentry *dentry = NULL, *olddentry;
 975	int err;
 976
 977	if (!emul)
 978		goto set_it;
 979	err = path_lookup(emul, LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_NOALT, &nd);
 980	if (!err) {
 981		mnt = nd.mnt;
 982		dentry = nd.dentry;
 983	}
 984set_it:
 985	write_lock(&current->fs->lock);
 986	oldmnt = current->fs->altrootmnt;
 987	olddentry = current->fs->altroot;
 988	current->fs->altrootmnt = mnt;
 989	current->fs->altroot = dentry;
 990	write_unlock(&current->fs->lock);
 991	if (olddentry) {
 992		dput(olddentry);
 993		mntput(oldmnt);
 994	}
 995}
 996
 997int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd)
 998{
 999	int retval;
1000
1001	nd->last_type = LAST_ROOT; /* if there are only slashes... */
1002	nd->flags = flags;
1003	nd->depth = 0;
1004
1005	read_lock(&current->fs->lock);
1006	if (*name=='/') {
1007		if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) {
1008			nd->mnt = mntget(current->fs->altrootmnt);
1009			nd->dentry = dget(current->fs->altroot);
1010			read_unlock(&current->fs->lock);
1011			if (__emul_lookup_dentry(name,nd))
1012				return 0;
1013			read_lock(&current->fs->lock);
1014		}
1015		nd->mnt = mntget(current->fs->rootmnt);
1016		nd->dentry = dget(current->fs->root);
1017	} else {
1018		nd->mnt = mntget(current->fs->pwdmnt);
1019		nd->dentry = dget(current->fs->pwd);
1020	}
1021	read_unlock(&current->fs->lock);
1022	current->total_link_count = 0;
1023	retval = link_path_walk(name, nd);
1024	if (unlikely(current->audit_context
1025		     && nd && nd->dentry && nd->dentry->d_inode))
1026		audit_inode(name, nd->dentry->d_inode);
1027	return retval;
1028}
1029
1030/*
1031 * Restricted form of lookup. Doesn't follow links, single-component only,
1032 * needs parent already locked. Doesn't follow mounts.
1033 * SMP-safe.
1034 */
1035static struct dentry * __lookup_hash(struct qstr *name, struct dentry * base, struct nameidata *nd)
1036{
1037	struct dentry * dentry;
1038	struct inode *inode;
1039	int err;
1040
1041	inode = base->d_inode;
1042	err = permission(inode, MAY_EXEC, nd);
1043	dentry = ERR_PTR(err);
1044	if (err)
1045		goto out;
1046
1047	/*
1048	 * See if the low-level filesystem might want
1049	 * to use its own hash..
1050	 */
1051	if (base->d_op && base->d_op->d_hash) {
1052		err = base->d_op->d_hash(base, name);
1053		dentry = ERR_PTR(err);
1054		if (err < 0)
1055			goto out;
1056	}
1057
1058	dentry = cached_lookup(base, name, nd);
1059	if (!dentry) {
1060		struct dentry *new = d_alloc(base, name);
1061		dentry = ERR_PTR(-ENOMEM);
1062		if (!new)
1063			goto out;
1064		dentry = inode->i_op->lookup(inode, new, nd);
1065		if (!dentry)
1066			dentry = new;
1067		else
1068			dput(new);
1069	}
1070out:
1071	return dentry;
1072}
1073
1074struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
1075{
1076	return __lookup_hash(name, base, NULL);
1077}
1078
1079/* SMP-safe */
1080struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
1081{
1082	unsigned long hash;
1083	struct qstr this;
1084	unsigned int c;
1085
1086	this.name = name;
1087	this.len = len;
1088	if (!len)
1089		goto access;
1090
1091	hash = init_name_hash();
1092	while (len--) {
1093		c = *(const unsigned char *)name++;
1094		if (c == '/' || c == '\0')
1095			goto access;
1096		hash = partial_name_hash(c, hash);
1097	}
1098	this.hash = end_name_hash(hash);
1099
1100	return lookup_hash(&this, base);
1101access:
1102	return ERR_PTR(-EACCES);
1103}
1104
1105/*
1106 *	namei()
1107 *
1108 * is used by most simple commands to get the inode of a specified name.
1109 * Open, link etc use their own routines, but this is enough for things
1110 * like 'chmod' etc.
1111 *
1112 * namei exists in two versions: namei/lnamei. The only difference is
1113 * that namei follows links, while lnamei does not.
1114 * SMP-safe
1115 */
1116int fastcall __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
1117{
1118	char *tmp = getname(name);
1119	int err = PTR_ERR(tmp);
1120
1121	if (!IS_ERR(tmp)) {
1122		err = path_lookup(tmp, flags, nd);
1123		putname(tmp);
1124	}
1125	return err;
1126}
1127
1128/*
1129 * It's inline, so penalty for filesystems that don't use sticky bit is
1130 * minimal.
1131 */
1132static inline int check_sticky(struct inode *dir, struct inode *inode)
1133{
1134	if (!(dir->i_mode & S_ISVTX))
1135		return 0;
1136	if (inode->i_uid == current->fsuid)
1137		return 0;
1138	if (dir->i_uid == current->fsuid)
1139		return 0;
1140	return !capable(CAP_FOWNER);
1141}
1142
1143/*
1144 *	Check whether we can remove a link victim from directory dir, check
1145 *  whether the type of victim is right.
1146 *  1. We can't do it if dir is read-only (done in permission())
1147 *  2. We should have write and exec permissions on dir
1148 *  3. We can't remove anything from append-only dir
1149 *  4. We can't do anything with immutable dir (done in permission())
1150 *  5. If the sticky bit on dir is set we should either
1151 *	a. be owner of dir, or
1152 *	b. be owner of victim, or
1153 *	c. have CAP_FOWNER capability
1154 *  6. If the victim is append-only or immutable we can't do antyhing with
1155 *     links pointing to it.
1156 *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
1157 *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
1158 *  9. We can't remove a root or mountpoint.
1159 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
1160 *     nfs_async_unlink().
1161 */
1162static inline int may_delete(struct inode *dir,struct dentry *victim,int isdir)
1163{
1164	int error;
1165
1166	if (!victim->d_inode)
1167		return -ENOENT;
1168
1169	BUG_ON(victim->d_parent->d_inode != dir);
1170
1171	error = permission(dir,MAY_WRITE | MAY_EXEC, NULL);
1172	if (error)
1173		return error;
1174	if (IS_APPEND(dir))
1175		return -EPERM;
1176	if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
1177	    IS_IMMUTABLE(victim->d_inode))
1178		return -EPERM;
1179	if (isdir) {
1180		if (!S_ISDIR(victim->d_inode->i_mode))
1181			return -ENOTDIR;
1182		if (IS_ROOT(victim))
1183			return -EBUSY;
1184	} else if (S_ISDIR(victim->d_inode->i_mode))
1185		return -EISDIR;
1186	if (IS_DEADDIR(dir))
1187		return -ENOENT;
1188	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
1189		return -EBUSY;
1190	return 0;
1191}
1192
1193/*	Check whether we can create an object with dentry child in directory
1194 *  dir.
1195 *  1. We can't do it if child already exists (open has special treatment for
1196 *     this case, but since we are inlined it's OK)
1197 *  2. We can't do it if dir is read-only (done in permission())
1198 *  3. We should have write and exec permissions on dir
1199 *  4. We can't do it if dir is immutable (done in permission())
1200 */
1201static inline int may_create(struct inode *dir, struct dentry *child,
1202			     struct nameidata *nd)
1203{
1204	if (child->d_inode)
1205		return -EEXIST;
1206	if (IS_DEADDIR(dir))
1207		return -ENOENT;
1208	return permission(dir,MAY_WRITE | MAY_EXEC, nd);
1209}
1210
1211/* 
1212 * Special case: O_CREAT|O_EXCL implies O_NOFOLLOW for security
1213 * reasons.
1214 *
1215 * O_DIRECTORY translates into forcing a directory lookup.
1216 */
1217static inline int lookup_flags(unsigned int f)
1218{
1219	unsigned long retval = LOOKUP_FOLLOW;
1220
1221	if (f & O_NOFOLLOW)
1222		retval &= ~LOOKUP_FOLLOW;
1223	
1224	if ((f & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
1225		retval &= ~LOOKUP_FOLLOW;
1226	
1227	if (f & O_DIRECTORY)
1228		retval |= LOOKUP_DIRECTORY;
1229
1230	return retval;
1231}
1232
1233/*
1234 * p1 and p2 should be directories on the same fs.
1235 */
1236struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
1237{
1238	struct dentry *p;
1239
1240	if (p1 == p2) {
1241		down(&p1->d_inode->i_sem);
1242		return NULL;
1243	}
1244
1245	down(&p1->d_inode->i_sb->s_vfs_rename_sem);
1246
1247	for (p = p1; p->d_parent != p; p = p->d_parent) {
1248		if (p->d_parent == p2) {
1249			down(&p2->d_inode->i_sem);
1250			down(&p1->d_inode->i_sem);
1251			return p;
1252		}
1253	}
1254
1255	for (p = p2; p->d_parent != p; p = p->d_parent) {
1256		if (p->d_parent == p1) {
1257			down(&p1->d_inode->i_sem);
1258			down(&p2->d_inode->i_sem);
1259			return p;
1260		}
1261	}
1262
1263	down(&p1->d_inode->i_sem);
1264	down(&p2->d_inode->i_sem);
1265	return NULL;
1266}
1267
1268void unlock_rename(struct dentry *p1, struct dentry *p2)
1269{
1270	up(&p1->d_inode->i_sem);
1271	if (p1 != p2) {
1272		up(&p2->d_inode->i_sem);
1273		up(&p1->d_inode->i_sb->s_vfs_rename_sem);
1274	}
1275}
1276
1277int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
1278		struct nameidata *nd)
1279{
1280	int error = may_create(dir, dentry, nd);
1281
1282	if (error)
1283		return error;
1284
1285	if (!dir->i_op || !dir->i_op->create)
1286		return -EACCES;	/* shouldn't it be ENOSYS? */
1287	mode &= S_IALLUGO;
1288	mode |= S_IFREG;
1289	error = security_inode_create(dir, dentry, mode);
1290	if (error)
1291		return error;
1292	DQUOT_INIT(dir);
1293	error = dir->i_op->create(dir, dentry, mode, nd);
1294	if (!error) {
1295		inode_dir_notify(dir, DN_CREATE);
1296		security_inode_post_create(dir, dentry, mode);
1297	}
1298	return error;
1299}
1300
1301int may_open(struct nameidata *nd, int acc_mode, int flag)
1302{
1303	struct dentry *dentry = nd->dentry;
1304	struct inode *inode = dentry->d_inode;
1305	int error;
1306
1307	if (!inode)
1308		return -ENOENT;
1309
1310	if (S_ISLNK(inode->i_mode))
1311		return -ELOOP;
1312	
1313	if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE))
1314		return -EISDIR;
1315
1316	error = permission(inode, acc_mode, nd);
1317	if (error)
1318		return error;
1319
1320	/*
1321	 * FIFO's, sockets and device files are special: they don't
1322	 * actually live on the filesystem itself, and as such you
1323	 * can write to them even if the filesystem is read-only.
1324	 */
1325	if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
1326	    	flag &= ~O_TRUNC;
1327	} else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
1328		if (nd->mnt->mnt_flags & MNT_NODEV)
1329			return -EACCES;
1330
1331		flag &= ~O_TRUNC;
1332	} else if (IS_RDONLY(inode) && (flag & FMODE_WRITE))
1333		return -EROFS;
1334	/*
1335	 * An append-only file must be opened in append mode for writing.
1336	 */
1337	if (IS_APPEND(inode)) {
1338		if  ((flag & FMODE_WRITE) && !(flag & O_APPEND))
1339			return -EPERM;
1340		if (flag & O_TRUNC)
1341			return -EPERM;
1342	}
1343
1344	/* O_NOATIME can only be set by the owner or superuser */
1345	if (flag & O_NOATIME)
1346		if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER))
1347			return -EPERM;
1348
1349	/*
1350	 * Ensure there are no outstanding leases on the file.
1351	 */
1352	error = break_lease(inode, flag);
1353	if (error)
1354		return error;
1355
1356	if (flag & O_TRUNC) {
1357		error = get_write_access(inode);
1358		if (error)
1359			return error;
1360
1361		/*
1362		 * Refuse to truncate files with mandatory locks held on them.
1363		 */
1364		error = locks_verify_locked(inode);
1365		if (!error) {
1366			DQUOT_INIT(inode);
1367			
1368			error = do_truncate(dentry, 0);
1369		}
1370		put_write_access(inode);
1371		if (error)
1372			return error;
1373	} else
1374		if (flag & FMODE_WRITE)
1375			DQUOT_INIT(inode);
1376
1377	return 0;
1378}
1379
1380/*
1381 *	open_namei()
1382 *
1383 * namei for open - this is in fact almost the whole open-routine.
1384 *
1385 * Note that the low bits of "flag" aren't the same as in the open
1386 * system call - they are 00 - no permissions needed
1387 *			  01 - read permission needed
1388 *			  10 - write permission needed
1389 *			  11 - read/write permissions needed
1390 * which is a lot more logical, and also allows the "no perm" needed
1391 * for symlinks (where the permissions are checked later).
1392 * SMP-safe
1393 */
1394int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
1395{
1396	int acc_mode, error = 0;
1397	struct dentry *dentry;
1398	struct dentry *dir;
1399	int count = 0;
1400
1401	acc_mode = ACC_MODE(flag);
1402
1403	/* Allow the LSM permission hook to distinguish append 
1404	   access from general write access. */
1405	if (flag & O_APPEND)
1406		acc_mode |= MAY_APPEND;
1407
1408	/* Fill in the open() intent data */
1409	nd->intent.open.flags = flag;
1410	nd->intent.open.create_mode = mode;
1411
1412	/*
1413	 * The simplest case - just a plain lookup.
1414	 */
1415	if (!(flag & O_CREAT)) {
1416		error = path_lookup(pathname, lookup_flags(flag)|LOOKUP_OPEN, nd);
1417		if (error)
1418			return error;
1419		goto ok;
1420	}
1421
1422	/*
1423	 * Create - we need to know the parent.
1424	 */
1425	error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd);
1426	if (error)
1427		return error;
1428
1429	/*
1430	 * We have the parent and last component. First of all, check
1431	 * that we are not asked to creat(2) an obvious directory - that
1432	 * will not do.
1433	 */
1434	error = -EISDIR;
1435	if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len])
1436		goto exit;
1437
1438	dir = nd->dentry;
1439	nd->flags &= ~LOOKUP_PARENT;
1440	down(&dir->d_inode->i_sem);
1441	dentry = __lookup_hash(&nd->last, nd->dentry, nd);
1442
1443do_last:
1444	error = PTR_ERR(dentry);
1445	if (IS_ERR(dentry)) {
1446		up(&dir->d_inode->i_sem);
1447		goto exit;
1448	}
1449
1450	/* Negative dentry, just create the file */
1451	if (!dentry->d_inode) {
1452		if (!IS_POSIXACL(dir->d_inode))
1453			mode &= ~current->fs->umask;
1454		error = vfs_create(dir->d_inode, dentry, mode, nd);
1455		up(&dir->d_inode->i_sem);
1456		dput(nd->dentry);
1457		nd->dentry = dentry;
1458		if (error)
1459			goto exit;
1460		/* Don't check for write permission, don't truncate */
1461		acc_mode = 0;
1462		flag &= ~O_TRUNC;
1463		goto ok;
1464	}
1465
1466	/*
1467	 * It already exists.
1468	 */
1469	up(&dir->d_inode->i_sem);
1470
1471	error = -EEXIST;
1472	if (flag & O_EXCL)
1473		goto exit_dput;
1474
1475	if (d_mountpoint(dentry)) {
1476		error = -ELOOP;
1477		if (flag & O_NOFOLLOW)
1478			goto exit_dput;
1479		while (__follow_down(&nd->mnt,&dentry) && d_mountpoint(dentry));
1480	}
1481	error = -ENOENT;
1482	if (!dentry->d_inode)
1483		goto exit_dput;
1484	if (dentry->d_inode->i_op && dentry->d_inode->i_op->follow_link)
1485		goto do_link;
1486
1487	dput(nd->dentry);
1488	nd->dentry = dentry;
1489	error = -EISDIR;
1490	if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode))
1491		goto exit;
1492ok:
1493	error = may_open(nd, acc_mode, flag);
1494	if (error)
1495		goto exit;
1496	return 0;
1497
1498exit_dput:
1499	dput(dentry);
1500exit:
1501	path_release(nd);
1502	return error;
1503
1504do_link:
1505	error = -ELOOP;
1506	if (flag & O_NOFOLLOW)
1507		goto exit_dput;
1508	/*
1509	 * This is subtle. Instead of calling do_follow_link() we do the
1510	 * thing by hands. The reason is that this way we have zero link_count
1511	 * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT.
1512	 * After that we have the parent and last component, i.e.
1513	 * we are in the same situation as after the first path_walk().
1514	 * Well, almost - if the last component is normal we get its copy
1515	 * stored in nd->last.name and we will have to putname() it when we
1516	 * are done. Procfs-like symlinks just set LAST_BIND.
1517	 */
1518	nd->flags |= LOOKUP_PARENT;
1519	error = security_inode_follow_link(dentry, nd);
1520	if (error)
1521		goto exit_dput;
1522	error = __do_follow_link(dentry, nd);
1523	dput(dentry);
1524	if (error)
1525		return error;
1526	nd->flags &= ~LOOKUP_PARENT;
1527	if (nd->last_type == LAST_BIND) {
1528		dentry = nd->dentry;
1529		goto ok;
1530	}
1531	error = -EISDIR;
1532	if (nd->last_type != LAST_NORM)
1533		goto exit;
1534	if (nd->last.name[nd->last.len]) {
1535		putname(nd->last.name);
1536		goto exit;
1537	}
1538	error = -ELOOP;
1539	if (count++==32) {
1540		putname(nd->last.name);
1541		goto exit;
1542	}
1543	dir = nd->dentry;
1544	down(&dir->d_inode->i_sem);
1545	dentry = __lookup_hash(&nd->last, nd->dentry, nd);
1546	putname(nd->last.name);
1547	goto do_last;
1548}
1549
1550/**
1551 * lookup_create - lookup a dentry, creating it if it doesn't exist
1552 * @nd: nameidata info
1553 * @is_dir: directory flag
1554 *
1555 * Simple function to lookup and return a dentry and create it
1556 * if it doesn't exist.  Is SMP-safe.
1557 */
1558struct dentry *lookup_create(struct nameidata *nd, int is_dir)
1559{
1560	struct dentry *dentry;
1561
1562	down(&nd->dentry->d_inode->i_sem);
1563	dentry = ERR_PTR(-EEXIST);
1564	if (nd->last_type != LAST_NORM)
1565		goto fail;
1566	nd->flags &= ~LOOKUP_PARENT;
1567	dentry = lookup_hash(&nd->last, nd->dentry);
1568	if (IS_ERR(dentry))
1569		goto fail;
1570	if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
1571		goto enoent;
1572	return dentry;
1573enoent:
1574	dput(dentry);
1575	dentry = ERR_PTR(-ENOENT);
1576fail:
1577	return dentry;
1578}
1579
1580int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1581{
1582	int error = may_create(dir, dentry, NULL);
1583
1584	if (error)
1585		return error;
1586
1587	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
1588		return -EPERM;
1589
1590	if (!dir->i_op || !dir->i_op->mknod)
1591		return -EPERM;
1592
1593	error = security_inode_mknod(dir, dentry, mode, dev);
1594	if (error)
1595		return error;
1596
1597	DQUOT_INIT(dir);
1598	error = dir->i_op->mknod(dir, dentry, mode, dev);
1599	if (!error) {
1600		inode_dir_notify(dir, DN_CREATE);
1601		security_inode_post_mknod(dir, dentry, mode, dev);
1602	}
1603	return error;
1604}
1605
1606asmlinkage long sys_mknod(const char __user * filename, int mode, unsigned dev)
1607{
1608	int error = 0;
1609	char * tmp;
1610	struct dentry * dentry;
1611	struct nameidata nd;
1612
1613	if (S_ISDIR(mode))
1614		return -EPERM;
1615	tmp = getname(filename);
1616	if (IS_ERR(tmp))
1617		return PTR_ERR(tmp);
1618
1619	error = path_lookup(tmp, LOOKUP_PARENT, &nd);
1620	if (error)
1621		goto out;
1622	dentry = lookup_create(&nd, 0);
1623	error = PTR_ERR(dentry);
1624
1625	if (!IS_POSIXACL(nd.dentry->d_inode))
1626		mode &= ~current->fs->umask;
1627	if (!IS_ERR(dentry)) {
1628		switch (mode & S_IFMT) {
1629		case 0: case S_IFREG:
1630			error = vfs_create(nd.dentry->d_inode,dentry,mode,&nd);
1631			break;
1632		case S_IFCHR: case S_IFBLK:
1633			error = vfs_mknod(nd.dentry->d_inode,dentry,mode,
1634					new_decode_dev(dev));
1635			break;
1636		case S_IFIFO: case S_IFSOCK:
1637			error = vfs_mknod(nd.dentry->d_inode,dentry,mode,0);
1638			break;
1639		case S_IFDIR:
1640			error = -EPERM;
1641			break;
1642		default:
1643			error = -EINVAL;
1644		}
1645		dput(dentry);
1646	}
1647	up(&nd.dentry->d_inode->i_sem);
1648	path_release(&nd);
1649out:
1650	putname(tmp);
1651
1652	return error;
1653}
1654
1655int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1656{
1657	int error = may_create(dir, dentry, NULL);
1658
1659	if (error)
1660		return error;
1661
1662	if (!dir->i_op || !dir->i_op->mkdir)
1663		return -EPERM;
1664
1665	mode &= (S_IRWXUGO|S_ISVTX);
1666	error = security_inode_mkdir(dir, dentry, mode);
1667	if (error)
1668		return error;
1669
1670	DQUOT_INIT(dir);
1671	error = dir->i_op->mkdir(dir, dentry, mode);
1672	if (!error) {
1673		inode_dir_notify(dir, DN_CREATE);
1674		security_inode_post_mkdir(dir,dentry, mode);
1675	}
1676	return error;
1677}
1678
1679asmlinkage long sys_mkdir(const char __user * pathname, int mode)
1680{
1681	int error = 0;
1682	char * tmp;
1683
1684	tmp = getname(pathname);
1685	error = PTR_ERR(tmp);
1686	if (!IS_ERR(tmp)) {
1687		struct dentry *dentry;
1688		struct nameidata nd;
1689
1690		error = path_lookup(tmp, LOOKUP_PARENT, &nd);
1691		if (error)
1692			goto out;
1693		dentry = lookup_create(&nd, 1);
1694		error = PTR_ERR(dentry);
1695		if (!IS_ERR(dentry)) {
1696			if (!IS_POSIXACL(nd.dentry->d_inode))
1697				mode &= ~current->fs->umask;
1698			error = vfs_mkdir(nd.dentry->d_inode, dentry, mode);
1699			dput(dentry);
1700		}
1701		up(&nd.dentry->d_inode->i_sem);
1702		path_release(&nd);
1703out:
1704		putname(tmp);
1705	}
1706
1707	return error;
1708}
1709
1710/*
1711 * We try to drop the dentry early: we should have
1712 * a usage count of 2 if we're the only user of this
1713 * dentry, and if that is true (possibly after pruning
1714 * the dcache), then we drop the dentry now.
1715 *
1716 * A low-level filesystem can, if it choses, legally
1717 * do a
1718 *
1719 *	if (!d_unhashed(dentry))
1720 *		return -EBUSY;
1721 *
1722 * if it cannot handle the case of removing a directory
1723 * that is still in use by something else..
1724 */
1725void dentry_unhash(struct dentry *dentry)
1726{
1727	dget(dentry);
1728	if (atomic_read(&dentry->d_count))
1729		shrink_dcache_parent(dentry);
1730	spin_lock(&dcache_lock);
1731	spin_lock(&dentry->d_lock);
1732	if (atomic_read(&dentry->d_count) == 2)
1733		__d_drop(dentry);
1734	spin_unlock(&dentry->d_lock);
1735	spin_unlock(&dcache_lock);
1736}
1737
1738int vfs_rmdir(struct inode *dir, struct dentry *dentry)
1739{
1740	int error = may_delete(dir, dentry, 1);
1741
1742	if (error)
1743		return error;
1744
1745	if (!dir->i_op || !dir->i_op->rmdir)
1746		return -EPERM;
1747
1748	DQUOT_INIT(dir);
1749
1750	down(&dentry->d_inode->i_sem);
1751	dentry_unhash(dentry);
1752	if (d_mountpoint(dentry))
1753		error = -EBUSY;
1754	else {
1755		error = security_inode_rmdir(dir, dentry);
1756		if (!error) {
1757			error = dir->i_op->rmdir(dir, dentry);
1758			if (!error)
1759				dentry->d_inode->i_flags |= S_DEAD;
1760		}
1761	}
1762	up(&dentry->d_inode->i_sem);
1763	if (!error) {
1764		inode_dir_notify(dir, DN_DELETE);
1765		d_delete(dentry);
1766	}
1767	dput(dentry);
1768
1769	return error;
1770}
1771
1772asmlinkage long sys_rmdir(const char __user * pathname)
1773{
1774	int error = 0;
1775	char * name;
1776	struct dentry *dentry;
1777	struct nameidata nd;
1778
1779	name = getname(pathname);
1780	if(IS_ERR(name))
1781		return PTR_ERR(name);
1782
1783	error = path_lookup(name, LOOKUP_PARENT, &nd);
1784	if (error)
1785		goto exit;
1786
1787	switch(nd.last_type) {
1788		case LAST_DOTDOT:
1789			error = -ENOTEMPTY;
1790			goto exit1;
1791		case LAST_DOT:
1792			error = -EINVAL;
1793			goto exit1;
1794		case LAST_ROOT:
1795			error = -EBUSY;
1796			goto exit1;
1797	}
1798	down(&nd.dentry->d_inode->i_sem);
1799	dentry = lookup_hash(&nd.last, nd.dentry);
1800	error = PTR_ERR(dentry);
1801	if (!IS_ERR(dentry)) {
1802		error = vfs_rmdir(nd.dentry->d_inode, dentry);
1803		dput(dentry);
1804	}
1805	up(&nd.dentry->d_inode->i_sem);
1806exit1:
1807	path_release(&nd);
1808exit:
1809	putname(name);
1810	return error;
1811}
1812
1813int vfs_unlink(struct inode *dir, struct dentry *dentry)
1814{
1815	int error = may_delete(dir, dentry, 0);
1816
1817	if (error)
1818		return error;
1819
1820	if (!dir->i_op || !dir->i_op->unlink)
1821		return -EPERM;
1822
1823	DQUOT_INIT(dir);
1824
1825	down(&dentry->d_inode->i_sem);
1826	if (d_mountpoint(dentry))
1827		error = -EBUSY;
1828	else {
1829		error = security_inode_unlink(dir, dentry);
1830		if (!error)
1831			error = dir->i_op->unlink(dir, dentry);
1832	}
1833	up(&dentry->d_inode->i_sem);
1834
1835	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
1836	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
1837		d_delete(dentry);
1838		inode_dir_notify(dir, DN_DELETE);
1839	}
1840	return error;
1841}
1842
1843/*
1844 * Make sure that the actual truncation of the file will occur outside its
1845 * directory's i_sem.  Truncate can take a long time if there is a lot of
1846 * writeout happening, and we don't want to prevent access to the directory
1847 * while waiting on the I/O.
1848 */
1849asmlinkage long sys_unlink(const char __user * pathname)
1850{
1851	int error = 0;
1852	char * name;
1853	struct dentry *dentry;
1854	struct nameidata nd;
1855	struct inode *inode = NULL;
1856
1857	name = getname(pathname);
1858	if(IS_ERR(name))
1859		return PTR_ERR(name);
1860
1861	error = path_lookup(name, LOOKUP_PARENT, &nd);
1862	if (error)
1863		goto exit;
1864	error = -EISDIR;
1865	if (nd.last_type != LAST_NORM)
1866		goto exit1;
1867	down(&nd.dentry->d_inode->i_sem);
1868	dentry = lookup_hash(&nd.last, nd.dentry);
1869	error = PTR_ERR(dentry);
1870	if (!IS_ERR(dentry)) {
1871		/* Why not before? Because we want correct error value */
1872		if (nd.last.name[nd.last.len])
1873			goto slashes;
1874		inode = dentry->d_inode;
1875		if (inode)
1876			atomic_inc(&inode->i_count);
1877		error = vfs_unlink(nd.dentry->d_inode, dentry);
1878	exit2:
1879		dput(dentry);
1880	}
1881	up(&nd.dentry->d_inode->i_sem);
1882	if (inode)
1883		iput(inode);	/* truncate the inode here */
1884exit1:
1885	path_release(&nd);
1886exit:
1887	putname(name);
1888	return error;
1889
1890slashes:
1891	error = !dentry->d_inode ? -ENOENT :
1892		S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
1893	goto exit2;
1894}
1895
1896int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode)
1897{
1898	int error = may_create(dir, dentry, NULL);
1899
1900	if (error)
1901		return error;
1902
1903	if (!dir->i_op || !dir->i_op->symlink)
1904		return -EPERM;
1905
1906	error = security_inode_symlink(dir, dentry, oldname);
1907	if (error)
1908		return error;
1909
1910	DQUOT_INIT(dir);
1911	error = dir->i_op->symlink(dir, dentry, oldname);
1912	if (!error) {
1913		inode_dir_notify(dir, DN_CREATE);
1914		security_inode_post_symlink(dir, dentry, oldname);
1915	}
1916	return error;
1917}
1918
1919asmlinkage long sys_symlink(const char __user * oldname, const char __user * newname)
1920{
1921	int error = 0;
1922	char * from;
1923	char * to;
1924
1925	from = getname(oldname);
1926	if(IS_ERR(from))
1927		return PTR_ERR(from);
1928	to = getname(newname);
1929	error = PTR_ERR(to);
1930	if (!IS_ERR(to)) {
1931		struct dentry *dentry;
1932		struct nameidata nd;
1933
1934		error = path_lookup(to, LOOKUP_PARENT, &nd);
1935		if (error)
1936			goto out;
1937		dentry = lookup_create(&nd, 0);
1938		error = PTR_ERR(dentry);
1939		if (!IS_ERR(dentry)) {
1940			error = vfs_symlink(nd.dentry->d_inode, dentry, from, S_IALLUGO);
1941			dput(dentry);
1942		}
1943		up(&nd.dentry->d_inode->i_sem);
1944		path_release(&nd);
1945out:
1946		putname(to);
1947	}
1948	putname(from);
1949	return error;
1950}
1951
1952int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
1953{
1954	struct inode *inode = old_dentry->d_inode;
1955	int error;
1956
1957	if (!inode)
1958		return -ENOENT;
1959
1960	error = may_create(dir, new_dentry, NULL);
1961	if (error)
1962		return error;
1963
1964	if (dir->i_sb != inode->i_sb)
1965		return -EXDEV;
1966
1967	/*
1968	 * A link to an append-only or immutable file cannot be created.
1969	 */
1970	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1971		return -EPERM;
1972	if (!dir->i_op || !dir->i_op->link)
1973		return -EPERM;
1974	if (S_ISDIR(old_dentry->d_inode->i_mode))
1975		return -EPERM;
1976
1977	error = security_inode_link(old_dentry, dir, new_dentry);
1978	if (error)
1979		return error;
1980
1981	down(&old_dentry->d_inode->i_sem);
1982	DQUOT_INIT(dir);
1983	error = dir->i_op->link(old_dentry, dir, new_dentry);
1984	up(&old_dentry->d_inode->i_sem);
1985	if (!error) {
1986		inode_dir_notify(dir, DN_CREATE);
1987		security_inode_post_link(old_dentry, dir, new_dentry);
1988	}
1989	return error;
1990}
1991
1992/*
1993 * Hardlinks are often used in delicate situations.  We avoid
1994 * security-related surprises by not following symlinks on the
1995 * newname.  --KAB
1996 *
1997 * We don't follow them on the oldname either to be compatible
1998 * with linux 2.0, and to avoid hard-linking to directories
1999 * and other special files.  --ADM
2000 */
2001asmlinkage long sys_link(const char __user * oldname, const char __user * newname)
2002{
2003	struct dentry *new_dentry;
2004	struct nameidata nd, old_nd;
2005	int error;
2006	char * to;
2007
2008	to = getname(newname);
2009	if (IS_ERR(to))
2010		return PTR_ERR(to);
2011
2012	error = __user_walk(oldname, 0, &old_nd);
2013	if (error)
2014		goto exit;
2015	error = path_lookup(to, LOOKUP_PARENT, &nd);
2016	if (error)
2017		goto out;
2018	error = -EXDEV;
2019	if (old_nd.mnt != nd.mnt)
2020		goto out_release;
2021	new_dentry = lookup_create(&nd, 0);
2022	error = PTR_ERR(new_dentry);
2023	if (!IS_ERR(new_dentry)) {
2024		error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
2025		dput(new_dentry);
2026	}
2027	up(&nd.dentry->d_inode->i_sem);
2028out_release:
2029	path_release(&nd);
2030out:
2031	path_release(&old_nd);
2032exit:
2033	putname(to);
2034
2035	return error;
2036}
2037
2038/*
2039 * The worst of all namespace operations - renaming directory. "Perverted"
2040 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
2041 * Problems:
2042 *	a) we can get into loop creation. Check is done in is_subdir().
2043 *	b) race potential - two innocent renames can create a loop together.
2044 *	   That's where 4.4 screws up. Current fix: serialization on
2045 *	   sb->s_vfs_rename_sem. We might be more accurate, but that's another
2046 *	   story.
2047 *	c) we have to lock _three_ objects - parents and victim (if it exists).
2048 *	   And that - after we got ->i_sem on parents (until then we don't know
2049 *	   whether the target exists).  Solution: try to be smart with locking
2050 *	   order for inodes.  We rely on the fact that tree topology may change
2051 *	 …

Large files files are truncated, but you can click here to view the full file