
From: Andreas Gruenbacher <agruen@suse.de>

This patch removes the dependency on i_sem in the getxattr and
listxattr iops of ext2 and ext3. In addition, the global ext[23]_xattr
semaphores go away. Instead of i_sem and the global semaphore, mutual
exclusion is now ensured by per-inode xattr semaphores, and by locking
the buffers before modifying them. The detailed locking strategy is
described in comments in fs/ext[23]/xattr.c.

Due to this change it is no longer necessary to take i_sem in
ext[23]_permission() for retrieving acls, so the
ext[23]_permission_locked() functions go away.

Additionally, the patch fixes a race condition in ext[23]_permission:
Accessing inode->i_acl was protected by the BKL in 2.4; in 2.5 there no
longer is such protection. Instead, inode->i_acl (and inode->i_default_acl)
are now accessed under inode->i_lock. (This could be replaced by RCU in
the future.)

In the ext3 extended attribute code, an new uglines results from locking
at the buffer head level: The buffer lock must be held between testing
if an xattr block can be modified and the actual modification to prevent
races from happening. Before a block can be modified,
ext3_journal_get_write_access() must be called. But this requies an unlocked
buffer, so I call ext3_journal_get_write_access() before locking the
buffer. If it turns out that the buffer cannot be modified, 
journal_release_buffer() is called. Calling ext3_journal_get_write_access
after the test but while the buffer is still locked would be much better.



 fs/ext2/acl.c             |  104 ++++++++++-------------
 fs/ext2/acl.h             |    1 
 fs/ext2/ext2.h            |   10 ++
 fs/ext2/super.c           |    3 
 fs/ext2/xattr.c           |  155 ++++++++++++++++++-----------------
 fs/ext2/xattr_user.c      |   12 --
 fs/ext3/acl.c             |   99 ++++++++++------------
 fs/ext3/acl.h             |    1 
 fs/ext3/super.c           |    3 
 fs/ext3/xattr.c           |  201 +++++++++++++++++++++++++---------------------
 fs/ext3/xattr_user.c      |   12 --
 include/linux/ext3_fs_i.h |   10 ++
 12 files changed, 311 insertions(+), 300 deletions(-)

diff -puN fs/ext2/acl.c~xattr-fine-grained-locking fs/ext2/acl.c
--- 25/fs/ext2/acl.c~xattr-fine-grained-locking	2003-07-04 22:31:27.000000000 -0700
+++ 25-akpm/fs/ext2/acl.c	2003-07-04 22:31:39.000000000 -0700
@@ -124,14 +124,38 @@ fail:
 	return ERR_PTR(-EINVAL);
 }
 
+static inline struct posix_acl *
+ext2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
+{
+	struct posix_acl *acl = EXT2_ACL_NOT_CACHED;
+
+	spin_lock(&inode->i_lock);
+	if (*i_acl != EXT2_ACL_NOT_CACHED)
+		acl = posix_acl_dup(*i_acl);
+	spin_unlock(&inode->i_lock);
+
+	return acl;
+}
+
+static inline void
+ext2_iset_acl(struct inode *inode, struct posix_acl **i_acl,
+		   struct posix_acl *acl)
+{
+	spin_lock(&inode->i_lock);
+	if (*i_acl != EXT2_ACL_NOT_CACHED)
+		posix_acl_release(*i_acl);
+	*i_acl = posix_acl_dup(acl);
+	spin_unlock(&inode->i_lock);
+}
+
 /*
- * inode->i_sem: down
+ * inode->i_sem: don't care
  */
 static struct posix_acl *
 ext2_get_acl(struct inode *inode, int type)
 {
 	const size_t max_size = ext2_acl_size(EXT2_ACL_MAX_ENTRIES);
-	struct ext2_inode_inode *ei = EXT2_I(inode);
+	struct ext2_inode_info *ei = EXT2_I(inode);
 	int name_index;
 	char *value;
 	struct posix_acl *acl;
@@ -142,14 +166,16 @@ ext2_get_acl(struct inode *inode, int ty
 
 	switch(type) {
 		case ACL_TYPE_ACCESS:
-			if (ei->i_acl != EXT2_ACL_NOT_CACHED)
-				return posix_acl_dup(ei->i_acl);
+			acl = ext2_iget_acl(inode, &ei->i_acl);
+			if (acl != EXT2_ACL_NOT_CACHED)
+				return acl;
 			name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
 			break;
 
 		case ACL_TYPE_DEFAULT:
-			if (ei->i_default_acl != EXT2_ACL_NOT_CACHED)
-				return posix_acl_dup(ei->i_default_acl);
+			acl = ext2_iget_acl(inode, &ei->i_default_acl);
+			if (acl != EXT2_ACL_NOT_CACHED)
+				return acl;
 			name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT;
 			break;
 
@@ -171,11 +197,11 @@ ext2_get_acl(struct inode *inode, int ty
 	if (!IS_ERR(acl)) {
 		switch(type) {
 			case ACL_TYPE_ACCESS:
-				ei->i_acl = posix_acl_dup(acl);
+				ext2_iset_acl(inode, &ei->i_acl, acl);
 				break;
 
 			case ACL_TYPE_DEFAULT:
-				ei->i_default_acl = posix_acl_dup(acl);
+				ext2_iset_acl(inode, &ei->i_default_acl, acl);
 				break;
 		}
 	}
@@ -240,23 +266,24 @@ ext2_set_acl(struct inode *inode, int ty
 	if (!error) {
 		switch(type) {
 			case ACL_TYPE_ACCESS:
-				if (ei->i_acl != EXT2_ACL_NOT_CACHED)
-					posix_acl_release(ei->i_acl);
-				ei->i_acl = posix_acl_dup(acl);
+				ext2_iset_acl(inode, &ei->i_acl, acl);
 				break;
 
 			case ACL_TYPE_DEFAULT:
-				if (ei->i_default_acl != EXT2_ACL_NOT_CACHED)
-					posix_acl_release(ei->i_default_acl);
-				ei->i_default_acl = posix_acl_dup(acl);
+				ext2_iset_acl(inode, &ei->i_default_acl, acl);
 				break;
 		}
 	}
 	return error;
 }
 
-static int
-__ext2_permission(struct inode *inode, int mask, int lock)
+/*
+ * Inode operation permission().
+ *
+ * inode->i_sem: don't care
+ */
+int
+ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	int mode = inode->i_mode;
 
@@ -270,30 +297,16 @@ __ext2_permission(struct inode *inode, i
 	if (current->fsuid == inode->i_uid) {
 		mode >>= 6;
 	} else if (test_opt(inode->i_sb, POSIX_ACL)) {
-		struct ext2_inode_info *ei = EXT2_I(inode);
+		struct posix_acl *acl;
 
 		/* The access ACL cannot grant access if the group class
 		   permission bits don't contain all requested permissions. */
 		if (((mode >> 3) & mask & S_IRWXO) != mask)
 			goto check_groups;
-		if (ei->i_acl == EXT2_ACL_NOT_CACHED) {
-			struct posix_acl *acl;
-
-			if (lock) {
-				down(&inode->i_sem);
-				acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
-				up(&inode->i_sem);
-			} else
-				acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
-
-			if (IS_ERR(acl))
-				return PTR_ERR(acl);
+		acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
+		if (acl) {
+			int error = posix_acl_permission(inode, acl, mask);
 			posix_acl_release(acl);
-			if (ei->i_acl == EXT2_ACL_NOT_CACHED)
-				return -EIO;
-		}
-		if (ei->i_acl) {
-			int error = posix_acl_permission(inode, ei->i_acl,mask);
 			if (error == -EACCES)
 				goto check_capabilities;
 			return error;
@@ -320,32 +333,10 @@ check_capabilities:
 }
 
 /*
- * Inode operation permission().
- *
- * inode->i_sem: up
- * BKL held [before 2.5.x]
- */
-int
-ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-	return __ext2_permission(inode, mask, 1);
-}
-
-/*
- * Used internally if i_sem is already down.
- */
-int
-ext2_permission_locked(struct inode *inode, int mask)
-{
-	return __ext2_permission(inode, mask, 0);
-}
-
-/*
  * Initialize the ACLs of a new inode. Called from ext2_new_inode.
  *
  * dir->i_sem: down
  * inode->i_sem: up (access to inode is still exclusive)
- * BKL held [before 2.5.x] 
  */
 int
 ext2_init_acl(struct inode *inode, struct inode *dir)
@@ -405,7 +396,6 @@ cleanup:
  * file mode.
  *
  * inode->i_sem: down
- * BKL held [before 2.5.x]
  */
 int
 ext2_acl_chmod(struct inode *inode)
diff -puN fs/ext2/acl.h~xattr-fine-grained-locking fs/ext2/acl.h
--- 25/fs/ext2/acl.h~xattr-fine-grained-locking	2003-07-04 22:31:27.000000000 -0700
+++ 25-akpm/fs/ext2/acl.h	2003-07-04 22:31:27.000000000 -0700
@@ -60,7 +60,6 @@ static inline int ext2_acl_count(size_t 
 
 /* acl.c */
 extern int ext2_permission (struct inode *, int, struct nameidata *);
-extern int ext2_permission_locked (struct inode *, int);
 extern int ext2_acl_chmod (struct inode *);
 extern int ext2_init_acl (struct inode *, struct inode *);
 
diff -puN fs/ext2/ext2.h~xattr-fine-grained-locking fs/ext2/ext2.h
--- 25/fs/ext2/ext2.h~xattr-fine-grained-locking	2003-07-04 22:31:27.000000000 -0700
+++ 25-akpm/fs/ext2/ext2.h	2003-07-04 22:31:27.000000000 -0700
@@ -41,6 +41,16 @@ struct ext2_inode_info {
 	__u32	i_prealloc_block;
 	__u32	i_prealloc_count;
 	__u32	i_dir_start_lookup;
+#ifdef CONFIG_EXT2_FS_XATTR
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_sem even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+#endif
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
 	struct posix_acl	*i_acl;
 	struct posix_acl	*i_default_acl;
diff -puN fs/ext2/super.c~xattr-fine-grained-locking fs/ext2/super.c
--- 25/fs/ext2/super.c~xattr-fine-grained-locking	2003-07-04 22:31:27.000000000 -0700
+++ 25-akpm/fs/ext2/super.c	2003-07-04 22:31:27.000000000 -0700
@@ -177,6 +177,9 @@ static void init_once(void * foo, kmem_c
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 	    SLAB_CTOR_CONSTRUCTOR) {
 		rwlock_init(&ei->i_meta_lock);
+#ifdef CONFIG_EXT2_FS_XATTR
+		init_rwsem(&ei->xattr_sem);
+#endif
 		inode_init_once(&ei->vfs_inode);
 	}
 }
diff -puN fs/ext2/xattr.c~xattr-fine-grained-locking fs/ext2/xattr.c
--- 25/fs/ext2/xattr.c~xattr-fine-grained-locking	2003-07-04 22:31:27.000000000 -0700
+++ 25-akpm/fs/ext2/xattr.c	2003-07-04 22:31:27.000000000 -0700
@@ -42,13 +42,12 @@
  *
  * Locking strategy
  * ----------------
- * The VFS already holds the BKL and the inode->i_sem semaphore when any of
- * the xattr inode operations are called, so we are guaranteed that only one
- * processes accesses extended attributes of an inode at any time.
- *
- * For writing we also grab the ext2_xattr_sem semaphore. This ensures that
- * only a single process is modifying an extended attribute block, even
- * if the block is shared among inodes.
+ * EXT2_I(inode)->i_file_acl is protected by EXT2_I(inode)->xattr_sem.
+ * EA blocks are only changed if they are exclusive to an inode, so
+ * holding xattr_sem also means that nothing but the EA block's reference
+ * count will change. Multiple writers to an EA block are synchronized
+ * by the bh lock. No more than a single bh lock is held at any time
+ * to avoid deadlocks.
  */
 
 #include <linux/buffer_head.h>
@@ -57,7 +56,7 @@
 #include <linux/slab.h>
 #include <linux/mbcache.h>
 #include <linux/quotaops.h>
-#include <asm/semaphore.h>
+#include <linux/rwsem.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
@@ -105,15 +104,6 @@ static void ext2_xattr_rehash(struct ext
 			      struct ext2_xattr_entry *);
 
 static struct mb_cache *ext2_xattr_cache;
-
-/*
- * If a file system does not share extended attributes among inodes,
- * we should not need the ext2_xattr_sem semaphore. However, the
- * filesystem may still contain shared blocks, so we always take
- * the lock.
- */
-
-static DECLARE_MUTEX(ext2_xattr_sem);
 static struct ext2_xattr_handler *ext2_xattr_handlers[EXT2_XATTR_INDEX_MAX];
 static rwlock_t ext2_handler_lock = RW_LOCK_UNLOCKED;
 
@@ -196,7 +186,7 @@ ext2_xattr_handler(int name_index)
 /*
  * Inode operation getxattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_sem: don't care
  */
 ssize_t
 ext2_getxattr(struct dentry *dentry, const char *name,
@@ -204,39 +194,28 @@ ext2_getxattr(struct dentry *dentry, con
 {
 	struct ext2_xattr_handler *handler;
 	struct inode *inode = dentry->d_inode;
-	ssize_t error;
 
 	handler = ext2_xattr_resolve_name(&name);
 	if (!handler)
 		return -EOPNOTSUPP;
-	down(&inode->i_sem);
-	error = handler->get(inode, name, buffer, size);
-	up(&inode->i_sem);
-
-	return error;
+	return handler->get(inode, name, buffer, size);
 }
 
 /*
  * Inode operation listxattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_sem: don't care
  */
 ssize_t
 ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
-	ssize_t error;
-
-	down(&dentry->d_inode->i_sem);
-	error = ext2_xattr_list(dentry->d_inode, buffer, size);
-	up(&dentry->d_inode->i_sem);
-
-	return error;
+	return ext2_xattr_list(dentry->d_inode, buffer, size);
 }
 
 /*
  * Inode operation setxattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_sem: down
  */
 int
 ext2_setxattr(struct dentry *dentry, const char *name,
@@ -256,7 +235,7 @@ ext2_setxattr(struct dentry *dentry, con
 /*
  * Inode operation removexattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_sem: down
  */
 int
 ext2_removexattr(struct dentry *dentry, const char *name)
@@ -295,12 +274,15 @@ ext2_xattr_get(struct inode *inode, int 
 
 	if (name == NULL)
 		return -EINVAL;
+	down_read(&EXT2_I(inode)->xattr_sem);
+	error = -ENODATA;
 	if (!EXT2_I(inode)->i_file_acl)
-		return -ENODATA;
+		goto cleanup;
 	ea_idebug(inode, "reading block %d", EXT2_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
+	error = -EIO;
 	if (!bh)
-		return -EIO;
+		goto cleanup;
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
 	end = bh->b_data + bh->b_size;
@@ -365,6 +347,7 @@ found:
 
 cleanup:
 	brelse(bh);
+	up_read(&EXT2_I(inode)->xattr_sem);
 
 	return error;
 }
@@ -391,12 +374,15 @@ ext2_xattr_list(struct inode *inode, cha
 	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
 		  buffer, (long)buffer_size);
 
+	down_read(&EXT2_I(inode)->xattr_sem);
+	error = 0;
 	if (!EXT2_I(inode)->i_file_acl)
-		return 0;
+		goto cleanup;
 	ea_idebug(inode, "reading block %d", EXT2_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
+	error = -EIO;
 	if (!bh)
-		return -EIO;
+		goto cleanup;
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
 	end = bh->b_data + bh->b_size;
@@ -449,6 +435,7 @@ bad_block:	ext2_error(inode->i_sb, "ext2
 
 cleanup:
 	brelse(bh);
+	up_read(&EXT2_I(inode)->xattr_sem);
 
 	return error;
 }
@@ -520,8 +507,7 @@ ext2_xattr_set(struct inode *inode, int 
 	name_len = strlen(name);
 	if (name_len > 255 || value_len > sb->s_blocksize)
 		return -ERANGE;
-	down(&ext2_xattr_sem);
-
+	down_write(&EXT2_I(inode)->xattr_sem);
 	if (EXT2_I(inode)->i_file_acl) {
 		/* The inode already has an extended attribute block. */
 		bh = sb_bread(sb, EXT2_I(inode)->i_file_acl);
@@ -614,12 +600,16 @@ bad_block:		ext2_error(sb, "ext2_xattr_s
 	/* Here we know that we can set the new attribute. */
 
 	if (header) {
+		/* assert(header == HDR(bh)); */
+		lock_buffer(bh);
 		if (header->h_refcount == cpu_to_le32(1)) {
 			ea_bdebug(bh, "modifying in-place");
 			ext2_xattr_cache_remove(bh);
+			/* keep the buffer locked while modifying it. */
 		} else {
 			int offset;
 
+			unlock_buffer(bh);
 			ea_bdebug(bh, "cloning");
 			header = kmalloc(bh->b_size, GFP_KERNEL);
 			error = -ENOMEM;
@@ -644,6 +634,8 @@ bad_block:		ext2_error(sb, "ext2_xattr_s
 		last = here = ENTRY(header+1);
 	}
 
+	/* Iff we are modifying the block in-place, bh is locked here. */
+
 	if (not_found) {
 		/* Insert the new name. */
 		size_t size = EXT2_XATTR_LEN(name_len);
@@ -714,9 +706,13 @@ bad_block:		ext2_error(sb, "ext2_xattr_s
 skip_replace:
 	if (IS_LAST_ENTRY(ENTRY(header+1))) {
 		/* This block is now empty. */
+		if (bh && header == HDR(bh))
+			unlock_buffer(bh);  /* we were modifying in-place. */
 		error = ext2_xattr_set2(inode, bh, NULL);
 	} else {
 		ext2_xattr_rehash(header, here);
+		if (bh && header == HDR(bh))
+			unlock_buffer(bh);  /* we were modifying in-place. */
 		error = ext2_xattr_set2(inode, bh, header);
 	}
 
@@ -724,7 +720,7 @@ cleanup:
 	brelse(bh);
 	if (!(bh && header == HDR(bh)))
 		kfree(header);
-	up(&ext2_xattr_sem);
+	up_write(&EXT2_I(inode)->xattr_sem);
 
 	return error;
 }
@@ -744,24 +740,28 @@ ext2_xattr_set2(struct inode *inode, str
 		new_bh = ext2_xattr_cache_find(inode, header);
 		if (new_bh) {
 			/*
-			 * We found an identical block in the cache.
-			 * The old block will be released after updating
-			 * the inode.
+			 * We found an identical block in the cache. The
+			 * block returned is locked. The old block will
+			 * be released after updating the inode.
 			 */
 			ea_bdebug(new_bh, "%s block %lu",
 				(old_bh == new_bh) ? "keeping" : "reusing",
 				(unsigned long) new_bh->b_blocknr);
 			
 			error = -EDQUOT;
-			if (DQUOT_ALLOC_BLOCK(inode, 1))
+			if (DQUOT_ALLOC_BLOCK(inode, 1)) {
+				unlock_buffer(new_bh);
 				goto cleanup;
+			}
 			
 			HDR(new_bh)->h_refcount = cpu_to_le32(
 				le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
 			ea_bdebug(new_bh, "refcount now=%d",
 				le32_to_cpu(HDR(new_bh)->h_refcount));
+			unlock_buffer(new_bh);
 		} else if (old_bh && header == HDR(old_bh)) {
-			/* Keep this block. */
+			/* Keep this block. No need to lock the block as we
+			   don't need to change the reference count. */
 			new_bh = old_bh;
 			get_bh(new_bh);
 			ext2_xattr_cache_insert(new_bh);
@@ -812,12 +812,11 @@ ext2_xattr_set2(struct inode *inode, str
 	error = 0;
 	if (old_bh && old_bh != new_bh) {
 		/*
-		 * If there was an old block, and we are not still using it,
-		 * we now release the old block.
-		*/
-		unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
-
-		if (refcount == 1) {
+		 * If there was an old block and we are no longer using it,
+		 * release the old block.
+		 */
+		lock_buffer(old_bh);
+		if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
 			/* Free the old block. */
 			ea_bdebug(old_bh, "freeing");
 			ext2_free_blocks(inode, old_bh->b_blocknr, 1);
@@ -827,12 +826,14 @@ ext2_xattr_set2(struct inode *inode, str
 			bforget(old_bh);
 		} else {
 			/* Decrement the refcount only. */
-			refcount--;
-			HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
+			HDR(old_bh)->h_refcount = cpu_to_le32(
+				le32_to_cpu(HDR(old_bh)->h_refcount) - 1);
 			DQUOT_FREE_BLOCK(inode, 1);
 			mark_buffer_dirty(old_bh);
-			ea_bdebug(old_bh, "refcount now=%d", refcount);
+			ea_bdebug(old_bh, "refcount now=%d",
+				le32_to_cpu(HDR(old_bh)->h_refcount));
 		}
+		unlock_buffer(old_bh);
 	}
 
 cleanup:
@@ -850,12 +851,11 @@ cleanup:
 void
 ext2_xattr_delete_inode(struct inode *inode)
 {
-	struct buffer_head *bh;
+	struct buffer_head *bh = NULL;
 
+	down_write(&EXT2_I(inode)->xattr_sem);
 	if (!EXT2_I(inode)->i_file_acl)
-		return;
-	down(&ext2_xattr_sem);
-
+		goto cleanup;
 	bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
 	if (!bh) {
 		ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
@@ -871,7 +871,7 @@ ext2_xattr_delete_inode(struct inode *in
 			EXT2_I(inode)->i_file_acl);
 		goto cleanup;
 	}
-	ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
+	lock_buffer(bh);
 	if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
 		ext2_xattr_cache_remove(bh);
 		ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
@@ -885,11 +885,13 @@ ext2_xattr_delete_inode(struct inode *in
 			sync_dirty_buffer(bh);
 		DQUOT_FREE_BLOCK(inode, 1);
 	}
+	ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
+	unlock_buffer(bh);
 	EXT2_I(inode)->i_file_acl = 0;
 
 cleanup:
 	brelse(bh);
-	up(&ext2_xattr_sem);
+	up_write(&EXT2_I(inode)->xattr_sem);
 }
 
 /*
@@ -982,8 +984,8 @@ ext2_xattr_cmp(struct ext2_xattr_header 
  *
  * Find an identical extended attribute block.
  *
- * Returns a pointer to the block found, or NULL if such a block was
- * not found or an error occurred.
+ * Returns a locked buffer head to the block found, or NULL if such
+ * a block was not found or an error occurred.
  */
 static struct buffer_head *
 ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
@@ -1003,18 +1005,23 @@ ext2_xattr_cache_find(struct inode *inod
 			ext2_error(inode->i_sb, "ext2_xattr_cache_find",
 				"inode %ld: block %ld read error",
 				inode->i_ino, (unsigned long) ce->e_block);
-		} else if (le32_to_cpu(HDR(bh)->h_refcount) >
-			   EXT2_XATTR_REFCOUNT_MAX) {
-			ea_idebug(inode, "block %ld refcount %d>%d",
-				  (unsigned long) ce->e_block,
-				  le32_to_cpu(HDR(bh)->h_refcount),
-				  EXT2_XATTR_REFCOUNT_MAX);
-		} else if (!ext2_xattr_cmp(header, HDR(bh))) {
-			ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
-			mb_cache_entry_release(ce);
-			return bh;
+		} else {
+			lock_buffer(bh);
+			if (le32_to_cpu(HDR(bh)->h_refcount) >
+				   EXT2_XATTR_REFCOUNT_MAX) {
+				ea_idebug(inode, "block %ld refcount %d>%d",
+					  (unsigned long) ce->e_block,
+					  le32_to_cpu(HDR(bh)->h_refcount),
+					  EXT2_XATTR_REFCOUNT_MAX);
+			} else if (!ext2_xattr_cmp(header, HDR(bh))) {
+				ea_bdebug(bh, "b_count=%d",
+					  atomic_read(&(bh->b_count)));
+				mb_cache_entry_release(ce);
+				return bh;
+			}
+			unlock_buffer(bh);
+			brelse(bh);
 		}
-		brelse(bh);
 		ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
 	}
 	return NULL;
diff -puN fs/ext2/xattr_user.c~xattr-fine-grained-locking fs/ext2/xattr_user.c
--- 25/fs/ext2/xattr_user.c~xattr-fine-grained-locking	2003-07-04 22:31:27.000000000 -0700
+++ 25-akpm/fs/ext2/xattr_user.c	2003-07-04 22:31:27.000000000 -0700
@@ -11,10 +11,6 @@
 #include "ext2.h"
 #include "xattr.h"
 
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
-# include "acl.h"
-#endif
-
 #define XATTR_USER_PREFIX "user."
 
 static size_t
@@ -44,11 +40,7 @@ ext2_xattr_user_get(struct inode *inode,
 		return -EINVAL;
 	if (!test_opt(inode->i_sb, XATTR_USER))
 		return -EOPNOTSUPP;
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
-	error = ext2_permission_locked(inode, MAY_READ);
-#else
 	error = permission(inode, MAY_READ, NULL);
-#endif
 	if (error)
 		return error;
 
@@ -68,11 +60,7 @@ ext2_xattr_user_set(struct inode *inode,
 	if ( !S_ISREG(inode->i_mode) &&
 	    (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
 		return -EPERM;
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
-	error = ext2_permission_locked(inode, MAY_WRITE);
-#else
 	error = permission(inode, MAY_WRITE, NULL);
-#endif
 	if (error)
 		return error;
 
diff -puN fs/ext3/acl.c~xattr-fine-grained-locking fs/ext3/acl.c
--- 25/fs/ext3/acl.c~xattr-fine-grained-locking	2003-07-04 22:31:27.000000000 -0700
+++ 25-akpm/fs/ext3/acl.c	2003-07-04 22:31:27.000000000 -0700
@@ -125,10 +125,34 @@ fail:
 	return ERR_PTR(-EINVAL);
 }
 
+static inline struct posix_acl *
+ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl)
+{
+	struct posix_acl *acl = EXT3_ACL_NOT_CACHED;
+
+	spin_lock(&inode->i_lock);
+	if (*i_acl != EXT3_ACL_NOT_CACHED)
+		acl = posix_acl_dup(*i_acl);
+	spin_unlock(&inode->i_lock);
+
+	return acl;
+}
+
+static inline void
+ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
+                  struct posix_acl *acl)
+{
+	spin_lock(&inode->i_lock);
+	if (*i_acl != EXT3_ACL_NOT_CACHED)
+		posix_acl_release(*i_acl);
+	*i_acl = posix_acl_dup(acl);
+	spin_unlock(&inode->i_lock);
+}
+
 /*
  * Inode operation get_posix_acl().
  *
- * inode->i_sem: down
+ * inode->i_sem: don't care
  */
 static struct posix_acl *
 ext3_get_acl(struct inode *inode, int type)
@@ -145,14 +169,16 @@ ext3_get_acl(struct inode *inode, int ty
 
 	switch(type) {
 		case ACL_TYPE_ACCESS:
-			if (ei->i_acl != EXT3_ACL_NOT_CACHED)
-				return posix_acl_dup(ei->i_acl);
+			acl = ext3_iget_acl(inode, &ei->i_acl);
+			if (acl != EXT3_ACL_NOT_CACHED)
+				return acl;
 			name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
 			break;
 
 		case ACL_TYPE_DEFAULT:
-			if (ei->i_default_acl != EXT3_ACL_NOT_CACHED)
-				return posix_acl_dup(ei->i_default_acl);
+			acl = ext3_iget_acl(inode, &ei->i_default_acl);
+			if (acl != EXT3_ACL_NOT_CACHED)
+				return acl;
 			name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
 			break;
 
@@ -174,11 +200,11 @@ ext3_get_acl(struct inode *inode, int ty
 	if (!IS_ERR(acl)) {
 		switch(type) {
 			case ACL_TYPE_ACCESS:
-				ei->i_acl = posix_acl_dup(acl);
+				ext3_iset_acl(inode, &ei->i_acl, acl);
 				break;
 
 			case ACL_TYPE_DEFAULT:
-				ei->i_default_acl = posix_acl_dup(acl);
+				ext3_iset_acl(inode, &ei->i_default_acl, acl);
 				break;
 		}
 	}
@@ -245,23 +271,24 @@ ext3_set_acl(handle_t *handle, struct in
 	if (!error) {
 		switch(type) {
 			case ACL_TYPE_ACCESS:
-				if (ei->i_acl != EXT3_ACL_NOT_CACHED)
-					posix_acl_release(ei->i_acl);
-				ei->i_acl = posix_acl_dup(acl);
+				ext3_iset_acl(inode, &ei->i_acl, acl);
 				break;
 
 			case ACL_TYPE_DEFAULT:
-				if (ei->i_default_acl != EXT3_ACL_NOT_CACHED)
-					posix_acl_release(ei->i_default_acl);
-				ei->i_default_acl = posix_acl_dup(acl);
+				ext3_iset_acl(inode, &ei->i_default_acl, acl);
 				break;
 		}
 	}
 	return error;
 }
 
-static int
-__ext3_permission(struct inode *inode, int mask, int lock)
+/*
+ * Inode operation permission().
+ *
+ * inode->i_sem: don't care
+ */
+int
+ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	int mode = inode->i_mode;
 
@@ -275,30 +302,16 @@ __ext3_permission(struct inode *inode, i
 	if (current->fsuid == inode->i_uid) {
 		mode >>= 6;
 	} else if (test_opt(inode->i_sb, POSIX_ACL)) {
-		struct ext3_inode_info *ei = EXT3_I(inode);
+		struct posix_acl *acl;
 
 		/* The access ACL cannot grant access if the group class
 		   permission bits don't contain all requested permissions. */
 		if (((mode >> 3) & mask & S_IRWXO) != mask)
 			goto check_groups;
-		if (ei->i_acl == EXT3_ACL_NOT_CACHED) {
-			struct posix_acl *acl;
-
-			if (lock) {
-				down(&inode->i_sem);
-				acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
-				up(&inode->i_sem);
-			} else
-				acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
-
-			if (IS_ERR(acl))
-				return PTR_ERR(acl);
+		acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
+		if (acl) {
+			int error = posix_acl_permission(inode, acl, mask);
 			posix_acl_release(acl);
-			if (ei->i_acl == EXT3_ACL_NOT_CACHED)
-				return -EIO;
-		}
-		if (ei->i_acl) {
-			int error = posix_acl_permission(inode, ei->i_acl,mask);
 			if (error == -EACCES)
 				goto check_capabilities;
 			return error;
@@ -325,26 +338,6 @@ check_capabilities:
 }
 
 /*
- * Inode operation permission().
- *
- * inode->i_sem: up
- */
-int
-ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-	return __ext3_permission(inode, mask, 1);
-}
-
-/*
- * Used internally if i_sem is already down.
- */
-int
-ext3_permission_locked(struct inode *inode, int mask)
-{
-	return __ext3_permission(inode, mask, 0);
-}
-
-/*
  * Initialize the ACLs of a new inode. Called from ext3_new_inode.
  *
  * dir->i_sem: down
diff -puN fs/ext3/acl.h~xattr-fine-grained-locking fs/ext3/acl.h
--- 25/fs/ext3/acl.h~xattr-fine-grained-locking	2003-07-04 22:31:27.000000000 -0700
+++ 25-akpm/fs/ext3/acl.h	2003-07-04 22:31:27.000000000 -0700
@@ -60,7 +60,6 @@ static inline int ext3_acl_count(size_t 
 
 /* acl.c */
 extern int ext3_permission (struct inode *, int, struct nameidata *);
-extern int ext3_permission_locked (struct inode *, int);
 extern int ext3_acl_chmod (struct inode *);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
 
diff -puN fs/ext3/super.c~xattr-fine-grained-locking fs/ext3/super.c
--- 25/fs/ext3/super.c~xattr-fine-grained-locking	2003-07-04 22:31:27.000000000 -0700
+++ 25-akpm/fs/ext3/super.c	2003-07-04 22:31:27.000000000 -0700
@@ -519,6 +519,9 @@ static void init_once(void * foo, kmem_c
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 	    SLAB_CTOR_CONSTRUCTOR) {
 		INIT_LIST_HEAD(&ei->i_orphan);
+#ifdef CONFIG_EXT3_FS_XATTR
+		init_rwsem(&ei->xattr_sem);
+#endif
 		init_rwsem(&ei->truncate_sem);
 		inode_init_once(&ei->vfs_inode);
 	}
diff -puN fs/ext3/xattr.c~xattr-fine-grained-locking fs/ext3/xattr.c
--- 25/fs/ext3/xattr.c~xattr-fine-grained-locking	2003-07-04 22:31:27.000000000 -0700
+++ 25-akpm/fs/ext3/xattr.c	2003-07-04 22:31:27.000000000 -0700
@@ -43,13 +43,12 @@
  *
  * Locking strategy
  * ----------------
- * The VFS holdsinode->i_sem semaphore when any of the xattr inode
- * operations are called, so we are guaranteed that only one
- * processes accesses extended attributes of an inode at any time.
- *
- * For writing we also grab the ext3_xattr_sem semaphore. This ensures that
- * only a single process is modifying an extended attribute block, even
- * if the block is shared among inodes.
+ * EXT3_I(inode)->i_file_acl is protected by EXT3_I(inode)->xattr_sem.
+ * EA blocks are only changed if they are exclusive to an inode, so
+ * holding xattr_sem also means that nothing but the EA block's reference
+ * count will change. Multiple writers to an EA block are synchronized
+ * by the bh lock. No more than a single bh lock is held at any time
+ * to avoid deadlocks.
  */
 
 #include <linux/init.h>
@@ -59,7 +58,7 @@
 #include <linux/ext3_fs.h>
 #include <linux/mbcache.h>
 #include <linux/quotaops.h>
-#include <asm/semaphore.h>
+#include <linux/rwsem.h>
 #include "xattr.h"
 #include "acl.h"
 
@@ -93,22 +92,14 @@ static int ext3_xattr_set_handle2(handle
 				  struct ext3_xattr_header *);
 
 static int ext3_xattr_cache_insert(struct buffer_head *);
-static struct buffer_head *ext3_xattr_cache_find(struct inode *,
-						 struct ext3_xattr_header *);
+static struct buffer_head *ext3_xattr_cache_find(handle_t *, struct inode *,
+						 struct ext3_xattr_header *,
+						 int *);
 static void ext3_xattr_cache_remove(struct buffer_head *);
 static void ext3_xattr_rehash(struct ext3_xattr_header *,
 			      struct ext3_xattr_entry *);
 
 static struct mb_cache *ext3_xattr_cache;
-
-/*
- * If a file system does not share extended attributes among inodes,
- * we should not need the ext3_xattr_sem semaphore. However, the
- * filesystem may still contain shared blocks, so we always take
- * the lock.
- */
-
-static DECLARE_MUTEX(ext3_xattr_sem);
 static struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX];
 static rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED;
 
@@ -191,7 +182,7 @@ ext3_xattr_handler(int name_index)
 /*
  * Inode operation getxattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_sem: don't care
  */
 ssize_t
 ext3_getxattr(struct dentry *dentry, const char *name,
@@ -199,39 +190,28 @@ ext3_getxattr(struct dentry *dentry, con
 {
 	struct ext3_xattr_handler *handler;
 	struct inode *inode = dentry->d_inode;
-	ssize_t error;
 
 	handler = ext3_xattr_resolve_name(&name);
 	if (!handler)
 		return -EOPNOTSUPP;
-	down(&inode->i_sem);
-	error = handler->get(inode, name, buffer, size);
-	up(&inode->i_sem);
-
-	return error;
+	return handler->get(inode, name, buffer, size);
 }
 
 /*
  * Inode operation listxattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_sem: don't care
  */
 ssize_t
 ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
-	ssize_t error;
-
-	down(&dentry->d_inode->i_sem);
-	error = ext3_xattr_list(dentry->d_inode, buffer, size);
-	up(&dentry->d_inode->i_sem);
-
-	return error;
+	return ext3_xattr_list(dentry->d_inode, buffer, size);
 }
 
 /*
  * Inode operation setxattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_sem: down
  */
 int
 ext3_setxattr(struct dentry *dentry, const char *name,
@@ -251,7 +231,7 @@ ext3_setxattr(struct dentry *dentry, con
 /*
  * Inode operation removexattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_sem: down
  */
 int
 ext3_removexattr(struct dentry *dentry, const char *name)
@@ -290,12 +270,15 @@ ext3_xattr_get(struct inode *inode, int 
 
 	if (name == NULL)
 		return -EINVAL;
+	down_read(&EXT3_I(inode)->xattr_sem);
+	error = -ENODATA;
 	if (!EXT3_I(inode)->i_file_acl)
-		return -ENODATA;
+		goto cleanup;
 	ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
+	error = -EIO;
 	if (!bh)
-		return -EIO;
+		goto cleanup;
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
 	end = bh->b_data + bh->b_size;
@@ -360,6 +343,7 @@ found:
 
 cleanup:
 	brelse(bh);
+	up_read(&EXT3_I(inode)->xattr_sem);
 
 	return error;
 }
@@ -386,12 +370,15 @@ ext3_xattr_list(struct inode *inode, cha
 	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
 		  buffer, (long)buffer_size);
 
+	down_read(&EXT3_I(inode)->xattr_sem);
+	error = 0;
 	if (!EXT3_I(inode)->i_file_acl)
-		return 0;
+		goto cleanup;
 	ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
+	error = -EIO;
 	if (!bh)
-		return -EIO;
+		goto cleanup;
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
 	end = bh->b_data + bh->b_size;
@@ -444,6 +431,7 @@ bad_block:	ext3_error(inode->i_sb, "ext3
 
 cleanup:
 	brelse(bh);
+	up_read(&EXT3_I(inode)->xattr_sem);
 
 	return error;
 }
@@ -459,11 +447,12 @@ static void ext3_xattr_update_super_bloc
 		return;
 
 	lock_super(sb);
-	ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
-	EXT3_SB(sb)->s_es->s_feature_compat |=
-		cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR);
-	sb->s_dirt = 1;
-	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
+		EXT3_SB(sb)->s_es->s_feature_compat |=
+			cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR);
+		sb->s_dirt = 1;
+		ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	}
 	unlock_super(sb);
 }
 
@@ -518,8 +507,7 @@ ext3_xattr_set_handle(handle_t *handle, 
 	name_len = strlen(name);
 	if (name_len > 255 || value_len > sb->s_blocksize)
 		return -ERANGE;
-	down(&ext3_xattr_sem);
-
+	down_write(&EXT3_I(inode)->xattr_sem);
 	if (EXT3_I(inode)->i_file_acl) {
 		/* The inode already has an extended attribute block. */
 		bh = sb_bread(sb, EXT3_I(inode)->i_file_acl);
@@ -612,15 +600,28 @@ bad_block:		ext3_error(sb, "ext3_xattr_s
 	/* Here we know that we can set the new attribute. */
 
 	if (header) {
+		int credits = 0;
+
+		/* assert(header == HDR(bh)); */
+		if (header->h_refcount != cpu_to_le32(1))
+			goto skip_get_write_access;
+		/* ext3_journal_get_write_access() requires an unlocked bh,
+		   which complicates things here. */
+		error = ext3_journal_get_write_access_credits(handle, bh,
+							      &credits);
+		if (error)
+			goto cleanup;
+		lock_buffer(bh);
 		if (header->h_refcount == cpu_to_le32(1)) {
 			ea_bdebug(bh, "modifying in-place");
 			ext3_xattr_cache_remove(bh);
-			error = ext3_journal_get_write_access(handle, bh);
-			if (error)
-				goto cleanup;
+			/* keep the buffer locked while modifying it. */
 		} else {
 			int offset;
 
+			unlock_buffer(bh);
+			journal_release_buffer(handle, bh, credits);
+		skip_get_write_access:
 			ea_bdebug(bh, "cloning");
 			header = kmalloc(bh->b_size, GFP_KERNEL);
 			error = -ENOMEM;
@@ -645,6 +646,8 @@ bad_block:		ext3_error(sb, "ext3_xattr_s
 		last = here = ENTRY(header+1);
 	}
 
+	/* Iff we are modifying the block in-place, bh is locked here. */
+
 	if (not_found) {
 		/* Insert the new name. */
 		size_t size = EXT3_XATTR_LEN(name_len);
@@ -715,9 +718,13 @@ bad_block:		ext3_error(sb, "ext3_xattr_s
 skip_replace:
 	if (IS_LAST_ENTRY(ENTRY(header+1))) {
 		/* This block is now empty. */
+		if (bh && header == HDR(bh))
+			unlock_buffer(bh);  /* we were modifying in-place. */
 		error = ext3_xattr_set_handle2(handle, inode, bh, NULL);
 	} else {
 		ext3_xattr_rehash(header, here);
+		if (bh && header == HDR(bh))
+			unlock_buffer(bh);  /* we were modifying in-place. */
 		error = ext3_xattr_set_handle2(handle, inode, bh, header);
 	}
 
@@ -725,7 +732,7 @@ cleanup:
 	brelse(bh);
 	if (!(bh && header == HDR(bh)))
 		kfree(header);
-	up(&ext3_xattr_sem);
+	up_write(&EXT3_I(inode)->xattr_sem);
 
 	return error;
 }
@@ -740,33 +747,34 @@ ext3_xattr_set_handle2(handle_t *handle,
 {
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *new_bh = NULL;
-	int error;
+	int credits = 0, error;
 
 	if (header) {
-		new_bh = ext3_xattr_cache_find(inode, header);
+		new_bh = ext3_xattr_cache_find(handle, inode, header, &credits);
 		if (new_bh) {
 			/*
-			 * We found an identical block in the cache.
-			 * The old block will be released after updating
-			 * the inode.
+			 * We found an identical block in the cache. The
+			 * block returned is locked. The old block will
+			 * be released after updating the inode.
 			 */
 			ea_bdebug(new_bh, "%s block %lu",
 				(old_bh == new_bh) ? "keeping" : "reusing",
 				(unsigned long) new_bh->b_blocknr);
 
 			error = -EDQUOT;
-			if (DQUOT_ALLOC_BLOCK(inode, 1))
-				goto cleanup;
-
-			error = ext3_journal_get_write_access(handle, new_bh);
-			if (error)
+			if (DQUOT_ALLOC_BLOCK(inode, 1)) {
+				unlock_buffer(new_bh);
+				journal_release_buffer(handle, new_bh, credits);
 				goto cleanup;
+			}
 			HDR(new_bh)->h_refcount = cpu_to_le32(
 				le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
 			ea_bdebug(new_bh, "refcount now=%d",
 				le32_to_cpu(HDR(new_bh)->h_refcount));
+			unlock_buffer(new_bh);
 		} else if (old_bh && header == HDR(old_bh)) {
-			/* Keep this block. */
+			/* Keep this block. No need to lock the block as we
+			 * don't need to change the reference count. */
 			new_bh = old_bh;
 			get_bh(new_bh);
 			ext3_xattr_cache_insert(new_bh);
@@ -817,15 +825,14 @@ getblk_failed:
 	error = 0;
 	if (old_bh && old_bh != new_bh) {
 		/*
-		 * If there was an old block, and we are not still using it,
-		 * we now release the old block.
+		 * If there was an old block, and we are no longer using it,
+		 * release the old block.
 		*/
-		unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
-
 		error = ext3_journal_get_write_access(handle, old_bh);
 		if (error)
 			goto cleanup;
-		if (refcount == 1) {
+		lock_buffer(old_bh);
+		if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
 			/* Free the old block. */
 			ea_bdebug(old_bh, "freeing");
 			ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1);
@@ -837,12 +844,14 @@ getblk_failed:
 			ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr);
 		} else {
 			/* Decrement the refcount only. */
-			refcount--;
-			HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
+			HDR(old_bh)->h_refcount = cpu_to_le32(
+				le32_to_cpu(HDR(old_bh)->h_refcount) - 1);
 			DQUOT_FREE_BLOCK(inode, 1);
 			ext3_journal_dirty_metadata(handle, old_bh);
-			ea_bdebug(old_bh, "refcount now=%d", refcount);
+			ea_bdebug(old_bh, "refcount now=%d",
+				le32_to_cpu(HDR(old_bh)->h_refcount));
 		}
+		unlock_buffer(old_bh);
 	}
 
 cleanup:
@@ -886,12 +895,11 @@ ext3_xattr_set(struct inode *inode, int 
 void
 ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
 {
-	struct buffer_head *bh;
+	struct buffer_head *bh = NULL;
 
+	down_write(&EXT3_I(inode)->xattr_sem);
 	if (!EXT3_I(inode)->i_file_acl)
-		return;
-	down(&ext3_xattr_sem);
-
+		goto cleanup;
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	if (!bh) {
 		ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
@@ -899,7 +907,6 @@ ext3_xattr_delete_inode(handle_t *handle
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
-	ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
 	if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
 	    HDR(bh)->h_blocks != cpu_to_le32(1)) {
 		ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
@@ -907,8 +914,9 @@ ext3_xattr_delete_inode(handle_t *handle
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
-	ext3_journal_get_write_access(handle, bh);
-	ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
+	if (ext3_journal_get_write_access(handle, bh) != 0)
+		goto cleanup;
+	lock_buffer(bh);
 	if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
 		ext3_xattr_cache_remove(bh);
 		ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1);
@@ -922,11 +930,13 @@ ext3_xattr_delete_inode(handle_t *handle
 			handle->h_sync = 1;
 		DQUOT_FREE_BLOCK(inode, 1);
 	}
+	ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
+	unlock_buffer(bh);
 	EXT3_I(inode)->i_file_acl = 0;
 
 cleanup:
 	brelse(bh);
-	up(&ext3_xattr_sem);
+	up_write(&EXT3_I(inode)->xattr_sem);
 }
 
 /*
@@ -1022,7 +1032,8 @@ ext3_xattr_cmp(struct ext3_xattr_header 
  * not found or an error occurred.
  */
 static struct buffer_head *
-ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header)
+ext3_xattr_cache_find(handle_t *handle, struct inode *inode,
+		      struct ext3_xattr_header *header, int *credits)
 {
 	__u32 hash = le32_to_cpu(header->h_hash);
 	struct mb_cache_entry *ce;
@@ -1039,18 +1050,28 @@ ext3_xattr_cache_find(struct inode *inod
 			ext3_error(inode->i_sb, "ext3_xattr_cache_find",
 				"inode %ld: block %ld read error",
 				inode->i_ino, (unsigned long) ce->e_block);
-		} else if (le32_to_cpu(HDR(bh)->h_refcount) >
-			   EXT3_XATTR_REFCOUNT_MAX) {
-			ea_idebug(inode, "block %ld refcount %d>%d",
-				  (unsigned long) ce->e_block,
-				  le32_to_cpu(HDR(bh)->h_refcount),
-				  EXT3_XATTR_REFCOUNT_MAX);
-		} else if (!ext3_xattr_cmp(header, HDR(bh))) {
-			ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
-			mb_cache_entry_release(ce);
-			return bh;
+		} else {
+			/* ext3_journal_get_write_access() requires an unlocked
+			 * bh, which complicates things here. */
+			if (ext3_journal_get_write_access_credits(handle, bh,
+								  credits) != 0)
+				return NULL;
+			lock_buffer(bh);
+			if (le32_to_cpu(HDR(bh)->h_refcount) >
+				   EXT3_XATTR_REFCOUNT_MAX) {
+				ea_idebug(inode, "block %ld refcount %d>%d",
+					  (unsigned long) ce->e_block,
+					  le32_to_cpu(HDR(bh)->h_refcount),
+					  EXT3_XATTR_REFCOUNT_MAX);
+			} else if (!ext3_xattr_cmp(header, HDR(bh))) {
+				mb_cache_entry_release(ce);
+				/* buffer will be unlocked by caller */
+				return bh;
+			}
+			unlock_buffer(bh);
+			journal_release_buffer(handle, bh, *credits);
+			brelse(bh);
 		}
-		brelse(bh);
 		ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
 	}
 	return NULL;
diff -puN fs/ext3/xattr_user.c~xattr-fine-grained-locking fs/ext3/xattr_user.c
--- 25/fs/ext3/xattr_user.c~xattr-fine-grained-locking	2003-07-04 22:31:27.000000000 -0700
+++ 25-akpm/fs/ext3/xattr_user.c	2003-07-04 22:31:27.000000000 -0700
@@ -13,10 +13,6 @@
 #include <linux/ext3_fs.h>
 #include "xattr.h"
 
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-# include "acl.h"
-#endif
-
 #define XATTR_USER_PREFIX "user."
 
 static size_t
@@ -46,11 +42,7 @@ ext3_xattr_user_get(struct inode *inode,
 		return -EINVAL;
 	if (!test_opt(inode->i_sb, XATTR_USER))
 		return -EOPNOTSUPP;
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	error = ext3_permission_locked(inode, MAY_READ);
-#else
 	error = permission(inode, MAY_READ, NULL);
-#endif
 	if (error)
 		return error;
 
@@ -70,11 +62,7 @@ ext3_xattr_user_set(struct inode *inode,
 	if ( !S_ISREG(inode->i_mode) &&
 	    (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
 		return -EPERM;
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	error = ext3_permission_locked(inode, MAY_WRITE);
-#else
 	error = permission(inode, MAY_WRITE, NULL);
-#endif
 	if (error)
 		return error;
 
diff -puN include/linux/ext3_fs_i.h~xattr-fine-grained-locking include/linux/ext3_fs_i.h
--- 25/include/linux/ext3_fs_i.h~xattr-fine-grained-locking	2003-07-04 22:31:27.000000000 -0700
+++ 25-akpm/include/linux/ext3_fs_i.h	2003-07-04 22:31:27.000000000 -0700
@@ -62,6 +62,16 @@ struct ext3_inode_info {
 	__u32	i_prealloc_count;
 #endif
 	__u32	i_dir_start_lookup;
+#ifdef CONFIG_EXT3_FS_XATTR
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_sem even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+#endif
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
 	struct posix_acl	*i_acl;
 	struct posix_acl	*i_default_acl;

_
