* locking semantics of the file system using the protocol.  It should 
  * be somewhere else, I'm sure, but right now it isn't.
  *
+ * New in version 8:
+ *     - Replace delete inode votes with a cluster lock
+ *
  * New in version 7:
  *     - DLM join domain includes the live nodemap
  *
  *     - full 64 bit i_size in the metadata lock lvbs
  *     - introduction of "rw" lock and pushing meta/data locking down
  */
-#define O2NET_PROTOCOL_VERSION 7ULL
+#define O2NET_PROTOCOL_VERSION 8ULL
 struct o2net_handshake {
        __be64  protocol_version;
        __be64  connector_id;
 
        .flags          = 0,
 };
 
+static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
+       .get_osb        = ocfs2_get_inode_osb,
+       .flags          = 0,
+};
+
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
        return lockres->l_type == OCFS2_LOCK_TYPE_META ||
                lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
-               lockres->l_type == OCFS2_LOCK_TYPE_RW;
+               lockres->l_type == OCFS2_LOCK_TYPE_RW ||
+               lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
 }
 
 static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
                case OCFS2_LOCK_TYPE_DATA:
                        ops = &ocfs2_inode_data_lops;
                        break;
+               case OCFS2_LOCK_TYPE_OPEN:
+                       ops = &ocfs2_inode_open_lops;
+                       break;
                default:
                        mlog_bug_on_msg(1, "type: %d\n", type);
                        ops = NULL; /* thanks, gcc */
                goto bail;
        }
 
+       ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
+       if (ret) {
+               mlog_errno(ret);
+               goto bail;
+       }
+
 bail:
        mlog_exit(ret);
        return ret;
        mlog_exit_void();
 }
 
+/*
+ * ocfs2_open_lock always get PR mode lock.
+ */
+int ocfs2_open_lock(struct inode *inode)
+{
+       int status = 0;
+       struct ocfs2_lock_res *lockres;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       BUG_ON(!inode);
+
+       mlog_entry_void();
+
+       mlog(0, "inode %llu take PRMODE open lock\n",
+            (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+       if (ocfs2_mount_local(osb))
+               goto out;
+
+       lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+       status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+                                   LKM_PRMODE, 0, 0);
+       if (status < 0)
+               mlog_errno(status);
+
+out:
+       mlog_exit(status);
+       return status;
+}
+
+int ocfs2_try_open_lock(struct inode *inode, int write)
+{
+       int status = 0, level;
+       struct ocfs2_lock_res *lockres;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       BUG_ON(!inode);
+
+       mlog_entry_void();
+
+       mlog(0, "inode %llu try to take %s open lock\n",
+            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+            write ? "EXMODE" : "PRMODE");
+
+       if (ocfs2_mount_local(osb))
+               goto out;
+
+       lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+       level = write ? LKM_EXMODE : LKM_PRMODE;
+
+       /*
+        * The file system may already holding a PRMODE/EXMODE open lock.
+        * Since we pass LKM_NOQUEUE, the request won't block waiting on
+        * other nodes and the -EAGAIN will indicate to the caller that
+        * this inode is still in use.
+        */
+       status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+                                   level, LKM_NOQUEUE, 0);
+
+out:
+       mlog_exit(status);
+       return status;
+}
+
+/*
+ * ocfs2_open_unlock unlock PR and EX mode open locks.
+ */
+void ocfs2_open_unlock(struct inode *inode)
+{
+       struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       mlog_entry_void();
+
+       mlog(0, "inode %llu drop open lock\n",
+            (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+       if (ocfs2_mount_local(osb))
+               goto out;
+
+       if(lockres->l_ro_holders)
+               ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+                                    LKM_PRMODE);
+       if(lockres->l_ex_holders)
+               ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+                                    LKM_EXMODE);
+
+out:
+       mlog_exit_void();
+}
+
 int ocfs2_data_lock_full(struct inode *inode,
                         int write,
                         int arg_flags)
         * ocfs2_clear_inode has done it for us. */
 
        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
-                             &OCFS2_I(inode)->ip_data_lockres);
+                             &OCFS2_I(inode)->ip_open_lockres);
        if (err < 0)
                mlog_errno(err);
 
        status = err;
 
+       err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+                             &OCFS2_I(inode)->ip_data_lockres);
+       if (err < 0)
+               mlog_errno(err);
+       if (err < 0 && !status)
+               status = err;
+
        err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
                              &OCFS2_I(inode)->ip_meta_lockres);
        if (err < 0)
 
                       int write);
 int ocfs2_rw_lock(struct inode *inode, int write);
 void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_open_lock(struct inode *inode);
+int ocfs2_try_open_lock(struct inode *inode, int write);
+void ocfs2_open_unlock(struct inode *inode);
 int ocfs2_meta_lock_atime(struct inode *inode,
                          struct vfsmount *vfsmnt,
                          int *level);
 
                     (unsigned long long)fe->i_blkno);
 
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
-       OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
        OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
 
        inode->i_nlink = le16_to_cpu(fe->i_links_count);
 
                ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
                                          OCFS2_LOCK_TYPE_META, 0, inode);
+
+               ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+                                         OCFS2_LOCK_TYPE_OPEN, 0, inode);
        }
 
        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
         * cluster lock before trusting anything anyway.
         */
        can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
-               && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK)
+               && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
                && !ocfs2_mount_local(osb);
 
        /*
                                  OCFS2_LOCK_TYPE_META,
                                  generation, inode);
 
+       ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+                                 OCFS2_LOCK_TYPE_OPEN,
+                                 0, inode);
+
        if (can_lock) {
+               status = ocfs2_open_lock(inode);
+               if (status) {
+                       make_bad_inode(inode);
+                       mlog_errno(status);
+                       return status;
+               }
                status = ocfs2_meta_lock(inode, NULL, 0);
                if (status) {
                        make_bad_inode(inode);
                }
        }
 
+       if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
+               status = ocfs2_try_open_lock(inode, 0);
+               if (status) {
+                       make_bad_inode(inode);  
+                       return status;
+               }
+       }
+
        status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
                                  can_lock ? inode : NULL);
        if (status < 0) {
        struct inode *orphan_dir_inode = NULL;
        struct buffer_head *orphan_dir_bh = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       struct ocfs2_dinode *di;
 
-       /* We've already voted on this so it should be readonly - no
-        * spinlock needed. */
-       orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+       di = (struct ocfs2_dinode *) di_bh->b_data;
+       orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
 
        status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
        if (status)
        return ret;
 }
 
+static int ocfs2_request_delete(struct inode *inode)
+{
+       int status = 0;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+       if (ocfs2_inode_is_new(inode))
+               return 0;
+
+       if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
+                                  osb->node_num))
+               return 0;
+       /*
+        * This is how ocfs2 determines whether an inode is still live
+        * within the cluster. Every node takes a shared read lock on
+        * the inode open lock in ocfs2_read_locked_inode(). When we
+        * get to ->delete_inode(), each node tries to convert it's
+        * lock to an exclusive. Trylocks are serialized by the inode
+        * meta data lock. If the upconvert suceeds, we know the inode
+        * is no longer live and can be deleted.
+        *
+        * Though we call this with the meta data lock held, the
+        * trylock keeps us from ABBA deadlock.
+        */
+       status = ocfs2_try_open_lock(inode, 1);
+       if (status < 0 && status != -EAGAIN)
+               mlog_errno(status);
+       return status;
+}
+
 /* Query the cluster to determine whether we should wipe an inode from
  * disk or not.
  *
                goto bail;
        }
 
-       status = ocfs2_request_delete_vote(inode);
-       /* -EBUSY means that other nodes are still using the
+       status = ocfs2_request_delete(inode);
+       /* -EAGAIN means that other nodes are still using the
         * inode. We're done here though, so avoid doing anything on
         * disk and let them worry about deleting it. */
-       if (status == -EBUSY) {
+       if (status == -EAGAIN) {
                status = 0;
                mlog(0, "Skipping delete of %llu because it is in use on"
                     "other nodes\n", (unsigned long long)oi->ip_blkno);
                goto bail;
        }
 
-       spin_lock(&oi->ip_lock);
-       if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
-               /* Nobody knew which slot this inode was orphaned
-                * into. This may happen during node death and
-                * recovery knows how to clean it up so we can safely
-                * ignore this inode for now on. */
-               mlog(0, "Nobody knew where inode %llu was orphaned!\n",
-                    (unsigned long long)oi->ip_blkno);
-       } else {
-               *wipe = 1;
-
-               mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n",
-                    (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot);
-       }
-       spin_unlock(&oi->ip_lock);
+       *wipe = 1;
+       mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
+            (unsigned long long)oi->ip_blkno,
+            le16_to_cpu(di->i_orphaned_slot));
 
 bail:
        return status;
        mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
                        "Inode=%lu\n", inode->i_ino);
 
+       /* For remove delete_inode vote, we hold open lock before,
+        * now it is time to unlock PR and EX open locks. */
+       ocfs2_open_unlock(inode);
+
        /* Do these before all the other work so that we don't bounce
         * the vote thread while waiting to destroy the locks. */
        ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
        ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
        ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
+       ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
 
        /* We very well may get a clear_inode before all an inodes
         * metadata has hit disk. Of course, we can't drop any cluster
        ocfs2_lock_res_free(&oi->ip_rw_lockres);
        ocfs2_lock_res_free(&oi->ip_meta_lockres);
        ocfs2_lock_res_free(&oi->ip_data_lockres);
+       ocfs2_lock_res_free(&oi->ip_open_lockres);
 
        ocfs2_metadata_cache_purge(inode);
 
        mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
             (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
 
-       /* Testing ip_orphaned_slot here wouldn't work because we may
-        * not have gotten a delete_inode vote from any other nodes
-        * yet. */
        if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
                generic_delete_inode(inode);
        else
 
        struct ocfs2_lock_res           ip_rw_lockres;
        struct ocfs2_lock_res           ip_meta_lockres;
        struct ocfs2_lock_res           ip_data_lockres;
+       struct ocfs2_lock_res           ip_open_lockres;
 
        /* protects allocation changes on this inode. */
        struct rw_semaphore             ip_alloc_sem;
 /* Flags for ocfs2_iget() */
 #define OCFS2_FI_FLAG_NOWAIT   0x1
 #define OCFS2_FI_FLAG_DELETE   0x2
-#define OCFS2_FI_FLAG_SYSFILE  0x4
-#define OCFS2_FI_FLAG_NOLOCK   0x8
+#define OCFS2_FI_FLAG_SYSFILE          0x4
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY  0x8
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
 struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
                                     u64 blkno,
 
                                continue;
 
                        iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
-                                         OCFS2_FI_FLAG_NOLOCK);
+                                         OCFS2_FI_FLAG_ORPHAN_RECOVERY);
                        if (IS_ERR(iter))
                                continue;
 
                /* Set the proper information to get us going into
                 * ocfs2_delete_inode. */
                oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
-               oi->ip_orphaned_slot = slot;
                spin_unlock(&oi->ip_lock);
 
                iput(inode);
 
         * unlink. */
        spin_lock(&oi->ip_lock);
        oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
-       oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
        spin_unlock(&oi->ip_lock);
 
 bail_add:
        /* Record which orphan dir our inode now resides
         * in. delete_inode will use this to determine which orphan
         * dir to lock. */
-       spin_lock(&OCFS2_I(inode)->ip_lock);
-       OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
-       spin_unlock(&OCFS2_I(inode)->ip_lock);
+       fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
 
        mlog(0, "Inode %llu orphaned in slot %d\n",
             (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
 
        __le32 i_ctime_nsec;
        __le32 i_mtime_nsec;
        __le32 i_attr;
-       __le32 i_reserved1;
+       __le16 i_orphaned_slot;         /* Only valid when OCFS2_ORPHANED_FL
+                                          was set in i_flags */
+       __le16 i_reserved1;
 /*70*/ __le64 i_reserved2[8];
 /*B8*/ union {
                __le64 i_pad1;          /* Generic way to refer to this
 
        OCFS2_LOCK_TYPE_RENAME,
        OCFS2_LOCK_TYPE_RW,
        OCFS2_LOCK_TYPE_DENTRY,
+       OCFS2_LOCK_TYPE_OPEN,
        OCFS2_NUM_LOCK_TYPES
 };
 
                case OCFS2_LOCK_TYPE_DENTRY:
                        c = 'N';
                        break;
+               case OCFS2_LOCK_TYPE_OPEN:
+                       c = 'O';
+                       break;
                default:
                        c = '\0';
        }
         * important job it does, anyway. */
        [OCFS2_LOCK_TYPE_RW] = "Write/Read",
        [OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
+       [OCFS2_LOCK_TYPE_OPEN] = "Open",
 };
 
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
 
                ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
                ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
                ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+               ocfs2_lock_res_init_once(&oi->ip_open_lockres);
 
                ocfs2_metadata_cache_init(&oi->vfs_inode);