]> pilppa.org Git - linux-2.6-omap-h63xx.git/blobdiff - fs/ocfs2/dlm/dlmdomain.c
Merge git://git.infradead.org/mtd-2.6
[linux-2.6-omap-h63xx.git] / fs / ocfs2 / dlm / dlmdomain.c
index 8a208b06fdd770824477c160354b90c25591cd92..d836b98dd99a42614ab0e1afbb4c5f459ef708ae 100644 (file)
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
 #include "cluster/masklog.h"
 
+/*
+ * ocfs2 node maps are array of long int, which limits to send them freely
+ * across the wire due to endianness issues. To workaround this, we convert
+ * long ints to byte arrays. Following 3 routines are helper functions to
+ * set/test/copy bits within those array of bytes
+ */
+static inline void byte_set_bit(u8 nr, u8 map[])
+{
+       map[nr >> 3] |= (1UL << (nr & 7));
+}
+
+static inline int byte_test_bit(u8 nr, u8 map[])
+{
+       return ((1UL << (nr & 7)) & (map[nr >> 3])) != 0;
+}
+
+static inline void byte_copymap(u8 dmap[], unsigned long smap[],
+                       unsigned int sz)
+{
+       unsigned int nn;
+
+       if (!sz)
+               return;
+
+       memset(dmap, 0, ((sz + 7) >> 3));
+       for (nn = 0 ; nn < sz; nn++)
+               if (test_bit(nn, smap))
+                       byte_set_bit(nn, dmap);
+}
+
 static void dlm_free_pagevec(void **vec, int pages)
 {
        while (pages--)
@@ -108,8 +138,10 @@ static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
 
 void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
 {
-       hlist_del_init(&lockres->hash_node);
-       dlm_lockres_put(lockres);
+       if (!hlist_unhashed(&lockres->hash_node)) {
+               hlist_del_init(&lockres->hash_node);
+               dlm_lockres_put(lockres);
+       }
 }
 
 void __dlm_insert_lockres(struct dlm_ctxt *dlm,
@@ -398,11 +430,10 @@ redo_bucket:
 
                        dlm_lockres_put(res);
 
-                       cond_resched_lock(&dlm->spinlock);
-
                        if (dropped)
                                goto redo_bucket;
                }
+               cond_resched_lock(&dlm->spinlock);
                num += n;
                mlog(0, "%s: touched %d lockreses in bucket %d "
                     "(tot=%d)\n", dlm->name, n, i, num);
@@ -625,6 +656,8 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
                dlm_kick_thread(dlm, NULL);
 
                while (dlm_migrate_all_locks(dlm)) {
+                       /* Give dlm_thread time to purge the lockres' */
+                       msleep(500);
                        mlog(0, "%s: more migration to do\n", dlm->name);
                }
                dlm_mark_domain_leaving(dlm);
@@ -641,6 +674,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
        struct dlm_query_join_request *query;
        enum dlm_query_join_response response;
        struct dlm_ctxt *dlm = NULL;
+       u8 nodenum;
 
        query = (struct dlm_query_join_request *) msg->buf;
 
@@ -664,6 +698,28 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
 
        spin_lock(&dlm_domain_lock);
        dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
+       if (!dlm)
+               goto unlock_respond;
+
+       /*
+        * There is a small window where the joining node may not see the
+        * node(s) that just left but still part of the cluster. DISALLOW
+        * join request if joining node has different node map.
+        */
+       nodenum=0;
+       while (nodenum < O2NM_MAX_NODES) {
+               if (test_bit(nodenum, dlm->domain_map)) {
+                       if (!byte_test_bit(nodenum, query->node_map)) {
+                               mlog(0, "disallow join as node %u does not "
+                                    "have node %u in its nodemap\n",
+                                    query->node_idx, nodenum);
+                               response = JOIN_DISALLOW;
+                               goto unlock_respond;
+                       }
+               }
+               nodenum++;
+       }
+
        /* Once the dlm ctxt is marked as leaving then we don't want
         * to be put in someone's domain map. 
         * Also, explicitly disallow joining at certain troublesome
@@ -682,15 +738,15 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
                        /* Disallow parallel joins. */
                        response = JOIN_DISALLOW;
                } else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
-                       mlog(ML_NOTICE, "node %u trying to join, but recovery "
+                       mlog(0, "node %u trying to join, but recovery "
                             "is ongoing.\n", bit);
                        response = JOIN_DISALLOW;
                } else if (test_bit(bit, dlm->recovery_map)) {
-                       mlog(ML_NOTICE, "node %u trying to join, but it "
+                       mlog(0, "node %u trying to join, but it "
                             "still needs recovery.\n", bit);
                        response = JOIN_DISALLOW;
                } else if (test_bit(bit, dlm->domain_map)) {
-                       mlog(ML_NOTICE, "node %u trying to join, but it "
+                       mlog(0, "node %u trying to join, but it "
                             "is still in the domain! needs recovery?\n",
                             bit);
                        response = JOIN_DISALLOW;
@@ -705,6 +761,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
 
                spin_unlock(&dlm->spinlock);
        }
+unlock_respond:
        spin_unlock(&dlm_domain_lock);
 
 respond:
@@ -854,6 +911,9 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
        join_msg.name_len = strlen(dlm->name);
        memcpy(join_msg.domain, dlm->name, join_msg.name_len);
 
+       /* copy live node map to join message */
+       byte_copymap(join_msg.node_map, dlm->live_nodes_map, O2NM_MAX_NODES);
+
        status = o2net_send_message(DLM_QUERY_JOIN_MSG, DLM_MOD_KEY, &join_msg,
                                    sizeof(join_msg), node, &retval);
        if (status < 0 && status != -ENOPROTOOPT) {
@@ -974,7 +1034,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
 {
        int status = 0, tmpstat, node;
        struct domain_join_ctxt *ctxt;
-       enum dlm_query_join_response response;
+       enum dlm_query_join_response response = JOIN_DISALLOW;
 
        mlog_entry("%p", dlm);
 
@@ -1101,7 +1161,8 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
        status = o2net_register_handler(DLM_ASSERT_MASTER_MSG, dlm->key,
                                        sizeof(struct dlm_assert_master),
                                        dlm_assert_master_handler,
-                                       dlm, NULL, &dlm->dlm_domain_handlers);
+                                       dlm, dlm_assert_master_post_handler,
+                                       &dlm->dlm_domain_handlers);
        if (status)
                goto bail;
 
@@ -1206,6 +1267,8 @@ bail:
 static int dlm_join_domain(struct dlm_ctxt *dlm)
 {
        int status;
+       unsigned int backoff;
+       unsigned int total_backoff = 0;
 
        BUG_ON(!dlm);
 
@@ -1237,18 +1300,27 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
        }
 
        do {
-               unsigned int backoff;
                status = dlm_try_to_join_domain(dlm);
 
                /* If we're racing another node to the join, then we
                 * need to back off temporarily and let them
                 * complete. */
+#define        DLM_JOIN_TIMEOUT_MSECS  90000
                if (status == -EAGAIN) {
                        if (signal_pending(current)) {
                                status = -ERESTARTSYS;
                                goto bail;
                        }
 
+                       if (total_backoff >
+                           msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) {
+                               status = -ERESTARTSYS;
+                               mlog(ML_NOTICE, "Timed out joining dlm domain "
+                                    "%s after %u msecs\n", dlm->name,
+                                    jiffies_to_msecs(total_backoff));
+                               goto bail;
+                       }
+
                        /*
                         * <chip> After you!
                         * <dale> No, after you!
@@ -1258,6 +1330,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
                         */
                        backoff = (unsigned int)(jiffies & 0x3);
                        backoff *= DLM_DOMAIN_BACKOFF_MS;
+                       total_backoff += backoff;
                        mlog(0, "backoff %d\n", backoff);
                        msleep(backoff);
                }