]> pilppa.org Git - linux-2.6-omap-h63xx.git/blobdiff - drivers/infiniband/hw/ipath/ipath_rc.c
IB/ipath: Fix IB_EVENT_PORT_ERR event
[linux-2.6-omap-h63xx.git] / drivers / infiniband / hw / ipath / ipath_rc.c
index 2e4d544957afd124f2a94d61d359fab71fefb064..5c29b2bfea17b7c6236181f214b4224fbf4cb60d 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 QLogic, Inc. All rights reserved.
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
  * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -81,9 +81,8 @@ static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)
  * Note that we are in the responder's side of the QP context.
  * Note the QP s_lock must be held.
  */
-static int ipath_make_rc_ack(struct ipath_qp *qp,
-                            struct ipath_other_headers *ohdr,
-                            u32 pmtu, u32 *bth0p, u32 *bth2p)
+static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp,
+                            struct ipath_other_headers *ohdr, u32 pmtu)
 {
        struct ipath_ack_entry *e;
        u32 hwords;
@@ -98,13 +97,21 @@ static int ipath_make_rc_ack(struct ipath_qp *qp,
        case OP(RDMA_READ_RESPONSE_LAST):
        case OP(RDMA_READ_RESPONSE_ONLY):
        case OP(ATOMIC_ACKNOWLEDGE):
-               qp->s_ack_state = OP(ACKNOWLEDGE);
+               /*
+                * We can increment the tail pointer now that the last
+                * response has been sent instead of only being
+                * constructed.
+                */
+               if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC)
+                       qp->s_tail_ack_queue = 0;
                /* FALLTHROUGH */
+       case OP(SEND_ONLY):
        case OP(ACKNOWLEDGE):
                /* Check for no next entry in the queue. */
                if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
                        if (qp->s_flags & IPATH_S_ACK_PENDING)
                                goto normal;
+                       qp->s_ack_state = OP(ACKNOWLEDGE);
                        goto bail;
                }
 
@@ -119,9 +126,7 @@ static int ipath_make_rc_ack(struct ipath_qp *qp,
                                qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
                        } else {
                                qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
-                               if (++qp->s_tail_ack_queue >
-                                   IPATH_MAX_RDMA_ATOMIC)
-                                       qp->s_tail_ack_queue = 0;
+                               e->sent = 1;
                        }
                        ohdr->u.aeth = ipath_compute_aeth(qp);
                        hwords++;
@@ -139,8 +144,7 @@ static int ipath_make_rc_ack(struct ipath_qp *qp,
                                cpu_to_be32(e->atomic_data);
                        hwords += sizeof(ohdr->u.at) / sizeof(u32);
                        bth2 = e->psn;
-                       if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC)
-                               qp->s_tail_ack_queue = 0;
+                       e->sent = 1;
                }
                bth0 = qp->s_ack_state << 24;
                break;
@@ -156,8 +160,7 @@ static int ipath_make_rc_ack(struct ipath_qp *qp,
                        ohdr->u.aeth = ipath_compute_aeth(qp);
                        hwords++;
                        qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
-                       if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC)
-                               qp->s_tail_ack_queue = 0;
+                       qp->s_ack_queue[qp->s_tail_ack_queue].sent = 1;
                }
                bth0 = qp->s_ack_state << 24;
                bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
@@ -171,7 +174,7 @@ static int ipath_make_rc_ack(struct ipath_qp *qp,
                 * the ACK before setting s_ack_state to ACKNOWLEDGE
                 * (see above).
                 */
-               qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+               qp->s_ack_state = OP(SEND_ONLY);
                qp->s_flags &= ~IPATH_S_ACK_PENDING;
                qp->s_cur_sge = NULL;
                if (qp->s_nak_state)
@@ -188,8 +191,7 @@ static int ipath_make_rc_ack(struct ipath_qp *qp,
        }
        qp->s_hdrwords = hwords;
        qp->s_cur_size = len;
-       *bth0p = bth0;
-       *bth2p = bth2;
+       ipath_make_ruc_header(dev, qp, ohdr, bth0, bth2);
        return 1;
 
 bail:
@@ -199,53 +201,55 @@ bail:
 /**
  * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
  * @qp: a pointer to the QP
- * @ohdr: a pointer to the IB header being constructed
- * @pmtu: the path MTU
- * @bth0p: pointer to the BTH opcode word
- * @bth2p: pointer to the BTH PSN word
  *
  * Return 1 if constructed; otherwise, return 0.
- * Note the QP s_lock must be held and interrupts disabled.
  */
-int ipath_make_rc_req(struct ipath_qp *qp,
-                     struct ipath_other_headers *ohdr,
-                     u32 pmtu, u32 *bth0p, u32 *bth2p)
+int ipath_make_rc_req(struct ipath_qp *qp)
 {
        struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+       struct ipath_other_headers *ohdr;
        struct ipath_sge_state *ss;
        struct ipath_swqe *wqe;
        u32 hwords;
        u32 len;
        u32 bth0;
        u32 bth2;
+       u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
        char newreq;
+       unsigned long flags;
+       int ret = 0;
+
+       ohdr = &qp->s_hdr.u.oth;
+       if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+               ohdr = &qp->s_hdr.u.l.oth;
+
+       /*
+        * The lock is needed to synchronize between the sending tasklet,
+        * the receive interrupt handler, and timeout resends.
+        */
+       spin_lock_irqsave(&qp->s_lock, flags);
 
        /* Sending responses has higher priority over sending requests. */
        if ((qp->r_head_ack_queue != qp->s_tail_ack_queue ||
             (qp->s_flags & IPATH_S_ACK_PENDING) ||
-            qp->s_ack_state != IB_OPCODE_RC_ACKNOWLEDGE) &&
-           ipath_make_rc_ack(qp, ohdr, pmtu, bth0p, bth2p))
+            qp->s_ack_state != OP(ACKNOWLEDGE)) &&
+           ipath_make_rc_ack(dev, qp, ohdr, pmtu))
                goto done;
 
        if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) ||
-           qp->s_rnr_timeout)
+           qp->s_rnr_timeout || qp->s_wait_credit)
                goto bail;
 
        /* Limit the number of packets sent without an ACK. */
        if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT) > 0) {
                qp->s_wait_credit = 1;
                dev->n_rc_stalls++;
-               spin_lock(&dev->pending_lock);
-               if (list_empty(&qp->timerwait))
-                       list_add_tail(&qp->timerwait,
-                                     &dev->pending[dev->pending_index]);
-               spin_unlock(&dev->pending_lock);
                goto bail;
        }
 
        /* header size in 32-bit words LRH+BTH = (8+12)/4. */
        hwords = 5;
-       bth0 = 0;
+       bth0 = 1 << 22; /* Set M bit */
 
        /* Send a request. */
        wqe = get_swqe_ptr(qp, qp->s_cur);
@@ -444,7 +448,7 @@ int ipath_make_rc_req(struct ipath_qp *qp,
                        qp->s_psn = wqe->lpsn + 1;
                else {
                        qp->s_psn++;
-                       if ((int)(qp->s_psn - qp->s_next_psn) > 0)
+                       if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
                                qp->s_next_psn = qp->s_psn;
                }
                /*
@@ -471,7 +475,7 @@ int ipath_make_rc_req(struct ipath_qp *qp,
                /* FALLTHROUGH */
        case OP(SEND_MIDDLE):
                bth2 = qp->s_psn++ & IPATH_PSN_MASK;
-               if ((int)(qp->s_psn - qp->s_next_psn) > 0)
+               if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
                        qp->s_next_psn = qp->s_psn;
                ss = &qp->s_sge;
                len = qp->s_len;
@@ -507,7 +511,7 @@ int ipath_make_rc_req(struct ipath_qp *qp,
                /* FALLTHROUGH */
        case OP(RDMA_WRITE_MIDDLE):
                bth2 = qp->s_psn++ & IPATH_PSN_MASK;
-               if ((int)(qp->s_psn - qp->s_next_psn) > 0)
+               if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
                        qp->s_next_psn = qp->s_psn;
                ss = &qp->s_sge;
                len = qp->s_len;
@@ -546,7 +550,7 @@ int ipath_make_rc_req(struct ipath_qp *qp,
                qp->s_state = OP(RDMA_READ_REQUEST);
                hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
                bth2 = qp->s_psn++ & IPATH_PSN_MASK;
-               if ((int)(qp->s_psn - qp->s_next_psn) > 0)
+               if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
                        qp->s_next_psn = qp->s_psn;
                ss = NULL;
                len = 0;
@@ -561,13 +565,12 @@ int ipath_make_rc_req(struct ipath_qp *qp,
        qp->s_hdrwords = hwords;
        qp->s_cur_sge = ss;
        qp->s_cur_size = len;
-       *bth0p = bth0 | (qp->s_state << 24);
-       *bth2p = bth2;
+       ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2);
 done:
-       return 1;
-
+       ret = 1;
 bail:
-       return 0;
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+       return ret;
 }
 
 /**
@@ -587,9 +590,12 @@ static void send_rc_ack(struct ipath_qp *qp)
        u32 hwords;
        struct ipath_ib_header hdr;
        struct ipath_other_headers *ohdr;
+       unsigned long flags;
 
        /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
-       if (qp->r_head_ack_queue != qp->s_tail_ack_queue)
+       if (qp->r_head_ack_queue != qp->s_tail_ack_queue ||
+           (qp->s_flags & IPATH_S_ACK_PENDING) ||
+           qp->s_ack_state != OP(ACKNOWLEDGE))
                goto queue_ack;
 
        /* Construct the header. */
@@ -606,7 +612,7 @@ static void send_rc_ack(struct ipath_qp *qp)
        }
        /* read pkey_index w/o lock (its atomic) */
        bth0 = ipath_get_pkey(dev->dd, qp->s_pkey_index) |
-               OP(ACKNOWLEDGE) << 24;
+               (OP(ACKNOWLEDGE) << 24) | (1 << 22);
        if (qp->r_nak_state)
                ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
                                            (qp->r_nak_state <<
@@ -625,7 +631,7 @@ static void send_rc_ack(struct ipath_qp *qp)
        /*
         * If we can send the ACK, clear the ACK state.
         */
-       if (ipath_verbs_send(dev->dd, hwords, (u32 *) &hdr, 0, NULL) == 0) {
+       if (ipath_verbs_send(qp, &hdr, hwords, NULL, 0) == 0) {
                dev->n_unicast_xmit++;
                goto done;
        }
@@ -640,11 +646,11 @@ static void send_rc_ack(struct ipath_qp *qp)
        dev->n_rc_qacks++;
 
 queue_ack:
-       spin_lock_irq(&qp->s_lock);
+       spin_lock_irqsave(&qp->s_lock, flags);
        qp->s_flags |= IPATH_S_ACK_PENDING;
        qp->s_nak_state = qp->r_nak_state;
        qp->s_ack_psn = qp->r_ack_psn;
-       spin_unlock_irq(&qp->s_lock);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
 
        /* Call ipath_do_rc_send() in another thread. */
        tasklet_hi_schedule(&qp->s_task);
@@ -755,7 +761,9 @@ void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc)
                wc->vendor_err = 0;
                wc->byte_len = 0;
                wc->qp = &qp->ibqp;
+               wc->imm_data = 0;
                wc->src_qp = qp->remote_qpn;
+               wc->wc_flags = 0;
                wc->pkey_index = 0;
                wc->slid = qp->remote_ah_attr.dlid;
                wc->sl = qp->remote_ah_attr.sl;
@@ -779,7 +787,7 @@ void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc)
        if (wqe->wr.opcode == IB_WR_RDMA_READ)
                dev->n_rc_resends++;
        else
-               dev->n_rc_resends += (int)qp->s_psn - (int)psn;
+               dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK;
 
        reset_psn(qp, psn);
        tasklet_hi_schedule(&qp->s_task);
@@ -808,13 +816,15 @@ static inline void update_last_psn(struct ipath_qp *qp, u32 psn)
  * Called at interrupt level with the QP s_lock held and interrupts disabled.
  * Returns 1 if OK, 0 if current operation should be aborted (NAK).
  */
-static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
+static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
+                    u64 val)
 {
        struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
        struct ib_wc wc;
        struct ipath_swqe *wqe;
        int ret = 0;
        u32 ack_psn;
+       int diff;
 
        /*
         * Remove the QP from the timeout queue (or RNR timeout queue).
@@ -842,7 +852,19 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
         * The MSN might be for a later WQE than the PSN indicates so
         * only complete WQEs that the PSN finishes.
         */
-       while (ipath_cmp24(ack_psn, wqe->lpsn) >= 0) {
+       while ((diff = ipath_cmp24(ack_psn, wqe->lpsn)) >= 0) {
+               /*
+                * RDMA_READ_RESPONSE_ONLY is a special case since
+                * we want to generate completion events for everything
+                * before the RDMA read, copy the data, then generate
+                * the completion for the read.
+                */
+               if (wqe->wr.opcode == IB_WR_RDMA_READ &&
+                   opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
+                   diff == 0) {
+                       ret = 1;
+                       goto bail;
+               }
                /*
                 * If this request is a RDMA read or atomic, and the ACK is
                 * for a later operation, this ACK NAKs the RDMA read or
@@ -853,12 +875,10 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
                 * is sent but before the response is received.
                 */
                if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
-                    (opcode != OP(RDMA_READ_RESPONSE_LAST) ||
-                     ipath_cmp24(ack_psn, wqe->lpsn) != 0)) ||
+                    (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
                    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
                      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
-                    (opcode != OP(ATOMIC_ACKNOWLEDGE) ||
-                     ipath_cmp24(wqe->psn, psn) != 0))) {
+                    (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
                        /*
                         * The last valid PSN seen is the previous
                         * request's.
@@ -872,6 +892,9 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
                         */
                        goto bail;
                }
+               if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+                   wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+                       *(u64 *) wqe->sg_list[0].vaddr = val;
                if (qp->s_num_rd_atomic &&
                    (wqe->wr.opcode == IB_WR_RDMA_READ ||
                     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
@@ -915,15 +938,19 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
                if (qp->s_last == qp->s_cur) {
                        if (++qp->s_cur >= qp->s_size)
                                qp->s_cur = 0;
+                       qp->s_last = qp->s_cur;
+                       if (qp->s_last == qp->s_tail)
+                               break;
                        wqe = get_swqe_ptr(qp, qp->s_cur);
                        qp->s_state = OP(SEND_LAST);
                        qp->s_psn = wqe->psn;
+               } else {
+                       if (++qp->s_last >= qp->s_size)
+                               qp->s_last = 0;
+                       if (qp->s_last == qp->s_tail)
+                               break;
+                       wqe = get_swqe_ptr(qp, qp->s_last);
                }
-               if (++qp->s_last >= qp->s_size)
-                       qp->s_last = 0;
-               wqe = get_swqe_ptr(qp, qp->s_last);
-               if (qp->s_last == qp->s_tail)
-                       break;
        }
 
        switch (aeth >> 29) {
@@ -935,6 +962,18 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
                        list_add_tail(&qp->timerwait,
                                      &dev->pending[dev->pending_index]);
                        spin_unlock(&dev->pending_lock);
+                       /*
+                        * If we get a partial ACK for a resent operation,
+                        * we can stop resending the earlier packets and
+                        * continue with the next packet the receiver wants.
+                        */
+                       if (ipath_cmp24(qp->s_psn, psn) <= 0) {
+                               reset_psn(qp, psn + 1);
+                               tasklet_hi_schedule(&qp->s_task);
+                       }
+               } else if (ipath_cmp24(qp->s_psn, psn) <= 0) {
+                       qp->s_state = OP(SEND_LAST);
+                       qp->s_psn = psn + 1;
                }
                ipath_get_credit(qp, aeth);
                qp->s_rnr_retry = qp->s_rnr_retry_cnt;
@@ -945,22 +984,23 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
 
        case 1:         /* RNR NAK */
                dev->n_rnr_naks++;
+               if (qp->s_last == qp->s_tail)
+                       goto bail;
                if (qp->s_rnr_retry == 0) {
-                       if (qp->s_last == qp->s_tail)
-                               goto bail;
-
                        wc.status = IB_WC_RNR_RETRY_EXC_ERR;
                        goto class_b;
                }
                if (qp->s_rnr_retry_cnt < 7)
                        qp->s_rnr_retry--;
-               if (qp->s_last == qp->s_tail)
-                       goto bail;
 
                /* The last valid PSN is the previous PSN. */
                update_last_psn(qp, psn - 1);
 
-               dev->n_rc_resends += (int)qp->s_psn - (int)psn;
+               if (wqe->wr.opcode == IB_WR_RDMA_READ)
+                       dev->n_rc_resends++;
+               else
+                       dev->n_rc_resends +=
+                               (qp->s_psn - psn) & IPATH_PSN_MASK;
 
                reset_psn(qp, psn);
 
@@ -971,26 +1011,20 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
                goto bail;
 
        case 3:         /* NAK */
-               /* The last valid PSN seen is the previous request's. */
-               if (qp->s_last != qp->s_tail)
-                       update_last_psn(qp, wqe->psn - 1);
+               if (qp->s_last == qp->s_tail)
+                       goto bail;
+               /* The last valid PSN is the previous PSN. */
+               update_last_psn(qp, psn - 1);
                switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) &
                        IPATH_AETH_CREDIT_MASK) {
                case 0: /* PSN sequence error */
                        dev->n_seq_naks++;
                        /*
-                        * Back up to the responder's expected PSN.  XXX
+                        * Back up to the responder's expected PSN.
                         * Note that we might get a NAK in the middle of an
                         * RDMA READ response which terminates the RDMA
                         * READ.
                         */
-                       if (qp->s_last == qp->s_tail)
-                               break;
-
-                       if (ipath_cmp24(psn, wqe->psn) < 0)
-                               break;
-
-                       /* Retry the request. */
                        ipath_restart_rc(qp, psn, &wc);
                        break;
 
@@ -1013,7 +1047,9 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode)
                        wc.vendor_err = 0;
                        wc.byte_len = 0;
                        wc.qp = &qp->ibqp;
+                       wc.imm_data = 0;
                        wc.src_qp = qp->remote_qpn;
+                       wc.wc_flags = 0;
                        wc.pkey_index = 0;
                        wc.slid = qp->remote_ah_attr.dlid;
                        wc.sl = qp->remote_ah_attr.sl;
@@ -1070,6 +1106,7 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
        int diff;
        u32 pad;
        u32 aeth;
+       u64 val;
 
        spin_lock_irqsave(&qp->s_lock, flags);
 
@@ -1109,8 +1146,6 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
                        data += sizeof(__be32);
                }
                if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
-                       u64 val;
-
                        if (!header_in_data) {
                                __be32 *p = ohdr->u.at.atomic_ack_eth;
 
@@ -1118,14 +1153,15 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
                                        be32_to_cpu(p[1]);
                        } else
                                val = be64_to_cpu(((__be64 *) data)[0]);
-                       *(u64 *) wqe->sg_list[0].vaddr = val;
-               }
-               if (!do_rc_ack(qp, aeth, psn, opcode) ||
+               } else
+                       val = 0;
+               if (!do_rc_ack(qp, aeth, psn, opcode, val) ||
                    opcode != OP(RDMA_READ_RESPONSE_FIRST))
                        goto ack_done;
                hdrsize += 4;
+               wqe = get_swqe_ptr(qp, qp->s_last);
                if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-                       goto ack_done;
+                       goto ack_op_err;
                /*
                 * If this is a response to a resent RDMA read, we
                 * have to be careful to copy the data to the right
@@ -1143,12 +1179,12 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
                        goto ack_done;
                }
                if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-                       goto ack_done;
+                       goto ack_op_err;
        read_middle:
                if (unlikely(tlen != (hdrsize + pmtu + 4)))
-                       goto ack_done;
+                       goto ack_len_err;
                if (unlikely(pmtu >= qp->s_rdma_read_len))
-                       goto ack_done;
+                       goto ack_len_err;
 
                /* We got a response so update the timeout. */
                spin_lock(&dev->pending_lock);
@@ -1167,19 +1203,27 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
                goto bail;
 
        case OP(RDMA_READ_RESPONSE_ONLY):
-               if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
-                       dev->n_rdma_seq++;
-                       ipath_restart_rc(qp, qp->s_last_psn + 1, &wc);
-                       goto ack_done;
-               }
-               if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+               if (!header_in_data)
+                       aeth = be32_to_cpu(ohdr->u.aeth);
+               else
+                       aeth = be32_to_cpu(((__be32 *) data)[0]);
+               if (!do_rc_ack(qp, aeth, psn, opcode, 0))
                        goto ack_done;
+               /* Get the number of bytes the message was padded by. */
+               pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+               /*
+                * Check that the data size is >= 0 && <= pmtu.
+                * Remember to account for the AETH header (4) and
+                * ICRC (4).
+                */
+               if (unlikely(tlen < (hdrsize + pad + 8)))
+                       goto ack_len_err;
                /*
                 * If this is a response to a resent RDMA read, we
                 * have to be careful to copy the data to the right
                 * location.
-                * XXX should check PSN and wqe opcode first.
                 */
+               wqe = get_swqe_ptr(qp, qp->s_last);
                qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
                                                  wqe, psn, pmtu);
                goto read_last;
@@ -1192,26 +1236,20 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
                        goto ack_done;
                }
                if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
-                       goto ack_done;
-       read_last:
-               /*
-                * Get the number of bytes the message was padded by.
-                */
+                       goto ack_op_err;
+               /* Get the number of bytes the message was padded by. */
                pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
                /*
                 * Check that the data size is >= 1 && <= pmtu.
                 * Remember to account for the AETH header (4) and
                 * ICRC (4).
                 */
-               if (unlikely(tlen <= (hdrsize + pad + 8))) {
-                       /* XXX Need to generate an error CQ entry. */
-                       goto ack_done;
-               }
+               if (unlikely(tlen <= (hdrsize + pad + 8)))
+                       goto ack_len_err;
+       read_last:
                tlen -= hdrsize + pad + 8;
-               if (unlikely(tlen != qp->s_rdma_read_len)) {
-                       /* XXX Need to generate an error CQ entry. */
-                       goto ack_done;
-               }
+               if (unlikely(tlen != qp->s_rdma_read_len))
+                       goto ack_len_err;
                if (!header_in_data)
                        aeth = be32_to_cpu(ohdr->u.aeth);
                else {
@@ -1219,12 +1257,37 @@ static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
                        data += sizeof(__be32);
                }
                ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen);
-               (void) do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST));
+               (void) do_rc_ack(qp, aeth, psn,
+                                OP(RDMA_READ_RESPONSE_LAST), 0);
                goto ack_done;
        }
 
 ack_done:
        spin_unlock_irqrestore(&qp->s_lock, flags);
+       goto bail;
+
+ack_op_err:
+       wc.status = IB_WC_LOC_QP_OP_ERR;
+       goto ack_err;
+
+ack_len_err:
+       wc.status = IB_WC_LOC_LEN_ERR;
+ack_err:
+       wc.wr_id = wqe->wr.wr_id;
+       wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+       wc.vendor_err = 0;
+       wc.byte_len = 0;
+       wc.imm_data = 0;
+       wc.qp = &qp->ibqp;
+       wc.src_qp = qp->remote_qpn;
+       wc.wc_flags = 0;
+       wc.pkey_index = 0;
+       wc.slid = qp->remote_ah_attr.dlid;
+       wc.sl = qp->remote_ah_attr.sl;
+       wc.dlid_path_bits = 0;
+       wc.port_num = 0;
+       ipath_sqerror_qp(qp, &wc);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
 bail:
        return;
 }
@@ -1258,6 +1321,7 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
        struct ipath_ack_entry *e;
        u8 i, prev;
        int old_req;
+       unsigned long flags;
 
        if (diff > 0) {
                /*
@@ -1291,7 +1355,7 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
        psn &= IPATH_PSN_MASK;
        e = NULL;
        old_req = 1;
-       spin_lock_irq(&qp->s_lock);
+       spin_lock_irqsave(&qp->s_lock, flags);
        for (i = qp->r_head_ack_queue; ; i = prev) {
                if (i == qp->s_tail_ack_queue)
                        old_req = 0;
@@ -1308,8 +1372,11 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
                        e = NULL;
                        break;
                }
-               if (ipath_cmp24(psn, e->psn) >= 0)
+               if (ipath_cmp24(psn, e->psn) >= 0) {
+                       if (prev == qp->s_tail_ack_queue)
+                               old_req = 0;
                        break;
+               }
        }
        switch (opcode) {
        case OP(RDMA_READ_REQUEST): {
@@ -1389,11 +1456,24 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
                 * after all the previous RDMA reads and atomics.
                 */
                if (i == qp->r_head_ack_queue) {
-                       spin_unlock_irq(&qp->s_lock);
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
                        qp->r_nak_state = 0;
                        qp->r_ack_psn = qp->r_psn - 1;
                        goto send_ack;
                }
+               /*
+                * Try to send a simple ACK to work around a Mellanox bug
+                * which doesn't accept a RDMA read response or atomic
+                * response as an ACK for earlier SENDs or RDMA writes.
+                */
+               if (qp->r_head_ack_queue == qp->s_tail_ack_queue &&
+                   !(qp->s_flags & IPATH_S_ACK_PENDING) &&
+                   qp->s_ack_state == OP(ACKNOWLEDGE)) {
+                       spin_unlock_irqrestore(&qp->s_lock, flags);
+                       qp->r_nak_state = 0;
+                       qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
+                       goto send_ack;
+               }
                /*
                 * Resend the RDMA read or atomic op which
                 * ACKs this duplicate request.
@@ -1403,11 +1483,10 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
                break;
        }
        qp->r_nak_state = 0;
-       spin_unlock_irq(&qp->s_lock);
        tasklet_hi_schedule(&qp->s_task);
 
 unlock_done:
-       spin_unlock_irq(&qp->s_lock);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
 done:
        return 1;
 
@@ -1417,10 +1496,38 @@ send_ack:
 
 static void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err)
 {
-       spin_lock_irq(&qp->s_lock);
+       unsigned long flags;
+       int lastwqe;
+
+       spin_lock_irqsave(&qp->s_lock, flags);
        qp->state = IB_QPS_ERR;
-       ipath_error_qp(qp, err);
-       spin_unlock_irq(&qp->s_lock);
+       lastwqe = ipath_error_qp(qp, err);
+       spin_unlock_irqrestore(&qp->s_lock, flags);
+
+       if (lastwqe) {
+               struct ib_event ev;
+
+               ev.device = qp->ibqp.device;
+               ev.element.qp = &qp->ibqp;
+               ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+               qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+       }
+}
+
+static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n)
+{
+       unsigned long flags;
+       unsigned next;
+
+       next = n + 1;
+       if (next > IPATH_MAX_RDMA_ATOMIC)
+               next = 0;
+       spin_lock_irqsave(&qp->s_lock, flags);
+       if (n == qp->s_tail_ack_queue) {
+               qp->s_tail_ack_queue = next;
+               qp->s_ack_state = OP(ACKNOWLEDGE);
+       }
+       spin_unlock_irqrestore(&qp->s_lock, flags);
 }
 
 /**
@@ -1635,6 +1742,9 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
        case OP(RDMA_WRITE_FIRST):
        case OP(RDMA_WRITE_ONLY):
        case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+               if (unlikely(!(qp->qp_access_flags &
+                              IB_ACCESS_REMOTE_WRITE)))
+                       goto nack_inv;
                /* consume RWQE */
                /* RETH comes after BTH */
                if (!header_in_data)
@@ -1664,9 +1774,6 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                        qp->r_sge.sge.length = 0;
                        qp->r_sge.sge.sge_length = 0;
                }
-               if (unlikely(!(qp->qp_access_flags &
-                              IB_ACCESS_REMOTE_WRITE)))
-                       goto nack_acc;
                if (opcode == OP(RDMA_WRITE_FIRST))
                        goto send_middle;
                else if (opcode == OP(RDMA_WRITE_ONLY))
@@ -1680,13 +1787,17 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                u32 len;
                u8 next;
 
-               if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
-                       goto nack_acc;
+               if (unlikely(!(qp->qp_access_flags &
+                              IB_ACCESS_REMOTE_READ)))
+                       goto nack_inv;
                next = qp->r_head_ack_queue + 1;
                if (next > IPATH_MAX_RDMA_ATOMIC)
                        next = 0;
-               if (unlikely(next == qp->s_tail_ack_queue))
-                       goto nack_inv;
+               if (unlikely(next == qp->s_tail_ack_queue)) {
+                       if (!qp->s_ack_queue[next].sent)
+                               goto nack_inv;
+                       ipath_update_ack_queue(qp, next);
+               }
                e = &qp->s_ack_queue[qp->r_head_ack_queue];
                /* RETH comes after BTH */
                if (!header_in_data)
@@ -1721,6 +1832,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                        e->rdma_sge.sge.sge_length = 0;
                }
                e->opcode = opcode;
+               e->sent = 0;
                e->psn = psn;
                /*
                 * We need to increment the MSN here instead of when we
@@ -1752,12 +1864,15 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 
                if (unlikely(!(qp->qp_access_flags &
                               IB_ACCESS_REMOTE_ATOMIC)))
-                       goto nack_acc;
+                       goto nack_inv;
                next = qp->r_head_ack_queue + 1;
                if (next > IPATH_MAX_RDMA_ATOMIC)
                        next = 0;
-               if (unlikely(next == qp->s_tail_ack_queue))
-                       goto nack_inv;
+               if (unlikely(next == qp->s_tail_ack_queue)) {
+                       if (!qp->s_ack_queue[next].sent)
+                               goto nack_inv;
+                       ipath_update_ack_queue(qp, next);
+               }
                if (!header_in_data)
                        ateth = &ohdr->u.atomic_eth;
                else
@@ -1782,6 +1897,7 @@ void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
                                      be64_to_cpu(ateth->compare_data),
                                      sdata);
                e->opcode = opcode;
+               e->sent = 0;
                e->psn = psn & IPATH_PSN_MASK;
                qp->r_msn++;
                qp->r_psn++;