Rather than byte-swapping cqe->g_mlpath_rqpn each time we extract a
field from it, byte-swap it once into a temporary variable.  This
results in smaller, better code -- eg, on 32-bit x86:
add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-5 (-5)
function                                     old     new   delta
mlx4_ib_poll_cq                             1188    1183      -5
Signed-off-by: Roland Dreier <rolandd@cisco.com>
        struct mlx4_ib_srq *srq;
        int is_send;
        int is_error;
+       u32 g_mlpath_rqpn;
        u16 wqe_ctr;
 
        cqe = next_cqe_sw(cq);
 
                wc->slid           = be16_to_cpu(cqe->rlid);
                wc->sl             = cqe->sl >> 4;
-               wc->src_qp         = be32_to_cpu(cqe->g_mlpath_rqpn) & 0xffffff;
-               wc->dlid_path_bits = (be32_to_cpu(cqe->g_mlpath_rqpn) >> 24) & 0x7f;
-               wc->wc_flags      |= be32_to_cpu(cqe->g_mlpath_rqpn) & 0x80000000 ?
-                       IB_WC_GRH : 0;
+               g_mlpath_rqpn      = be32_to_cpu(cqe->g_mlpath_rqpn);
+               wc->src_qp         = g_mlpath_rqpn & 0xffffff;
+               wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f;
+               wc->wc_flags      |= g_mlpath_rqpn & 0x80000000 ? IB_WC_GRH : 0;
                wc->pkey_index     = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f;
        }