unsigned int nob = data->blocks;
        unsigned long long clks;
        unsigned int timeout;
+       bool dalgn = 0;
        u32 dcmd;
        int i;
 
                host->sg_cpu[i].dcmd = dcmd | length;
                if (length & 31 && !(data->flags & MMC_DATA_READ))
                        host->sg_cpu[i].dcmd |= DCMD_ENDIRQEN;
+               /* Not aligned to 8-byte boundary? */
+               if (sg_dma_address(&data->sg[i]) & 0x7)
+                       dalgn = 1;
                if (data->flags & MMC_DATA_READ) {
                        host->sg_cpu[i].dsadr = host->res->start + MMC_RXFIFO;
                        host->sg_cpu[i].dtadr = sg_dma_address(&data->sg[i]);
        host->sg_cpu[host->dma_len - 1].ddadr = DDADR_STOP;
        wmb();
 
+       /*
+        * The PXA27x DMA controller encounters overhead when working with
+        * unaligned (to 8-byte boundaries) data, so switch on byte alignment
+        * mode only if we have unaligned data.
+        */
+       if (dalgn)
+               DALGN |= (1 << host->dma);
+       else
+               DALGN &= (1 << host->dma);
        DDADR(host->dma) = host->sg_dma;
        DCSR(host->dma) = DCSR_RUN;
 }