org.handhelds.familiar/packages/libxine/files/libxine-libavcodec.patch

   1 TO be MERGED!!!
   2
   3
   4 #
   5 # Patch managed by http://www.holgerschurig.de/patcher.html
   6 #
   7
   8 --- xine-lib-1-rc7/src/libffmpeg/libavcodec/Makefile.am~libxine-libavcodec
   9 +++ xine-lib-1-rc7/src/libffmpeg/libavcodec/Makefile.am
  10 @@ -9,9 +9,14 @@
  11  # we need to compile everything in debug mode, including the encoders,
  12  # otherwise we get unresolved symbols, because some unsatisfied function calls
  13  # are not optimized away with debug optimization
  14 -AM_CFLAGS = $(LIBFFMPEG_CFLAGS) `test "$(CFLAGS)" = "$(DEBUG_CFLAGS)" && echo -DCONFIG_ENCODERS`
  15 +#AM_CFLAGS = $(LIBFFMPEG_CFLAGS) `test "$(CFLAGS)" = "$(DEBUG_CFLAGS)" && echo -DCONFIG_ENCODERS`
  16  ASFLAGS =
  17
  18 +if HAVE_ARMV4L
  19 +AM_CFLAGS = -DARCH_ARMV4L $(LIBFFMPEG_CFLAGS)
  20 +else
  21 +AM_CFLAGS = $(LIBFFMPEG_CFLAGS)
  22 +endif
  23  noinst_LTLIBRARIES = libavcodec.la
  24
  25  libavcodec_la_SOURCES = \
  26 --- xine-lib-1-rc7/src/libffmpeg/libavcodec/dsputil.h~libxine-libavcodec
  27 +++ xine-lib-1-rc7/src/libffmpeg/libavcodec/dsputil.h
  28 @@ -45,6 +45,7 @@
  29  void ff_fdct248_islow (DCTELEM *data);
  30
  31  void j_rev_dct (DCTELEM *data);
  32 +void j_rev_dct_ARM ( DCTELEM *data );
  33
  34  void ff_fdct_mmx(DCTELEM *block);
  35  void ff_fdct_mmx2(DCTELEM *block);
  36 --- xine-lib-1-rc7/src/libffmpeg/libavcodec/mpegvideo.c~libxine-libavcodec
  37 +++ xine-lib-1-rc7/src/libffmpeg/libavcodec/mpegvideo.c
  38 @@ -218,6 +218,25 @@
  39  }
  40  #endif //CONFIG_ENCODERS
  41
  42 +
  43 +#ifdef ARCH_ARMV4L
  44 +
  45 +static void ff_jref_idct_put_armv4l(UINT8 *dest, int line_size, DCTELEM *block)
  46 +{
  47 +//    fprintf(stderr, "src/libffmpeg/libavcodec/mpegvideo.c : ff_jref_idct_put_armv4l utilisant l'iDCT ARMv4l en ASM\n");
  48 +    j_rev_dct_ARM (block);
  49 +    ff_put_pixels_clamped(block, dest, line_size);
  50 +}
  51 +static void ff_jref_idct_add_armv4l(UINT8 *dest, int line_size, DCTELEM *block)
  52 +{
  53 +//    fprintf(stderr, "src/libffmpeg/libavcodec/mpegvideo.c : ff_jref_idct_add_armv4l utilisant l'iDCT ARMv4l en ASM\n");
  54 +    j_rev_dct_ARM (block);
  55 +    ff_add_pixels_clamped(block, dest, line_size);
  56 +}
  57 +
  58 +#endif
  59 +
  60 +
  61  /* init common dct for both encoder and decoder */
  62  int DCT_common_init(MpegEncContext *s)
  63  {
  64 @@ -246,7 +265,11 @@
  65      MPV_common_init_mmi(s);
  66  #endif
  67  #ifdef ARCH_ARMV4L
  68 -    MPV_common_init_armv4l(s);
  69 +/*    MPV_common_init_armv4l(s); */
  70 +/* Methode bourrine, mais bon, c'est pour pas trop se prendre la tete a tout changer ce soir, a ameliorer plus tard*/
  71 +    s->idct_put= ff_jref_idct_put_armv4l;
  72 +    s->idct_add= ff_jref_idct_add_armv4l;
  73 +    s->idct_permutation_type= FF_NO_IDCT_PERM;
  74  #endif
  75  #ifdef ARCH_POWERPC
  76      MPV_common_init_ppc(s);
  77 --- xine-lib-1-rc7/src/libffmpeg/libavcodec/armv4l/jrevdct_arm.S~libxine-libavcodec
  78 +++ xine-lib-1-rc7/src/libffmpeg/libavcodec/armv4l/jrevdct_arm.S
  79 @@ -1,386 +1,491 @@
  80  /*
  81 -   C-like prototype :
  82 -       void j_rev_dct_ARM(DCTBLOCK data)
  83 + * jrevdct_arm.S
  84 + * Copyright (C) 2002 Frederic 'dilb' Boulay.
  85 + * All Rights Reserved.
  86 + *
  87 + * Author: Frederic Boulay <dilb@handhelds.org>
  88 + *
  89 + * you can redistribute this file and/or modify
  90 + * it under the terms of the GNU General Public License (version 2)
  91 + * as published by the Free Software Foundation.
  92 + *
  93 + * This file is distributed in the hope that it will be useful,
  94 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  95 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  96 + * GNU General Public License for more details.
  97 + *
  98 + * You should have received a copy of the GNU General Public License
  99 + * along with this program; if not, write to the Free Software
 100 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 101 + *
 102 + *
 103 + * The function defined in this file, is derived from the simple_idct function from
 104 + * the libavcodec library part of the ffmpeg project.
 105 + */
 106
 107 -   With DCTBLOCK being a pointer to an array of 64 'signed shorts'
 108
 109 -   Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
 110
 111 -   Permission is hereby granted, free of charge, to any person obtaining a copy
 112 -   of this software and associated documentation files (the "Software"), to deal
 113 -   in the Software without restriction, including without limitation the rights
 114 -   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 115 -   copies of the Software, and to permit persons to whom the Software is
 116 -   furnished to do so, subject to the following conditions:
 117
 118 -   The above copyright notice and this permission notice shall be included in
 119 -   all copies or substantial portions of the Software.
 120 +/* useful constants for the algorithm, they are save in __constant_ptr__ at the end of the source code.*/
 121 +#define W1  22725
 122 +#define W2  21407
 123 +#define W3  19266
 124 +#define W4  16383
 125 +#define W5  12873
 126 +#define W6  8867
 127 +#define W7  4520
 128 +#define MASK_MSHW 0xFFFF0000
 129 +
 130 +/* offsets of the constants in the vector*/
 131 +#define offW1  0
 132 +#define offW2  4
 133 +#define offW3  8
 134 +#define offW4  12
 135 +#define offW5  16
 136 +#define offW6  20
 137 +#define offW7  24
 138 +#define offMASK_MSHW 28
 139 +
 140 +#define ROW_SHIFT 11
 141 +#define ROW_SHIFT2MSHW (16-11)
 142 +#define COL_SHIFT 20
 143 +#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1)*/
 144 +#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1)*/
 145 +
 146
 147 -   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 148 -   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 149 -   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 150 -   COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 151 -   IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 152 -   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 153 -
 154 -*/
 155 -#define FIX_0_298631336 2446
 156 -#define FIX_0_541196100 4433
 157 -#define FIX_0_765366865 6270
 158 -#define FIX_1_175875602 9633
 159 -#define FIX_1_501321110 12299
 160 -#define FIX_2_053119869 16819
 161 -#define FIX_3_072711026 25172
 162 -#define FIX_M_0_390180644 -3196
 163 -#define FIX_M_0_899976223 -7373
 164 -#define FIX_M_1_847759065 -15137
 165 -#define FIX_M_1_961570560 -16069
 166 -#define FIX_M_2_562915447 -20995
 167 -#define FIX_0xFFFF 0xFFFF
 168 -
 169 -#define FIX_0_298631336_ID      0
 170 -#define FIX_0_541196100_ID      4
 171 -#define FIX_0_765366865_ID      8
 172 -#define FIX_1_175875602_ID     12
 173 -#define FIX_1_501321110_ID     16
 174 -#define FIX_2_053119869_ID     20
 175 -#define FIX_3_072711026_ID     24
 176 -#define FIX_M_0_390180644_ID   28
 177 -#define FIX_M_0_899976223_ID   32
 178 -#define FIX_M_1_847759065_ID   36
 179 -#define FIX_M_1_961570560_ID   40
 180 -#define FIX_M_2_562915447_ID   44
 181 -#define FIX_0xFFFF_ID          48
 182         .text
 183         .align
 184 -
 185         .global j_rev_dct_ARM
 186 +
 187  j_rev_dct_ARM:
 188 -       stmdb   sp!, { r4 - r12, lr }   @ all callee saved regs
 189 +simple_idct3:
 190 +simple_idct_ARM:
 191 +        @@ void simple_idct_ARM(int16_t *block)
 192 +        @@ save stack for reg needed (take all of them),
 193 +        @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
 194 +        @@ so it must not be overwritten, if it is not saved!!
 195 +        @@ R12 is another scratch register, so it should not be saved too
 196 +        @@ save all registers
 197 +        stmfd sp!, {r4-r11, r14} @ R14 is also called LR
 198 +        @@ at this point, R0=block, other registers are free.
 199 +        add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
 200 +        add r12, pc, #(__constant_ptr__-.-8) @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
 201 +        @@ add 2 temporary variables in the stack: R0 and R14
 202 +        sub sp, sp, #8          @ allow 2 local variables
 203 +        str r0, [sp, #0]        @ save block in sp[0]
 204 +        @@ stack status
 205 +        @@ sp+4   free
 206 +        @@ sp+0   R0  (block)
 207
 208 -       sub sp, sp, #4                  @ reserve some space on the stack
 209 -       str r0, [ sp ]                  @ save the DCT pointer to the stack
 210
 211 -       mov lr, r0                      @ lr = pointer to the current row
 212 -       mov r12, #8                     @ r12 = row-counter
 213 -       add r11, pc, #(const_array-.-8) @ r11 = base pointer to the constants array
 214 -row_loop:
 215 -       ldrsh r0, [lr, # 0]             @ r0 = 'd0'
 216 -       ldrsh r1, [lr, # 8]             @ r1 = 'd1'
 217 +        @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
 218
 219 -       @ Optimization for row that have all items except the first set to 0
 220 -       @ (this works as the DCTELEMS are always 4-byte aligned)
 221 -       ldr r5, [lr, # 0]
 222 -       ldr r2, [lr, # 4]
 223 -       ldr r3, [lr, # 8]
 224 -       ldr r4, [lr, #12]
 225 -       orr r3, r3, r4
 226 -       orr r3, r3, r2
 227 -       orrs r5, r3, r5
 228 -       beq end_of_row_loop             @ nothing to be done as ALL of them are '0'
 229 -       orrs r2, r3, r1
 230 -       beq empty_row
 231 -
 232 -       ldrsh r2, [lr, # 2]             @ r2 = 'd2'
 233 -       ldrsh r4, [lr, # 4]             @ r4 = 'd4'
 234 -       ldrsh r6, [lr, # 6]             @ r6 = 'd6'
 235 -
 236 -       ldr r3, [r11, #FIX_0_541196100_ID]
 237 -       add r7, r2, r6
 238 -       ldr r5, [r11, #FIX_M_1_847759065_ID]
 239 -       mul r7, r3, r7                      @ r7 = z1
 240 -       ldr r3, [r11, #FIX_0_765366865_ID]
 241 -       mla r6, r5, r6, r7                  @ r6 = tmp2
 242 -       add r5, r0, r4                      @ r5 = tmp0
 243 -       mla r2, r3, r2, r7                  @ r2 = tmp3
 244 -       sub r3, r0, r4                      @ r3 = tmp1
 245
 246 -       add r0, r2, r5, lsl #13             @ r0 = tmp10
 247 -       rsb r2, r2, r5, lsl #13             @ r2 = tmp13
 248 -       add r4, r6, r3, lsl #13             @ r4 = tmp11
 249 -       rsb r3, r6, r3, lsl #13             @ r3 = tmp12
 250 +__row_loop:
 251 +        @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimise ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
 252 +        ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
 253 +        ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
 254 +        ldr r3, [r14, #8]        @ R3=ROWr32[2]
 255 +        ldr r4, [r14, #12]       @ R4=ROWr32[3]
 256 +        @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
 257 +        @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
 258 +        @@ else follow the complete algorithm.
 259 +        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
 260 +        @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
 261 +        orr r5, r4, r3           @ R5=R4 | R3
 262 +        orr r5, r5, r2           @ R5=R4 | R3 | R2
 263 +        orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
 264 +        beq __end_row_loop
 265 +        mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
 266 +        ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
 267 +        orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
 268 +        beq __almost_empty_row
 269
 270 -       stmdb   sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11
 271 -
 272 -       ldrsh r3, [lr, #10]             @ r3 = 'd3'
 273 -       ldrsh r5, [lr, #12]             @ r5 = 'd5'
 274 -       ldrsh r7, [lr, #14]             @ r7 = 'd7'
 275 +__b_evaluation:
 276 +        @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
 277 +        @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
 278 +        @@     R12=__const_ptr_, R14=&block[n]
 279 +        @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
 280
 281 -       add r0, r3, r5                  @ r0 = 'z2'
 282 -       add r2, r1, r7                  @ r2 = 'z1'
 283 -       add r4, r3, r7                  @ r4 = 'z3'
 284 -       add r6, r1, r5                  @ r6 = 'z4'
 285 -       ldr r9, [r11, #FIX_1_175875602_ID]
 286 -       add r8, r4, r6                  @ r8 = z3 + z4
 287 -       ldr r10, [r11, #FIX_M_0_899976223_ID]
 288 -       mul r8, r9, r8                  @ r8 = 'z5'
 289 -       ldr r9, [r11, #FIX_M_2_562915447_ID]
 290 -       mul r2, r10, r2                 @ r2 = 'z1'
 291 -       ldr r10, [r11, #FIX_M_1_961570560_ID]
 292 -       mul r0, r9, r0                  @ r0 = 'z2'
 293 -       ldr r9, [r11, #FIX_M_0_390180644_ID]
 294 -       mla r4, r10, r4, r8             @ r4 = 'z3'
 295 -       ldr r10, [r11, #FIX_0_298631336_ID]
 296 -       mla r6, r9, r6, r8              @ r6 = 'z4'
 297 -       ldr r9, [r11, #FIX_2_053119869_ID]
 298 -       mla r7, r10, r7, r2             @ r7 = tmp0 + z1
 299 -       ldr r10, [r11, #FIX_3_072711026_ID]
 300 -       mla r5, r9, r5, r0              @ r5 = tmp1 + z2
 301 -       ldr r9, [r11, #FIX_1_501321110_ID]
 302 -       mla r3, r10, r3, r0             @ r3 = tmp2 + z2
 303 -       add r7, r7, r4                  @ r7 = tmp0
 304 -       mla r1, r9, r1, r2              @ r1 = tmp3 + z1
 305 -       add r5, r5, r6                  @ r5 = tmp1
 306 -       add r3, r3, r4                  @ r3 = tmp2
 307 -       add r1, r1, r6                  @ r1 = tmp3
 308 +        @@ MUL16(b0, W1, row[1]);
 309 +        @@ MUL16(b1, W3, row[1]);
 310 +        @@ MUL16(b2, W5, row[1]);
 311 +        @@ MUL16(b3, W7, row[1]);
 312 +        @@ MAC16(b0, W3, row[3]);
 313 +        @@ MAC16(b1, -W7, row[3]);
 314 +        @@ MAC16(b2, -W1, row[3]);
 315 +        @@ MAC16(b3, -W5, row[3]);
 316 +        ldr r8, [r12, #offW1]    @ R8=W1
 317 +        mov r2, r2, asr #16      @ R2=ROWr16[3]
 318 +        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 319 +        ldr r9, [r12, #offW3]    @ R9=W3
 320 +        ldr r10, [r12, #offW5]   @ R10=W5
 321 +        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 322 +        ldr r11, [r12, #offW7]   @ R11=W7
 323 +        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 324 +        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 325 +               teq r2, #0               @ if null avoid muls
 326 +               mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 327 +        rsbne r2, r2, #0         @ R2=-ROWr16[3]
 328 +        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 329 +        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 330 +        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 331
 332 -       ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
 333 -                                     @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
 334 -
 335 -       @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
 336 -       add r8, r0, r1
 337 -       add r8, r8, #(1<<10)
 338 -       mov r8, r8, asr #11
 339 -       strh r8, [lr, # 0]
 340 -
 341 -       @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
 342 -       sub r8, r0, r1
 343 -       add r8, r8, #(1<<10)
 344 -       mov r8, r8, asr #11
 345 -       strh r8, [lr, #14]
 346 -
 347 -       @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
 348 -       add r8, r6, r3
 349 -       add r8, r8, #(1<<10)
 350 -       mov r8, r8, asr #11
 351 -       strh r8, [lr, # 2]
 352 -
 353 -       @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
 354 -       sub r8, r6, r3
 355 -       add r8, r8, #(1<<10)
 356 -       mov r8, r8, asr #11
 357 -       strh r8, [lr, #12]
 358 -
 359 -       @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
 360 -       add r8, r4, r5
 361 -       add r8, r8, #(1<<10)
 362 -       mov r8, r8, asr #11
 363 -       strh r8, [lr, # 4]
 364 -
 365 -       @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
 366 -       sub r8, r4, r5
 367 -       add r8, r8, #(1<<10)
 368 -       mov r8, r8, asr #11
 369 -       strh r8, [lr, #10]
 370 -
 371 -       @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
 372 -       add r8, r2, r7
 373 -       add r8, r8, #(1<<10)
 374 -       mov r8, r8, asr #11
 375 -       strh r8, [lr, # 6]
 376 -
 377 -       @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
 378 -       sub r8, r2, r7
 379 -       add r8, r8, #(1<<10)
 380 -       mov r8, r8, asr #11
 381 -       strh r8, [lr, # 8]
 382 +        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
 383 +        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 384 +        @@     R12=__const_ptr_, R14=&block[n]
 385 +        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
 386 +        @@ if (temp != 0) {}
 387 +        orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
 388 +        beq __end_b_evaluation
 389
 390 -       @ End of row loop
 391 -       add lr, lr, #16
 392 -       subs r12, r12, #1
 393 -       bne row_loop
 394 -       beq start_column_loop
 395 -
 396 -empty_row:
 397 -       ldr r1, [r11, #FIX_0xFFFF_ID]
 398 -       mov r0, r0, lsl #2
 399 -       and r0, r0, r1
 400 -       add r0, r0, r0, lsl #16
 401 -       str r0, [lr, # 0]
 402 -       str r0, [lr, # 4]
 403 -       str r0, [lr, # 8]
 404 -       str r0, [lr, #12]
 405 +        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
 406 +        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 407 +        @@     R12=__const_ptr_, R14=&block[n]
 408 +        @@ MAC16(b0, W5, row[5]);
 409 +        @@ MAC16(b2, W7, row[5]);
 410 +        @@ MAC16(b3, W3, row[5]);
 411 +        @@ MAC16(b1, -W1, row[5]);
 412 +        @@ MAC16(b0, W7, row[7]);
 413 +        @@ MAC16(b2, W3, row[7]);
 414 +        @@ MAC16(b3, -W1, row[7]);
 415 +        @@ MAC16(b1, -W5, row[7]);
 416 +        mov r3, r3, asr #16      @ R3=ROWr16[5]
 417 +               teq r3, #0               @ if null avoid muls
 418 +        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
 419 +        mov r4, r4, asr #16      @ R4=ROWr16[7]
 420 +        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
 421 +        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
 422 +        rsbne r3, r3, #0         @ R3=-ROWr16[5]
 423 +        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
 424 +        @@ R3 is free now
 425 +               teq r4, #0               @ if null avoid muls
 426 +        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
 427 +        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
 428 +        rsbne r4, r4, #0         @ R4=-ROWr16[7]
 429 +        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
 430 +        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
 431 +        @@ R4 is free now
 432 +__end_b_evaluation:
 433 +        @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
 434 +        @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 435 +        @@     R12=__const_ptr_, R14=&block[n]
 436
 437 -end_of_row_loop:
 438 -       @ End of loop
 439 -       add lr, lr, #16
 440 -       subs r12, r12, #1
 441 -       bne row_loop
 442 +__a_evaluation:
 443 +        @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
 444 +        @@ a1 = a0 + W6 * row[2];
 445 +        @@ a2 = a0 - W6 * row[2];
 446 +        @@ a3 = a0 - W2 * row[2];
 447 +        @@ a0 = a0 + W2 * row[2];
 448 +        ldr r9, [r12, #offW4]    @ R9=W4
 449 +        mul r6, r9, r6           @ R6=W4*ROWr16[0]
 450 +        ldr r10, [r12, #offW6]   @ R10=W6
 451 +        ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
 452 +        add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
 453
 454 -start_column_loop:
 455 -       @ Start of column loop
 456 -       ldr lr, [ sp ]
 457 -       mov r12, #8
 458 -column_loop:
 459 -       ldrsh r0, [lr, #( 0*8)]             @ r0 = 'd0'
 460 -       ldrsh r2, [lr, #( 4*8)]             @ r2 = 'd2'
 461 -       ldrsh r4, [lr, #( 8*8)]             @ r4 = 'd4'
 462 -       ldrsh r6, [lr, #(12*8)]             @ r6 = 'd6'
 463 +        mul r11, r10, r4         @ R11=W6*ROWr16[2]
 464 +        ldr r8, [r12, #offW2]    @ R8=W2
 465 +        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
 466 +        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
 467 +        @@ if (temp != 0) {}
 468 +        teq r2, #0
 469 +        beq __end_bef_a_evaluation
 470
 471 -       ldr r3, [r11, #FIX_0_541196100_ID]
 472 -       add r1, r2, r6
 473 -       ldr r5, [r11, #FIX_M_1_847759065_ID]
 474 -       mul r1, r3, r1                      @ r1 = z1
 475 -       ldr r3, [r11, #FIX_0_765366865_ID]
 476 -       mla r6, r5, r6, r1                  @ r6 = tmp2
 477 -       add r5, r0, r4                      @ r5 = tmp0
 478 -       mla r2, r3, r2, r1                  @ r2 = tmp3
 479 -       sub r3, r0, r4                      @ r3 = tmp1
 480 +       add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 481 +        mul r11, r8, r4          @ R11=W2*ROWr16[2]
 482 +        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 483 +        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 484
 485 -       add r0, r2, r5, lsl #13             @ r0 = tmp10
 486 -       rsb r2, r2, r5, lsl #13             @ r2 = tmp13
 487 -       add r4, r6, r3, lsl #13             @ r4 = tmp11
 488 -       rsb r6, r6, r3, lsl #13             @ r6 = tmp12
 489
 490 -       ldrsh r1, [lr, #( 2*8)]             @ r1 = 'd1'
 491 -       ldrsh r3, [lr, #( 6*8)]             @ r3 = 'd3'
 492 -       ldrsh r5, [lr, #(10*8)]             @ r5 = 'd5'
 493 -       ldrsh r7, [lr, #(14*8)]             @ r7 = 'd7'
 494 +        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 495 +        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
 496 +        @@     R12=__const_ptr_, R14=&block[n]
 497
 498 -       @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
 499 -       orr r9, r1, r3
 500 -       orr r10, r5, r7
 501 -       orrs r10, r9, r10
 502 -       beq empty_odd_column
 503
 504 -       stmdb   sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11
 505 -
 506 -       add r0, r3, r5                  @ r0 = 'z2'
 507 -       add r2, r1, r7                  @ r2 = 'z1'
 508 -       add r4, r3, r7                  @ r4 = 'z3'
 509 -       add r6, r1, r5                  @ r6 = 'z4'
 510 -       ldr r9, [r11, #FIX_1_175875602_ID]
 511 -       add r8, r4, r6
 512 -       ldr r10, [r11, #FIX_M_0_899976223_ID]
 513 -       mul r8, r9, r8                  @ r8 = 'z5'
 514 -       ldr r9, [r11, #FIX_M_2_562915447_ID]
 515 -       mul r2, r10, r2                 @ r2 = 'z1'
 516 -       ldr r10, [r11, #FIX_M_1_961570560_ID]
 517 -       mul r0, r9, r0                  @ r0 = 'z2'
 518 -       ldr r9, [r11, #FIX_M_0_390180644_ID]
 519 -       mla r4, r10, r4, r8             @ r4 = 'z3'
 520 -       ldr r10, [r11, #FIX_0_298631336_ID]
 521 -       mla r6, r9, r6, r8              @ r6 = 'z4'
 522 -       ldr r9, [r11, #FIX_2_053119869_ID]
 523 -       mla r7, r10, r7, r2             @ r7 = tmp0 + z1
 524 -       ldr r10, [r11, #FIX_3_072711026_ID]
 525 -       mla r5, r9, r5, r0              @ r5 = tmp1 + z2
 526 -       ldr r9, [r11, #FIX_1_501321110_ID]
 527 -       mla r3, r10, r3, r0             @ r3 = tmp2 + z2
 528 -       add r7, r7, r4                  @ r7 = tmp0
 529 -       mla r1, r9, r1, r2              @ r1 = tmp3 + z1
 530 -       add r5, r5, r6                  @ r5 = tmp1
 531 -       add r3, r3, r4                  @ r3 = tmp2
 532 -       add r1, r1, r6                  @ r1 = tmp3
 533 -
 534 -       ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
 535 -                                     @ r1 = tmp3  / r3 = tmp2  / r5 = tmp1  / r7 = tmp0
 536 +        @@ a0 += W4*row[4]
 537 +        @@ a1 -= W4*row[4]
 538 +        @@ a2 -= W4*row[4]
 539 +        @@ a3 += W4*row[4]
 540 +        ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
 541 +               teq r11, #0              @ if null avoid muls
 542 +        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
 543 +        @@ R9 is free now
 544 +        ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
 545 +        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
 546 +        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
 547 +        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
 548 +        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
 549 +        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
 550 +               teq r9, #0               @ if null avoid muls
 551 +        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
 552 +        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
 553 +        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
 554 +        @@ a0 += W6*row[6];
 555 +        @@ a3 -= W6*row[6];
 556 +        @@ a1 -= W2*row[6];
 557 +        @@ a2 += W2*row[6];
 558 +        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
 559 +        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
 560 +        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 561
 562 -       @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
 563 -       add r8, r0, r1
 564 -       add r8, r8, #(1<<17)
 565 -       mov r8, r8, asr #18
 566 -       strh r8, [lr, #( 0*8)]
 567 -
 568 -       @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
 569 -       sub r8, r0, r1
 570 -       add r8, r8, #(1<<17)
 571 -       mov r8, r8, asr #18
 572 -       strh r8, [lr, #(14*8)]
 573 -
 574 -       @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
 575 -       add r8, r4, r3
 576 -       add r8, r8, #(1<<17)
 577 -       mov r8, r8, asr #18
 578 -       strh r8, [lr, #( 2*8)]
 579 -
 580 -       @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
 581 -       sub r8, r4, r3
 582 -       add r8, r8, #(1<<17)
 583 -       mov r8, r8, asr #18
 584 -       strh r8, [lr, #(12*8)]
 585 -
 586 -       @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
 587 -       add r8, r6, r5
 588 -       add r8, r8, #(1<<17)
 589 -       mov r8, r8, asr #18
 590 -       strh r8, [lr, #( 4*8)]
 591 -
 592 -       @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
 593 -       sub r8, r6, r5
 594 -       add r8, r8, #(1<<17)
 595 -       mov r8, r8, asr #18
 596 -       strh r8, [lr, #(10*8)]
 597 -
 598 -       @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
 599 -       add r8, r2, r7
 600 -       add r8, r8, #(1<<17)
 601 -       mov r8, r8, asr #18
 602 -       strh r8, [lr, #( 6*8)]
 603 -
 604 -       @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
 605 -       sub r8, r2, r7
 606 -       add r8, r8, #(1<<17)
 607 -       mov r8, r8, asr #18
 608 -       strh r8, [lr, #( 8*8)]
 609 +__end_a_evaluation:
 610 +        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 611 +        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 612 +        @@     R12=__const_ptr_, R14=&block[n]
 613 +        @@ row[0] = (a0 + b0) >> ROW_SHIFT;
 614 +        @@ row[1] = (a1 + b1) >> ROW_SHIFT;
 615 +        @@ row[2] = (a2 + b2) >> ROW_SHIFT;
 616 +        @@ row[3] = (a3 + b3) >> ROW_SHIFT;
 617 +        @@ row[4] = (a3 - b3) >> ROW_SHIFT;
 618 +        @@ row[5] = (a2 - b2) >> ROW_SHIFT;
 619 +        @@ row[6] = (a1 - b1) >> ROW_SHIFT;
 620 +        @@ row[7] = (a0 - b0) >> ROW_SHIFT;
 621 +        add r8, r6, r0           @ R8=a0+b0
 622 +        add r9, r2, r1           @ R9=a1+b1
 623 +        @@ put 2 16 bits half-words in a 32bits word
 624 +        @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
 625 +        ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
 626 +        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
 627 +        mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
 628 +        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
 629 +        orr r8, r8, r9
 630 +        str r8, [r14, #0]
 631
 632 -       @ End of row loop
 633 -       add lr, lr, #2
 634 -       subs r12, r12, #1
 635 -       bne column_loop
 636 -       beq the_end
 637 -
 638 -empty_odd_column:
 639 -       @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
 640 -       @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
 641 -       add r0, r0, #(1<<17)
 642 -       mov r0, r0, asr #18
 643 -       strh r0, [lr, #( 0*8)]
 644 -       strh r0, [lr, #(14*8)]
 645 -
 646 -       @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
 647 -       @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
 648 -       add r4, r4, #(1<<17)
 649 -       mov r4, r4, asr #18
 650 -       strh r4, [lr, #( 2*8)]
 651 -       strh r4, [lr, #(12*8)]
 652 -
 653 -       @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
 654 -       @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
 655 -       add r6, r6, #(1<<17)
 656 -       mov r6, r6, asr #18
 657 -       strh r6, [lr, #( 4*8)]
 658 -       strh r6, [lr, #(10*8)]
 659 -
 660 -       @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
 661 -       @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
 662 -       add r2, r2, #(1<<17)
 663 -       mov r2, r2, asr #18
 664 -       strh r2, [lr, #( 6*8)]
 665 -       strh r2, [lr, #( 8*8)]
 666 +        add r8, r3, r5           @ R8=a2+b2
 667 +        add r9, r4, r7           @ R9=a3+b3
 668 +        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
 669 +        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
 670 +        orr r8, r8, r9
 671 +        str r8, [r14, #4]
 672
 673 -       @ End of row loop
 674 -       add lr, lr, #2
 675 -       subs r12, r12, #1
 676 -       bne column_loop
 677 -
 678 -the_end:
 679 -       @ The end....
 680 -       add sp, sp, #4
 681 -       ldmia   sp!, { r4 - r12, pc }   @ restore callee saved regs and return
 682 +        sub r8, r4, r7           @ R8=a3-b3
 683 +        sub r9, r3, r5           @ R9=a2-b2
 684 +        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
 685 +        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
 686 +        orr r8, r8, r9
 687 +        str r8, [r14, #8]
 688
 689 -const_array:
 690 +        sub r8, r2, r1           @ R8=a1-b1
 691 +        sub r9, r6, r0           @ R9=a0-b0
 692 +        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
 693 +        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
 694 +        orr r8, r8, r9
 695 +        str r8, [r14, #12]
 696 +
 697 +        bal __end_row_loop
 698 +
 699 +__almost_empty_row:
 700 +        @@ the row was empty, except ROWr16[0], now, management of this special case
 701 +        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
 702 +        @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
 703 +        @@                R8=0xFFFF (temp), R9-R11 free
 704 +        mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
 705 +        sub r8, r8, #1           @ R8 is now ready.
 706 +        and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
 707 +        orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
 708 +        str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
 709 +        str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
 710 +        str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
 711 +        str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
 712 +
 713 +__end_row_loop:
 714 +        @@ at this point, R0-R11 (free)
 715 +        @@     R12=__const_ptr_, R14=&block[n]
 716 +        ldr r0, [sp, #0]         @ R0=block
 717 +        teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
 718 +        sub r14, r14, #16
 719 +        bne __row_loop
 720 +
 721 +
 722 +
 723 +       @@ at this point, R0=block, R1-R11 (free)
 724 +       @@     R12=__const_ptr_, R14=&block[n]
 725 +       add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
 726 +__col_loop:
 727 +
 728 +__b_evaluation2:
 729 +       @@ at this point, R0=block (temp),  R1-R11 (free)
 730 +       @@     R12=__const_ptr_, R14=&block[n]
 731 +       @@ proceed with b0-b3 first, followed by a0-a3
 732 +       @@ MUL16(b0, W1, col[8x1]);
 733 +       @@ MUL16(b1, W3, col[8x1]);
 734 +       @@ MUL16(b2, W5, col[8x1]);
 735 +       @@ MUL16(b3, W7, col[8x1]);
 736 +       @@ MAC16(b0, W3, col[8x3]);
 737 +       @@ MAC16(b1, -W7, col[8x3]);
 738 +       @@ MAC16(b2, -W1, col[8x3]);
 739 +       @@ MAC16(b3, -W5, col[8x3]);
 740 +       ldr r8, [r12, #offW1]    @ R8=W1
 741 +       ldrsh r7, [r14, #16]
 742 +       mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 743 +       ldr r9, [r12, #offW3]    @ R9=W3
 744 +       ldr r10, [r12, #offW5]   @ R10=W5
 745 +       mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 746 +       ldr r11, [r12, #offW7]   @ R11=W7
 747 +       mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 748 +       ldrsh r2, [r14, #48]
 749 +       mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 750 +       teq r2, #0               @ if 0, then avoid muls
 751 +       mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 752 +       rsbne r2, r2, #0         @ R2=-ROWr16[3]
 753 +       mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 754 +       mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 755 +       mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 756 +
 757 +       @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
 758 +       @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 759 +       @@     R12=__const_ptr_, R14=&block[n]
 760 +       @@ MAC16(b0, W5, col[5x8]);
 761 +       @@ MAC16(b2, W7, col[5x8]);
 762 +       @@ MAC16(b3, W3, col[5x8]);
 763 +       @@ MAC16(b1, -W1, col[5x8]);
 764 +       @@ MAC16(b0, W7, col[7x8]);
 765 +       @@ MAC16(b2, W3, col[7x8]);
 766 +       @@ MAC16(b3, -W1, col[7x8]);
 767 +       @@ MAC16(b1, -W5, col[7x8]);
 768 +       ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
 769 +       teq r3, #0               @ if 0 then avoid muls
 770 +       mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
 771 +       mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
 772 +       mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
 773 +       rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
 774 +       ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
 775 +       mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
 776 +       @@ R3 is free now
 777 +       teq r4, #0               @ if 0 then avoid muls
 778 +       mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
 779 +       mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
 780 +       rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
 781 +       mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
 782 +       mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
 783 +       @@ R4 is free now
 784 +__end_b_evaluation2:
 785 +       @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
 786 +       @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 787 +       @@     R12=__const_ptr_, R14=&block[n]
 788 +
 789 +__a_evaluation2:
 790 +       @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
 791 +       @@ a1 = a0 + W6 * row[2];
 792 +       @@ a2 = a0 - W6 * row[2];
 793 +       @@ a3 = a0 - W2 * row[2];
 794 +       @@ a0 = a0 + W2 * row[2];
 795 +       ldrsh r6, [r14, #0]
 796 +       ldr r9, [r12, #offW4]    @ R9=W4
 797 +       mul r6, r9, r6           @ R6=W4*ROWr16[0]
 798 +       ldr r10, [r12, #offW6]   @ R10=W6
 799 +       ldrsh r4, [r14, #32]      @ R4=ROWr16[2] (a3 not defined yet)
 800 +       add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
 801 +       mul r11, r10, r4         @ R11=W6*ROWr16[2]
 802 +       ldr r8, [r12, #offW2]    @ R8=W2
 803 +       add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 804 +       sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
 805 +       mul r11, r8, r4          @ R11=W2*ROWr16[2]
 806 +       sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 807 +       add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 808 +
 809 +       @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 810 +       @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
 811 +       @@     R12=__const_ptr_, R14=&block[n]
 812 +       @@ a0 += W4*row[4]
 813 +       @@ a1 -= W4*row[4]
 814 +       @@ a2 -= W4*row[4]
 815 +       @@ a3 += W4*row[4]
 816 +       ldrsh r11, [r14, #64]     @ R11=ROWr16[4]
 817 +       teq r11, #0              @ if null avoid muls
 818 +       mulne r11, r9, r11       @ R11=W4*ROWr16[4]
 819 +       @@ R9 is free now
 820 +       addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
 821 +       subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
 822 +       subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
 823 +       ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
 824 +       addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
 825 +       @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
 826 +       teq r9, #0               @ if null avoid muls
 827 +       mulne r11, r10, r9       @ R11=W6*ROWr16[6]
 828 +       addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
 829 +       mulne r10, r8, r9        @ R10=W2*ROWr16[6]
 830 +       @@ a0 += W6*row[6];
 831 +       @@ a3 -= W6*row[6];
 832 +       @@ a1 -= W2*row[6];
 833 +       @@ a2 += W2*row[6];
 834 +       subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
 835 +       subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
 836 +       addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 837 +__end_a_evaluation2:
 838 +       @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 839 +       @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 840 +       @@     R12=__const_ptr_, R14=&block[n]
 841 +       @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
 842 +       @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
 843 +       @@ col[16] = ((a2 + b2) >> COL_SHIFT);
 844 +       @@ col[24] = ((a3 + b3) >> COL_SHIFT);
 845 +       @@ col[32] = ((a3 - b3) >> COL_SHIFT);
 846 +       @@ col[40] = ((a2 - b2) >> COL_SHIFT);
 847 +       @@ col[48] = ((a1 - b1) >> COL_SHIFT);
 848 +       @@ col[56] = ((a0 - b0) >> COL_SHIFT);
 849 +       @@@@@ no optimisation here @@@@@
 850 +       add r8, r6, r0           @ R8=a0+b0
 851 +       add r9, r2, r1           @ R9=a1+b1
 852 +       mov r8, r8, asr #COL_SHIFT
 853 +       mov r9, r9, asr #COL_SHIFT
 854 +       strh r8, [r14, #0]
 855 +       strh r9, [r14, #16]
 856 +       add r8, r3, r5           @ R8=a2+b2
 857 +       add r9, r4, r7           @ R9=a3+b3
 858 +       mov r8, r8, asr #COL_SHIFT
 859 +       mov r9, r9, asr #COL_SHIFT
 860 +       strh r8, [r14, #32]
 861 +       strh r9, [r14, #48]
 862 +       sub r8, r4, r7           @ R8=a3-b3
 863 +       sub r9, r3, r5           @ R9=a2-b2
 864 +       mov r8, r8, asr #COL_SHIFT
 865 +       mov r9, r9, asr #COL_SHIFT
 866 +       strh r8, [r14, #64]
 867 +       strh r9, [r14, #80]
 868 +       sub r8, r2, r1           @ R8=a1-b1
 869 +       sub r9, r6, r0           @ R9=a0-b0
 870 +       mov r8, r8, asr #COL_SHIFT
 871 +       mov r9, r9, asr #COL_SHIFT
 872 +       strh r8, [r14, #96]
 873 +       strh r9, [r14, #112]
 874 +
 875 +__end_col_loop:
 876 +       @@ at this point, R0-R11 (free)
 877 +       @@     R12=__const_ptr_, R14=&block[n]
 878 +       ldr r0, [sp, #0]         @ R0=block
 879 +       teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
 880 +       sub r14, r14, #2
 881 +       bne __col_loop
 882 +
 883 +
 884 +
 885 +
 886 +__end_simple_idct_ARM:
 887 +        @@ restore registers to previous status!
 888 +        add sp, sp, #8 @@ the local variables!
 889 +        ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
 890 +
 891 +
 892 +
 893 +@@ kind of sub-function, here not to overload the common case.
 894 +__end_bef_a_evaluation:
 895 +       add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 896 +        mul r11, r8, r4          @ R11=W2*ROWr16[2]
 897 +        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 898 +        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 899 +       bal __end_a_evaluation
 900 +
 901 +
 902 +__constant_ptr__:  @@ see #defines at the beginning of the source code for values.
 903         .align
 904 -       .word FIX_0_298631336
 905 -       .word FIX_0_541196100
 906 -       .word FIX_0_765366865
 907 -       .word FIX_1_175875602
 908 -       .word FIX_1_501321110
 909 -       .word FIX_2_053119869
 910 -       .word FIX_3_072711026
 911 -       .word FIX_M_0_390180644
 912 -       .word FIX_M_0_899976223
 913 -       .word FIX_M_1_847759065
 914 -       .word FIX_M_1_961570560
 915 -       .word FIX_M_2_562915447
 916 -       .word FIX_0xFFFF
 917 +        .word   W1
 918 +        .word   W2
 919 +        .word   W3
 920 +        .word   W4
 921 +        .word   W5
 922 +        .word   W6
 923 +        .word   W7
 924 +        .word   MASK_MSHW
 925 +
 926 +