5 # Patch managed by http://www.holgerschurig.de/patcher.html
8 --- xine-lib-1-rc7/src/libffmpeg/libavcodec/Makefile.am~libxine-libavcodec
9 +++ xine-lib-1-rc7/src/libffmpeg/libavcodec/Makefile.am
11 # we need to compile everything in debug mode, including the encoders,
12 # otherwise we get unresolved symbols, because some unsatisfied function calls
13 # are not optimized away with debug optimization
14 -AM_CFLAGS = $(LIBFFMPEG_CFLAGS) `test "$(CFLAGS)" = "$(DEBUG_CFLAGS)" && echo -DCONFIG_ENCODERS`
15 +#AM_CFLAGS = $(LIBFFMPEG_CFLAGS) `test "$(CFLAGS)" = "$(DEBUG_CFLAGS)" && echo -DCONFIG_ENCODERS`
19 +AM_CFLAGS = -DARCH_ARMV4L $(LIBFFMPEG_CFLAGS)
21 +AM_CFLAGS = $(LIBFFMPEG_CFLAGS)
23 noinst_LTLIBRARIES = libavcodec.la
25 libavcodec_la_SOURCES = \
26 --- xine-lib-1-rc7/src/libffmpeg/libavcodec/dsputil.h~libxine-libavcodec
27 +++ xine-lib-1-rc7/src/libffmpeg/libavcodec/dsputil.h
29 void ff_fdct248_islow (DCTELEM *data);
31 void j_rev_dct (DCTELEM *data);
32 +void j_rev_dct_ARM ( DCTELEM *data );
34 void ff_fdct_mmx(DCTELEM *block);
35 void ff_fdct_mmx2(DCTELEM *block);
36 --- xine-lib-1-rc7/src/libffmpeg/libavcodec/mpegvideo.c~libxine-libavcodec
37 +++ xine-lib-1-rc7/src/libffmpeg/libavcodec/mpegvideo.c
40 #endif //CONFIG_ENCODERS
45 +static void ff_jref_idct_put_armv4l(UINT8 *dest, int line_size, DCTELEM *block)
47 +// fprintf(stderr, "src/libffmpeg/libavcodec/mpegvideo.c : ff_jref_idct_put_armv4l utilisant l'iDCT ARMv4l en ASM\n");
48 + j_rev_dct_ARM (block);
49 + ff_put_pixels_clamped(block, dest, line_size);
51 +static void ff_jref_idct_add_armv4l(UINT8 *dest, int line_size, DCTELEM *block)
53 +// fprintf(stderr, "src/libffmpeg/libavcodec/mpegvideo.c : ff_jref_idct_add_armv4l utilisant l'iDCT ARMv4l en ASM\n");
54 + j_rev_dct_ARM (block);
55 + ff_add_pixels_clamped(block, dest, line_size);
61 /* init common dct for both encoder and decoder */
62 int DCT_common_init(MpegEncContext *s)
65 MPV_common_init_mmi(s);
68 - MPV_common_init_armv4l(s);
69 +/* MPV_common_init_armv4l(s); */
70 +/* Methode bourrine, mais bon, c'est pour pas trop se prendre la tete a tout changer ce soir, a ameliorer plus tard*/
71 + s->idct_put= ff_jref_idct_put_armv4l;
72 + s->idct_add= ff_jref_idct_add_armv4l;
73 + s->idct_permutation_type= FF_NO_IDCT_PERM;
76 MPV_common_init_ppc(s);
77 --- xine-lib-1-rc7/src/libffmpeg/libavcodec/armv4l/jrevdct_arm.S~libxine-libavcodec
78 +++ xine-lib-1-rc7/src/libffmpeg/libavcodec/armv4l/jrevdct_arm.S
82 - void j_rev_dct_ARM(DCTBLOCK data)
84 + * Copyright (C) 2002 Frederic 'dilb' Boulay.
85 + * All Rights Reserved.
87 + * Author: Frederic Boulay <dilb@handhelds.org>
89 + * you can redistribute this file and/or modify
90 + * it under the terms of the GNU General Public License (version 2)
91 + * as published by the Free Software Foundation.
93 + * This file is distributed in the hope that it will be useful,
94 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
95 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
96 + * GNU General Public License for more details.
98 + * You should have received a copy of the GNU General Public License
99 + * along with this program; if not, write to the Free Software
100 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
103 + * The function defined in this file, is derived from the simple_idct function from
104 + * the libavcodec library part of the ffmpeg project.
107 - With DCTBLOCK being a pointer to an array of 64 'signed shorts'
109 - Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
111 - Permission is hereby granted, free of charge, to any person obtaining a copy
112 - of this software and associated documentation files (the "Software"), to deal
113 - in the Software without restriction, including without limitation the rights
114 - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
115 - copies of the Software, and to permit persons to whom the Software is
116 - furnished to do so, subject to the following conditions:
118 - The above copyright notice and this permission notice shall be included in
119 - all copies or substantial portions of the Software.
120 +/* useful constants for the algorithm, they are save in __constant_ptr__ at the end of the source code.*/
128 +#define MASK_MSHW 0xFFFF0000
130 +/* offsets of the constants in the vector*/
138 +#define offMASK_MSHW 28
140 +#define ROW_SHIFT 11
141 +#define ROW_SHIFT2MSHW (16-11)
142 +#define COL_SHIFT 20
143 +#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1)*/
144 +#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1)*/
147 - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
148 - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
149 - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
150 - COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
151 - IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
152 - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
155 -#define FIX_0_298631336 2446
156 -#define FIX_0_541196100 4433
157 -#define FIX_0_765366865 6270
158 -#define FIX_1_175875602 9633
159 -#define FIX_1_501321110 12299
160 -#define FIX_2_053119869 16819
161 -#define FIX_3_072711026 25172
162 -#define FIX_M_0_390180644 -3196
163 -#define FIX_M_0_899976223 -7373
164 -#define FIX_M_1_847759065 -15137
165 -#define FIX_M_1_961570560 -16069
166 -#define FIX_M_2_562915447 -20995
167 -#define FIX_0xFFFF 0xFFFF
169 -#define FIX_0_298631336_ID 0
170 -#define FIX_0_541196100_ID 4
171 -#define FIX_0_765366865_ID 8
172 -#define FIX_1_175875602_ID 12
173 -#define FIX_1_501321110_ID 16
174 -#define FIX_2_053119869_ID 20
175 -#define FIX_3_072711026_ID 24
176 -#define FIX_M_0_390180644_ID 28
177 -#define FIX_M_0_899976223_ID 32
178 -#define FIX_M_1_847759065_ID 36
179 -#define FIX_M_1_961570560_ID 40
180 -#define FIX_M_2_562915447_ID 44
181 -#define FIX_0xFFFF_ID 48
185 .global j_rev_dct_ARM
188 - stmdb sp!, { r4 - r12, lr } @ all callee saved regs
191 + @@ void simple_idct_ARM(int16_t *block)
192 + @@ save stack for reg needed (take all of them),
193 + @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
194 + @@ so it must not be overwritten, if it is not saved!!
195 + @@ R12 is another scratch register, so it should not be saved too
196 + @@ save all registers
197 + stmfd sp!, {r4-r11, r14} @ R14 is also called LR
198 + @@ at this point, R0=block, other registers are free.
199 + add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
200 + add r12, pc, #(__constant_ptr__-.-8) @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
201 + @@ add 2 temporary variables in the stack: R0 and R14
202 + sub sp, sp, #8 @ allow 2 local variables
203 + str r0, [sp, #0] @ save block in sp[0]
208 - sub sp, sp, #4 @ reserve some space on the stack
209 - str r0, [ sp ] @ save the DCT pointer to the stack
211 - mov lr, r0 @ lr = pointer to the current row
212 - mov r12, #8 @ r12 = row-counter
213 - add r11, pc, #(const_array-.-8) @ r11 = base pointer to the constants array
215 - ldrsh r0, [lr, # 0] @ r0 = 'd0'
216 - ldrsh r1, [lr, # 8] @ r1 = 'd1'
217 + @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
219 - @ Optimization for row that have all items except the first set to 0
220 - @ (this works as the DCTELEMS are always 4-byte aligned)
228 - beq end_of_row_loop @ nothing to be done as ALL of them are '0'
232 - ldrsh r2, [lr, # 2] @ r2 = 'd2'
233 - ldrsh r4, [lr, # 4] @ r4 = 'd4'
234 - ldrsh r6, [lr, # 6] @ r6 = 'd6'
236 - ldr r3, [r11, #FIX_0_541196100_ID]
238 - ldr r5, [r11, #FIX_M_1_847759065_ID]
239 - mul r7, r3, r7 @ r7 = z1
240 - ldr r3, [r11, #FIX_0_765366865_ID]
241 - mla r6, r5, r6, r7 @ r6 = tmp2
242 - add r5, r0, r4 @ r5 = tmp0
243 - mla r2, r3, r2, r7 @ r2 = tmp3
244 - sub r3, r0, r4 @ r3 = tmp1
246 - add r0, r2, r5, lsl #13 @ r0 = tmp10
247 - rsb r2, r2, r5, lsl #13 @ r2 = tmp13
248 - add r4, r6, r3, lsl #13 @ r4 = tmp11
249 - rsb r3, r6, r3, lsl #13 @ r3 = tmp12
251 + @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimise ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
252 + ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
253 + ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1]
254 + ldr r3, [r14, #8] @ R3=ROWr32[2]
255 + ldr r4, [r14, #12] @ R4=ROWr32[3]
256 + @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
257 + @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
258 + @@ else follow the complete algorithm.
259 + @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
260 + @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
261 + orr r5, r4, r3 @ R5=R4 | R3
262 + orr r5, r5, r2 @ R5=R4 | R3 | R2
263 + orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null)
265 + mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
266 + ldrsh r6, [r14, #0] @ R6=ROWr16[0]
267 + orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7
268 + beq __almost_empty_row
270 - stmdb sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11
272 - ldrsh r3, [lr, #10] @ r3 = 'd3'
273 - ldrsh r5, [lr, #12] @ r5 = 'd5'
274 - ldrsh r7, [lr, #14] @ r7 = 'd7'
276 + @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
277 + @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
278 + @@ R12=__const_ptr_, R14=&block[n]
279 + @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
281 - add r0, r3, r5 @ r0 = 'z2'
282 - add r2, r1, r7 @ r2 = 'z1'
283 - add r4, r3, r7 @ r4 = 'z3'
284 - add r6, r1, r5 @ r6 = 'z4'
285 - ldr r9, [r11, #FIX_1_175875602_ID]
286 - add r8, r4, r6 @ r8 = z3 + z4
287 - ldr r10, [r11, #FIX_M_0_899976223_ID]
288 - mul r8, r9, r8 @ r8 = 'z5'
289 - ldr r9, [r11, #FIX_M_2_562915447_ID]
290 - mul r2, r10, r2 @ r2 = 'z1'
291 - ldr r10, [r11, #FIX_M_1_961570560_ID]
292 - mul r0, r9, r0 @ r0 = 'z2'
293 - ldr r9, [r11, #FIX_M_0_390180644_ID]
294 - mla r4, r10, r4, r8 @ r4 = 'z3'
295 - ldr r10, [r11, #FIX_0_298631336_ID]
296 - mla r6, r9, r6, r8 @ r6 = 'z4'
297 - ldr r9, [r11, #FIX_2_053119869_ID]
298 - mla r7, r10, r7, r2 @ r7 = tmp0 + z1
299 - ldr r10, [r11, #FIX_3_072711026_ID]
300 - mla r5, r9, r5, r0 @ r5 = tmp1 + z2
301 - ldr r9, [r11, #FIX_1_501321110_ID]
302 - mla r3, r10, r3, r0 @ r3 = tmp2 + z2
303 - add r7, r7, r4 @ r7 = tmp0
304 - mla r1, r9, r1, r2 @ r1 = tmp3 + z1
305 - add r5, r5, r6 @ r5 = tmp1
306 - add r3, r3, r4 @ r3 = tmp2
307 - add r1, r1, r6 @ r1 = tmp3
308 + @@ MUL16(b0, W1, row[1]);
309 + @@ MUL16(b1, W3, row[1]);
310 + @@ MUL16(b2, W5, row[1]);
311 + @@ MUL16(b3, W7, row[1]);
312 + @@ MAC16(b0, W3, row[3]);
313 + @@ MAC16(b1, -W7, row[3]);
314 + @@ MAC16(b2, -W1, row[3]);
315 + @@ MAC16(b3, -W5, row[3]);
316 + ldr r8, [r12, #offW1] @ R8=W1
317 + mov r2, r2, asr #16 @ R2=ROWr16[3]
318 + mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
319 + ldr r9, [r12, #offW3] @ R9=W3
320 + ldr r10, [r12, #offW5] @ R10=W5
321 + mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
322 + ldr r11, [r12, #offW7] @ R11=W7
323 + mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
324 + mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
325 + teq r2, #0 @ if null avoid muls
326 + mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
327 + rsbne r2, r2, #0 @ R2=-ROWr16[3]
328 + mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
329 + mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
330 + mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
332 - ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
333 - @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
335 - @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
337 - add r8, r8, #(1<<10)
338 - mov r8, r8, asr #11
341 - @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
343 - add r8, r8, #(1<<10)
344 - mov r8, r8, asr #11
347 - @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
349 - add r8, r8, #(1<<10)
350 - mov r8, r8, asr #11
353 - @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
355 - add r8, r8, #(1<<10)
356 - mov r8, r8, asr #11
359 - @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
361 - add r8, r8, #(1<<10)
362 - mov r8, r8, asr #11
365 - @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
367 - add r8, r8, #(1<<10)
368 - mov r8, r8, asr #11
371 - @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
373 - add r8, r8, #(1<<10)
374 - mov r8, r8, asr #11
377 - @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
379 - add r8, r8, #(1<<10)
380 - mov r8, r8, asr #11
382 + @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
383 + @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
384 + @@ R12=__const_ptr_, R14=&block[n]
385 + @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
386 + @@ if (temp != 0) {}
387 + orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3]
388 + beq __end_b_evaluation
394 - beq start_column_loop
397 - ldr r1, [r11, #FIX_0xFFFF_ID]
400 - add r0, r0, r0, lsl #16
405 + @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
406 + @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
407 + @@ R12=__const_ptr_, R14=&block[n]
408 + @@ MAC16(b0, W5, row[5]);
409 + @@ MAC16(b2, W7, row[5]);
410 + @@ MAC16(b3, W3, row[5]);
411 + @@ MAC16(b1, -W1, row[5]);
412 + @@ MAC16(b0, W7, row[7]);
413 + @@ MAC16(b2, W3, row[7]);
414 + @@ MAC16(b3, -W1, row[7]);
415 + @@ MAC16(b1, -W5, row[7]);
416 + mov r3, r3, asr #16 @ R3=ROWr16[5]
417 + teq r3, #0 @ if null avoid muls
418 + mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
419 + mov r4, r4, asr #16 @ R4=ROWr16[7]
420 + mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
421 + mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
422 + rsbne r3, r3, #0 @ R3=-ROWr16[5]
423 + mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
425 + teq r4, #0 @ if null avoid muls
426 + mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
427 + mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
428 + rsbne r4, r4, #0 @ R4=-ROWr16[7]
429 + mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
430 + mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
433 + @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
434 + @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
435 + @@ R12=__const_ptr_, R14=&block[n]
443 + @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
444 + @@ a1 = a0 + W6 * row[2];
445 + @@ a2 = a0 - W6 * row[2];
446 + @@ a3 = a0 - W2 * row[2];
447 + @@ a0 = a0 + W2 * row[2];
448 + ldr r9, [r12, #offW4] @ R9=W4
449 + mul r6, r9, r6 @ R6=W4*ROWr16[0]
450 + ldr r10, [r12, #offW6] @ R10=W6
451 + ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet)
452 + add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
455 - @ Start of column loop
459 - ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0'
460 - ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2'
461 - ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4'
462 - ldrsh r6, [lr, #(12*8)] @ r6 = 'd6'
463 + mul r11, r10, r4 @ R11=W6*ROWr16[2]
464 + ldr r8, [r12, #offW2] @ R8=W2
465 + sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
466 + @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
467 + @@ if (temp != 0) {}
469 + beq __end_bef_a_evaluation
471 - ldr r3, [r11, #FIX_0_541196100_ID]
473 - ldr r5, [r11, #FIX_M_1_847759065_ID]
474 - mul r1, r3, r1 @ r1 = z1
475 - ldr r3, [r11, #FIX_0_765366865_ID]
476 - mla r6, r5, r6, r1 @ r6 = tmp2
477 - add r5, r0, r4 @ r5 = tmp0
478 - mla r2, r3, r2, r1 @ r2 = tmp3
479 - sub r3, r0, r4 @ r3 = tmp1
480 + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
481 + mul r11, r8, r4 @ R11=W2*ROWr16[2]
482 + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
483 + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
485 - add r0, r2, r5, lsl #13 @ r0 = tmp10
486 - rsb r2, r2, r5, lsl #13 @ r2 = tmp13
487 - add r4, r6, r3, lsl #13 @ r4 = tmp11
488 - rsb r6, r6, r3, lsl #13 @ r6 = tmp12
490 - ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1'
491 - ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3'
492 - ldrsh r5, [lr, #(10*8)] @ r5 = 'd5'
493 - ldrsh r7, [lr, #(14*8)] @ r7 = 'd7'
494 + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
495 + @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
496 + @@ R12=__const_ptr_, R14=&block[n]
498 - @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
502 - beq empty_odd_column
504 - stmdb sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11
506 - add r0, r3, r5 @ r0 = 'z2'
507 - add r2, r1, r7 @ r2 = 'z1'
508 - add r4, r3, r7 @ r4 = 'z3'
509 - add r6, r1, r5 @ r6 = 'z4'
510 - ldr r9, [r11, #FIX_1_175875602_ID]
512 - ldr r10, [r11, #FIX_M_0_899976223_ID]
513 - mul r8, r9, r8 @ r8 = 'z5'
514 - ldr r9, [r11, #FIX_M_2_562915447_ID]
515 - mul r2, r10, r2 @ r2 = 'z1'
516 - ldr r10, [r11, #FIX_M_1_961570560_ID]
517 - mul r0, r9, r0 @ r0 = 'z2'
518 - ldr r9, [r11, #FIX_M_0_390180644_ID]
519 - mla r4, r10, r4, r8 @ r4 = 'z3'
520 - ldr r10, [r11, #FIX_0_298631336_ID]
521 - mla r6, r9, r6, r8 @ r6 = 'z4'
522 - ldr r9, [r11, #FIX_2_053119869_ID]
523 - mla r7, r10, r7, r2 @ r7 = tmp0 + z1
524 - ldr r10, [r11, #FIX_3_072711026_ID]
525 - mla r5, r9, r5, r0 @ r5 = tmp1 + z2
526 - ldr r9, [r11, #FIX_1_501321110_ID]
527 - mla r3, r10, r3, r0 @ r3 = tmp2 + z2
528 - add r7, r7, r4 @ r7 = tmp0
529 - mla r1, r9, r1, r2 @ r1 = tmp3 + z1
530 - add r5, r5, r6 @ r5 = tmp1
531 - add r3, r3, r4 @ r3 = tmp2
532 - add r1, r1, r6 @ r1 = tmp3
534 - ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
535 - @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
540 + ldrsh r11, [r14, #8] @ R11=ROWr16[4]
541 + teq r11, #0 @ if null avoid muls
542 + mulne r11, r9, r11 @ R11=W4*ROWr16[4]
544 + ldrsh r9, [r14, #12] @ R9=ROWr16[6]
545 + addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
546 + subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
547 + subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
548 + addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
549 + @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
550 + teq r9, #0 @ if null avoid muls
551 + mulne r11, r10, r9 @ R11=W6*ROWr16[6]
552 + addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
553 + mulne r10, r8, r9 @ R10=W2*ROWr16[6]
554 + @@ a0 += W6*row[6];
555 + @@ a3 -= W6*row[6];
556 + @@ a1 -= W2*row[6];
557 + @@ a2 += W2*row[6];
558 + subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
559 + subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
560 + addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
562 - @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
564 - add r8, r8, #(1<<17)
565 - mov r8, r8, asr #18
566 - strh r8, [lr, #( 0*8)]
568 - @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
570 - add r8, r8, #(1<<17)
571 - mov r8, r8, asr #18
572 - strh r8, [lr, #(14*8)]
574 - @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
576 - add r8, r8, #(1<<17)
577 - mov r8, r8, asr #18
578 - strh r8, [lr, #( 2*8)]
580 - @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
582 - add r8, r8, #(1<<17)
583 - mov r8, r8, asr #18
584 - strh r8, [lr, #(12*8)]
586 - @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
588 - add r8, r8, #(1<<17)
589 - mov r8, r8, asr #18
590 - strh r8, [lr, #( 4*8)]
592 - @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
594 - add r8, r8, #(1<<17)
595 - mov r8, r8, asr #18
596 - strh r8, [lr, #(10*8)]
598 - @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
600 - add r8, r8, #(1<<17)
601 - mov r8, r8, asr #18
602 - strh r8, [lr, #( 6*8)]
604 - @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
606 - add r8, r8, #(1<<17)
607 - mov r8, r8, asr #18
608 - strh r8, [lr, #( 8*8)]
610 + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
611 + @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
612 + @@ R12=__const_ptr_, R14=&block[n]
613 + @@ row[0] = (a0 + b0) >> ROW_SHIFT;
614 + @@ row[1] = (a1 + b1) >> ROW_SHIFT;
615 + @@ row[2] = (a2 + b2) >> ROW_SHIFT;
616 + @@ row[3] = (a3 + b3) >> ROW_SHIFT;
617 + @@ row[4] = (a3 - b3) >> ROW_SHIFT;
618 + @@ row[5] = (a2 - b2) >> ROW_SHIFT;
619 + @@ row[6] = (a1 - b1) >> ROW_SHIFT;
620 + @@ row[7] = (a0 - b0) >> ROW_SHIFT;
621 + add r8, r6, r0 @ R8=a0+b0
622 + add r9, r2, r1 @ R9=a1+b1
623 + @@ put 2 16 bits half-words in a 32bits word
624 + @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
625 + ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
626 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
627 + mvn r11, r10 @ R11= NOT R10= 0x0000FFFF
628 + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
639 - @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
640 - @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
641 - add r0, r0, #(1<<17)
642 - mov r0, r0, asr #18
643 - strh r0, [lr, #( 0*8)]
644 - strh r0, [lr, #(14*8)]
646 - @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
647 - @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
648 - add r4, r4, #(1<<17)
649 - mov r4, r4, asr #18
650 - strh r4, [lr, #( 2*8)]
651 - strh r4, [lr, #(12*8)]
653 - @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
654 - @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
655 - add r6, r6, #(1<<17)
656 - mov r6, r6, asr #18
657 - strh r6, [lr, #( 4*8)]
658 - strh r6, [lr, #(10*8)]
660 - @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
661 - @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
662 - add r2, r2, #(1<<17)
663 - mov r2, r2, asr #18
664 - strh r2, [lr, #( 6*8)]
665 - strh r2, [lr, #( 8*8)]
666 + add r8, r3, r5 @ R8=a2+b2
667 + add r9, r4, r7 @ R9=a3+b3
668 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
669 + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
681 - ldmia sp!, { r4 - r12, pc } @ restore callee saved regs and return
682 + sub r8, r4, r7 @ R8=a3-b3
683 + sub r9, r3, r5 @ R9=a2-b2
684 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
685 + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
690 + sub r8, r2, r1 @ R8=a1-b1
691 + sub r9, r6, r0 @ R9=a0-b0
692 + and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
693 + and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
700 + @@ the row was empty, except ROWr16[0], now, management of this special case
701 + @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
702 + @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
703 + @@ R8=0xFFFF (temp), R9-R11 free
704 + mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
705 + sub r8, r8, #1 @ R8 is now ready.
706 + and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
707 + orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16)
708 + str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5
709 + str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5
710 + str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5
711 + str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5
714 + @@ at this point, R0-R11 (free)
715 + @@ R12=__const_ptr_, R14=&block[n]
716 + ldr r0, [sp, #0] @ R0=block
717 + teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished.
723 + @@ at this point, R0=block, R1-R11 (free)
724 + @@ R12=__const_ptr_, R14=&block[n]
725 + add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
729 + @@ at this point, R0=block (temp), R1-R11 (free)
730 + @@ R12=__const_ptr_, R14=&block[n]
731 + @@ proceed with b0-b3 first, followed by a0-a3
732 + @@ MUL16(b0, W1, col[8x1]);
733 + @@ MUL16(b1, W3, col[8x1]);
734 + @@ MUL16(b2, W5, col[8x1]);
735 + @@ MUL16(b3, W7, col[8x1]);
736 + @@ MAC16(b0, W3, col[8x3]);
737 + @@ MAC16(b1, -W7, col[8x3]);
738 + @@ MAC16(b2, -W1, col[8x3]);
739 + @@ MAC16(b3, -W5, col[8x3]);
740 + ldr r8, [r12, #offW1] @ R8=W1
741 + ldrsh r7, [r14, #16]
742 + mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
743 + ldr r9, [r12, #offW3] @ R9=W3
744 + ldr r10, [r12, #offW5] @ R10=W5
745 + mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
746 + ldr r11, [r12, #offW7] @ R11=W7
747 + mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
748 + ldrsh r2, [r14, #48]
749 + mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
750 + teq r2, #0 @ if 0, then avoid muls
751 + mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
752 + rsbne r2, r2, #0 @ R2=-ROWr16[3]
753 + mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
754 + mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
755 + mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
757 + @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
758 + @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
759 + @@ R12=__const_ptr_, R14=&block[n]
760 + @@ MAC16(b0, W5, col[5x8]);
761 + @@ MAC16(b2, W7, col[5x8]);
762 + @@ MAC16(b3, W3, col[5x8]);
763 + @@ MAC16(b1, -W1, col[5x8]);
764 + @@ MAC16(b0, W7, col[7x8]);
765 + @@ MAC16(b2, W3, col[7x8]);
766 + @@ MAC16(b3, -W1, col[7x8]);
767 + @@ MAC16(b1, -W5, col[7x8]);
768 + ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
769 + teq r3, #0 @ if 0 then avoid muls
770 + mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
771 + mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
772 + mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
773 + rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
774 + ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
775 + mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
777 + teq r4, #0 @ if 0 then avoid muls
778 + mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
779 + mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
780 + rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
781 + mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
782 + mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
784 +__end_b_evaluation2:
785 + @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
786 + @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
787 + @@ R12=__const_ptr_, R14=&block[n]
790 + @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
791 + @@ a1 = a0 + W6 * row[2];
792 + @@ a2 = a0 - W6 * row[2];
793 + @@ a3 = a0 - W2 * row[2];
794 + @@ a0 = a0 + W2 * row[2];
795 + ldrsh r6, [r14, #0]
796 + ldr r9, [r12, #offW4] @ R9=W4
797 + mul r6, r9, r6 @ R6=W4*ROWr16[0]
798 + ldr r10, [r12, #offW6] @ R10=W6
799 + ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet)
800 + add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
801 + mul r11, r10, r4 @ R11=W6*ROWr16[2]
802 + ldr r8, [r12, #offW2] @ R8=W2
803 + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
804 + sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
805 + mul r11, r8, r4 @ R11=W2*ROWr16[2]
806 + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
807 + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
809 + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
810 + @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
811 + @@ R12=__const_ptr_, R14=&block[n]
816 + ldrsh r11, [r14, #64] @ R11=ROWr16[4]
817 + teq r11, #0 @ if null avoid muls
818 + mulne r11, r9, r11 @ R11=W4*ROWr16[4]
820 + addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
821 + subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
822 + subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
823 + ldrsh r9, [r14, #96] @ R9=ROWr16[6]
824 + addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
825 + @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
826 + teq r9, #0 @ if null avoid muls
827 + mulne r11, r10, r9 @ R11=W6*ROWr16[6]
828 + addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
829 + mulne r10, r8, r9 @ R10=W2*ROWr16[6]
830 + @@ a0 += W6*row[6];
831 + @@ a3 -= W6*row[6];
832 + @@ a1 -= W2*row[6];
833 + @@ a2 += W2*row[6];
834 + subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
835 + subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
836 + addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
837 +__end_a_evaluation2:
838 + @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
839 + @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
840 + @@ R12=__const_ptr_, R14=&block[n]
841 + @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
842 + @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
843 + @@ col[16] = ((a2 + b2) >> COL_SHIFT);
844 + @@ col[24] = ((a3 + b3) >> COL_SHIFT);
845 + @@ col[32] = ((a3 - b3) >> COL_SHIFT);
846 + @@ col[40] = ((a2 - b2) >> COL_SHIFT);
847 + @@ col[48] = ((a1 - b1) >> COL_SHIFT);
848 + @@ col[56] = ((a0 - b0) >> COL_SHIFT);
849 + @@@@@ no optimisation here @@@@@
850 + add r8, r6, r0 @ R8=a0+b0
851 + add r9, r2, r1 @ R9=a1+b1
852 + mov r8, r8, asr #COL_SHIFT
853 + mov r9, r9, asr #COL_SHIFT
855 + strh r9, [r14, #16]
856 + add r8, r3, r5 @ R8=a2+b2
857 + add r9, r4, r7 @ R9=a3+b3
858 + mov r8, r8, asr #COL_SHIFT
859 + mov r9, r9, asr #COL_SHIFT
860 + strh r8, [r14, #32]
861 + strh r9, [r14, #48]
862 + sub r8, r4, r7 @ R8=a3-b3
863 + sub r9, r3, r5 @ R9=a2-b2
864 + mov r8, r8, asr #COL_SHIFT
865 + mov r9, r9, asr #COL_SHIFT
866 + strh r8, [r14, #64]
867 + strh r9, [r14, #80]
868 + sub r8, r2, r1 @ R8=a1-b1
869 + sub r9, r6, r0 @ R9=a0-b0
870 + mov r8, r8, asr #COL_SHIFT
871 + mov r9, r9, asr #COL_SHIFT
872 + strh r8, [r14, #96]
873 + strh r9, [r14, #112]
876 + @@ at this point, R0-R11 (free)
877 + @@ R12=__const_ptr_, R14=&block[n]
878 + ldr r0, [sp, #0] @ R0=block
879 + teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished.
886 +__end_simple_idct_ARM:
887 + @@ restore registers to previous status!
888 + add sp, sp, #8 @@ the local variables!
889 + ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
893 +@@ kind of sub-function, here not to overload the common case.
894 +__end_bef_a_evaluation:
895 + add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
896 + mul r11, r8, r4 @ R11=W2*ROWr16[2]
897 + sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
898 + add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
899 + bal __end_a_evaluation
902 +__constant_ptr__: @@ see #defines at the beginning of the source code for values.
904 - .word FIX_0_298631336
905 - .word FIX_0_541196100
906 - .word FIX_0_765366865
907 - .word FIX_1_175875602
908 - .word FIX_1_501321110
909 - .word FIX_2_053119869
910 - .word FIX_3_072711026
911 - .word FIX_M_0_390180644
912 - .word FIX_M_0_899976223
913 - .word FIX_M_1_847759065
914 - .word FIX_M_1_961570560
915 - .word FIX_M_2_562915447