1/* 2 * Optimization of some functions from mpegvideo.c for armv5te 3 * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net> 4 * 5 * This file is part of Libav. 6 * 7 * Libav is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * Libav is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with Libav; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "config.h" 23#include "asm.S" 24 25/* 26 * Special optimized version of dct_unquantize_h263_helper_c, it 27 * requires the block to be at least 8 bytes aligned, and may process 28 * more elements than requested. But it is guaranteed to never 29 * process more than 64 elements provided that count argument is <= 64, 30 * so it is safe. This function is optimized for a common distribution 31 * of values for nCoeffs (they are mostly multiple of 8 plus one or 32 * two extra elements). So this function processes data as 8 elements 33 * per loop iteration and contains optional 2 elements processing in 34 * the end. 35 * 36 * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770) 37 */ 38 39.macro dequant_t dst, src, mul, add, tmp 40 rsbs \tmp, ip, \src, asr #16 41 it gt 42 addgt \tmp, \add, #0 43 it lt 44 rsblt \tmp, \add, #0 45 it ne 46 smlatbne \dst, \src, \mul, \tmp 47.endm 48 49.macro dequant_b dst, src, mul, add, tmp 50 rsbs \tmp, ip, \src, lsl #16 51 it gt 52 addgt \tmp, \add, #0 53 it lt 54 rsblt \tmp, \add, #0 55 it ne 56 smlabbne \dst, \src, \mul, \tmp 57.endm 58 59function ff_dct_unquantize_h263_armv5te, export=1 60 push {r4-r9,lr} 61 mov ip, #0 62 subs r3, r3, #2 63 ble 2f 64 ldrd r4, [r0, #0] 651: 66 ldrd r6, [r0, #8] 67 68 dequant_t r9, r4, r1, r2, r9 69 dequant_t lr, r5, r1, r2, lr 70 dequant_b r4, r4, r1, r2, r8 71 dequant_b r5, r5, r1, r2, r8 72 73 strh r4, [r0], #2 74 strh r9, [r0], #2 75 strh r5, [r0], #2 76 strh lr, [r0], #2 77 78 dequant_t r9, r6, r1, r2, r9 79 dequant_t lr, r7, r1, r2, lr 80 dequant_b r6, r6, r1, r2, r8 81 dequant_b r7, r7, r1, r2, r8 82 83 strh r6, [r0], #2 84 strh r9, [r0], #2 85 strh r7, [r0], #2 86 strh lr, [r0], #2 87 88 subs r3, r3, #8 89 it gt 90 ldrdgt r4, [r0, #0] /* load data early to avoid load/use pipeline stall */ 91 bgt 1b 92 93 adds r3, r3, #2 94 it le 95 pople {r4-r9,pc} 962: 97 ldrsh r9, [r0, #0] 98 ldrsh lr, [r0, #2] 99 mov r8, r2 100 cmp r9, #0 101 it lt 102 rsblt r8, r2, #0 103 it ne 104 smlabbne r9, r9, r1, r8 105 mov r8, r2 106 cmp lr, #0 107 it lt 108 rsblt r8, r2, #0 109 it ne 110 smlabbne lr, lr, r1, r8 111 strh r9, [r0], #2 112 strh lr, [r0], #2 113 pop {r4-r9,pc} 114endfunc 115