1/* 2 * Optimization of some functions from mpegvideo.c for armv5te 3 * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "config.h" 23#include "asm.S" 24 25/* 26 * Special optimized version of dct_unquantize_h263_helper_c, it 27 * requires the block to be at least 8 bytes aligned, and may process 28 * more elements than requested. But it is guaranteed to never 29 * process more than 64 elements provided that count argument is <= 64, 30 * so it is safe. This function is optimized for a common distribution 31 * of values for nCoeffs (they are mostly multiple of 8 plus one or 32 * two extra elements). So this function processes data as 8 elements 33 * per loop iteration and contains optional 2 elements processing in 34 * the end. 35 * 36 * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770) 37 */ 38function ff_dct_unquantize_h263_armv5te, export=1 39 push {r4-r9,lr} 40 mov ip, #0 41 subs r3, r3, #2 42 ble 2f 43 ldrd r4, [r0, #0] 441: 45 ldrd r6, [r0, #8] 46 47 rsbs r9, ip, r4, asr #16 48 addgt r9, r2, #0 49 rsblt r9, r2, #0 50 smlatbne r9, r4, r1, r9 51 52 rsbs lr, ip, r5, asr #16 53 addgt lr, r2, #0 54 rsblt lr, r2, #0 55 smlatbne lr, r5, r1, lr 56 57 rsbs r8, ip, r4, asl #16 58 addgt r8, r2, #0 59 rsblt r8, r2, #0 60 smlabbne r4, r4, r1, r8 61 62 rsbs r8, ip, r5, asl #16 63 addgt r8, r2, #0 64 rsblt r8, r2, #0 65 smlabbne r5, r5, r1, r8 66 67 strh r4, [r0], #2 68 strh r9, [r0], #2 69 strh r5, [r0], #2 70 strh lr, [r0], #2 71 72 rsbs r9, ip, r6, asr #16 73 addgt r9, r2, #0 74 rsblt r9, r2, #0 75 smlatbne r9, r6, r1, r9 76 77 rsbs lr, ip, r7, asr #16 78 addgt lr, r2, #0 79 rsblt lr, r2, #0 80 smlatbne lr, r7, r1, lr 81 82 rsbs r8, ip, r6, asl #16 83 addgt r8, r2, #0 84 rsblt r8, r2, #0 85 smlabbne r6, r6, r1, r8 86 87 rsbs r8, ip, r7, asl #16 88 addgt r8, r2, #0 89 rsblt r8, r2, #0 90 smlabbne r7, r7, r1, r8 91 92 strh r6, [r0], #2 93 strh r9, [r0], #2 94 strh r7, [r0], #2 95 strh lr, [r0], #2 96 97 subs r3, r3, #8 98 ldrgtd r4, [r0, #0] /* load data early to avoid load/use pipeline stall */ 99 bgt 1b 100 101 adds r3, r3, #2 102 pople {r4-r9,pc} 1032: 104 ldrsh r9, [r0, #0] 105 ldrsh lr, [r0, #2] 106 mov r8, r2 107 cmp r9, #0 108 rsblt r8, r2, #0 109 smlabbne r9, r9, r1, r8 110 mov r8, r2 111 cmp lr, #0 112 rsblt r8, r2, #0 113 smlabbne lr, lr, r1, r8 114 strh r9, [r0], #2 115 strh lr, [r0], #2 116 pop {r4-r9,pc} 117 .endfunc 118