1/* 2 * Alpha optimized DSP utils 3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22/* 23 * These functions are scheduled for pca56. They should work 24 * reasonably on ev6, though. 25 */ 26 27#include "regdef.h" 28 29/* Some nicer register names. */ 30#define ta t10 31#define tb t11 32#define tc t12 33#define td AT 34/* Danger: these overlap with the argument list and the return value */ 35#define te a5 36#define tf a4 37#define tg a3 38#define th v0 39 40 .set noat 41 .set noreorder 42 .arch pca56 43 .text 44 45/************************************************************************ 46 * void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, 47 * int line_size, int h) 48 */ 49 .align 6 50 .globl put_pixels_axp_asm 51 .ent put_pixels_axp_asm 52put_pixels_axp_asm: 53 .frame sp, 0, ra 54 .prologue 0 55 56#if CONFIG_GPROF 57 lda AT, _mcount 58 jsr AT, (AT), _mcount 59#endif 60 61 and a1, 7, t0 62 beq t0, $aligned 63 64 .align 4 65$unaligned: 66 ldq_u t0, 0(a1) 67 ldq_u t1, 8(a1) 68 addq a1, a2, a1 69 nop 70 71 ldq_u t2, 0(a1) 72 ldq_u t3, 8(a1) 73 addq a1, a2, a1 74 nop 75 76 ldq_u t4, 0(a1) 77 ldq_u t5, 8(a1) 78 addq a1, a2, a1 79 nop 80 81 ldq_u t6, 0(a1) 82 ldq_u t7, 8(a1) 83 extql t0, a1, t0 84 addq a1, a2, a1 85 86 extqh t1, a1, t1 87 addq a0, a2, t8 88 extql t2, a1, t2 89 addq t8, a2, t9 90 91 extqh t3, a1, t3 92 addq t9, a2, ta 93 extql t4, a1, t4 94 or t0, t1, t0 95 96 extqh t5, a1, t5 97 or t2, t3, t2 98 extql t6, a1, t6 99 or t4, t5, t4 100 101 extqh t7, a1, t7 102 or t6, t7, t6 103 stq t0, 0(a0) 104 stq t2, 0(t8) 105 106 stq t4, 0(t9) 107 subq a3, 4, a3 108 stq t6, 0(ta) 109 addq ta, a2, a0 110 111 bne a3, $unaligned 112 ret 113 114 .align 4 115$aligned: 116 ldq t0, 0(a1) 117 addq a1, a2, a1 118 ldq t1, 0(a1) 119 addq a1, a2, a1 120 121 ldq t2, 0(a1) 122 addq a1, a2, a1 123 ldq t3, 0(a1) 124 125 addq a0, a2, t4 126 addq a1, a2, a1 127 addq t4, a2, t5 128 subq a3, 4, a3 129 130 stq t0, 0(a0) 131 addq t5, a2, t6 132 stq t1, 0(t4) 133 addq t6, a2, a0 134 135 stq t2, 0(t5) 136 stq t3, 0(t6) 137 138 bne a3, $aligned 139 ret 140 .end put_pixels_axp_asm 141 142/************************************************************************ 143 * void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 144 * int line_size) 145 */ 146 .align 6 147 .globl put_pixels_clamped_mvi_asm 148 .ent put_pixels_clamped_mvi_asm 149put_pixels_clamped_mvi_asm: 150 .frame sp, 0, ra 151 .prologue 0 152 153#if CONFIG_GPROF 154 lda AT, _mcount 155 jsr AT, (AT), _mcount 156#endif 157 158 lda t8, -1 159 lda t9, 8 # loop counter 160 zap t8, 0xaa, t8 # 00ff00ff00ff00ff 161 162 .align 4 1631: ldq t0, 0(a0) 164 ldq t1, 8(a0) 165 ldq t2, 16(a0) 166 ldq t3, 24(a0) 167 168 maxsw4 t0, zero, t0 169 subq t9, 2, t9 170 maxsw4 t1, zero, t1 171 lda a0, 32(a0) 172 173 maxsw4 t2, zero, t2 174 addq a1, a2, ta 175 maxsw4 t3, zero, t3 176 minsw4 t0, t8, t0 177 178 minsw4 t1, t8, t1 179 minsw4 t2, t8, t2 180 minsw4 t3, t8, t3 181 pkwb t0, t0 182 183 pkwb t1, t1 184 pkwb t2, t2 185 pkwb t3, t3 186 stl t0, 0(a1) 187 188 stl t1, 4(a1) 189 addq ta, a2, a1 190 stl t2, 0(ta) 191 stl t3, 4(ta) 192 193 bne t9, 1b 194 ret 195 .end put_pixels_clamped_mvi_asm 196 197/************************************************************************ 198 * void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 199 * int line_size) 200 */ 201 .align 6 202 .globl add_pixels_clamped_mvi_asm 203 .ent add_pixels_clamped_mvi_asm 204add_pixels_clamped_mvi_asm: 205 .frame sp, 0, ra 206 .prologue 0 207 208#if CONFIG_GPROF 209 lda AT, _mcount 210 jsr AT, (AT), _mcount 211#endif 212 213 lda t1, -1 214 lda th, 8 215 zap t1, 0x33, tg 216 nop 217 218 srl tg, 1, t0 219 xor tg, t0, tg # 0x8000800080008000 220 zap t1, 0xaa, tf # 0x00ff00ff00ff00ff 221 222 .align 4 2231: ldl t1, 0(a1) # pix0 (try to hit cache line soon) 224 ldl t4, 4(a1) # pix1 225 addq a1, a2, te # pixels += line_size 226 ldq t0, 0(a0) # shorts0 227 228 ldl t7, 0(te) # pix2 (try to hit cache line soon) 229 ldl ta, 4(te) # pix3 230 ldq t3, 8(a0) # shorts1 231 ldq t6, 16(a0) # shorts2 232 233 ldq t9, 24(a0) # shorts3 234 unpkbw t1, t1 # 0 0 (quarter/op no.) 235 and t0, tg, t2 # 0 1 236 unpkbw t4, t4 # 1 0 237 238 bic t0, tg, t0 # 0 2 239 unpkbw t7, t7 # 2 0 240 and t3, tg, t5 # 1 1 241 addq t0, t1, t0 # 0 3 242 243 xor t0, t2, t0 # 0 4 244 unpkbw ta, ta # 3 0 245 and t6, tg, t8 # 2 1 246 maxsw4 t0, zero, t0 # 0 5 247 248 bic t3, tg, t3 # 1 2 249 bic t6, tg, t6 # 2 2 250 minsw4 t0, tf, t0 # 0 6 251 addq t3, t4, t3 # 1 3 252 253 pkwb t0, t0 # 0 7 254 xor t3, t5, t3 # 1 4 255 maxsw4 t3, zero, t3 # 1 5 256 addq t6, t7, t6 # 2 3 257 258 xor t6, t8, t6 # 2 4 259 and t9, tg, tb # 3 1 260 minsw4 t3, tf, t3 # 1 6 261 bic t9, tg, t9 # 3 2 262 263 maxsw4 t6, zero, t6 # 2 5 264 addq t9, ta, t9 # 3 3 265 stl t0, 0(a1) # 0 8 266 minsw4 t6, tf, t6 # 2 6 267 268 xor t9, tb, t9 # 3 4 269 maxsw4 t9, zero, t9 # 3 5 270 lda a0, 32(a0) # block += 16; 271 pkwb t3, t3 # 1 7 272 273 minsw4 t9, tf, t9 # 3 6 274 subq th, 2, th 275 pkwb t6, t6 # 2 7 276 pkwb t9, t9 # 3 7 277 278 stl t3, 4(a1) # 1 8 279 addq te, a2, a1 # pixels += line_size 280 stl t6, 0(te) # 2 8 281 stl t9, 4(te) # 3 8 282 283 bne th, 1b 284 ret 285 .end add_pixels_clamped_mvi_asm 286