1/* 2 * Alpha optimized DSP utils 3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org> 4 * 5 * This file is part of Libav. 6 * 7 * Libav is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * Libav is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with Libav; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22/* 23 * These functions are scheduled for pca56. They should work 24 * reasonably on ev6, though. 25 */ 26 27#include "regdef.h" 28 29/* Some nicer register names. */ 30#define ta t10 31#define tb t11 32#define tc t12 33#define td AT 34/* Danger: these overlap with the argument list and the return value */ 35#define te a5 36#define tf a4 37#define tg a3 38#define th v0 39 40 .set noat 41 .set noreorder 42 .arch pca56 43 .text 44 45/************************************************************************ 46 * void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, 47 * int line_size, int h) 48 */ 49 .align 6 50 .globl put_pixels_axp_asm 51 .ent put_pixels_axp_asm 52put_pixels_axp_asm: 53 .frame sp, 0, ra 54 .prologue 0 55 56 and a1, 7, t0 57 beq t0, $aligned 58 59 .align 4 60$unaligned: 61 ldq_u t0, 0(a1) 62 ldq_u t1, 8(a1) 63 addq a1, a2, a1 64 nop 65 66 ldq_u t2, 0(a1) 67 ldq_u t3, 8(a1) 68 addq a1, a2, a1 69 nop 70 71 ldq_u t4, 0(a1) 72 ldq_u t5, 8(a1) 73 addq a1, a2, a1 74 nop 75 76 ldq_u t6, 0(a1) 77 ldq_u t7, 8(a1) 78 extql t0, a1, t0 79 addq a1, a2, a1 80 81 extqh t1, a1, t1 82 addq a0, a2, t8 83 extql t2, a1, t2 84 addq t8, a2, t9 85 86 extqh t3, a1, t3 87 addq t9, a2, ta 88 extql t4, a1, t4 89 or t0, t1, t0 90 91 extqh t5, a1, t5 92 or t2, t3, t2 93 extql t6, a1, t6 94 or t4, t5, t4 95 96 extqh t7, a1, t7 97 or t6, t7, t6 98 stq t0, 0(a0) 99 stq t2, 0(t8) 100 101 stq t4, 0(t9) 102 subq a3, 4, a3 103 stq t6, 0(ta) 104 addq ta, a2, a0 105 106 bne a3, $unaligned 107 ret 108 109 .align 4 110$aligned: 111 ldq t0, 0(a1) 112 addq a1, a2, a1 113 ldq t1, 0(a1) 114 addq a1, a2, a1 115 116 ldq t2, 0(a1) 117 addq a1, a2, a1 118 ldq t3, 0(a1) 119 120 addq a0, a2, t4 121 addq a1, a2, a1 122 addq t4, a2, t5 123 subq a3, 4, a3 124 125 stq t0, 0(a0) 126 addq t5, a2, t6 127 stq t1, 0(t4) 128 addq t6, a2, a0 129 130 stq t2, 0(t5) 131 stq t3, 0(t6) 132 133 bne a3, $aligned 134 ret 135 .end put_pixels_axp_asm 136 137/************************************************************************ 138 * void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 139 * int line_size) 140 */ 141 .align 6 142 .globl put_pixels_clamped_mvi_asm 143 .ent put_pixels_clamped_mvi_asm 144put_pixels_clamped_mvi_asm: 145 .frame sp, 0, ra 146 .prologue 0 147 148 lda t8, -1 149 lda t9, 8 # loop counter 150 zap t8, 0xaa, t8 # 00ff00ff00ff00ff 151 152 .align 4 1531: ldq t0, 0(a0) 154 ldq t1, 8(a0) 155 ldq t2, 16(a0) 156 ldq t3, 24(a0) 157 158 maxsw4 t0, zero, t0 159 subq t9, 2, t9 160 maxsw4 t1, zero, t1 161 lda a0, 32(a0) 162 163 maxsw4 t2, zero, t2 164 addq a1, a2, ta 165 maxsw4 t3, zero, t3 166 minsw4 t0, t8, t0 167 168 minsw4 t1, t8, t1 169 minsw4 t2, t8, t2 170 minsw4 t3, t8, t3 171 pkwb t0, t0 172 173 pkwb t1, t1 174 pkwb t2, t2 175 pkwb t3, t3 176 stl t0, 0(a1) 177 178 stl t1, 4(a1) 179 addq ta, a2, a1 180 stl t2, 0(ta) 181 stl t3, 4(ta) 182 183 bne t9, 1b 184 ret 185 .end put_pixels_clamped_mvi_asm 186 187/************************************************************************ 188 * void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 189 * int line_size) 190 */ 191 .align 6 192 .globl add_pixels_clamped_mvi_asm 193 .ent add_pixels_clamped_mvi_asm 194add_pixels_clamped_mvi_asm: 195 .frame sp, 0, ra 196 .prologue 0 197 198 lda t1, -1 199 lda th, 8 200 zap t1, 0x33, tg 201 nop 202 203 srl tg, 1, t0 204 xor tg, t0, tg # 0x8000800080008000 205 zap t1, 0xaa, tf # 0x00ff00ff00ff00ff 206 207 .align 4 2081: ldl t1, 0(a1) # pix0 (try to hit cache line soon) 209 ldl t4, 4(a1) # pix1 210 addq a1, a2, te # pixels += line_size 211 ldq t0, 0(a0) # shorts0 212 213 ldl t7, 0(te) # pix2 (try to hit cache line soon) 214 ldl ta, 4(te) # pix3 215 ldq t3, 8(a0) # shorts1 216 ldq t6, 16(a0) # shorts2 217 218 ldq t9, 24(a0) # shorts3 219 unpkbw t1, t1 # 0 0 (quarter/op no.) 220 and t0, tg, t2 # 0 1 221 unpkbw t4, t4 # 1 0 222 223 bic t0, tg, t0 # 0 2 224 unpkbw t7, t7 # 2 0 225 and t3, tg, t5 # 1 1 226 addq t0, t1, t0 # 0 3 227 228 xor t0, t2, t0 # 0 4 229 unpkbw ta, ta # 3 0 230 and t6, tg, t8 # 2 1 231 maxsw4 t0, zero, t0 # 0 5 232 233 bic t3, tg, t3 # 1 2 234 bic t6, tg, t6 # 2 2 235 minsw4 t0, tf, t0 # 0 6 236 addq t3, t4, t3 # 1 3 237 238 pkwb t0, t0 # 0 7 239 xor t3, t5, t3 # 1 4 240 maxsw4 t3, zero, t3 # 1 5 241 addq t6, t7, t6 # 2 3 242 243 xor t6, t8, t6 # 2 4 244 and t9, tg, tb # 3 1 245 minsw4 t3, tf, t3 # 1 6 246 bic t9, tg, t9 # 3 2 247 248 maxsw4 t6, zero, t6 # 2 5 249 addq t9, ta, t9 # 3 3 250 stl t0, 0(a1) # 0 8 251 minsw4 t6, tf, t6 # 2 6 252 253 xor t9, tb, t9 # 3 4 254 maxsw4 t9, zero, t9 # 3 5 255 lda a0, 32(a0) # block += 16; 256 pkwb t3, t3 # 1 7 257 258 minsw4 t9, tf, t9 # 3 6 259 subq th, 2, th 260 pkwb t6, t6 # 2 7 261 pkwb t9, t9 # 3 7 262 263 stl t3, 4(a1) # 1 8 264 addq te, a2, a1 # pixels += line_size 265 stl t6, 0(te) # 2 8 266 stl t9, 4(te) # 3 8 267 268 bne th, 1b 269 ret 270 .end add_pixels_clamped_mvi_asm 271