1/* 2 * iWMMXt optimized DSP utils 3 * copyright (c) 2004 AGAWA Koji 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 23{ 24 int stride = line_size; 25 __asm__ volatile ( 26 "and r12, %[pixels], #7 \n\t" 27 "bic %[pixels], %[pixels], #7 \n\t" 28 "tmcr wcgr1, r12 \n\t" 29 "add r4, %[pixels], %[line_size] \n\t" 30 "add r5, %[block], %[line_size] \n\t" 31 "mov %[line_size], %[line_size], lsl #1 \n\t" 32 "1: \n\t" 33 "wldrd wr0, [%[pixels]] \n\t" 34 "subs %[h], %[h], #2 \n\t" 35 "wldrd wr1, [%[pixels], #8] \n\t" 36 "add %[pixels], %[pixels], %[line_size] \n\t" 37 "wldrd wr3, [r4] \n\t" 38 "pld [%[pixels]] \n\t" 39 "pld [%[pixels], #32] \n\t" 40 "wldrd wr4, [r4, #8] \n\t" 41 "add r4, r4, %[line_size] \n\t" 42 "walignr1 wr8, wr0, wr1 \n\t" 43 "pld [r4] \n\t" 44 "pld [r4, #32] \n\t" 45 "walignr1 wr10, wr3, wr4 \n\t" 46 "wstrd wr8, [%[block]] \n\t" 47 "add %[block], %[block], %[line_size] \n\t" 48 "wstrd wr10, [r5] \n\t" 49 "add r5, r5, %[line_size] \n\t" 50 "bne 1b \n\t" 51 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) 52 : 53 : "memory", "r4", "r5", "r12"); 54} 55 56void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 57{ 58 int stride = line_size; 59 __asm__ volatile ( 60 "and r12, %[pixels], #7 \n\t" 61 "bic %[pixels], %[pixels], #7 \n\t" 62 "tmcr wcgr1, r12 \n\t" 63 "add r4, %[pixels], %[line_size] \n\t" 64 "add r5, %[block], %[line_size] \n\t" 65 "mov %[line_size], %[line_size], lsl #1 \n\t" 66 "1: \n\t" 67 "wldrd wr0, [%[pixels]] \n\t" 68 "subs %[h], %[h], #2 \n\t" 69 "wldrd wr1, [%[pixels], #8] \n\t" 70 "add %[pixels], %[pixels], %[line_size] \n\t" 71 "wldrd wr3, [r4] \n\t" 72 "pld [%[pixels]] \n\t" 73 "pld [%[pixels], #32] \n\t" 74 "wldrd wr4, [r4, #8] \n\t" 75 "add r4, r4, %[line_size] \n\t" 76 "walignr1 wr8, wr0, wr1 \n\t" 77 "wldrd wr0, [%[block]] \n\t" 78 "wldrd wr2, [r5] \n\t" 79 "pld [r4] \n\t" 80 "pld [r4, #32] \n\t" 81 "walignr1 wr10, wr3, wr4 \n\t" 82 WAVG2B" wr8, wr8, wr0 \n\t" 83 WAVG2B" wr10, wr10, wr2 \n\t" 84 "wstrd wr8, [%[block]] \n\t" 85 "add %[block], %[block], %[line_size] \n\t" 86 "wstrd wr10, [r5] \n\t" 87 "pld [%[block]] \n\t" 88 "pld [%[block], #32] \n\t" 89 "add r5, r5, %[line_size] \n\t" 90 "pld [r5] \n\t" 91 "pld [r5, #32] \n\t" 92 "bne 1b \n\t" 93 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) 94 : 95 : "memory", "r4", "r5", "r12"); 96} 97 98void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 99{ 100 int stride = line_size; 101 __asm__ volatile ( 102 "and r12, %[pixels], #7 \n\t" 103 "bic %[pixels], %[pixels], #7 \n\t" 104 "tmcr wcgr1, r12 \n\t" 105 "add r4, %[pixels], %[line_size] \n\t" 106 "add r5, %[block], %[line_size] \n\t" 107 "mov %[line_size], %[line_size], lsl #1 \n\t" 108 "1: \n\t" 109 "wldrd wr0, [%[pixels]] \n\t" 110 "wldrd wr1, [%[pixels], #8] \n\t" 111 "subs %[h], %[h], #2 \n\t" 112 "wldrd wr2, [%[pixels], #16] \n\t" 113 "add %[pixels], %[pixels], %[line_size] \n\t" 114 "wldrd wr3, [r4] \n\t" 115 "pld [%[pixels]] \n\t" 116 "pld [%[pixels], #32] \n\t" 117 "walignr1 wr8, wr0, wr1 \n\t" 118 "wldrd wr4, [r4, #8] \n\t" 119 "walignr1 wr9, wr1, wr2 \n\t" 120 "wldrd wr5, [r4, #16] \n\t" 121 "add r4, r4, %[line_size] \n\t" 122 "pld [r4] \n\t" 123 "pld [r4, #32] \n\t" 124 "walignr1 wr10, wr3, wr4 \n\t" 125 "wstrd wr8, [%[block]] \n\t" 126 "walignr1 wr11, wr4, wr5 \n\t" 127 "wstrd wr9, [%[block], #8] \n\t" 128 "add %[block], %[block], %[line_size] \n\t" 129 "wstrd wr10, [r5] \n\t" 130 "wstrd wr11, [r5, #8] \n\t" 131 "add r5, r5, %[line_size] \n\t" 132 "bne 1b \n\t" 133 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) 134 : 135 : "memory", "r4", "r5", "r12"); 136} 137 138void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 139{ 140 int stride = line_size; 141 __asm__ volatile ( 142 "pld [%[pixels]] \n\t" 143 "pld [%[pixels], #32] \n\t" 144 "pld [%[block]] \n\t" 145 "pld [%[block], #32] \n\t" 146 "and r12, %[pixels], #7 \n\t" 147 "bic %[pixels], %[pixels], #7 \n\t" 148 "tmcr wcgr1, r12 \n\t" 149 "add r4, %[pixels], %[line_size]\n\t" 150 "add r5, %[block], %[line_size] \n\t" 151 "mov %[line_size], %[line_size], lsl #1 \n\t" 152 "1: \n\t" 153 "wldrd wr0, [%[pixels]] \n\t" 154 "wldrd wr1, [%[pixels], #8] \n\t" 155 "subs %[h], %[h], #2 \n\t" 156 "wldrd wr2, [%[pixels], #16] \n\t" 157 "add %[pixels], %[pixels], %[line_size] \n\t" 158 "wldrd wr3, [r4] \n\t" 159 "pld [%[pixels]] \n\t" 160 "pld [%[pixels], #32] \n\t" 161 "walignr1 wr8, wr0, wr1 \n\t" 162 "wldrd wr4, [r4, #8] \n\t" 163 "walignr1 wr9, wr1, wr2 \n\t" 164 "wldrd wr5, [r4, #16] \n\t" 165 "add r4, r4, %[line_size] \n\t" 166 "wldrd wr0, [%[block]] \n\t" 167 "pld [r4] \n\t" 168 "wldrd wr1, [%[block], #8] \n\t" 169 "pld [r4, #32] \n\t" 170 "wldrd wr2, [r5] \n\t" 171 "walignr1 wr10, wr3, wr4 \n\t" 172 "wldrd wr3, [r5, #8] \n\t" 173 WAVG2B" wr8, wr8, wr0 \n\t" 174 WAVG2B" wr9, wr9, wr1 \n\t" 175 WAVG2B" wr10, wr10, wr2 \n\t" 176 "wstrd wr8, [%[block]] \n\t" 177 "walignr1 wr11, wr4, wr5 \n\t" 178 WAVG2B" wr11, wr11, wr3 \n\t" 179 "wstrd wr9, [%[block], #8] \n\t" 180 "add %[block], %[block], %[line_size] \n\t" 181 "wstrd wr10, [r5] \n\t" 182 "pld [%[block]] \n\t" 183 "pld [%[block], #32] \n\t" 184 "wstrd wr11, [r5, #8] \n\t" 185 "add r5, r5, %[line_size] \n\t" 186 "pld [r5] \n\t" 187 "pld [r5, #32] \n\t" 188 "bne 1b \n\t" 189 : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) 190 : 191 : "memory", "r4", "r5", "r12"); 192} 193 194void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 195{ 196 int stride = line_size; 197 // [wr0 wr1 wr2 wr3] for previous line 198 // [wr4 wr5 wr6 wr7] for current line 199 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 200 __asm__ volatile( 201 "pld [%[pixels]] \n\t" 202 "pld [%[pixels], #32] \n\t" 203 "and r12, %[pixels], #7 \n\t" 204 "bic %[pixels], %[pixels], #7 \n\t" 205 "tmcr wcgr1, r12 \n\t" 206 "add r12, r12, #1 \n\t" 207 "add r4, %[pixels], %[line_size]\n\t" 208 "tmcr wcgr2, r12 \n\t" 209 "add r5, %[block], %[line_size] \n\t" 210 "mov %[line_size], %[line_size], lsl #1 \n\t" 211 212 "1: \n\t" 213 "wldrd wr10, [%[pixels]] \n\t" 214 "cmp r12, #8 \n\t" 215 "wldrd wr11, [%[pixels], #8] \n\t" 216 "add %[pixels], %[pixels], %[line_size] \n\t" 217 "wldrd wr13, [r4] \n\t" 218 "pld [%[pixels]] \n\t" 219 "wldrd wr14, [r4, #8] \n\t" 220 "pld [%[pixels], #32] \n\t" 221 "add r4, r4, %[line_size] \n\t" 222 "walignr1 wr0, wr10, wr11 \n\t" 223 "pld [r4] \n\t" 224 "pld [r4, #32] \n\t" 225 "walignr1 wr2, wr13, wr14 \n\t" 226 "wmoveq wr4, wr11 \n\t" 227 "wmoveq wr6, wr14 \n\t" 228 "walignr2ne wr4, wr10, wr11 \n\t" 229 "walignr2ne wr6, wr13, wr14 \n\t" 230 WAVG2B" wr0, wr0, wr4 \n\t" 231 WAVG2B" wr2, wr2, wr6 \n\t" 232 "wstrd wr0, [%[block]] \n\t" 233 "subs %[h], %[h], #2 \n\t" 234 "wstrd wr2, [r5] \n\t" 235 "add %[block], %[block], %[line_size] \n\t" 236 "add r5, r5, %[line_size] \n\t" 237 "bne 1b \n\t" 238 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 239 : 240 : "r4", "r5", "r12", "memory"); 241} 242 243void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 244{ 245 int stride = line_size; 246 // [wr0 wr1 wr2 wr3] for previous line 247 // [wr4 wr5 wr6 wr7] for current line 248 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 249 __asm__ volatile( 250 "pld [%[pixels]] \n\t" 251 "pld [%[pixels], #32] \n\t" 252 "and r12, %[pixels], #7 \n\t" 253 "bic %[pixels], %[pixels], #7 \n\t" 254 "tmcr wcgr1, r12 \n\t" 255 "add r12, r12, #1 \n\t" 256 "add r4, %[pixels], %[line_size]\n\t" 257 "tmcr wcgr2, r12 \n\t" 258 "add r5, %[block], %[line_size] \n\t" 259 "mov %[line_size], %[line_size], lsl #1 \n\t" 260 261 "1: \n\t" 262 "wldrd wr10, [%[pixels]] \n\t" 263 "cmp r12, #8 \n\t" 264 "wldrd wr11, [%[pixels], #8] \n\t" 265 "wldrd wr12, [%[pixels], #16] \n\t" 266 "add %[pixels], %[pixels], %[line_size] \n\t" 267 "wldrd wr13, [r4] \n\t" 268 "pld [%[pixels]] \n\t" 269 "wldrd wr14, [r4, #8] \n\t" 270 "pld [%[pixels], #32] \n\t" 271 "wldrd wr15, [r4, #16] \n\t" 272 "add r4, r4, %[line_size] \n\t" 273 "walignr1 wr0, wr10, wr11 \n\t" 274 "pld [r4] \n\t" 275 "pld [r4, #32] \n\t" 276 "walignr1 wr1, wr11, wr12 \n\t" 277 "walignr1 wr2, wr13, wr14 \n\t" 278 "walignr1 wr3, wr14, wr15 \n\t" 279 "wmoveq wr4, wr11 \n\t" 280 "wmoveq wr5, wr12 \n\t" 281 "wmoveq wr6, wr14 \n\t" 282 "wmoveq wr7, wr15 \n\t" 283 "walignr2ne wr4, wr10, wr11 \n\t" 284 "walignr2ne wr5, wr11, wr12 \n\t" 285 "walignr2ne wr6, wr13, wr14 \n\t" 286 "walignr2ne wr7, wr14, wr15 \n\t" 287 WAVG2B" wr0, wr0, wr4 \n\t" 288 WAVG2B" wr1, wr1, wr5 \n\t" 289 "wstrd wr0, [%[block]] \n\t" 290 WAVG2B" wr2, wr2, wr6 \n\t" 291 "wstrd wr1, [%[block], #8] \n\t" 292 WAVG2B" wr3, wr3, wr7 \n\t" 293 "add %[block], %[block], %[line_size] \n\t" 294 "wstrd wr2, [r5] \n\t" 295 "subs %[h], %[h], #2 \n\t" 296 "wstrd wr3, [r5, #8] \n\t" 297 "add r5, r5, %[line_size] \n\t" 298 "bne 1b \n\t" 299 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 300 : 301 : "r4", "r5", "r12", "memory"); 302} 303 304void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 305{ 306 int stride = line_size; 307 // [wr0 wr1 wr2 wr3] for previous line 308 // [wr4 wr5 wr6 wr7] for current line 309 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 310 __asm__ volatile( 311 "pld [%[pixels]] \n\t" 312 "pld [%[pixels], #32] \n\t" 313 "pld [%[block]] \n\t" 314 "pld [%[block], #32] \n\t" 315 "and r12, %[pixels], #7 \n\t" 316 "bic %[pixels], %[pixels], #7 \n\t" 317 "tmcr wcgr1, r12 \n\t" 318 "add r12, r12, #1 \n\t" 319 "add r4, %[pixels], %[line_size]\n\t" 320 "tmcr wcgr2, r12 \n\t" 321 "add r5, %[block], %[line_size] \n\t" 322 "mov %[line_size], %[line_size], lsl #1 \n\t" 323 "pld [r5] \n\t" 324 "pld [r5, #32] \n\t" 325 326 "1: \n\t" 327 "wldrd wr10, [%[pixels]] \n\t" 328 "cmp r12, #8 \n\t" 329 "wldrd wr11, [%[pixels], #8] \n\t" 330 "add %[pixels], %[pixels], %[line_size] \n\t" 331 "wldrd wr13, [r4] \n\t" 332 "pld [%[pixels]] \n\t" 333 "wldrd wr14, [r4, #8] \n\t" 334 "pld [%[pixels], #32] \n\t" 335 "add r4, r4, %[line_size] \n\t" 336 "walignr1 wr0, wr10, wr11 \n\t" 337 "pld [r4] \n\t" 338 "pld [r4, #32] \n\t" 339 "walignr1 wr2, wr13, wr14 \n\t" 340 "wmoveq wr4, wr11 \n\t" 341 "wmoveq wr6, wr14 \n\t" 342 "walignr2ne wr4, wr10, wr11 \n\t" 343 "wldrd wr10, [%[block]] \n\t" 344 "walignr2ne wr6, wr13, wr14 \n\t" 345 "wldrd wr12, [r5] \n\t" 346 WAVG2B" wr0, wr0, wr4 \n\t" 347 WAVG2B" wr2, wr2, wr6 \n\t" 348 WAVG2B" wr0, wr0, wr10 \n\t" 349 WAVG2B" wr2, wr2, wr12 \n\t" 350 "wstrd wr0, [%[block]] \n\t" 351 "subs %[h], %[h], #2 \n\t" 352 "wstrd wr2, [r5] \n\t" 353 "add %[block], %[block], %[line_size] \n\t" 354 "add r5, r5, %[line_size] \n\t" 355 "pld [%[block]] \n\t" 356 "pld [%[block], #32] \n\t" 357 "pld [r5] \n\t" 358 "pld [r5, #32] \n\t" 359 "bne 1b \n\t" 360 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 361 : 362 : "r4", "r5", "r12", "memory"); 363} 364 365void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 366{ 367 int stride = line_size; 368 // [wr0 wr1 wr2 wr3] for previous line 369 // [wr4 wr5 wr6 wr7] for current line 370 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 371 __asm__ volatile( 372 "pld [%[pixels]] \n\t" 373 "pld [%[pixels], #32] \n\t" 374 "pld [%[block]] \n\t" 375 "pld [%[block], #32] \n\t" 376 "and r12, %[pixels], #7 \n\t" 377 "bic %[pixels], %[pixels], #7 \n\t" 378 "tmcr wcgr1, r12 \n\t" 379 "add r12, r12, #1 \n\t" 380 "add r4, %[pixels], %[line_size]\n\t" 381 "tmcr wcgr2, r12 \n\t" 382 "add r5, %[block], %[line_size] \n\t" 383 "mov %[line_size], %[line_size], lsl #1 \n\t" 384 "pld [r5] \n\t" 385 "pld [r5, #32] \n\t" 386 387 "1: \n\t" 388 "wldrd wr10, [%[pixels]] \n\t" 389 "cmp r12, #8 \n\t" 390 "wldrd wr11, [%[pixels], #8] \n\t" 391 "wldrd wr12, [%[pixels], #16] \n\t" 392 "add %[pixels], %[pixels], %[line_size] \n\t" 393 "wldrd wr13, [r4] \n\t" 394 "pld [%[pixels]] \n\t" 395 "wldrd wr14, [r4, #8] \n\t" 396 "pld [%[pixels], #32] \n\t" 397 "wldrd wr15, [r4, #16] \n\t" 398 "add r4, r4, %[line_size] \n\t" 399 "walignr1 wr0, wr10, wr11 \n\t" 400 "pld [r4] \n\t" 401 "pld [r4, #32] \n\t" 402 "walignr1 wr1, wr11, wr12 \n\t" 403 "walignr1 wr2, wr13, wr14 \n\t" 404 "walignr1 wr3, wr14, wr15 \n\t" 405 "wmoveq wr4, wr11 \n\t" 406 "wmoveq wr5, wr12 \n\t" 407 "wmoveq wr6, wr14 \n\t" 408 "wmoveq wr7, wr15 \n\t" 409 "walignr2ne wr4, wr10, wr11 \n\t" 410 "walignr2ne wr5, wr11, wr12 \n\t" 411 "walignr2ne wr6, wr13, wr14 \n\t" 412 "walignr2ne wr7, wr14, wr15 \n\t" 413 "wldrd wr10, [%[block]] \n\t" 414 WAVG2B" wr0, wr0, wr4 \n\t" 415 "wldrd wr11, [%[block], #8] \n\t" 416 WAVG2B" wr1, wr1, wr5 \n\t" 417 "wldrd wr12, [r5] \n\t" 418 WAVG2B" wr2, wr2, wr6 \n\t" 419 "wldrd wr13, [r5, #8] \n\t" 420 WAVG2B" wr3, wr3, wr7 \n\t" 421 WAVG2B" wr0, wr0, wr10 \n\t" 422 WAVG2B" wr1, wr1, wr11 \n\t" 423 WAVG2B" wr2, wr2, wr12 \n\t" 424 WAVG2B" wr3, wr3, wr13 \n\t" 425 "wstrd wr0, [%[block]] \n\t" 426 "subs %[h], %[h], #2 \n\t" 427 "wstrd wr1, [%[block], #8] \n\t" 428 "add %[block], %[block], %[line_size] \n\t" 429 "wstrd wr2, [r5] \n\t" 430 "pld [%[block]] \n\t" 431 "wstrd wr3, [r5, #8] \n\t" 432 "add r5, r5, %[line_size] \n\t" 433 "pld [%[block], #32] \n\t" 434 "pld [r5] \n\t" 435 "pld [r5, #32] \n\t" 436 "bne 1b \n\t" 437 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 438 : 439 :"r4", "r5", "r12", "memory"); 440} 441 442void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 443{ 444 int stride = line_size; 445 // [wr0 wr1 wr2 wr3] for previous line 446 // [wr4 wr5 wr6 wr7] for current line 447 __asm__ volatile( 448 "pld [%[pixels]] \n\t" 449 "pld [%[pixels], #32] \n\t" 450 "and r12, %[pixels], #7 \n\t" 451 "tmcr wcgr1, r12 \n\t" 452 "bic %[pixels], %[pixels], #7 \n\t" 453 454 "wldrd wr10, [%[pixels]] \n\t" 455 "wldrd wr11, [%[pixels], #8] \n\t" 456 "pld [%[block]] \n\t" 457 "add %[pixels], %[pixels], %[line_size] \n\t" 458 "walignr1 wr0, wr10, wr11 \n\t" 459 "pld [%[pixels]] \n\t" 460 "pld [%[pixels], #32] \n\t" 461 462 "1: \n\t" 463 "wldrd wr10, [%[pixels]] \n\t" 464 "wldrd wr11, [%[pixels], #8] \n\t" 465 "add %[pixels], %[pixels], %[line_size] \n\t" 466 "pld [%[pixels]] \n\t" 467 "pld [%[pixels], #32] \n\t" 468 "walignr1 wr4, wr10, wr11 \n\t" 469 "wldrd wr10, [%[block]] \n\t" 470 WAVG2B" wr8, wr0, wr4 \n\t" 471 WAVG2B" wr8, wr8, wr10 \n\t" 472 "wstrd wr8, [%[block]] \n\t" 473 "add %[block], %[block], %[line_size] \n\t" 474 475 "wldrd wr10, [%[pixels]] \n\t" 476 "wldrd wr11, [%[pixels], #8] \n\t" 477 "pld [%[block]] \n\t" 478 "add %[pixels], %[pixels], %[line_size] \n\t" 479 "pld [%[pixels]] \n\t" 480 "pld [%[pixels], #32] \n\t" 481 "walignr1 wr0, wr10, wr11 \n\t" 482 "wldrd wr10, [%[block]] \n\t" 483 WAVG2B" wr8, wr0, wr4 \n\t" 484 WAVG2B" wr8, wr8, wr10 \n\t" 485 "wstrd wr8, [%[block]] \n\t" 486 "add %[block], %[block], %[line_size] \n\t" 487 488 "subs %[h], %[h], #2 \n\t" 489 "pld [%[block]] \n\t" 490 "bne 1b \n\t" 491 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 492 : 493 : "cc", "memory", "r12"); 494} 495 496void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 497{ 498 int stride = line_size; 499 // [wr0 wr1 wr2 wr3] for previous line 500 // [wr4 wr5 wr6 wr7] for current line 501 __asm__ volatile( 502 "pld [%[pixels]] \n\t" 503 "pld [%[pixels], #32] \n\t" 504 "and r12, %[pixels], #7 \n\t" 505 "tmcr wcgr1, r12 \n\t" 506 "bic %[pixels], %[pixels], #7 \n\t" 507 508 "wldrd wr10, [%[pixels]] \n\t" 509 "wldrd wr11, [%[pixels], #8] \n\t" 510 "wldrd wr12, [%[pixels], #16] \n\t" 511 "add %[pixels], %[pixels], %[line_size] \n\t" 512 "pld [%[pixels]] \n\t" 513 "pld [%[pixels], #32] \n\t" 514 "walignr1 wr0, wr10, wr11 \n\t" 515 "walignr1 wr1, wr11, wr12 \n\t" 516 517 "1: \n\t" 518 "wldrd wr10, [%[pixels]] \n\t" 519 "wldrd wr11, [%[pixels], #8] \n\t" 520 "wldrd wr12, [%[pixels], #16] \n\t" 521 "add %[pixels], %[pixels], %[line_size] \n\t" 522 "pld [%[pixels]] \n\t" 523 "pld [%[pixels], #32] \n\t" 524 "walignr1 wr4, wr10, wr11 \n\t" 525 "walignr1 wr5, wr11, wr12 \n\t" 526 WAVG2B" wr8, wr0, wr4 \n\t" 527 WAVG2B" wr9, wr1, wr5 \n\t" 528 "wstrd wr8, [%[block]] \n\t" 529 "wstrd wr9, [%[block], #8] \n\t" 530 "add %[block], %[block], %[line_size] \n\t" 531 532 "wldrd wr10, [%[pixels]] \n\t" 533 "wldrd wr11, [%[pixels], #8] \n\t" 534 "wldrd wr12, [%[pixels], #16] \n\t" 535 "add %[pixels], %[pixels], %[line_size] \n\t" 536 "pld [%[pixels]] \n\t" 537 "pld [%[pixels], #32] \n\t" 538 "walignr1 wr0, wr10, wr11 \n\t" 539 "walignr1 wr1, wr11, wr12 \n\t" 540 WAVG2B" wr8, wr0, wr4 \n\t" 541 WAVG2B" wr9, wr1, wr5 \n\t" 542 "wstrd wr8, [%[block]] \n\t" 543 "wstrd wr9, [%[block], #8] \n\t" 544 "add %[block], %[block], %[line_size] \n\t" 545 546 "subs %[h], %[h], #2 \n\t" 547 "bne 1b \n\t" 548 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 549 : 550 : "r4", "r5", "r12", "memory"); 551} 552 553void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 554{ 555 int stride = line_size; 556 // [wr0 wr1 wr2 wr3] for previous line 557 // [wr4 wr5 wr6 wr7] for current line 558 __asm__ volatile( 559 "pld [%[pixels]] \n\t" 560 "pld [%[pixels], #32] \n\t" 561 "and r12, %[pixels], #7 \n\t" 562 "tmcr wcgr1, r12 \n\t" 563 "bic %[pixels], %[pixels], #7 \n\t" 564 565 "wldrd wr10, [%[pixels]] \n\t" 566 "wldrd wr11, [%[pixels], #8] \n\t" 567 "pld [%[block]] \n\t" 568 "wldrd wr12, [%[pixels], #16] \n\t" 569 "add %[pixels], %[pixels], %[line_size] \n\t" 570 "pld [%[pixels]] \n\t" 571 "pld [%[pixels], #32] \n\t" 572 "walignr1 wr0, wr10, wr11 \n\t" 573 "walignr1 wr1, wr11, wr12 \n\t" 574 575 "1: \n\t" 576 "wldrd wr10, [%[pixels]] \n\t" 577 "wldrd wr11, [%[pixels], #8] \n\t" 578 "wldrd wr12, [%[pixels], #16] \n\t" 579 "add %[pixels], %[pixels], %[line_size] \n\t" 580 "pld [%[pixels]] \n\t" 581 "pld [%[pixels], #32] \n\t" 582 "walignr1 wr4, wr10, wr11 \n\t" 583 "walignr1 wr5, wr11, wr12 \n\t" 584 "wldrd wr10, [%[block]] \n\t" 585 "wldrd wr11, [%[block], #8] \n\t" 586 WAVG2B" wr8, wr0, wr4 \n\t" 587 WAVG2B" wr9, wr1, wr5 \n\t" 588 WAVG2B" wr8, wr8, wr10 \n\t" 589 WAVG2B" wr9, wr9, wr11 \n\t" 590 "wstrd wr8, [%[block]] \n\t" 591 "wstrd wr9, [%[block], #8] \n\t" 592 "add %[block], %[block], %[line_size] \n\t" 593 594 "wldrd wr10, [%[pixels]] \n\t" 595 "wldrd wr11, [%[pixels], #8] \n\t" 596 "pld [%[block]] \n\t" 597 "wldrd wr12, [%[pixels], #16] \n\t" 598 "add %[pixels], %[pixels], %[line_size] \n\t" 599 "pld [%[pixels]] \n\t" 600 "pld [%[pixels], #32] \n\t" 601 "walignr1 wr0, wr10, wr11 \n\t" 602 "walignr1 wr1, wr11, wr12 \n\t" 603 "wldrd wr10, [%[block]] \n\t" 604 "wldrd wr11, [%[block], #8] \n\t" 605 WAVG2B" wr8, wr0, wr4 \n\t" 606 WAVG2B" wr9, wr1, wr5 \n\t" 607 WAVG2B" wr8, wr8, wr10 \n\t" 608 WAVG2B" wr9, wr9, wr11 \n\t" 609 "wstrd wr8, [%[block]] \n\t" 610 "wstrd wr9, [%[block], #8] \n\t" 611 "add %[block], %[block], %[line_size] \n\t" 612 613 "subs %[h], %[h], #2 \n\t" 614 "pld [%[block]] \n\t" 615 "bne 1b \n\t" 616 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) 617 : 618 : "r4", "r5", "r12", "memory"); 619} 620 621void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 622{ 623 // [wr0 wr1 wr2 wr3] for previous line 624 // [wr4 wr5 wr6 wr7] for current line 625 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 626 __asm__ volatile( 627 "pld [%[pixels]] \n\t" 628 "mov r12, #2 \n\t" 629 "pld [%[pixels], #32] \n\t" 630 "tmcr wcgr0, r12 \n\t" /* for shift value */ 631 "and r12, %[pixels], #7 \n\t" 632 "bic %[pixels], %[pixels], #7 \n\t" 633 "tmcr wcgr1, r12 \n\t" 634 635 // [wr0 wr1 wr2 wr3] <= * 636 // [wr4 wr5 wr6 wr7] 637 "wldrd wr12, [%[pixels]] \n\t" 638 "add r12, r12, #1 \n\t" 639 "wldrd wr13, [%[pixels], #8] \n\t" 640 "tmcr wcgr2, r12 \n\t" 641 "add %[pixels], %[pixels], %[line_size] \n\t" 642 "cmp r12, #8 \n\t" 643 "pld [%[pixels]] \n\t" 644 "pld [%[pixels], #32] \n\t" 645 "walignr1 wr2, wr12, wr13 \n\t" 646 "wmoveq wr10, wr13 \n\t" 647 "walignr2ne wr10, wr12, wr13 \n\t" 648 "wunpckelub wr0, wr2 \n\t" 649 "wunpckehub wr1, wr2 \n\t" 650 "wunpckelub wr8, wr10 \n\t" 651 "wunpckehub wr9, wr10 \n\t" 652 "waddhus wr0, wr0, wr8 \n\t" 653 "waddhus wr1, wr1, wr9 \n\t" 654 655 "1: \n\t" 656 // [wr0 wr1 wr2 wr3] 657 // [wr4 wr5 wr6 wr7] <= * 658 "wldrd wr12, [%[pixels]] \n\t" 659 "cmp r12, #8 \n\t" 660 "wldrd wr13, [%[pixels], #8] \n\t" 661 "add %[pixels], %[pixels], %[line_size] \n\t" 662 "walignr1 wr6, wr12, wr13 \n\t" 663 "pld [%[pixels]] \n\t" 664 "pld [%[pixels], #32] \n\t" 665 "wmoveq wr10, wr13 \n\t" 666 "walignr2ne wr10, wr12, wr13 \n\t" 667 "wunpckelub wr4, wr6 \n\t" 668 "wunpckehub wr5, wr6 \n\t" 669 "wunpckelub wr8, wr10 \n\t" 670 "wunpckehub wr9, wr10 \n\t" 671 "waddhus wr4, wr4, wr8 \n\t" 672 "waddhus wr5, wr5, wr9 \n\t" 673 "waddhus wr8, wr0, wr4 \n\t" 674 "waddhus wr9, wr1, wr5 \n\t" 675 "waddhus wr8, wr8, wr15 \n\t" 676 "waddhus wr9, wr9, wr15 \n\t" 677 "wsrlhg wr8, wr8, wcgr0 \n\t" 678 "wsrlhg wr9, wr9, wcgr0 \n\t" 679 "wpackhus wr8, wr8, wr9 \n\t" 680 "wstrd wr8, [%[block]] \n\t" 681 "add %[block], %[block], %[line_size] \n\t" 682 683 // [wr0 wr1 wr2 wr3] <= * 684 // [wr4 wr5 wr6 wr7] 685 "wldrd wr12, [%[pixels]] \n\t" 686 "wldrd wr13, [%[pixels], #8] \n\t" 687 "add %[pixels], %[pixels], %[line_size] \n\t" 688 "walignr1 wr2, wr12, wr13 \n\t" 689 "pld [%[pixels]] \n\t" 690 "pld [%[pixels], #32] \n\t" 691 "wmoveq wr10, wr13 \n\t" 692 "walignr2ne wr10, wr12, wr13 \n\t" 693 "wunpckelub wr0, wr2 \n\t" 694 "wunpckehub wr1, wr2 \n\t" 695 "wunpckelub wr8, wr10 \n\t" 696 "wunpckehub wr9, wr10 \n\t" 697 "waddhus wr0, wr0, wr8 \n\t" 698 "waddhus wr1, wr1, wr9 \n\t" 699 "waddhus wr8, wr0, wr4 \n\t" 700 "waddhus wr9, wr1, wr5 \n\t" 701 "waddhus wr8, wr8, wr15 \n\t" 702 "waddhus wr9, wr9, wr15 \n\t" 703 "wsrlhg wr8, wr8, wcgr0 \n\t" 704 "wsrlhg wr9, wr9, wcgr0 \n\t" 705 "wpackhus wr8, wr8, wr9 \n\t" 706 "subs %[h], %[h], #2 \n\t" 707 "wstrd wr8, [%[block]] \n\t" 708 "add %[block], %[block], %[line_size] \n\t" 709 "bne 1b \n\t" 710 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) 711 : [line_size]"r"(line_size) 712 : "r12", "memory"); 713} 714 715void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 716{ 717 // [wr0 wr1 wr2 wr3] for previous line 718 // [wr4 wr5 wr6 wr7] for current line 719 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 720 __asm__ volatile( 721 "pld [%[pixels]] \n\t" 722 "mov r12, #2 \n\t" 723 "pld [%[pixels], #32] \n\t" 724 "tmcr wcgr0, r12 \n\t" /* for shift value */ 725 /* alignment */ 726 "and r12, %[pixels], #7 \n\t" 727 "bic %[pixels], %[pixels], #7 \n\t" 728 "tmcr wcgr1, r12 \n\t" 729 "add r12, r12, #1 \n\t" 730 "tmcr wcgr2, r12 \n\t" 731 732 // [wr0 wr1 wr2 wr3] <= * 733 // [wr4 wr5 wr6 wr7] 734 "wldrd wr12, [%[pixels]] \n\t" 735 "cmp r12, #8 \n\t" 736 "wldrd wr13, [%[pixels], #8] \n\t" 737 "wldrd wr14, [%[pixels], #16] \n\t" 738 "add %[pixels], %[pixels], %[line_size] \n\t" 739 "pld [%[pixels]] \n\t" 740 "walignr1 wr2, wr12, wr13 \n\t" 741 "pld [%[pixels], #32] \n\t" 742 "walignr1 wr3, wr13, wr14 \n\t" 743 "wmoveq wr10, wr13 \n\t" 744 "wmoveq wr11, wr14 \n\t" 745 "walignr2ne wr10, wr12, wr13 \n\t" 746 "walignr2ne wr11, wr13, wr14 \n\t" 747 "wunpckelub wr0, wr2 \n\t" 748 "wunpckehub wr1, wr2 \n\t" 749 "wunpckelub wr2, wr3 \n\t" 750 "wunpckehub wr3, wr3 \n\t" 751 "wunpckelub wr8, wr10 \n\t" 752 "wunpckehub wr9, wr10 \n\t" 753 "wunpckelub wr10, wr11 \n\t" 754 "wunpckehub wr11, wr11 \n\t" 755 "waddhus wr0, wr0, wr8 \n\t" 756 "waddhus wr1, wr1, wr9 \n\t" 757 "waddhus wr2, wr2, wr10 \n\t" 758 "waddhus wr3, wr3, wr11 \n\t" 759 760 "1: \n\t" 761 // [wr0 wr1 wr2 wr3] 762 // [wr4 wr5 wr6 wr7] <= * 763 "wldrd wr12, [%[pixels]] \n\t" 764 "cmp r12, #8 \n\t" 765 "wldrd wr13, [%[pixels], #8] \n\t" 766 "wldrd wr14, [%[pixels], #16] \n\t" 767 "add %[pixels], %[pixels], %[line_size] \n\t" 768 "walignr1 wr6, wr12, wr13 \n\t" 769 "pld [%[pixels]] \n\t" 770 "pld [%[pixels], #32] \n\t" 771 "walignr1 wr7, wr13, wr14 \n\t" 772 "wmoveq wr10, wr13 \n\t" 773 "wmoveq wr11, wr14 \n\t" 774 "walignr2ne wr10, wr12, wr13 \n\t" 775 "walignr2ne wr11, wr13, wr14 \n\t" 776 "wunpckelub wr4, wr6 \n\t" 777 "wunpckehub wr5, wr6 \n\t" 778 "wunpckelub wr6, wr7 \n\t" 779 "wunpckehub wr7, wr7 \n\t" 780 "wunpckelub wr8, wr10 \n\t" 781 "wunpckehub wr9, wr10 \n\t" 782 "wunpckelub wr10, wr11 \n\t" 783 "wunpckehub wr11, wr11 \n\t" 784 "waddhus wr4, wr4, wr8 \n\t" 785 "waddhus wr5, wr5, wr9 \n\t" 786 "waddhus wr6, wr6, wr10 \n\t" 787 "waddhus wr7, wr7, wr11 \n\t" 788 "waddhus wr8, wr0, wr4 \n\t" 789 "waddhus wr9, wr1, wr5 \n\t" 790 "waddhus wr10, wr2, wr6 \n\t" 791 "waddhus wr11, wr3, wr7 \n\t" 792 "waddhus wr8, wr8, wr15 \n\t" 793 "waddhus wr9, wr9, wr15 \n\t" 794 "waddhus wr10, wr10, wr15 \n\t" 795 "waddhus wr11, wr11, wr15 \n\t" 796 "wsrlhg wr8, wr8, wcgr0 \n\t" 797 "wsrlhg wr9, wr9, wcgr0 \n\t" 798 "wsrlhg wr10, wr10, wcgr0 \n\t" 799 "wsrlhg wr11, wr11, wcgr0 \n\t" 800 "wpackhus wr8, wr8, wr9 \n\t" 801 "wpackhus wr9, wr10, wr11 \n\t" 802 "wstrd wr8, [%[block]] \n\t" 803 "wstrd wr9, [%[block], #8] \n\t" 804 "add %[block], %[block], %[line_size] \n\t" 805 806 // [wr0 wr1 wr2 wr3] <= * 807 // [wr4 wr5 wr6 wr7] 808 "wldrd wr12, [%[pixels]] \n\t" 809 "wldrd wr13, [%[pixels], #8] \n\t" 810 "wldrd wr14, [%[pixels], #16] \n\t" 811 "add %[pixels], %[pixels], %[line_size] \n\t" 812 "walignr1 wr2, wr12, wr13 \n\t" 813 "pld [%[pixels]] \n\t" 814 "pld [%[pixels], #32] \n\t" 815 "walignr1 wr3, wr13, wr14 \n\t" 816 "wmoveq wr10, wr13 \n\t" 817 "wmoveq wr11, wr14 \n\t" 818 "walignr2ne wr10, wr12, wr13 \n\t" 819 "walignr2ne wr11, wr13, wr14 \n\t" 820 "wunpckelub wr0, wr2 \n\t" 821 "wunpckehub wr1, wr2 \n\t" 822 "wunpckelub wr2, wr3 \n\t" 823 "wunpckehub wr3, wr3 \n\t" 824 "wunpckelub wr8, wr10 \n\t" 825 "wunpckehub wr9, wr10 \n\t" 826 "wunpckelub wr10, wr11 \n\t" 827 "wunpckehub wr11, wr11 \n\t" 828 "waddhus wr0, wr0, wr8 \n\t" 829 "waddhus wr1, wr1, wr9 \n\t" 830 "waddhus wr2, wr2, wr10 \n\t" 831 "waddhus wr3, wr3, wr11 \n\t" 832 "waddhus wr8, wr0, wr4 \n\t" 833 "waddhus wr9, wr1, wr5 \n\t" 834 "waddhus wr10, wr2, wr6 \n\t" 835 "waddhus wr11, wr3, wr7 \n\t" 836 "waddhus wr8, wr8, wr15 \n\t" 837 "waddhus wr9, wr9, wr15 \n\t" 838 "waddhus wr10, wr10, wr15 \n\t" 839 "waddhus wr11, wr11, wr15 \n\t" 840 "wsrlhg wr8, wr8, wcgr0 \n\t" 841 "wsrlhg wr9, wr9, wcgr0 \n\t" 842 "wsrlhg wr10, wr10, wcgr0 \n\t" 843 "wsrlhg wr11, wr11, wcgr0 \n\t" 844 "wpackhus wr8, wr8, wr9 \n\t" 845 "wpackhus wr9, wr10, wr11 \n\t" 846 "wstrd wr8, [%[block]] \n\t" 847 "wstrd wr9, [%[block], #8] \n\t" 848 "add %[block], %[block], %[line_size] \n\t" 849 850 "subs %[h], %[h], #2 \n\t" 851 "bne 1b \n\t" 852 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) 853 : [line_size]"r"(line_size) 854 : "r12", "memory"); 855} 856 857void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 858{ 859 // [wr0 wr1 wr2 wr3] for previous line 860 // [wr4 wr5 wr6 wr7] for current line 861 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 862 __asm__ volatile( 863 "pld [%[block]] \n\t" 864 "pld [%[block], #32] \n\t" 865 "pld [%[pixels]] \n\t" 866 "mov r12, #2 \n\t" 867 "pld [%[pixels], #32] \n\t" 868 "tmcr wcgr0, r12 \n\t" /* for shift value */ 869 "and r12, %[pixels], #7 \n\t" 870 "bic %[pixels], %[pixels], #7 \n\t" 871 "tmcr wcgr1, r12 \n\t" 872 873 // [wr0 wr1 wr2 wr3] <= * 874 // [wr4 wr5 wr6 wr7] 875 "wldrd wr12, [%[pixels]] \n\t" 876 "add r12, r12, #1 \n\t" 877 "wldrd wr13, [%[pixels], #8] \n\t" 878 "tmcr wcgr2, r12 \n\t" 879 "add %[pixels], %[pixels], %[line_size] \n\t" 880 "cmp r12, #8 \n\t" 881 "pld [%[pixels]] \n\t" 882 "pld [%[pixels], #32] \n\t" 883 "walignr1 wr2, wr12, wr13 \n\t" 884 "wmoveq wr10, wr13 \n\t" 885 "walignr2ne wr10, wr12, wr13 \n\t" 886 "wunpckelub wr0, wr2 \n\t" 887 "wunpckehub wr1, wr2 \n\t" 888 "wunpckelub wr8, wr10 \n\t" 889 "wunpckehub wr9, wr10 \n\t" 890 "waddhus wr0, wr0, wr8 \n\t" 891 "waddhus wr1, wr1, wr9 \n\t" 892 893 "1: \n\t" 894 // [wr0 wr1 wr2 wr3] 895 // [wr4 wr5 wr6 wr7] <= * 896 "wldrd wr12, [%[pixels]] \n\t" 897 "cmp r12, #8 \n\t" 898 "wldrd wr13, [%[pixels], #8] \n\t" 899 "add %[pixels], %[pixels], %[line_size] \n\t" 900 "walignr1 wr6, wr12, wr13 \n\t" 901 "pld [%[pixels]] \n\t" 902 "pld [%[pixels], #32] \n\t" 903 "wmoveq wr10, wr13 \n\t" 904 "walignr2ne wr10, wr12, wr13 \n\t" 905 "wunpckelub wr4, wr6 \n\t" 906 "wunpckehub wr5, wr6 \n\t" 907 "wunpckelub wr8, wr10 \n\t" 908 "wunpckehub wr9, wr10 \n\t" 909 "waddhus wr4, wr4, wr8 \n\t" 910 "waddhus wr5, wr5, wr9 \n\t" 911 "waddhus wr8, wr0, wr4 \n\t" 912 "waddhus wr9, wr1, wr5 \n\t" 913 "waddhus wr8, wr8, wr15 \n\t" 914 "waddhus wr9, wr9, wr15 \n\t" 915 "wldrd wr12, [%[block]] \n\t" 916 "wsrlhg wr8, wr8, wcgr0 \n\t" 917 "wsrlhg wr9, wr9, wcgr0 \n\t" 918 "wpackhus wr8, wr8, wr9 \n\t" 919 WAVG2B" wr8, wr8, wr12 \n\t" 920 "wstrd wr8, [%[block]] \n\t" 921 "add %[block], %[block], %[line_size] \n\t" 922 "wldrd wr12, [%[pixels]] \n\t" 923 "pld [%[block]] \n\t" 924 "pld [%[block], #32] \n\t" 925 926 // [wr0 wr1 wr2 wr3] <= * 927 // [wr4 wr5 wr6 wr7] 928 "wldrd wr13, [%[pixels], #8] \n\t" 929 "add %[pixels], %[pixels], %[line_size] \n\t" 930 "walignr1 wr2, wr12, wr13 \n\t" 931 "pld [%[pixels]] \n\t" 932 "pld [%[pixels], #32] \n\t" 933 "wmoveq wr10, wr13 \n\t" 934 "walignr2ne wr10, wr12, wr13 \n\t" 935 "wunpckelub wr0, wr2 \n\t" 936 "wunpckehub wr1, wr2 \n\t" 937 "wunpckelub wr8, wr10 \n\t" 938 "wunpckehub wr9, wr10 \n\t" 939 "waddhus wr0, wr0, wr8 \n\t" 940 "waddhus wr1, wr1, wr9 \n\t" 941 "waddhus wr8, wr0, wr4 \n\t" 942 "waddhus wr9, wr1, wr5 \n\t" 943 "waddhus wr8, wr8, wr15 \n\t" 944 "waddhus wr9, wr9, wr15 \n\t" 945 "wldrd wr12, [%[block]] \n\t" 946 "wsrlhg wr8, wr8, wcgr0 \n\t" 947 "wsrlhg wr9, wr9, wcgr0 \n\t" 948 "wpackhus wr8, wr8, wr9 \n\t" 949 "subs %[h], %[h], #2 \n\t" 950 WAVG2B" wr8, wr8, wr12 \n\t" 951 "wstrd wr8, [%[block]] \n\t" 952 "add %[block], %[block], %[line_size] \n\t" 953 "pld [%[block]] \n\t" 954 "pld [%[block], #32] \n\t" 955 "bne 1b \n\t" 956 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) 957 : [line_size]"r"(line_size) 958 : "r12", "memory"); 959} 960 961void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) 962{ 963 // [wr0 wr1 wr2 wr3] for previous line 964 // [wr4 wr5 wr6 wr7] for current line 965 SET_RND(wr15); // =2 for rnd and =1 for no_rnd version 966 __asm__ volatile( 967 "pld [%[block]] \n\t" 968 "pld [%[block], #32] \n\t" 969 "pld [%[pixels]] \n\t" 970 "mov r12, #2 \n\t" 971 "pld [%[pixels], #32] \n\t" 972 "tmcr wcgr0, r12 \n\t" /* for shift value */ 973 /* alignment */ 974 "and r12, %[pixels], #7 \n\t" 975 "bic %[pixels], %[pixels], #7 \n\t" 976 "tmcr wcgr1, r12 \n\t" 977 "add r12, r12, #1 \n\t" 978 "tmcr wcgr2, r12 \n\t" 979 980 // [wr0 wr1 wr2 wr3] <= * 981 // [wr4 wr5 wr6 wr7] 982 "wldrd wr12, [%[pixels]] \n\t" 983 "cmp r12, #8 \n\t" 984 "wldrd wr13, [%[pixels], #8] \n\t" 985 "wldrd wr14, [%[pixels], #16] \n\t" 986 "add %[pixels], %[pixels], %[line_size] \n\t" 987 "pld [%[pixels]] \n\t" 988 "walignr1 wr2, wr12, wr13 \n\t" 989 "pld [%[pixels], #32] \n\t" 990 "walignr1 wr3, wr13, wr14 \n\t" 991 "wmoveq wr10, wr13 \n\t" 992 "wmoveq wr11, wr14 \n\t" 993 "walignr2ne wr10, wr12, wr13 \n\t" 994 "walignr2ne wr11, wr13, wr14 \n\t" 995 "wunpckelub wr0, wr2 \n\t" 996 "wunpckehub wr1, wr2 \n\t" 997 "wunpckelub wr2, wr3 \n\t" 998 "wunpckehub wr3, wr3 \n\t" 999 "wunpckelub wr8, wr10 \n\t" 1000 "wunpckehub wr9, wr10 \n\t" 1001 "wunpckelub wr10, wr11 \n\t" 1002 "wunpckehub wr11, wr11 \n\t" 1003 "waddhus wr0, wr0, wr8 \n\t" 1004 "waddhus wr1, wr1, wr9 \n\t" 1005 "waddhus wr2, wr2, wr10 \n\t" 1006 "waddhus wr3, wr3, wr11 \n\t" 1007 1008 "1: \n\t" 1009 // [wr0 wr1 wr2 wr3] 1010 // [wr4 wr5 wr6 wr7] <= * 1011 "wldrd wr12, [%[pixels]] \n\t" 1012 "cmp r12, #8 \n\t" 1013 "wldrd wr13, [%[pixels], #8] \n\t" 1014 "wldrd wr14, [%[pixels], #16] \n\t" 1015 "add %[pixels], %[pixels], %[line_size] \n\t" 1016 "walignr1 wr6, wr12, wr13 \n\t" 1017 "pld [%[pixels]] \n\t" 1018 "pld [%[pixels], #32] \n\t" 1019 "walignr1 wr7, wr13, wr14 \n\t" 1020 "wmoveq wr10, wr13 \n\t" 1021 "wmoveq wr11, wr14 \n\t" 1022 "walignr2ne wr10, wr12, wr13 \n\t" 1023 "walignr2ne wr11, wr13, wr14 \n\t" 1024 "wunpckelub wr4, wr6 \n\t" 1025 "wunpckehub wr5, wr6 \n\t" 1026 "wunpckelub wr6, wr7 \n\t" 1027 "wunpckehub wr7, wr7 \n\t" 1028 "wunpckelub wr8, wr10 \n\t" 1029 "wunpckehub wr9, wr10 \n\t" 1030 "wunpckelub wr10, wr11 \n\t" 1031 "wunpckehub wr11, wr11 \n\t" 1032 "waddhus wr4, wr4, wr8 \n\t" 1033 "waddhus wr5, wr5, wr9 \n\t" 1034 "waddhus wr6, wr6, wr10 \n\t" 1035 "waddhus wr7, wr7, wr11 \n\t" 1036 "waddhus wr8, wr0, wr4 \n\t" 1037 "waddhus wr9, wr1, wr5 \n\t" 1038 "waddhus wr10, wr2, wr6 \n\t" 1039 "waddhus wr11, wr3, wr7 \n\t" 1040 "waddhus wr8, wr8, wr15 \n\t" 1041 "waddhus wr9, wr9, wr15 \n\t" 1042 "waddhus wr10, wr10, wr15 \n\t" 1043 "waddhus wr11, wr11, wr15 \n\t" 1044 "wsrlhg wr8, wr8, wcgr0 \n\t" 1045 "wsrlhg wr9, wr9, wcgr0 \n\t" 1046 "wldrd wr12, [%[block]] \n\t" 1047 "wldrd wr13, [%[block], #8] \n\t" 1048 "wsrlhg wr10, wr10, wcgr0 \n\t" 1049 "wsrlhg wr11, wr11, wcgr0 \n\t" 1050 "wpackhus wr8, wr8, wr9 \n\t" 1051 "wpackhus wr9, wr10, wr11 \n\t" 1052 WAVG2B" wr8, wr8, wr12 \n\t" 1053 WAVG2B" wr9, wr9, wr13 \n\t" 1054 "wstrd wr8, [%[block]] \n\t" 1055 "wstrd wr9, [%[block], #8] \n\t" 1056 "add %[block], %[block], %[line_size] \n\t" 1057 1058 // [wr0 wr1 wr2 wr3] <= * 1059 // [wr4 wr5 wr6 wr7] 1060 "wldrd wr12, [%[pixels]] \n\t" 1061 "pld [%[block]] \n\t" 1062 "wldrd wr13, [%[pixels], #8] \n\t" 1063 "pld [%[block], #32] \n\t" 1064 "wldrd wr14, [%[pixels], #16] \n\t" 1065 "add %[pixels], %[pixels], %[line_size] \n\t" 1066 "walignr1 wr2, wr12, wr13 \n\t" 1067 "pld [%[pixels]] \n\t" 1068 "pld [%[pixels], #32] \n\t" 1069 "walignr1 wr3, wr13, wr14 \n\t" 1070 "wmoveq wr10, wr13 \n\t" 1071 "wmoveq wr11, wr14 \n\t" 1072 "walignr2ne wr10, wr12, wr13 \n\t" 1073 "walignr2ne wr11, wr13, wr14 \n\t" 1074 "wunpckelub wr0, wr2 \n\t" 1075 "wunpckehub wr1, wr2 \n\t" 1076 "wunpckelub wr2, wr3 \n\t" 1077 "wunpckehub wr3, wr3 \n\t" 1078 "wunpckelub wr8, wr10 \n\t" 1079 "wunpckehub wr9, wr10 \n\t" 1080 "wunpckelub wr10, wr11 \n\t" 1081 "wunpckehub wr11, wr11 \n\t" 1082 "waddhus wr0, wr0, wr8 \n\t" 1083 "waddhus wr1, wr1, wr9 \n\t" 1084 "waddhus wr2, wr2, wr10 \n\t" 1085 "waddhus wr3, wr3, wr11 \n\t" 1086 "waddhus wr8, wr0, wr4 \n\t" 1087 "waddhus wr9, wr1, wr5 \n\t" 1088 "waddhus wr10, wr2, wr6 \n\t" 1089 "waddhus wr11, wr3, wr7 \n\t" 1090 "waddhus wr8, wr8, wr15 \n\t" 1091 "waddhus wr9, wr9, wr15 \n\t" 1092 "waddhus wr10, wr10, wr15 \n\t" 1093 "waddhus wr11, wr11, wr15 \n\t" 1094 "wsrlhg wr8, wr8, wcgr0 \n\t" 1095 "wsrlhg wr9, wr9, wcgr0 \n\t" 1096 "wldrd wr12, [%[block]] \n\t" 1097 "wldrd wr13, [%[block], #8] \n\t" 1098 "wsrlhg wr10, wr10, wcgr0 \n\t" 1099 "wsrlhg wr11, wr11, wcgr0 \n\t" 1100 "wpackhus wr8, wr8, wr9 \n\t" 1101 "wpackhus wr9, wr10, wr11 \n\t" 1102 WAVG2B" wr8, wr8, wr12 \n\t" 1103 WAVG2B" wr9, wr9, wr13 \n\t" 1104 "wstrd wr8, [%[block]] \n\t" 1105 "wstrd wr9, [%[block], #8] \n\t" 1106 "add %[block], %[block], %[line_size] \n\t" 1107 "subs %[h], %[h], #2 \n\t" 1108 "pld [%[block]] \n\t" 1109 "pld [%[block], #32] \n\t" 1110 "bne 1b \n\t" 1111 : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) 1112 : [line_size]"r"(line_size) 1113 : "r12", "memory"); 1114} 1115