1/* 2 * Simple IDCT 3 * 4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> 5 * Copyright (c) 2006 Mans Rullgard <mans@mansr.com> 6 * 7 * This file is part of Libav. 8 * 9 * Libav is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU Lesser General Public 11 * License as published by the Free Software Foundation; either 12 * version 2.1 of the License, or (at your option) any later version. 13 * 14 * Libav is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * Lesser General Public License for more details. 18 * 19 * You should have received a copy of the GNU Lesser General Public 20 * License along with Libav; if not, write to the Free Software 21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 */ 23 24#include "asm.S" 25 26#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 27#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 28#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 29#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 30#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 31#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 32#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ 33#define ROW_SHIFT 11 34#define COL_SHIFT 20 35 36#define W13 (W1 | (W3 << 16)) 37#define W26 (W2 | (W6 << 16)) 38#define W57 (W5 | (W7 << 16)) 39 40 .text 41 .align 42w13: .long W13 43w26: .long W26 44w57: .long W57 45 46function idct_row_armv5te 47 str lr, [sp, #-4]! 48 49 ldrd v1, [a1, #8] 50 ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */ 51 orrs v1, v1, v2 52 itt eq 53 cmpeq v1, a4 54 cmpeq v1, a3, lsr #16 55 beq row_dc_only 56 57 mov v1, #(1<<(ROW_SHIFT-1)) 58 mov ip, #16384 59 sub ip, ip, #1 /* ip = W4 */ 60 smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */ 61 ldr ip, w26 /* ip = W2 | (W6 << 16) */ 62 smultb a2, ip, a4 63 smulbb lr, ip, a4 64 add v2, v1, a2 65 sub v3, v1, a2 66 sub v4, v1, lr 67 add v1, v1, lr 68 69 ldr ip, w13 /* ip = W1 | (W3 << 16) */ 70 ldr lr, w57 /* lr = W5 | (W7 << 16) */ 71 smulbt v5, ip, a3 72 smultt v6, lr, a4 73 smlatt v5, ip, a4, v5 74 smultt a2, ip, a3 75 smulbt v7, lr, a3 76 sub v6, v6, a2 77 smulbt a2, ip, a4 78 smultt fp, lr, a3 79 sub v7, v7, a2 80 smulbt a2, lr, a4 81 ldrd a3, [a1, #8] /* a3=row[5:4] a4=row[7:6] */ 82 sub fp, fp, a2 83 84 orrs a2, a3, a4 85 beq 1f 86 87 smlabt v5, lr, a3, v5 88 smlabt v6, ip, a3, v6 89 smlatt v5, lr, a4, v5 90 smlabt v6, lr, a4, v6 91 smlatt v7, lr, a3, v7 92 smlatt fp, ip, a3, fp 93 smulbt a2, ip, a4 94 smlatt v7, ip, a4, v7 95 sub fp, fp, a2 96 97 ldr ip, w26 /* ip = W2 | (W6 << 16) */ 98 mov a2, #16384 99 sub a2, a2, #1 /* a2 = W4 */ 100 smulbb a2, a2, a3 /* a2 = W4*row[4] */ 101 smultb lr, ip, a4 /* lr = W6*row[6] */ 102 add v1, v1, a2 /* v1 += W4*row[4] */ 103 add v1, v1, lr /* v1 += W6*row[6] */ 104 add v4, v4, a2 /* v4 += W4*row[4] */ 105 sub v4, v4, lr /* v4 -= W6*row[6] */ 106 smulbb lr, ip, a4 /* lr = W2*row[6] */ 107 sub v2, v2, a2 /* v2 -= W4*row[4] */ 108 sub v2, v2, lr /* v2 -= W2*row[6] */ 109 sub v3, v3, a2 /* v3 -= W4*row[4] */ 110 add v3, v3, lr /* v3 += W2*row[6] */ 111 1121: add a2, v1, v5 113 mov a3, a2, lsr #11 114 bic a3, a3, #0x1f0000 115 sub a2, v2, v6 116 mov a2, a2, lsr #11 117 add a3, a3, a2, lsl #16 118 add a2, v3, v7 119 mov a4, a2, lsr #11 120 bic a4, a4, #0x1f0000 121 add a2, v4, fp 122 mov a2, a2, lsr #11 123 add a4, a4, a2, lsl #16 124 strd a3, [a1] 125 126 sub a2, v4, fp 127 mov a3, a2, lsr #11 128 bic a3, a3, #0x1f0000 129 sub a2, v3, v7 130 mov a2, a2, lsr #11 131 add a3, a3, a2, lsl #16 132 add a2, v2, v6 133 mov a4, a2, lsr #11 134 bic a4, a4, #0x1f0000 135 sub a2, v1, v5 136 mov a2, a2, lsr #11 137 add a4, a4, a2, lsl #16 138 strd a3, [a1, #8] 139 140 ldr pc, [sp], #4 141 142row_dc_only: 143 orr a3, a3, a3, lsl #16 144 bic a3, a3, #0xe000 145 mov a3, a3, lsl #3 146 mov a4, a3 147 strd a3, [a1] 148 strd a3, [a1, #8] 149 150 ldr pc, [sp], #4 151endfunc 152 153 .macro idct_col 154 ldr a4, [a1] /* a4 = col[1:0] */ 155 mov ip, #16384 156 sub ip, ip, #1 /* ip = W4 */ 157#if 0 158 mov v1, #(1<<(COL_SHIFT-1)) 159 smlabt v2, ip, a4, v1 /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */ 160 smlabb v1, ip, a4, v1 /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */ 161 ldr a4, [a1, #(16*4)] 162#else 163 mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */ 164 add v2, v1, a4, asr #16 165 rsb v2, v2, v2, lsl #14 166 mov a4, a4, lsl #16 167 add v1, v1, a4, asr #16 168 ldr a4, [a1, #(16*4)] 169 rsb v1, v1, v1, lsl #14 170#endif 171 172 smulbb lr, ip, a4 173 smulbt a3, ip, a4 174 sub v3, v1, lr 175 sub v5, v1, lr 176 add v7, v1, lr 177 add v1, v1, lr 178 sub v4, v2, a3 179 sub v6, v2, a3 180 add fp, v2, a3 181 ldr ip, w26 182 ldr a4, [a1, #(16*2)] 183 add v2, v2, a3 184 185 smulbb lr, ip, a4 186 smultb a3, ip, a4 187 add v1, v1, lr 188 sub v7, v7, lr 189 add v3, v3, a3 190 sub v5, v5, a3 191 smulbt lr, ip, a4 192 smultt a3, ip, a4 193 add v2, v2, lr 194 sub fp, fp, lr 195 add v4, v4, a3 196 ldr a4, [a1, #(16*6)] 197 sub v6, v6, a3 198 199 smultb lr, ip, a4 200 smulbb a3, ip, a4 201 add v1, v1, lr 202 sub v7, v7, lr 203 sub v3, v3, a3 204 add v5, v5, a3 205 smultt lr, ip, a4 206 smulbt a3, ip, a4 207 add v2, v2, lr 208 sub fp, fp, lr 209 sub v4, v4, a3 210 add v6, v6, a3 211 212 stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp} 213 214 ldr ip, w13 215 ldr a4, [a1, #(16*1)] 216 ldr lr, w57 217 smulbb v1, ip, a4 218 smultb v3, ip, a4 219 smulbb v5, lr, a4 220 smultb v7, lr, a4 221 smulbt v2, ip, a4 222 smultt v4, ip, a4 223 smulbt v6, lr, a4 224 smultt fp, lr, a4 225 rsb v4, v4, #0 226 ldr a4, [a1, #(16*3)] 227 rsb v3, v3, #0 228 229 smlatb v1, ip, a4, v1 230 smlatb v3, lr, a4, v3 231 smulbb a3, ip, a4 232 smulbb a2, lr, a4 233 sub v5, v5, a3 234 sub v7, v7, a2 235 smlatt v2, ip, a4, v2 236 smlatt v4, lr, a4, v4 237 smulbt a3, ip, a4 238 smulbt a2, lr, a4 239 sub v6, v6, a3 240 ldr a4, [a1, #(16*5)] 241 sub fp, fp, a2 242 243 smlabb v1, lr, a4, v1 244 smlabb v3, ip, a4, v3 245 smlatb v5, lr, a4, v5 246 smlatb v7, ip, a4, v7 247 smlabt v2, lr, a4, v2 248 smlabt v4, ip, a4, v4 249 smlatt v6, lr, a4, v6 250 ldr a3, [a1, #(16*7)] 251 smlatt fp, ip, a4, fp 252 253 smlatb v1, lr, a3, v1 254 smlabb v3, lr, a3, v3 255 smlatb v5, ip, a3, v5 256 smulbb a4, ip, a3 257 smlatt v2, lr, a3, v2 258 sub v7, v7, a4 259 smlabt v4, lr, a3, v4 260 smulbt a4, ip, a3 261 smlatt v6, ip, a3, v6 262 sub fp, fp, a4 263 .endm 264 265function idct_col_armv5te 266 str lr, [sp, #-4]! 267 268 idct_col 269 270 ldmfd sp!, {a3, a4} 271 adds a2, a3, v1 272 mov a2, a2, lsr #20 273 it mi 274 orrmi a2, a2, #0xf000 275 add ip, a4, v2 276 mov ip, ip, asr #20 277 orr a2, a2, ip, lsl #16 278 str a2, [a1] 279 subs a3, a3, v1 280 mov a2, a3, lsr #20 281 it mi 282 orrmi a2, a2, #0xf000 283 sub a4, a4, v2 284 mov a4, a4, asr #20 285 orr a2, a2, a4, lsl #16 286 ldmfd sp!, {a3, a4} 287 str a2, [a1, #(16*7)] 288 289 subs a2, a3, v3 290 mov a2, a2, lsr #20 291 it mi 292 orrmi a2, a2, #0xf000 293 sub ip, a4, v4 294 mov ip, ip, asr #20 295 orr a2, a2, ip, lsl #16 296 str a2, [a1, #(16*1)] 297 adds a3, a3, v3 298 mov a2, a3, lsr #20 299 it mi 300 orrmi a2, a2, #0xf000 301 add a4, a4, v4 302 mov a4, a4, asr #20 303 orr a2, a2, a4, lsl #16 304 ldmfd sp!, {a3, a4} 305 str a2, [a1, #(16*6)] 306 307 adds a2, a3, v5 308 mov a2, a2, lsr #20 309 it mi 310 orrmi a2, a2, #0xf000 311 add ip, a4, v6 312 mov ip, ip, asr #20 313 orr a2, a2, ip, lsl #16 314 str a2, [a1, #(16*2)] 315 subs a3, a3, v5 316 mov a2, a3, lsr #20 317 it mi 318 orrmi a2, a2, #0xf000 319 sub a4, a4, v6 320 mov a4, a4, asr #20 321 orr a2, a2, a4, lsl #16 322 ldmfd sp!, {a3, a4} 323 str a2, [a1, #(16*5)] 324 325 adds a2, a3, v7 326 mov a2, a2, lsr #20 327 it mi 328 orrmi a2, a2, #0xf000 329 add ip, a4, fp 330 mov ip, ip, asr #20 331 orr a2, a2, ip, lsl #16 332 str a2, [a1, #(16*3)] 333 subs a3, a3, v7 334 mov a2, a3, lsr #20 335 it mi 336 orrmi a2, a2, #0xf000 337 sub a4, a4, fp 338 mov a4, a4, asr #20 339 orr a2, a2, a4, lsl #16 340 str a2, [a1, #(16*4)] 341 342 ldr pc, [sp], #4 343endfunc 344 345.macro clip dst, src:vararg 346 movs \dst, \src 347 it mi 348 movmi \dst, #0 349 cmp \dst, #255 350 it gt 351 movgt \dst, #255 352.endm 353 354.macro aclip dst, src:vararg 355 adds \dst, \src 356 it mi 357 movmi \dst, #0 358 cmp \dst, #255 359 it gt 360 movgt \dst, #255 361.endm 362 363function idct_col_put_armv5te 364 str lr, [sp, #-4]! 365 366 idct_col 367 368 ldmfd sp!, {a3, a4} 369 ldr lr, [sp, #32] 370 add a2, a3, v1 371 clip a2, a2, asr #20 372 add ip, a4, v2 373 clip ip, ip, asr #20 374 orr a2, a2, ip, lsl #8 375 sub a3, a3, v1 376 clip a3, a3, asr #20 377 sub a4, a4, v2 378 clip a4, a4, asr #20 379 ldr v1, [sp, #28] 380 strh a2, [v1] 381 add a2, v1, #2 382 str a2, [sp, #28] 383 orr a2, a3, a4, lsl #8 384 rsb v2, lr, lr, lsl #3 385 ldmfd sp!, {a3, a4} 386 strh_pre a2, v2, v1 387 388 sub a2, a3, v3 389 clip a2, a2, asr #20 390 sub ip, a4, v4 391 clip ip, ip, asr #20 392 orr a2, a2, ip, lsl #8 393 strh_pre a2, v1, lr 394 add a3, a3, v3 395 clip a2, a3, asr #20 396 add a4, a4, v4 397 clip a4, a4, asr #20 398 orr a2, a2, a4, lsl #8 399 ldmfd sp!, {a3, a4} 400 strh_dpre a2, v2, lr 401 402 add a2, a3, v5 403 clip a2, a2, asr #20 404 add ip, a4, v6 405 clip ip, ip, asr #20 406 orr a2, a2, ip, lsl #8 407 strh_pre a2, v1, lr 408 sub a3, a3, v5 409 clip a2, a3, asr #20 410 sub a4, a4, v6 411 clip a4, a4, asr #20 412 orr a2, a2, a4, lsl #8 413 ldmfd sp!, {a3, a4} 414 strh_dpre a2, v2, lr 415 416 add a2, a3, v7 417 clip a2, a2, asr #20 418 add ip, a4, fp 419 clip ip, ip, asr #20 420 orr a2, a2, ip, lsl #8 421 strh a2, [v1, lr] 422 sub a3, a3, v7 423 clip a2, a3, asr #20 424 sub a4, a4, fp 425 clip a4, a4, asr #20 426 orr a2, a2, a4, lsl #8 427 strh_dpre a2, v2, lr 428 429 ldr pc, [sp], #4 430endfunc 431 432function idct_col_add_armv5te 433 str lr, [sp, #-4]! 434 435 idct_col 436 437 ldr lr, [sp, #36] 438 439 ldmfd sp!, {a3, a4} 440 ldrh ip, [lr] 441 add a2, a3, v1 442 sub a3, a3, v1 443 and v1, ip, #255 444 aclip a2, v1, a2, asr #20 445 add v1, a4, v2 446 mov v1, v1, asr #20 447 aclip v1, v1, ip, lsr #8 448 orr a2, a2, v1, lsl #8 449 ldr v1, [sp, #32] 450 sub a4, a4, v2 451 rsb v2, v1, v1, lsl #3 452 ldrh_pre ip, v2, lr 453 strh a2, [lr] 454 and a2, ip, #255 455 aclip a3, a2, a3, asr #20 456 mov a4, a4, asr #20 457 aclip a4, a4, ip, lsr #8 458 add a2, lr, #2 459 str a2, [sp, #28] 460 orr a2, a3, a4, lsl #8 461 strh a2, [v2] 462 463 ldmfd sp!, {a3, a4} 464 ldrh_pre ip, lr, v1 465 sub a2, a3, v3 466 add a3, a3, v3 467 and v3, ip, #255 468 aclip a2, v3, a2, asr #20 469 sub v3, a4, v4 470 mov v3, v3, asr #20 471 aclip v3, v3, ip, lsr #8 472 orr a2, a2, v3, lsl #8 473 add a4, a4, v4 474 ldrh_dpre ip, v2, v1 475 strh a2, [lr] 476 and a2, ip, #255 477 aclip a3, a2, a3, asr #20 478 mov a4, a4, asr #20 479 aclip a4, a4, ip, lsr #8 480 orr a2, a3, a4, lsl #8 481 strh a2, [v2] 482 483 ldmfd sp!, {a3, a4} 484 ldrh_pre ip, lr, v1 485 add a2, a3, v5 486 sub a3, a3, v5 487 and v3, ip, #255 488 aclip a2, v3, a2, asr #20 489 add v3, a4, v6 490 mov v3, v3, asr #20 491 aclip v3, v3, ip, lsr #8 492 orr a2, a2, v3, lsl #8 493 sub a4, a4, v6 494 ldrh_dpre ip, v2, v1 495 strh a2, [lr] 496 and a2, ip, #255 497 aclip a3, a2, a3, asr #20 498 mov a4, a4, asr #20 499 aclip a4, a4, ip, lsr #8 500 orr a2, a3, a4, lsl #8 501 strh a2, [v2] 502 503 ldmfd sp!, {a3, a4} 504 ldrh_pre ip, lr, v1 505 add a2, a3, v7 506 sub a3, a3, v7 507 and v3, ip, #255 508 aclip a2, v3, a2, asr #20 509 add v3, a4, fp 510 mov v3, v3, asr #20 511 aclip v3, v3, ip, lsr #8 512 orr a2, a2, v3, lsl #8 513 sub a4, a4, fp 514 ldrh_dpre ip, v2, v1 515 strh a2, [lr] 516 and a2, ip, #255 517 aclip a3, a2, a3, asr #20 518 mov a4, a4, asr #20 519 aclip a4, a4, ip, lsr #8 520 orr a2, a3, a4, lsl #8 521 strh a2, [v2] 522 523 ldr pc, [sp], #4 524endfunc 525 526function ff_simple_idct_armv5te, export=1 527 stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} 528 529 bl idct_row_armv5te 530 add a1, a1, #16 531 bl idct_row_armv5te 532 add a1, a1, #16 533 bl idct_row_armv5te 534 add a1, a1, #16 535 bl idct_row_armv5te 536 add a1, a1, #16 537 bl idct_row_armv5te 538 add a1, a1, #16 539 bl idct_row_armv5te 540 add a1, a1, #16 541 bl idct_row_armv5te 542 add a1, a1, #16 543 bl idct_row_armv5te 544 545 sub a1, a1, #(16*7) 546 547 bl idct_col_armv5te 548 add a1, a1, #4 549 bl idct_col_armv5te 550 add a1, a1, #4 551 bl idct_col_armv5te 552 add a1, a1, #4 553 bl idct_col_armv5te 554 555 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} 556endfunc 557 558function ff_simple_idct_add_armv5te, export=1 559 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} 560 561 mov a1, a3 562 563 bl idct_row_armv5te 564 add a1, a1, #16 565 bl idct_row_armv5te 566 add a1, a1, #16 567 bl idct_row_armv5te 568 add a1, a1, #16 569 bl idct_row_armv5te 570 add a1, a1, #16 571 bl idct_row_armv5te 572 add a1, a1, #16 573 bl idct_row_armv5te 574 add a1, a1, #16 575 bl idct_row_armv5te 576 add a1, a1, #16 577 bl idct_row_armv5te 578 579 sub a1, a1, #(16*7) 580 581 bl idct_col_add_armv5te 582 add a1, a1, #4 583 bl idct_col_add_armv5te 584 add a1, a1, #4 585 bl idct_col_add_armv5te 586 add a1, a1, #4 587 bl idct_col_add_armv5te 588 589 add sp, sp, #8 590 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} 591endfunc 592 593function ff_simple_idct_put_armv5te, export=1 594 stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} 595 596 mov a1, a3 597 598 bl idct_row_armv5te 599 add a1, a1, #16 600 bl idct_row_armv5te 601 add a1, a1, #16 602 bl idct_row_armv5te 603 add a1, a1, #16 604 bl idct_row_armv5te 605 add a1, a1, #16 606 bl idct_row_armv5te 607 add a1, a1, #16 608 bl idct_row_armv5te 609 add a1, a1, #16 610 bl idct_row_armv5te 611 add a1, a1, #16 612 bl idct_row_armv5te 613 614 sub a1, a1, #(16*7) 615 616 bl idct_col_put_armv5te 617 add a1, a1, #4 618 bl idct_col_put_armv5te 619 add a1, a1, #4 620 bl idct_col_put_armv5te 621 add a1, a1, #4 622 bl idct_col_put_armv5te 623 624 add sp, sp, #8 625 ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} 626endfunc 627