1/* 2 * Blackfin Pixel Operations 3 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21#include "config_bfin.h" 22 23DEFUN(put_pixels_clamped,mL1, 24 (DCTELEM *block, uint8_t *dest, int line_size)): 25 [--SP] = (R7:4); 26 R4 = 0; 27 R5.l = 0x00ff; 28 R5.h = 0x00ff; 29 I0 = R0; // block 30 I1 = R1; // dest 31 R2 += -4; // line_size 32 M1 = R2; 33 P0 = 8; 34 R0 = [I0++]; 35 R1 = [I0++]; 36 R2 = MAX(R0, R4) (V); 37 LSETUP (ppc$0,ppc$1) LC0=P0; 38ppc$0: R2 = MIN(R2, R5) (V); 39 R3 = MAX(R1, R4) (V); 40 R3 = MIN(R3, R5) (V) || R0 = [I0++]; 41 R6 = BYTEPACK (R2,R3) || R1 = [I0++]; 42 R2 = MAX(R0, R4) (V) || [I1++] = R6; 43 R2 = MIN(R2, R5) (V); 44 R3 = MAX(R1, R4) (V); 45 R3 = MIN(R3, R5) (V) || R0 = [I0++]; 46 R6 = BYTEPACK (R2,R3) || R1 = [I0++]; 47ppc$1: R2 = Max(R0, R4) (V) || [I1++M1] = R6; 48 49 (R7:4) = [SP++]; 50 RTS; 51DEFUN_END(put_pixels_clamped) 52 53DEFUN(add_pixels_clamped,mL1, 54 (DCTELEM *block, uint8_t *dest, int line_size)): 55 [-- SP] = (R7:4); 56 R4 = 0; 57 I0 = 0; 58 R2 += -4; // line_size 59 M0 = R2; 60 I1 = R1; // dest 61 I3 = R0; // block 62 I2 = R1; // dest 63 P0 = 8; 64 M3 = 2; 65 R0 = [I3++] || R2 = [I1]; 66 R2 = R2 << 8 || R0.H = W[I3--] || R3 = [I1++]; 67 R3 = R3 >> 8 || R1.L = W[I3] || I3 += 4; 68 R6 = BYTEOP3P(R1:0, R3:2) (LO) || R1.H = W[I3++] || R2 = [I1]; 69 70 LSETUP(apc$2,apc$3) LC1 = P0; 71apc$2: R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++] || R3 = [I1++M0]; 72 R2 = R2 << 8 || R0.H = W[I3--]; 73 R3 = R3 >> 8 || R1.L = W[I3] || I3 += 4; 74 R6 = R6 + R7 (S) || R1.H = W[I3]; 75 R6 = BYTEOP3P(R1:0, R3:2) (LO) || I3+=M3 || [I2++]=R6; 76 R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++] || R2 = [I1]; 77 R2 = R2 << 8 || R0.H = W[I3--] || R3 = [I1++]; 78 R3 = R3 >> 8 || R1.L = W[I3] || I3 += 4; 79 R6 = R6 + R7 (S) || R1.H = W[I3++]; 80apc$3: R6 = BYTEOP3P(R1:0, R3:2) (LO) || [I2++M0] = R6 || R2 = [I1]; 81 82 (R7:4) = [SP++]; 83 RTS; 84DEFUN_END(add_pixels_clamped) 85 86 87/* 88 motion compensation 89 primitives 90 91 * Halfpel motion compensation with rounding (a+b+1)>>1. 92 * This is an array[4][4] of motion compensation funcions for 4 93 * horizontal blocksizes (8,16) and the 4 halfpel positions<br> 94 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ] 95 * @param block destination where the result is stored 96 * @param pixels source 97 * @param line_size number of bytes in a horizontal line of block 98 * @param h height 99 100*/ 101 102DEFUN(put_pixels8uc,mL1, 103 (uint8_t *block, const uint8_t *s0, const uint8_t *s1, 104 int dest_size, int line_size, int h)): 105 i3=r0; // dest 106 i0=r1; // src0 107 i1=r2; // src1 108 r0=[sp+12]; // dest_size 109 r2=[sp+16]; // line_size 110 p0=[sp+20]; // h 111 [--sp] = (r7:6); 112 r0+=-4; 113 m3=r0; 114 r2+=-8; 115 m0=r2; 116 LSETUP(pp8$0,pp8$1) LC0=P0; 117 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; 118 119pp8$0: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; 120 R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++M0]|| R2 =[I1++M0]; 121 R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++] || [I3++] = R6 ; 122pp8$1: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7; 123 124 (r7:6) = [sp++]; 125 RTS; 126DEFUN_END(put_pixels8uc) 127 128DEFUN(put_pixels16uc,mL1, 129 (uint8_t *block, const uint8_t *s0, const uint8_t *s1, 130 int dest_size, int line_size, int h)): 131 link 0; 132 [--sp] = (r7:6); 133 i3=r0; // dest 134 i0=r1; // src0 135 i1=r2; // src1 136 r0=[fp+20]; // dest_size 137 r2=[fp+24]; // line_size 138 p0=[fp+28]; // h 139 140 141 r0+=-12; 142 m3=r0; // line_size 143 r2+=-16; 144 m0=r2; 145 146 LSETUP(pp16$0,pp16$1) LC0=P0; 147 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; 148 149pp16$0: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; 150 R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++] || R2 =[I1++]; 151 R7 = BYTEOP1P(R1:0,R3:2)(R) || R1 = [I0++] || R3 =[I1++]; 152 [I3++] = R6; 153 R6 = BYTEOP1P(R1:0,R3:2) || R0 = [I0++M0] || R2 =[I1++M0]; 154 R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++] || [I3++] = R7 ; 155 [I3++] = R6; 156pp16$1: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7; 157 158 (r7:6) = [sp++]; 159 unlink; 160 RTS; 161DEFUN_END(put_pixels16uc) 162 163 164 165 166 167 168DEFUN(put_pixels8uc_nornd,mL1, 169 (uint8_t *block, const uint8_t *s0, const uint8_t *s1, 170 int line_size, int h)): 171 i3=r0; // dest 172 i0=r1; // src0 173 i1=r2; // src1 174 r2=[sp+12]; // line_size 175 p0=[sp+16]; // h 176 [--sp] = (r7:6); 177 r2+=-4; 178 m3=r2; 179 r2+=-4; 180 m0=r2; 181 LSETUP(pp8$2,pp8$3) LC0=P0; 182 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; 183 184pp8$2: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; 185 R6 = BYTEOP1P(R1:0,R3:2)(T) || R0 = [I0++M0]|| R2 =[I1++M0]; 186 R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++] || [I3++] = R6 ; 187pp8$3: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7; 188 189 (r7:6) = [sp++]; 190 RTS; 191DEFUN_END(put_pixels8uc_nornd) 192 193DEFUN(put_pixels16uc_nornd,mL1, 194 (uint8_t *block, const uint8_t *s0, const uint8_t *s1, 195 int line_size, int h)): 196 i3=r0; // dest 197 i0=r1; // src0 198 i1=r2; // src1 199 r2=[sp+12]; // line_size 200 p0=[sp+16]; // h 201 202 [--sp] = (r7:6); 203 r2+=-12; 204 m3=r2; // line_size 205 r2+=-4; 206 m0=r2; 207 208 LSETUP(pp16$2,pp16$3) LC0=P0; 209 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; 210 211pp16$2: 212 DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; 213 R6 = BYTEOP1P(R1:0,R3:2)(T) || R0 = [I0++] || R2 =[I1++]; 214 R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R1 = [I0++] || R3 =[I1++]; 215 [I3++] = R6; 216 217 R6 = BYTEOP1P(R1:0,R3:2)(T) || R0 = [I0++M0] || R2 =[I1++M0]; 218 R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++] || [I3++] = R7 ; 219 [I3++] = R6; 220pp16$3: DISALGNEXCPT || R2 = [I1++] || [I3++M3] = R7; 221 222 (r7:6) = [sp++]; 223 224 RTS; 225DEFUN_END(put_pixels16uc_nornd) 226 227DEFUN(z_put_pixels16_xy2,mL1, 228 (uint8_t *block, const uint8_t *s0, 229 int dest_size, int line_size, int h)): 230 link 0; 231 [--sp] = (r7:4); 232 i3=r0; // dest 233 i0=r1; // src0--> pixels 234 i1=r1; // src1--> pixels + line_size 235 r2+=-12; 236 m2=r2; // m2=dest_width-4 237 r2=[fp+20]; 238 m3=r2; // line_size 239 p0=[fp+24]; // h 240 r2+=-16; 241 i1+=m3; /* src1 + line_size */ 242 m0=r2; /* line-size - 20 */ 243 244 B0 = I0; 245 B1 = I1; 246 B3 = I3; 247 248 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; 249 250 LSETUP(LS$16E,LE$16E) LC0=P0; 251LS$16E: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; 252 R4 = BYTEOP2P (R3:2,R1:0) (RNDL) || R0 = [I0++] || R2 =[I1++]; 253 R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R1 = [I0++] || [I3++] = R4 ; 254 DISALGNEXCPT || R3 = [I1++] || [I3++] = R5; 255 R4 = BYTEOP2P (R3:2,R1:0) (RNDL) || R0 = [I0++M0]|| R2 = [I1++M0]; 256 R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++] || [I3++] = R4 ; 257LE$16E: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5; 258 259 M1 = 1; 260 I3 = B3; 261 I1 = B1; 262 I0 = B0; 263 264 I0 += M1; 265 I1 += M1; 266 267 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; 268 LSETUP(LS$16O,LE$16O) LC0=P0; 269LS$16O: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; 270 R4 = BYTEOP2P (R3:2,R1:0) (RNDH) || R0 = [I0++] || R2 =[I1++]; 271 R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R1 = [I0++] || R6 =[I3++]; 272 R4 = R4 +|+ R6 || R7 = [I3--]; 273 R5 = R5 +|+ R7 || [I3++] = R4; 274 DISALGNEXCPT || R3 =[I1++] || [I3++] = R5; 275 R4 = BYTEOP2P (R3:2,R1:0) (RNDH) || R0 = [I0++M0]|| R2 = [I1++M0]; 276 R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++] || R6 = [I3++]; 277 R4 = R4 +|+ R6 || R7 = [I3--]; 278 R5 = R5 +|+ R7 || [I3++] = R4; 279LE$16O: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5; 280 281 (r7:4) = [sp++]; 282 unlink; 283 rts; 284DEFUN_END(z_put_pixels16_xy2) 285 286DEFUN(put_pixels16_xy2_nornd,mL1, 287 (uint8_t *block, const uint8_t *s0, 288 int line_size, int h)): 289 link 0; 290 [--sp] = (r7:4); 291 i3=r0; // dest 292 i0=r1; // src0--> pixels 293 i1=r1; // src1--> pixels + line_size 294 m3=r2; 295 r2+=-12; 296 m2=r2; 297 r2+=-4; 298 i1+=m3; /* src1 + line_size */ 299 m0=r2; /* line-size - 20 */ 300 p0=[fp+20]; // h 301 302 B0=I0; 303 B1=I1; 304 B3=I3; 305 306 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; 307 308 LSETUP(LS$16ET,LE$16ET) LC0=P0; 309LS$16ET:DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; 310 R4 = BYTEOP2P (R3:2,R1:0) (TL) || R0 = [I0++] || R2 =[I1++]; 311 R5 = BYTEOP2P (R3:2,R1:0) (TL,R) || R1 = [I0++] || [I3++] = R4 ; 312 DISALGNEXCPT || R3 = [I1++] || [I3++] = R5; 313 R4 = BYTEOP2P (R3:2,R1:0) (TL) || R0 = [I0++M0]|| R2 = [I1++M0]; 314 R5 = BYTEOP2P (R3:2,R1:0) (TL,R) || R0 = [I0++] || [I3++] = R4 ; 315LE$16ET:DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5; 316 317 M1 = 1; 318 I3=B3; 319 I1=B1; 320 I0=B0; 321 322 I0 += M1; 323 I1 += M1; 324 325 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; 326 LSETUP(LS$16OT,LE$16OT) LC0=P0; 327LS$16OT:DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; 328 R4 = BYTEOP2P (R3:2,R1:0) (TH) || R0 = [I0++] || R2 =[I1++]; 329 R5 = BYTEOP2P (R3:2,R1:0) (TH,R) || R1 = [I0++] || R6 =[I3++]; 330 R4 = R4 +|+ R6 || R7 = [I3--]; 331 R5 = R5 +|+ R7 || [I3++] = R4; 332 DISALGNEXCPT || R3 =[I1++] || [I3++] = R5; 333 R4 = BYTEOP2P (R3:2,R1:0) (TH) || R0 = [I0++M0]|| R2 = [I1++M0]; 334 R5 = BYTEOP2P (R3:2,R1:0) (TH,R) || R0 = [I0++] || R6 = [I3++]; 335 R4 = R4 +|+ R6 || R7 = [I3--]; 336 R5 = R5 +|+ R7 || [I3++] = R4; 337LE$16OT:DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5; 338 339 (r7:4) = [sp++]; 340 unlink; 341 rts; 342DEFUN_END(put_pixels16_xy2_nornd) 343 344DEFUN(z_put_pixels8_xy2,mL1, 345 (uint8_t *block, const uint8_t *s0, 346 int dest_size, int line_size, int h)): 347 link 0; 348 [--sp] = (r7:4); 349 i3=r0; // dest 350 i0=r1; // src0--> pixels 351 i1=r1; // src1--> pixels + line_size 352 r2+=-4; 353 m2=r2; // m2=dest_width-4 354 r2=[fp+20]; 355 m3=r2; // line_size 356 p0=[fp+24]; // h 357 r2+=-8; 358 i1+=m3; /* src1 + line_size */ 359 m0=r2; /* line-size - 20 */ 360 361 b0 = I0; 362 b1 = I1; 363 b3 = I3; 364 365 LSETUP(LS$8E,LE$8E) LC0=P0; 366 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; 367LS$8E: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; 368 R4 = BYTEOP2P (R3:2,R1:0) (RNDL) || R0 = [I0++M0] || R2 =[I1++M0]; 369 R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++] || [I3++] = R4 ; 370LE$8E: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5; 371 372 M1 = 1; 373 I3 = b3; 374 I1 = b1; 375 I0 = b0; 376 377 I0 += M1; 378 I1 += M1; 379 380 LSETUP(LS$8O,LE$8O) LC0=P0; 381 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; 382LS$8O: DISALGNEXCPT || R1 = [I0++] || R3 =[I1++]; 383 R4 = BYTEOP2P (R3:2,R1:0) (RNDH) || R0 = [I0++M0] || R2 =[I1++M0]; 384 R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++] || R6 =[I3++]; 385 R4 = R4 +|+ R6 || R7 = [I3--]; 386 R5 = R5 +|+ R7 || [I3++] = R4; 387LE$8O: DISALGNEXCPT || R2 =[I1++] || [I3++M2] = R5; 388 389 (r7:4) = [sp++]; 390 unlink; 391 rts; 392DEFUN_END(z_put_pixels8_xy2) 393 394DEFUN(put_pixels8_xy2_nornd,mL1, 395 (uint8_t *block, const uint8_t *s0, int line_size, int h)): 396 link 0; 397 [--sp] = (r7:4); 398 i3=r0; // dest 399 i0=r1; // src0--> pixels 400 i1=r1; // src1--> pixels + line_size 401 m3=r2; 402 r2+=-4; 403 m2=r2; 404 r2+=-4; 405 i1+=m3; /* src1 + line_size */ 406 m0=r2; /* line-size - 20 */ 407 p0=[fp+20]; // h 408 409 410 b0 = I0; 411 b1 = I1; 412 b3 = I3; 413 414 LSETUP(LS$8ET,LE$8ET) LC0=P0; 415 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; 416 417LS$8ET: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; 418 R4 = BYTEOP2P (R3:2,R1:0) (TL) || R0 = [I0++M0] || R2 = [I1++M0]; 419 R5 = BYTEOP2P (R3:2,R1:0) (TL,R) || R0 = [I0++] || [I3++] = R4 ; 420LE$8ET: DISALGNEXCPT || R2 = [I1++] || [I3++M2] = R5; 421 422 M1 = 1; 423 I3 = b3; 424 I1 = b1; 425 I0 = b0; 426 427 I0 += M1; 428 I1 += M1; 429 430 LSETUP(LS$8OT,LE$8OT) LC0=P0; 431 DISALGNEXCPT || R0 = [I0++] || R2 = [I1++]; 432 433LS$8OT: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; 434 R4 = BYTEOP2P (R3:2,R1:0) (TH) || R0 = [I0++M0] || R2 = [I1++M0]; 435 R5 = BYTEOP2P (R3:2,R1:0) (TH,R) || R0 = [I0++] || R6 = [I3++]; 436 R4 = R4 +|+ R6 || R7 = [I3--]; 437 R5 = R5 +|+ R7 || [I3++] = R4; 438LE$8OT: DISALGNEXCPT || R2 =[I1++] || [I3++M2] = R5; 439 440 (r7:4) = [sp++]; 441 unlink; 442 rts; 443 444DEFUN(diff_pixels,mL1, 445 (DCTELEM *block, uint8_t *s1, uint8_t *s2, int stride)): 446 link 0; 447 [--sp] = (r7:4); 448 p0=8; 449 i3=r0; // block 450 i0=r1; // s1 451 i1=r2; // s2 452 r2=[fp+20]; // stride 453 r2+=-8; 454 m0=r2; 455 456 457 LSETUP(.LS0,.LE0) LC0=P0; 458 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; 459 460.LS0: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; 461 (R5,R4) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M0]; 462 (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++] || [I3++] = R4; 463 DISALGNEXCPT || R2 = [I1++] || [I3++] = R5; 464 [i3++]=r6; 465.LE0: [i3++]=r7; 466 467 (r7:4) = [sp++]; 468 unlink; 469 rts; 470DEFUN_END(put_pixels8_xy2_nornd) 471 472/* 473 for (i = 0; i < 16; i++) { 474 for (j = 0; j < 16; j++) { 475 sum += pix[j]; 476 } 477 pix += line_size; 478 } 479*/ 480DEFUN(pix_sum,mL1, 481 (uint8_t *p, int stride)): 482 link 0; 483 [--sp] = (r7:4); 484 p0=8; 485 i0=r0; // s1 486 i1=r0; 487 m1=r1; 488 r1=r1+r1; 489 r1+=-16; // stride 490 m0=r1; 491 i1+=m1; 492 493 r6=0; 494 495 LSETUP(LS$PS,LE$PS) LC0=P0; 496 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; 497 498LS$PS: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; 499 (R5,R4) = BYTEOP16P (R3:2,R1:0) || R0 = [I0++] || R2 = [I1++]; 500 r6=r6+|+r5; 501 r6=r6+|+r4; 502 (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R1 = [I0++] || R3 = [I1++]; 503 r6=r6+|+r5; 504 r6=r6+|+r4; 505 (R5,R4) = BYTEOP16P (R3:2,R1:0) || R0 = [I0++m0] || R2 = [I1++m0]; 506 r6=r6+|+r5; 507 r6=r6+|+r4; 508 (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R0 = [I0++] || R2 = [I1++]; 509 r6=r6+|+r5; 510LE$PS: r6=r6+|+r4; 511 r0.l=r6.l+r6.h; 512 r0.h=0; 513 514 (r7:4) = [sp++]; 515 unlink; 516 rts; 517DEFUN_END(pix_sum) 518 519 520DEFUN(get_pixels,mL1, 521 (DCTELEM *restrict block, const uint8_t *pixels, int line_size)): 522 [--sp] = (r7:4); 523 i3=r0; // dest 524 i0=r1; // src0 525 p0=8; 526 r2+=-8; 527 m0=r2; 528 LSETUP(gp8$0,gp8$1) LC0=P0; 529 530 DISALGNEXCPT || R0 = [I0++]; 531 DISALGNEXCPT || R1 = [I0++]; 532 533gp8$0: (R7,R6) = byteunpack R1:0 || R0 = [I0++M0]; 534 (R5,R4) = byteunpack R1:0 (R) || R0 = [I0++] || [I3++]=R6; 535 DISALGNEXCPT || R1 = [I0++] || [I3++]=R7; 536 [I3++]=R4; 537gp8$1: [I3++]=R5 538 539 540 (r7:4) = [sp++]; 541 RTS; 542DEFUN_END(get_pixels) 543 544 545/* sad = sad16x16 (ubyte *mb, ubyte *refwin, srcwidth, refwinwidth, h) */ 546/* 91 cycles */ 547DEFUN(z_sad16x16,mL1, 548 (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)): 549 link 0; 550 I0 = R0; 551 I1 = R1; 552 553 A1 = A0 = 0; 554 R0 = [sp+20]; // rwidth 555 P2 = [sp+24]; // height 556 R3 = 16; 557 R0 = R0 - R3; 558 R3 = R2 - R3; 559 M1 = R0; 560 M0 = R3; 561 562 DISALGNEXCPT || R0 = [I0++] || R2 = [I1++]; 563 LSETUP (s$16, e$16) LC0=P2; 564s$16: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; 565 SAA (R1:0,R3:2) || R0 = [I0++] || R2 = [I1++]; 566 SAA (R1:0,R3:2) (R) || R1 = [I0++] || R3 = [I1++]; 567 SAA (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M1]; 568e$16: SAA (R1:0,R3:2) (R) || R0 = [I0++] || R2 = [I1++]; 569 570 R3=A1.L+A1.H, R2=A0.L+A0.H ; 571 R0 = R2 + R3 ; 572 unlink; 573 RTS; 574DEFUN_END(z_sad16x16) 575 576/* sad = sad8x8 (ubyte *mb, ubyte *refwin, int srcwidth, int refwinwidth, int h) */ 577/* 36 cycles */ 578DEFUN(z_sad8x8,mL1, 579 (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)): 580 I0 = R0; 581 I1 = R1; 582 583 A1 = A0 = 0; 584 r0 = [sp+12]; // rwidth 585 P2 = [sp+16]; //height 586 R3 = 8; 587 R0 = R0 - R3; 588 R3 = R2 - R3; 589 M0 = R3; 590 M1 = R0; 591 592 LSETUP (s$8, e$8) LC0=P2; 593 DISALGNEXCPT || R0 = [I0++] || R2 = [I1++]; 594 DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; 595s$8: SAA (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M1]; 596 SAA (R1:0,R3:2) (R) || R0 = [I0++] || R2 = [I1++]; 597e$8: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; 598 599 R3=A1.L+A1.H, R2=A0.L+A0.H ; 600 R0 = R2 + R3 ; 601 RTS; 602DEFUN_END(z_sad8x8) 603 604DEFUN(pix_norm1,mL1, 605 (uint8_t * pix, int line_size)): 606 [--SP]=(R7:4,P5:3); 607 608 // Fetch the input arguments. 609 P1 = R0; // pix 610 P0 = R1; // line_size 611 P5 = 16; // loop ctr. 612 P0 -= P5; 613 M0 = P0; // M0 = line_size-16; 614 // Now for the real work. 615 A1 = A0 = 0; 616 lsetup(_pix_norm1_blkfn_loopStart, _pix_norm1_blkfn_loopEnd) LC1 = P5; 617 I0 = P1; 618 DISALGNEXCPT || r0 = [i0++]; 619 620_pix_norm1_blkfn_loopStart: 621 // following unpacks pix1[0..15] pix1+line_size[0..15] 622 DISALGNEXCPT || r1 = [i0++]; 623 624 (r5, r4) = byteunpack r1:0 || r0 = [i0++]; 625 a1 += r5.h * r5.h, a0 += r5.l * r5.l (is); 626 a1 += r4.h * r4.h, a0 += r4.l * r4.l (is); 627 (r5, r4) = byteunpack r1:0(r) || r1 = [i0++]; 628 a1 += r5.h * r5.h, a0 += r5.l * r5.l (is); 629 a1 += r4.h * r4.h, a0 += r4.l * r4.l (is); 630 (r5, r4) = byteunpack r1:0 || r0 = [i0++M0]; 631 a1 += r5.h * r5.h, a0 += r5.l * r5.l (is); 632 a1 += r4.h * r4.h, a0 += r4.l * r4.l (is); 633 (r5, r4) = byteunpack r1:0(r) || r0 = [i0++]; 634 a1 += r5.h * r5.h, a0 += r5.l * r5.l (is); 635_pix_norm1_blkfn_loopEnd: 636 a1 += r4.h * r4.h, a0 += r4.l * r4.l (is); 637 638 639// Clean up at the end: 640 R2 = A0, R3 = A1; 641 R0 = R2 + R3 (S); 642 643 (R7:4,P5:3)=[SP++]; 644 645 RTS; 646DEFUN_END(pix_norm1) 647 648DEFUN(sse4,mL1, 649 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)): 650 link 0; 651 [--sp] = (r7:6); 652 p0=[fp+24]; // h 653 i0=r1; // pix1 654 i1=r2; // pix2 655 r2=[fp+20]; // line_size 656 r2+=-4; 657 m0=r2; 658 659 a0=a1=0; 660 LSETUP(.S40,.E40) LC0=P0; 661 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; 662 663.S40: DISALGNEXCPT || R1 = [I0++M0] || R3 = [I1++M0]; 664 (R7,R6) = BYTEOP16M (R1:0,R3:2); 665 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is); 666.E40: a0 += r6.l * r6.l, a1 += r6.h * r6.h (is); 667 a0 += a1; 668 r0 = a0; 669 670 (r7:6) = [sp++]; 671 unlink; 672 rts; 673DEFUN_END(sse4) 674 675DEFUN(sse8,mL1, 676 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)): 677 link 0; 678 [--sp] = (r7:6); 679 p0=[fp+24]; // h 680 i0=r1; // pix1 681 i1=r2; // pix2 682 r2=[fp+20]; // line_size 683 r2+=-8; 684 m0=r2; 685 686 a0=a1=0; 687 LSETUP(.S80,.E80) LC0=P0; 688 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; 689 690.S80: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; 691 (R7,R6) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M0]; 692 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is); 693 a0 += r6.l * r6.l, a1 += r6.h * r6.h (is); 694 (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++] || R2 = [I1++]; 695 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is); 696.E80: a0 += r6.l * r6.l, a1 += r6.h * r6.h (is); 697 a0 += a1; 698 r0 = a0; 699 700 (r7:6) = [sp++]; 701 unlink; 702 rts; 703DEFUN_END(sse8) 704 705DEFUN(sse16,mL1, 706 (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)): 707 link 0; 708 [--sp] = (r7:6); 709 p0=[fp+24]; // h 710 i0=r1; // pix1 711 i1=r2; // pix2 712 r2=[fp+20]; // line_size 713 r2+=-16; 714 m0=r2; 715 716 a0=a1=0; 717 DISALGNEXCPT || R0 = [I0++] || R2 =[I1++]; 718 LSETUP(.S160,.E160) LC0=P0; 719 720.S160: DISALGNEXCPT || R1 = [I0++] || R3 = [I1++]; 721 (R7,R6) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++] || R2 = [I1++]; 722 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is); 723 a0 += r6.l * r6.l, a1 += r6.h * r6.h (is); 724 (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R1 = [I0++] || R3 = [I1++]; 725 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is); 726 a0 += r6.l * r6.l, a1 += r6.h * r6.h (is); 727 (R7,R6) = BYTEOP16M (R1:0,R3:2) || R0 = [I0++M0] || R2 = [I1++M0]; 728 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is); 729 a0 += r6.l * r6.l, a1 += r6.h * r6.h (is); 730 (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++] || R2 = [I1++]; 731 a0 += r7.l * r7.l, a1 += r7.h * r7.h (is); 732.E160: a0 += r6.l * r6.l, a1 += r6.h * r6.h (is); 733 a0 += a1; 734 r0 = a0; 735 736 (r7:6) = [sp++]; 737 unlink; 738 rts; 739DEFUN_END(sse16) 740 741 742