1/* 2 * Copyright (c) 2012 3 * MIPS Technologies, Inc., California. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its 14 * contributors may be used to endorse or promote products derived from 15 * this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * Author: Stanislav Ocovaj (socovaj@mips.com) 30 * Szabolcs Pal (sabolc@mips.com) 31 * 32 * AAC coefficients encoder optimized for MIPS floating-point architecture 33 * 34 * This file is part of FFmpeg. 35 * 36 * FFmpeg is free software; you can redistribute it and/or 37 * modify it under the terms of the GNU Lesser General Public 38 * License as published by the Free Software Foundation; either 39 * version 2.1 of the License, or (at your option) any later version. 40 * 41 * FFmpeg is distributed in the hope that it will be useful, 42 * but WITHOUT ANY WARRANTY; without even the implied warranty of 43 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 44 * Lesser General Public License for more details. 45 * 46 * You should have received a copy of the GNU Lesser General Public 47 * License along with FFmpeg; if not, write to the Free Software 48 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 49 */ 50 51/** 52 * @file 53 * Reference: libavcodec/aaccoder.c 54 */ 55 56#include "libavutil/libm.h" 57 58#include <float.h> 59#include "libavutil/mathematics.h" 60#include "libavcodec/avcodec.h" 61#include "libavcodec/put_bits.h" 62#include "libavcodec/aac.h" 63#include "libavcodec/aacenc.h" 64#include "libavcodec/aactab.h" 65 66#if HAVE_INLINE_ASM 67typedef struct BandCodingPath { 68 int prev_idx; 69 float cost; 70 int run; 71} BandCodingPath; 72 73static const uint8_t run_value_bits_long[64] = { 74 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 75 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10, 76 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 77 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15 78}; 79 80static const uint8_t run_value_bits_short[16] = { 81 3, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 9 82}; 83 84static const uint8_t *run_value_bits[2] = { 85 run_value_bits_long, run_value_bits_short 86}; 87 88static const uint8_t uquad_sign_bits[81] = { 89 0, 1, 1, 1, 2, 2, 1, 2, 2, 90 1, 2, 2, 2, 3, 3, 2, 3, 3, 91 1, 2, 2, 2, 3, 3, 2, 3, 3, 92 1, 2, 2, 2, 3, 3, 2, 3, 3, 93 2, 3, 3, 3, 4, 4, 3, 4, 4, 94 2, 3, 3, 3, 4, 4, 3, 4, 4, 95 1, 2, 2, 2, 3, 3, 2, 3, 3, 96 2, 3, 3, 3, 4, 4, 3, 4, 4, 97 2, 3, 3, 3, 4, 4, 3, 4, 4 98}; 99 100static const uint8_t upair7_sign_bits[64] = { 101 0, 1, 1, 1, 1, 1, 1, 1, 102 1, 2, 2, 2, 2, 2, 2, 2, 103 1, 2, 2, 2, 2, 2, 2, 2, 104 1, 2, 2, 2, 2, 2, 2, 2, 105 1, 2, 2, 2, 2, 2, 2, 2, 106 1, 2, 2, 2, 2, 2, 2, 2, 107 1, 2, 2, 2, 2, 2, 2, 2, 108 1, 2, 2, 2, 2, 2, 2, 2, 109}; 110 111static const uint8_t upair12_sign_bits[169] = { 112 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 113 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 114 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 115 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 116 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 117 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 118 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 119 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 120 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 121 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 122 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 123 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 124 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 125}; 126 127static const uint8_t esc_sign_bits[289] = { 128 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 129 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 130 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 131 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 132 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 133 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 134 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 135 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 136 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 137 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 138 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 139 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 140 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 141 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 142 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 143 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 144 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 145}; 146 147static void abs_pow34_v(float *out, const float *in, const int size) { 148#ifndef USE_REALLY_FULL_SEARCH 149 int i; 150 float a, b, c, d; 151 float ax, bx, cx, dx; 152 153 for (i = 0; i < size; i += 4) { 154 a = fabsf(in[i ]); 155 b = fabsf(in[i+1]); 156 c = fabsf(in[i+2]); 157 d = fabsf(in[i+3]); 158 159 ax = sqrtf(a); 160 bx = sqrtf(b); 161 cx = sqrtf(c); 162 dx = sqrtf(d); 163 164 a = a * ax; 165 b = b * bx; 166 c = c * cx; 167 d = d * dx; 168 169 out[i ] = sqrtf(a); 170 out[i+1] = sqrtf(b); 171 out[i+2] = sqrtf(c); 172 out[i+3] = sqrtf(d); 173 } 174#endif /* USE_REALLY_FULL_SEARCH */ 175} 176 177static float find_max_val(int group_len, int swb_size, const float *scaled) { 178 float maxval = 0.0f; 179 int w2, i; 180 for (w2 = 0; w2 < group_len; w2++) { 181 for (i = 0; i < swb_size; i++) { 182 maxval = FFMAX(maxval, scaled[w2*128+i]); 183 } 184 } 185 return maxval; 186} 187 188static int find_min_book(float maxval, int sf) { 189 float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512]; 190 float Q34 = sqrtf(Q * sqrtf(Q)); 191 int qmaxval, cb; 192 qmaxval = maxval * Q34 + 0.4054f; 193 if (qmaxval == 0) cb = 0; 194 else if (qmaxval == 1) cb = 1; 195 else if (qmaxval == 2) cb = 3; 196 else if (qmaxval <= 4) cb = 5; 197 else if (qmaxval <= 7) cb = 7; 198 else if (qmaxval <= 12) cb = 9; 199 else cb = 11; 200 return cb; 201} 202 203/** 204 * Functions developed from template function and optimized for quantizing and encoding band 205 */ 206static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s, 207 PutBitContext *pb, const float *in, 208 const float *scaled, int size, int scale_idx, 209 int cb, const float lambda, const float uplim, 210 int *bits) 211{ 212 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 213 int i; 214 int qc1, qc2, qc3, qc4; 215 216 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; 217 uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1]; 218 219 abs_pow34_v(s->scoefs, in, size); 220 scaled = s->scoefs; 221 for (i = 0; i < size; i += 4) { 222 int curidx; 223 int *in_int = (int *)&in[i]; 224 225 qc1 = scaled[i ] * Q34 + 0.4054f; 226 qc2 = scaled[i+1] * Q34 + 0.4054f; 227 qc3 = scaled[i+2] * Q34 + 0.4054f; 228 qc4 = scaled[i+3] * Q34 + 0.4054f; 229 230 __asm__ volatile ( 231 ".set push \n\t" 232 ".set noreorder \n\t" 233 234 "slt %[qc1], $zero, %[qc1] \n\t" 235 "slt %[qc2], $zero, %[qc2] \n\t" 236 "slt %[qc3], $zero, %[qc3] \n\t" 237 "slt %[qc4], $zero, %[qc4] \n\t" 238 "lw $t0, 0(%[in_int]) \n\t" 239 "lw $t1, 4(%[in_int]) \n\t" 240 "lw $t2, 8(%[in_int]) \n\t" 241 "lw $t3, 12(%[in_int]) \n\t" 242 "srl $t0, $t0, 31 \n\t" 243 "srl $t1, $t1, 31 \n\t" 244 "srl $t2, $t2, 31 \n\t" 245 "srl $t3, $t3, 31 \n\t" 246 "subu $t4, $zero, %[qc1] \n\t" 247 "subu $t5, $zero, %[qc2] \n\t" 248 "subu $t6, $zero, %[qc3] \n\t" 249 "subu $t7, $zero, %[qc4] \n\t" 250 "movn %[qc1], $t4, $t0 \n\t" 251 "movn %[qc2], $t5, $t1 \n\t" 252 "movn %[qc3], $t6, $t2 \n\t" 253 "movn %[qc4], $t7, $t3 \n\t" 254 255 ".set pop \n\t" 256 257 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 258 [qc3]"+r"(qc3), [qc4]"+r"(qc4) 259 : [in_int]"r"(in_int) 260 : "t0", "t1", "t2", "t3", 261 "t4", "t5", "t6", "t7", 262 "memory" 263 ); 264 265 curidx = qc1; 266 curidx *= 3; 267 curidx += qc2; 268 curidx *= 3; 269 curidx += qc3; 270 curidx *= 3; 271 curidx += qc4; 272 curidx += 40; 273 274 put_bits(pb, p_bits[curidx], p_codes[curidx]); 275 } 276} 277 278static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s, 279 PutBitContext *pb, const float *in, 280 const float *scaled, int size, int scale_idx, 281 int cb, const float lambda, const float uplim, 282 int *bits) 283{ 284 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 285 int i; 286 int qc1, qc2, qc3, qc4; 287 288 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; 289 uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1]; 290 291 abs_pow34_v(s->scoefs, in, size); 292 scaled = s->scoefs; 293 for (i = 0; i < size; i += 4) { 294 int curidx, sign, count; 295 int *in_int = (int *)&in[i]; 296 uint8_t v_bits; 297 unsigned int v_codes; 298 299 qc1 = scaled[i ] * Q34 + 0.4054f; 300 qc2 = scaled[i+1] * Q34 + 0.4054f; 301 qc3 = scaled[i+2] * Q34 + 0.4054f; 302 qc4 = scaled[i+3] * Q34 + 0.4054f; 303 304 __asm__ volatile ( 305 ".set push \n\t" 306 ".set noreorder \n\t" 307 308 "ori $t4, $zero, 2 \n\t" 309 "ori %[sign], $zero, 0 \n\t" 310 "slt $t0, $t4, %[qc1] \n\t" 311 "slt $t1, $t4, %[qc2] \n\t" 312 "slt $t2, $t4, %[qc3] \n\t" 313 "slt $t3, $t4, %[qc4] \n\t" 314 "movn %[qc1], $t4, $t0 \n\t" 315 "movn %[qc2], $t4, $t1 \n\t" 316 "movn %[qc3], $t4, $t2 \n\t" 317 "movn %[qc4], $t4, $t3 \n\t" 318 "lw $t0, 0(%[in_int]) \n\t" 319 "lw $t1, 4(%[in_int]) \n\t" 320 "lw $t2, 8(%[in_int]) \n\t" 321 "lw $t3, 12(%[in_int]) \n\t" 322 "slt $t0, $t0, $zero \n\t" 323 "movn %[sign], $t0, %[qc1] \n\t" 324 "slt $t1, $t1, $zero \n\t" 325 "slt $t2, $t2, $zero \n\t" 326 "slt $t3, $t3, $zero \n\t" 327 "sll $t0, %[sign], 1 \n\t" 328 "or $t0, $t0, $t1 \n\t" 329 "movn %[sign], $t0, %[qc2] \n\t" 330 "slt $t4, $zero, %[qc1] \n\t" 331 "slt $t1, $zero, %[qc2] \n\t" 332 "slt %[count], $zero, %[qc3] \n\t" 333 "sll $t0, %[sign], 1 \n\t" 334 "or $t0, $t0, $t2 \n\t" 335 "movn %[sign], $t0, %[qc3] \n\t" 336 "slt $t2, $zero, %[qc4] \n\t" 337 "addu %[count], %[count], $t4 \n\t" 338 "addu %[count], %[count], $t1 \n\t" 339 "sll $t0, %[sign], 1 \n\t" 340 "or $t0, $t0, $t3 \n\t" 341 "movn %[sign], $t0, %[qc4] \n\t" 342 "addu %[count], %[count], $t2 \n\t" 343 344 ".set pop \n\t" 345 346 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 347 [qc3]"+r"(qc3), [qc4]"+r"(qc4), 348 [sign]"=&r"(sign), [count]"=&r"(count) 349 : [in_int]"r"(in_int) 350 : "t0", "t1", "t2", "t3", "t4", 351 "memory" 352 ); 353 354 curidx = qc1; 355 curidx *= 3; 356 curidx += qc2; 357 curidx *= 3; 358 curidx += qc3; 359 curidx *= 3; 360 curidx += qc4; 361 362 v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1)); 363 v_bits = p_bits[curidx] + count; 364 put_bits(pb, v_bits, v_codes); 365 } 366} 367 368static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s, 369 PutBitContext *pb, const float *in, 370 const float *scaled, int size, int scale_idx, 371 int cb, const float lambda, const float uplim, 372 int *bits) 373{ 374 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 375 int i; 376 int qc1, qc2, qc3, qc4; 377 378 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; 379 uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1]; 380 381 abs_pow34_v(s->scoefs, in, size); 382 scaled = s->scoefs; 383 for (i = 0; i < size; i += 4) { 384 int curidx, curidx2; 385 int *in_int = (int *)&in[i]; 386 uint8_t v_bits; 387 unsigned int v_codes; 388 389 qc1 = scaled[i ] * Q34 + 0.4054f; 390 qc2 = scaled[i+1] * Q34 + 0.4054f; 391 qc3 = scaled[i+2] * Q34 + 0.4054f; 392 qc4 = scaled[i+3] * Q34 + 0.4054f; 393 394 __asm__ volatile ( 395 ".set push \n\t" 396 ".set noreorder \n\t" 397 398 "ori $t4, $zero, 4 \n\t" 399 "slt $t0, $t4, %[qc1] \n\t" 400 "slt $t1, $t4, %[qc2] \n\t" 401 "slt $t2, $t4, %[qc3] \n\t" 402 "slt $t3, $t4, %[qc4] \n\t" 403 "movn %[qc1], $t4, $t0 \n\t" 404 "movn %[qc2], $t4, $t1 \n\t" 405 "movn %[qc3], $t4, $t2 \n\t" 406 "movn %[qc4], $t4, $t3 \n\t" 407 "lw $t0, 0(%[in_int]) \n\t" 408 "lw $t1, 4(%[in_int]) \n\t" 409 "lw $t2, 8(%[in_int]) \n\t" 410 "lw $t3, 12(%[in_int]) \n\t" 411 "srl $t0, $t0, 31 \n\t" 412 "srl $t1, $t1, 31 \n\t" 413 "srl $t2, $t2, 31 \n\t" 414 "srl $t3, $t3, 31 \n\t" 415 "subu $t4, $zero, %[qc1] \n\t" 416 "subu $t5, $zero, %[qc2] \n\t" 417 "subu $t6, $zero, %[qc3] \n\t" 418 "subu $t7, $zero, %[qc4] \n\t" 419 "movn %[qc1], $t4, $t0 \n\t" 420 "movn %[qc2], $t5, $t1 \n\t" 421 "movn %[qc3], $t6, $t2 \n\t" 422 "movn %[qc4], $t7, $t3 \n\t" 423 424 ".set pop \n\t" 425 426 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 427 [qc3]"+r"(qc3), [qc4]"+r"(qc4) 428 : [in_int]"r"(in_int) 429 : "t0", "t1", "t2", "t3", 430 "t4", "t5", "t6", "t7", 431 "memory" 432 ); 433 434 curidx = 9 * qc1; 435 curidx += qc2 + 40; 436 437 curidx2 = 9 * qc3; 438 curidx2 += qc4 + 40; 439 440 v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]); 441 v_bits = p_bits[curidx] + p_bits[curidx2]; 442 put_bits(pb, v_bits, v_codes); 443 } 444} 445 446static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s, 447 PutBitContext *pb, const float *in, 448 const float *scaled, int size, int scale_idx, 449 int cb, const float lambda, const float uplim, 450 int *bits) 451{ 452 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 453 int i; 454 int qc1, qc2, qc3, qc4; 455 456 uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1]; 457 uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1]; 458 459 abs_pow34_v(s->scoefs, in, size); 460 scaled = s->scoefs; 461 for (i = 0; i < size; i += 4) { 462 int curidx, sign1, count1, sign2, count2; 463 int *in_int = (int *)&in[i]; 464 uint8_t v_bits; 465 unsigned int v_codes; 466 467 qc1 = scaled[i ] * Q34 + 0.4054f; 468 qc2 = scaled[i+1] * Q34 + 0.4054f; 469 qc3 = scaled[i+2] * Q34 + 0.4054f; 470 qc4 = scaled[i+3] * Q34 + 0.4054f; 471 472 __asm__ volatile ( 473 ".set push \n\t" 474 ".set noreorder \n\t" 475 476 "ori $t4, $zero, 7 \n\t" 477 "ori %[sign1], $zero, 0 \n\t" 478 "ori %[sign2], $zero, 0 \n\t" 479 "slt $t0, $t4, %[qc1] \n\t" 480 "slt $t1, $t4, %[qc2] \n\t" 481 "slt $t2, $t4, %[qc3] \n\t" 482 "slt $t3, $t4, %[qc4] \n\t" 483 "movn %[qc1], $t4, $t0 \n\t" 484 "movn %[qc2], $t4, $t1 \n\t" 485 "movn %[qc3], $t4, $t2 \n\t" 486 "movn %[qc4], $t4, $t3 \n\t" 487 "lw $t0, 0(%[in_int]) \n\t" 488 "lw $t1, 4(%[in_int]) \n\t" 489 "lw $t2, 8(%[in_int]) \n\t" 490 "lw $t3, 12(%[in_int]) \n\t" 491 "slt $t0, $t0, $zero \n\t" 492 "movn %[sign1], $t0, %[qc1] \n\t" 493 "slt $t2, $t2, $zero \n\t" 494 "movn %[sign2], $t2, %[qc3] \n\t" 495 "slt $t1, $t1, $zero \n\t" 496 "sll $t0, %[sign1], 1 \n\t" 497 "or $t0, $t0, $t1 \n\t" 498 "movn %[sign1], $t0, %[qc2] \n\t" 499 "slt $t3, $t3, $zero \n\t" 500 "sll $t0, %[sign2], 1 \n\t" 501 "or $t0, $t0, $t3 \n\t" 502 "movn %[sign2], $t0, %[qc4] \n\t" 503 "slt %[count1], $zero, %[qc1] \n\t" 504 "slt $t1, $zero, %[qc2] \n\t" 505 "slt %[count2], $zero, %[qc3] \n\t" 506 "slt $t2, $zero, %[qc4] \n\t" 507 "addu %[count1], %[count1], $t1 \n\t" 508 "addu %[count2], %[count2], $t2 \n\t" 509 510 ".set pop \n\t" 511 512 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 513 [qc3]"+r"(qc3), [qc4]"+r"(qc4), 514 [sign1]"=&r"(sign1), [count1]"=&r"(count1), 515 [sign2]"=&r"(sign2), [count2]"=&r"(count2) 516 : [in_int]"r"(in_int) 517 : "t0", "t1", "t2", "t3", "t4", 518 "memory" 519 ); 520 521 curidx = 8 * qc1; 522 curidx += qc2; 523 524 v_codes = (p_codes[curidx] << count1) | sign1; 525 v_bits = p_bits[curidx] + count1; 526 put_bits(pb, v_bits, v_codes); 527 528 curidx = 8 * qc3; 529 curidx += qc4; 530 531 v_codes = (p_codes[curidx] << count2) | sign2; 532 v_bits = p_bits[curidx] + count2; 533 put_bits(pb, v_bits, v_codes); 534 } 535} 536 537static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s, 538 PutBitContext *pb, const float *in, 539 const float *scaled, int size, int scale_idx, 540 int cb, const float lambda, const float uplim, 541 int *bits) 542{ 543 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 544 int i; 545 int qc1, qc2, qc3, qc4; 546 547 uint8_t *p_bits = (uint8_t*) ff_aac_spectral_bits[cb-1]; 548 uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1]; 549 550 abs_pow34_v(s->scoefs, in, size); 551 scaled = s->scoefs; 552 for (i = 0; i < size; i += 4) { 553 int curidx, sign1, count1, sign2, count2; 554 int *in_int = (int *)&in[i]; 555 uint8_t v_bits; 556 unsigned int v_codes; 557 558 qc1 = scaled[i ] * Q34 + 0.4054f; 559 qc2 = scaled[i+1] * Q34 + 0.4054f; 560 qc3 = scaled[i+2] * Q34 + 0.4054f; 561 qc4 = scaled[i+3] * Q34 + 0.4054f; 562 563 __asm__ volatile ( 564 ".set push \n\t" 565 ".set noreorder \n\t" 566 567 "ori $t4, $zero, 12 \n\t" 568 "ori %[sign1], $zero, 0 \n\t" 569 "ori %[sign2], $zero, 0 \n\t" 570 "slt $t0, $t4, %[qc1] \n\t" 571 "slt $t1, $t4, %[qc2] \n\t" 572 "slt $t2, $t4, %[qc3] \n\t" 573 "slt $t3, $t4, %[qc4] \n\t" 574 "movn %[qc1], $t4, $t0 \n\t" 575 "movn %[qc2], $t4, $t1 \n\t" 576 "movn %[qc3], $t4, $t2 \n\t" 577 "movn %[qc4], $t4, $t3 \n\t" 578 "lw $t0, 0(%[in_int]) \n\t" 579 "lw $t1, 4(%[in_int]) \n\t" 580 "lw $t2, 8(%[in_int]) \n\t" 581 "lw $t3, 12(%[in_int]) \n\t" 582 "slt $t0, $t0, $zero \n\t" 583 "movn %[sign1], $t0, %[qc1] \n\t" 584 "slt $t2, $t2, $zero \n\t" 585 "movn %[sign2], $t2, %[qc3] \n\t" 586 "slt $t1, $t1, $zero \n\t" 587 "sll $t0, %[sign1], 1 \n\t" 588 "or $t0, $t0, $t1 \n\t" 589 "movn %[sign1], $t0, %[qc2] \n\t" 590 "slt $t3, $t3, $zero \n\t" 591 "sll $t0, %[sign2], 1 \n\t" 592 "or $t0, $t0, $t3 \n\t" 593 "movn %[sign2], $t0, %[qc4] \n\t" 594 "slt %[count1], $zero, %[qc1] \n\t" 595 "slt $t1, $zero, %[qc2] \n\t" 596 "slt %[count2], $zero, %[qc3] \n\t" 597 "slt $t2, $zero, %[qc4] \n\t" 598 "addu %[count1], %[count1], $t1 \n\t" 599 "addu %[count2], %[count2], $t2 \n\t" 600 601 ".set pop \n\t" 602 603 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 604 [qc3]"+r"(qc3), [qc4]"+r"(qc4), 605 [sign1]"=&r"(sign1), [count1]"=&r"(count1), 606 [sign2]"=&r"(sign2), [count2]"=&r"(count2) 607 : [in_int]"r"(in_int) 608 : "t0", "t1", "t2", "t3", "t4", 609 "memory" 610 ); 611 612 curidx = 13 * qc1; 613 curidx += qc2; 614 615 v_codes = (p_codes[curidx] << count1) | sign1; 616 v_bits = p_bits[curidx] + count1; 617 put_bits(pb, v_bits, v_codes); 618 619 curidx = 13 * qc3; 620 curidx += qc4; 621 622 v_codes = (p_codes[curidx] << count2) | sign2; 623 v_bits = p_bits[curidx] + count2; 624 put_bits(pb, v_bits, v_codes); 625 } 626} 627 628static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s, 629 PutBitContext *pb, const float *in, 630 const float *scaled, int size, int scale_idx, 631 int cb, const float lambda, const float uplim, 632 int *bits) 633{ 634 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 635 int i; 636 int qc1, qc2, qc3, qc4; 637 638 uint8_t *p_bits = (uint8_t* )ff_aac_spectral_bits[cb-1]; 639 uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1]; 640 float *p_vectors = (float* )ff_aac_codebook_vectors[cb-1]; 641 642 abs_pow34_v(s->scoefs, in, size); 643 scaled = s->scoefs; 644 645 if (cb < 11) { 646 for (i = 0; i < size; i += 4) { 647 int curidx, curidx2, sign1, count1, sign2, count2; 648 int *in_int = (int *)&in[i]; 649 uint8_t v_bits; 650 unsigned int v_codes; 651 652 qc1 = scaled[i ] * Q34 + 0.4054f; 653 qc2 = scaled[i+1] * Q34 + 0.4054f; 654 qc3 = scaled[i+2] * Q34 + 0.4054f; 655 qc4 = scaled[i+3] * Q34 + 0.4054f; 656 657 __asm__ volatile ( 658 ".set push \n\t" 659 ".set noreorder \n\t" 660 661 "ori $t4, $zero, 16 \n\t" 662 "ori %[sign1], $zero, 0 \n\t" 663 "ori %[sign2], $zero, 0 \n\t" 664 "slt $t0, $t4, %[qc1] \n\t" 665 "slt $t1, $t4, %[qc2] \n\t" 666 "slt $t2, $t4, %[qc3] \n\t" 667 "slt $t3, $t4, %[qc4] \n\t" 668 "movn %[qc1], $t4, $t0 \n\t" 669 "movn %[qc2], $t4, $t1 \n\t" 670 "movn %[qc3], $t4, $t2 \n\t" 671 "movn %[qc4], $t4, $t3 \n\t" 672 "lw $t0, 0(%[in_int]) \n\t" 673 "lw $t1, 4(%[in_int]) \n\t" 674 "lw $t2, 8(%[in_int]) \n\t" 675 "lw $t3, 12(%[in_int]) \n\t" 676 "slt $t0, $t0, $zero \n\t" 677 "movn %[sign1], $t0, %[qc1] \n\t" 678 "slt $t2, $t2, $zero \n\t" 679 "movn %[sign2], $t2, %[qc3] \n\t" 680 "slt $t1, $t1, $zero \n\t" 681 "sll $t0, %[sign1], 1 \n\t" 682 "or $t0, $t0, $t1 \n\t" 683 "movn %[sign1], $t0, %[qc2] \n\t" 684 "slt $t3, $t3, $zero \n\t" 685 "sll $t0, %[sign2], 1 \n\t" 686 "or $t0, $t0, $t3 \n\t" 687 "movn %[sign2], $t0, %[qc4] \n\t" 688 "slt %[count1], $zero, %[qc1] \n\t" 689 "slt $t1, $zero, %[qc2] \n\t" 690 "slt %[count2], $zero, %[qc3] \n\t" 691 "slt $t2, $zero, %[qc4] \n\t" 692 "addu %[count1], %[count1], $t1 \n\t" 693 "addu %[count2], %[count2], $t2 \n\t" 694 695 ".set pop \n\t" 696 697 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 698 [qc3]"+r"(qc3), [qc4]"+r"(qc4), 699 [sign1]"=&r"(sign1), [count1]"=&r"(count1), 700 [sign2]"=&r"(sign2), [count2]"=&r"(count2) 701 : [in_int]"r"(in_int) 702 : "t0", "t1", "t2", "t3", "t4", 703 "memory" 704 ); 705 706 curidx = 17 * qc1; 707 curidx += qc2; 708 curidx2 = 17 * qc3; 709 curidx2 += qc4; 710 711 v_codes = (p_codes[curidx] << count1) | sign1; 712 v_bits = p_bits[curidx] + count1; 713 put_bits(pb, v_bits, v_codes); 714 715 v_codes = (p_codes[curidx2] << count2) | sign2; 716 v_bits = p_bits[curidx2] + count2; 717 put_bits(pb, v_bits, v_codes); 718 } 719 } else { 720 for (i = 0; i < size; i += 4) { 721 int curidx, curidx2, sign1, count1, sign2, count2; 722 int *in_int = (int *)&in[i]; 723 uint8_t v_bits; 724 unsigned int v_codes; 725 int c1, c2, c3, c4; 726 727 qc1 = scaled[i ] * Q34 + 0.4054f; 728 qc2 = scaled[i+1] * Q34 + 0.4054f; 729 qc3 = scaled[i+2] * Q34 + 0.4054f; 730 qc4 = scaled[i+3] * Q34 + 0.4054f; 731 732 __asm__ volatile ( 733 ".set push \n\t" 734 ".set noreorder \n\t" 735 736 "ori $t4, $zero, 16 \n\t" 737 "ori %[sign1], $zero, 0 \n\t" 738 "ori %[sign2], $zero, 0 \n\t" 739 "shll_s.w %[c1], %[qc1], 18 \n\t" 740 "shll_s.w %[c2], %[qc2], 18 \n\t" 741 "shll_s.w %[c3], %[qc3], 18 \n\t" 742 "shll_s.w %[c4], %[qc4], 18 \n\t" 743 "srl %[c1], %[c1], 18 \n\t" 744 "srl %[c2], %[c2], 18 \n\t" 745 "srl %[c3], %[c3], 18 \n\t" 746 "srl %[c4], %[c4], 18 \n\t" 747 "slt $t0, $t4, %[qc1] \n\t" 748 "slt $t1, $t4, %[qc2] \n\t" 749 "slt $t2, $t4, %[qc3] \n\t" 750 "slt $t3, $t4, %[qc4] \n\t" 751 "movn %[qc1], $t4, $t0 \n\t" 752 "movn %[qc2], $t4, $t1 \n\t" 753 "movn %[qc3], $t4, $t2 \n\t" 754 "movn %[qc4], $t4, $t3 \n\t" 755 "lw $t0, 0(%[in_int]) \n\t" 756 "lw $t1, 4(%[in_int]) \n\t" 757 "lw $t2, 8(%[in_int]) \n\t" 758 "lw $t3, 12(%[in_int]) \n\t" 759 "slt $t0, $t0, $zero \n\t" 760 "movn %[sign1], $t0, %[qc1] \n\t" 761 "slt $t2, $t2, $zero \n\t" 762 "movn %[sign2], $t2, %[qc3] \n\t" 763 "slt $t1, $t1, $zero \n\t" 764 "sll $t0, %[sign1], 1 \n\t" 765 "or $t0, $t0, $t1 \n\t" 766 "movn %[sign1], $t0, %[qc2] \n\t" 767 "slt $t3, $t3, $zero \n\t" 768 "sll $t0, %[sign2], 1 \n\t" 769 "or $t0, $t0, $t3 \n\t" 770 "movn %[sign2], $t0, %[qc4] \n\t" 771 "slt %[count1], $zero, %[qc1] \n\t" 772 "slt $t1, $zero, %[qc2] \n\t" 773 "slt %[count2], $zero, %[qc3] \n\t" 774 "slt $t2, $zero, %[qc4] \n\t" 775 "addu %[count1], %[count1], $t1 \n\t" 776 "addu %[count2], %[count2], $t2 \n\t" 777 778 ".set pop \n\t" 779 780 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 781 [qc3]"+r"(qc3), [qc4]"+r"(qc4), 782 [sign1]"=&r"(sign1), [count1]"=&r"(count1), 783 [sign2]"=&r"(sign2), [count2]"=&r"(count2), 784 [c1]"=&r"(c1), [c2]"=&r"(c2), 785 [c3]"=&r"(c3), [c4]"=&r"(c4) 786 : [in_int]"r"(in_int) 787 : "t0", "t1", "t2", "t3", "t4", 788 "memory" 789 ); 790 791 curidx = 17 * qc1; 792 curidx += qc2; 793 794 curidx2 = 17 * qc3; 795 curidx2 += qc4; 796 797 v_codes = (p_codes[curidx] << count1) | sign1; 798 v_bits = p_bits[curidx] + count1; 799 put_bits(pb, v_bits, v_codes); 800 801 if (p_vectors[curidx*2 ] == 64.0f) { 802 int len = av_log2(c1); 803 v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1)); 804 put_bits(pb, len * 2 - 3, v_codes); 805 } 806 if (p_vectors[curidx*2+1] == 64.0f) { 807 int len = av_log2(c2); 808 v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1)); 809 put_bits(pb, len*2-3, v_codes); 810 } 811 812 v_codes = (p_codes[curidx2] << count2) | sign2; 813 v_bits = p_bits[curidx2] + count2; 814 put_bits(pb, v_bits, v_codes); 815 816 if (p_vectors[curidx2*2 ] == 64.0f) { 817 int len = av_log2(c3); 818 v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1)); 819 put_bits(pb, len* 2 - 3, v_codes); 820 } 821 if (p_vectors[curidx2*2+1] == 64.0f) { 822 int len = av_log2(c4); 823 v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1)); 824 put_bits(pb, len * 2 - 3, v_codes); 825 } 826 } 827 } 828} 829 830static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s, 831 PutBitContext *pb, const float *in, 832 const float *scaled, int size, int scale_idx, 833 int cb, const float lambda, const float uplim, 834 int *bits) = { 835 NULL, 836 quantize_and_encode_band_cost_SQUAD_mips, 837 quantize_and_encode_band_cost_SQUAD_mips, 838 quantize_and_encode_band_cost_UQUAD_mips, 839 quantize_and_encode_band_cost_UQUAD_mips, 840 quantize_and_encode_band_cost_SPAIR_mips, 841 quantize_and_encode_band_cost_SPAIR_mips, 842 quantize_and_encode_band_cost_UPAIR7_mips, 843 quantize_and_encode_band_cost_UPAIR7_mips, 844 quantize_and_encode_band_cost_UPAIR12_mips, 845 quantize_and_encode_band_cost_UPAIR12_mips, 846 quantize_and_encode_band_cost_ESC_mips, 847}; 848 849#define quantize_and_encode_band_cost( \ 850 s, pb, in, scaled, size, scale_idx, cb, \ 851 lambda, uplim, bits) \ 852 quantize_and_encode_band_cost_arr[cb]( \ 853 s, pb, in, scaled, size, scale_idx, cb, \ 854 lambda, uplim, bits) 855 856static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb, 857 const float *in, int size, int scale_idx, 858 int cb, const float lambda) 859{ 860 quantize_and_encode_band_cost(s, pb, in, NULL, size, scale_idx, cb, lambda, 861 INFINITY, NULL); 862} 863 864/** 865 * Functions developed from template function and optimized for getting the number of bits 866 */ 867static float get_band_numbits_ZERO_mips(struct AACEncContext *s, 868 PutBitContext *pb, const float *in, 869 const float *scaled, int size, int scale_idx, 870 int cb, const float lambda, const float uplim, 871 int *bits) 872{ 873 return 0; 874} 875 876static float get_band_numbits_SQUAD_mips(struct AACEncContext *s, 877 PutBitContext *pb, const float *in, 878 const float *scaled, int size, int scale_idx, 879 int cb, const float lambda, const float uplim, 880 int *bits) 881{ 882 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 883 int i; 884 int qc1, qc2, qc3, qc4; 885 int curbits = 0; 886 887 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; 888 889 for (i = 0; i < size; i += 4) { 890 int curidx; 891 int *in_int = (int *)&in[i]; 892 893 qc1 = scaled[i ] * Q34 + 0.4054f; 894 qc2 = scaled[i+1] * Q34 + 0.4054f; 895 qc3 = scaled[i+2] * Q34 + 0.4054f; 896 qc4 = scaled[i+3] * Q34 + 0.4054f; 897 898 __asm__ volatile ( 899 ".set push \n\t" 900 ".set noreorder \n\t" 901 902 "slt %[qc1], $zero, %[qc1] \n\t" 903 "slt %[qc2], $zero, %[qc2] \n\t" 904 "slt %[qc3], $zero, %[qc3] \n\t" 905 "slt %[qc4], $zero, %[qc4] \n\t" 906 "lw $t0, 0(%[in_int]) \n\t" 907 "lw $t1, 4(%[in_int]) \n\t" 908 "lw $t2, 8(%[in_int]) \n\t" 909 "lw $t3, 12(%[in_int]) \n\t" 910 "srl $t0, $t0, 31 \n\t" 911 "srl $t1, $t1, 31 \n\t" 912 "srl $t2, $t2, 31 \n\t" 913 "srl $t3, $t3, 31 \n\t" 914 "subu $t4, $zero, %[qc1] \n\t" 915 "subu $t5, $zero, %[qc2] \n\t" 916 "subu $t6, $zero, %[qc3] \n\t" 917 "subu $t7, $zero, %[qc4] \n\t" 918 "movn %[qc1], $t4, $t0 \n\t" 919 "movn %[qc2], $t5, $t1 \n\t" 920 "movn %[qc3], $t6, $t2 \n\t" 921 "movn %[qc4], $t7, $t3 \n\t" 922 923 ".set pop \n\t" 924 925 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 926 [qc3]"+r"(qc3), [qc4]"+r"(qc4) 927 : [in_int]"r"(in_int) 928 : "t0", "t1", "t2", "t3", 929 "t4", "t5", "t6", "t7", 930 "memory" 931 ); 932 933 curidx = qc1; 934 curidx *= 3; 935 curidx += qc2; 936 curidx *= 3; 937 curidx += qc3; 938 curidx *= 3; 939 curidx += qc4; 940 curidx += 40; 941 942 curbits += p_bits[curidx]; 943 } 944 return curbits; 945} 946 947static float get_band_numbits_UQUAD_mips(struct AACEncContext *s, 948 PutBitContext *pb, const float *in, 949 const float *scaled, int size, int scale_idx, 950 int cb, const float lambda, const float uplim, 951 int *bits) 952{ 953 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 954 int i; 955 int curbits = 0; 956 int qc1, qc2, qc3, qc4; 957 958 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; 959 960 for (i = 0; i < size; i += 4) { 961 int curidx; 962 963 qc1 = scaled[i ] * Q34 + 0.4054f; 964 qc2 = scaled[i+1] * Q34 + 0.4054f; 965 qc3 = scaled[i+2] * Q34 + 0.4054f; 966 qc4 = scaled[i+3] * Q34 + 0.4054f; 967 968 __asm__ volatile ( 969 ".set push \n\t" 970 ".set noreorder \n\t" 971 972 "ori $t4, $zero, 2 \n\t" 973 "slt $t0, $t4, %[qc1] \n\t" 974 "slt $t1, $t4, %[qc2] \n\t" 975 "slt $t2, $t4, %[qc3] \n\t" 976 "slt $t3, $t4, %[qc4] \n\t" 977 "movn %[qc1], $t4, $t0 \n\t" 978 "movn %[qc2], $t4, $t1 \n\t" 979 "movn %[qc3], $t4, $t2 \n\t" 980 "movn %[qc4], $t4, $t3 \n\t" 981 982 ".set pop \n\t" 983 984 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 985 [qc3]"+r"(qc3), [qc4]"+r"(qc4) 986 : 987 : "t0", "t1", "t2", "t3", "t4" 988 ); 989 990 curidx = qc1; 991 curidx *= 3; 992 curidx += qc2; 993 curidx *= 3; 994 curidx += qc3; 995 curidx *= 3; 996 curidx += qc4; 997 998 curbits += p_bits[curidx]; 999 curbits += uquad_sign_bits[curidx]; 1000 } 1001 return curbits; 1002} 1003 1004static float get_band_numbits_SPAIR_mips(struct AACEncContext *s, 1005 PutBitContext *pb, const float *in, 1006 const float *scaled, int size, int scale_idx, 1007 int cb, const float lambda, const float uplim, 1008 int *bits) 1009{ 1010 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 1011 int i; 1012 int qc1, qc2, qc3, qc4; 1013 int curbits = 0; 1014 1015 uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1]; 1016 1017 for (i = 0; i < size; i += 4) { 1018 int curidx, curidx2; 1019 int *in_int = (int *)&in[i]; 1020 1021 qc1 = scaled[i ] * Q34 + 0.4054f; 1022 qc2 = scaled[i+1] * Q34 + 0.4054f; 1023 qc3 = scaled[i+2] * Q34 + 0.4054f; 1024 qc4 = scaled[i+3] * Q34 + 0.4054f; 1025 1026 __asm__ volatile ( 1027 ".set push \n\t" 1028 ".set noreorder \n\t" 1029 1030 "ori $t4, $zero, 4 \n\t" 1031 "slt $t0, $t4, %[qc1] \n\t" 1032 "slt $t1, $t4, %[qc2] \n\t" 1033 "slt $t2, $t4, %[qc3] \n\t" 1034 "slt $t3, $t4, %[qc4] \n\t" 1035 "movn %[qc1], $t4, $t0 \n\t" 1036 "movn %[qc2], $t4, $t1 \n\t" 1037 "movn %[qc3], $t4, $t2 \n\t" 1038 "movn %[qc4], $t4, $t3 \n\t" 1039 "lw $t0, 0(%[in_int]) \n\t" 1040 "lw $t1, 4(%[in_int]) \n\t" 1041 "lw $t2, 8(%[in_int]) \n\t" 1042 "lw $t3, 12(%[in_int]) \n\t" 1043 "srl $t0, $t0, 31 \n\t" 1044 "srl $t1, $t1, 31 \n\t" 1045 "srl $t2, $t2, 31 \n\t" 1046 "srl $t3, $t3, 31 \n\t" 1047 "subu $t4, $zero, %[qc1] \n\t" 1048 "subu $t5, $zero, %[qc2] \n\t" 1049 "subu $t6, $zero, %[qc3] \n\t" 1050 "subu $t7, $zero, %[qc4] \n\t" 1051 "movn %[qc1], $t4, $t0 \n\t" 1052 "movn %[qc2], $t5, $t1 \n\t" 1053 "movn %[qc3], $t6, $t2 \n\t" 1054 "movn %[qc4], $t7, $t3 \n\t" 1055 1056 ".set pop \n\t" 1057 1058 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 1059 [qc3]"+r"(qc3), [qc4]"+r"(qc4) 1060 : [in_int]"r"(in_int) 1061 : "t0", "t1", "t2", "t3", 1062 "t4", "t5", "t6", "t7", 1063 "memory" 1064 ); 1065 1066 curidx = 9 * qc1; 1067 curidx += qc2 + 40; 1068 1069 curidx2 = 9 * qc3; 1070 curidx2 += qc4 + 40; 1071 1072 curbits += p_bits[curidx] + p_bits[curidx2]; 1073 } 1074 return curbits; 1075} 1076 1077static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s, 1078 PutBitContext *pb, const float *in, 1079 const float *scaled, int size, int scale_idx, 1080 int cb, const float lambda, const float uplim, 1081 int *bits) 1082{ 1083 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 1084 int i; 1085 int qc1, qc2, qc3, qc4; 1086 int curbits = 0; 1087 1088 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; 1089 1090 for (i = 0; i < size; i += 4) { 1091 int curidx, curidx2; 1092 1093 qc1 = scaled[i ] * Q34 + 0.4054f; 1094 qc2 = scaled[i+1] * Q34 + 0.4054f; 1095 qc3 = scaled[i+2] * Q34 + 0.4054f; 1096 qc4 = scaled[i+3] * Q34 + 0.4054f; 1097 1098 __asm__ volatile ( 1099 ".set push \n\t" 1100 ".set noreorder \n\t" 1101 1102 "ori $t4, $zero, 7 \n\t" 1103 "slt $t0, $t4, %[qc1] \n\t" 1104 "slt $t1, $t4, %[qc2] \n\t" 1105 "slt $t2, $t4, %[qc3] \n\t" 1106 "slt $t3, $t4, %[qc4] \n\t" 1107 "movn %[qc1], $t4, $t0 \n\t" 1108 "movn %[qc2], $t4, $t1 \n\t" 1109 "movn %[qc3], $t4, $t2 \n\t" 1110 "movn %[qc4], $t4, $t3 \n\t" 1111 1112 ".set pop \n\t" 1113 1114 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 1115 [qc3]"+r"(qc3), [qc4]"+r"(qc4) 1116 : 1117 : "t0", "t1", "t2", "t3", "t4" 1118 ); 1119 1120 curidx = 8 * qc1; 1121 curidx += qc2; 1122 1123 curidx2 = 8 * qc3; 1124 curidx2 += qc4; 1125 1126 curbits += p_bits[curidx] + 1127 upair7_sign_bits[curidx] + 1128 p_bits[curidx2] + 1129 upair7_sign_bits[curidx2]; 1130 } 1131 return curbits; 1132} 1133 1134static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s, 1135 PutBitContext *pb, const float *in, 1136 const float *scaled, int size, int scale_idx, 1137 int cb, const float lambda, const float uplim, 1138 int *bits) 1139{ 1140 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 1141 int i; 1142 int qc1, qc2, qc3, qc4; 1143 int curbits = 0; 1144 1145 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; 1146 1147 for (i = 0; i < size; i += 4) { 1148 int curidx, curidx2; 1149 1150 qc1 = scaled[i ] * Q34 + 0.4054f; 1151 qc2 = scaled[i+1] * Q34 + 0.4054f; 1152 qc3 = scaled[i+2] * Q34 + 0.4054f; 1153 qc4 = scaled[i+3] * Q34 + 0.4054f; 1154 1155 __asm__ volatile ( 1156 ".set push \n\t" 1157 ".set noreorder \n\t" 1158 1159 "ori $t4, $zero, 12 \n\t" 1160 "slt $t0, $t4, %[qc1] \n\t" 1161 "slt $t1, $t4, %[qc2] \n\t" 1162 "slt $t2, $t4, %[qc3] \n\t" 1163 "slt $t3, $t4, %[qc4] \n\t" 1164 "movn %[qc1], $t4, $t0 \n\t" 1165 "movn %[qc2], $t4, $t1 \n\t" 1166 "movn %[qc3], $t4, $t2 \n\t" 1167 "movn %[qc4], $t4, $t3 \n\t" 1168 1169 ".set pop \n\t" 1170 1171 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 1172 [qc3]"+r"(qc3), [qc4]"+r"(qc4) 1173 : 1174 : "t0", "t1", "t2", "t3", "t4" 1175 ); 1176 1177 curidx = 13 * qc1; 1178 curidx += qc2; 1179 1180 curidx2 = 13 * qc3; 1181 curidx2 += qc4; 1182 1183 curbits += p_bits[curidx] + 1184 p_bits[curidx2] + 1185 upair12_sign_bits[curidx] + 1186 upair12_sign_bits[curidx2]; 1187 } 1188 return curbits; 1189} 1190 1191static float get_band_numbits_ESC_mips(struct AACEncContext *s, 1192 PutBitContext *pb, const float *in, 1193 const float *scaled, int size, int scale_idx, 1194 int cb, const float lambda, const float uplim, 1195 int *bits) 1196{ 1197 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 1198 int i; 1199 int qc1, qc2, qc3, qc4; 1200 int curbits = 0; 1201 1202 uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1]; 1203 1204 for (i = 0; i < size; i += 4) { 1205 int curidx, curidx2; 1206 int cond0, cond1, cond2, cond3; 1207 int c1, c2, c3, c4; 1208 1209 qc1 = scaled[i ] * Q34 + 0.4054f; 1210 qc2 = scaled[i+1] * Q34 + 0.4054f; 1211 qc3 = scaled[i+2] * Q34 + 0.4054f; 1212 qc4 = scaled[i+3] * Q34 + 0.4054f; 1213 1214 __asm__ volatile ( 1215 ".set push \n\t" 1216 ".set noreorder \n\t" 1217 1218 "ori $t4, $zero, 15 \n\t" 1219 "ori $t5, $zero, 16 \n\t" 1220 "shll_s.w %[c1], %[qc1], 18 \n\t" 1221 "shll_s.w %[c2], %[qc2], 18 \n\t" 1222 "shll_s.w %[c3], %[qc3], 18 \n\t" 1223 "shll_s.w %[c4], %[qc4], 18 \n\t" 1224 "srl %[c1], %[c1], 18 \n\t" 1225 "srl %[c2], %[c2], 18 \n\t" 1226 "srl %[c3], %[c3], 18 \n\t" 1227 "srl %[c4], %[c4], 18 \n\t" 1228 "slt %[cond0], $t4, %[qc1] \n\t" 1229 "slt %[cond1], $t4, %[qc2] \n\t" 1230 "slt %[cond2], $t4, %[qc3] \n\t" 1231 "slt %[cond3], $t4, %[qc4] \n\t" 1232 "movn %[qc1], $t5, %[cond0] \n\t" 1233 "movn %[qc2], $t5, %[cond1] \n\t" 1234 "movn %[qc3], $t5, %[cond2] \n\t" 1235 "movn %[qc4], $t5, %[cond3] \n\t" 1236 "ori $t5, $zero, 31 \n\t" 1237 "clz %[c1], %[c1] \n\t" 1238 "clz %[c2], %[c2] \n\t" 1239 "clz %[c3], %[c3] \n\t" 1240 "clz %[c4], %[c4] \n\t" 1241 "subu %[c1], $t5, %[c1] \n\t" 1242 "subu %[c2], $t5, %[c2] \n\t" 1243 "subu %[c3], $t5, %[c3] \n\t" 1244 "subu %[c4], $t5, %[c4] \n\t" 1245 "sll %[c1], %[c1], 1 \n\t" 1246 "sll %[c2], %[c2], 1 \n\t" 1247 "sll %[c3], %[c3], 1 \n\t" 1248 "sll %[c4], %[c4], 1 \n\t" 1249 "addiu %[c1], %[c1], -3 \n\t" 1250 "addiu %[c2], %[c2], -3 \n\t" 1251 "addiu %[c3], %[c3], -3 \n\t" 1252 "addiu %[c4], %[c4], -3 \n\t" 1253 "subu %[cond0], $zero, %[cond0] \n\t" 1254 "subu %[cond1], $zero, %[cond1] \n\t" 1255 "subu %[cond2], $zero, %[cond2] \n\t" 1256 "subu %[cond3], $zero, %[cond3] \n\t" 1257 "and %[c1], %[c1], %[cond0] \n\t" 1258 "and %[c2], %[c2], %[cond1] \n\t" 1259 "and %[c3], %[c3], %[cond2] \n\t" 1260 "and %[c4], %[c4], %[cond3] \n\t" 1261 1262 ".set pop \n\t" 1263 1264 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 1265 [qc3]"+r"(qc3), [qc4]"+r"(qc4), 1266 [cond0]"=&r"(cond0), [cond1]"=&r"(cond1), 1267 [cond2]"=&r"(cond2), [cond3]"=&r"(cond3), 1268 [c1]"=&r"(c1), [c2]"=&r"(c2), 1269 [c3]"=&r"(c3), [c4]"=&r"(c4) 1270 : 1271 : "t4", "t5" 1272 ); 1273 1274 curidx = 17 * qc1; 1275 curidx += qc2; 1276 1277 curidx2 = 17 * qc3; 1278 curidx2 += qc4; 1279 1280 curbits += p_bits[curidx]; 1281 curbits += esc_sign_bits[curidx]; 1282 curbits += p_bits[curidx2]; 1283 curbits += esc_sign_bits[curidx2]; 1284 1285 curbits += c1; 1286 curbits += c2; 1287 curbits += c3; 1288 curbits += c4; 1289 } 1290 return curbits; 1291} 1292 1293static float (*const get_band_numbits_arr[])(struct AACEncContext *s, 1294 PutBitContext *pb, const float *in, 1295 const float *scaled, int size, int scale_idx, 1296 int cb, const float lambda, const float uplim, 1297 int *bits) = { 1298 get_band_numbits_ZERO_mips, 1299 get_band_numbits_SQUAD_mips, 1300 get_band_numbits_SQUAD_mips, 1301 get_band_numbits_UQUAD_mips, 1302 get_band_numbits_UQUAD_mips, 1303 get_band_numbits_SPAIR_mips, 1304 get_band_numbits_SPAIR_mips, 1305 get_band_numbits_UPAIR7_mips, 1306 get_band_numbits_UPAIR7_mips, 1307 get_band_numbits_UPAIR12_mips, 1308 get_band_numbits_UPAIR12_mips, 1309 get_band_numbits_ESC_mips, 1310}; 1311 1312#define get_band_numbits( \ 1313 s, pb, in, scaled, size, scale_idx, cb, \ 1314 lambda, uplim, bits) \ 1315 get_band_numbits_arr[cb]( \ 1316 s, pb, in, scaled, size, scale_idx, cb, \ 1317 lambda, uplim, bits) 1318 1319static float quantize_band_cost_bits(struct AACEncContext *s, const float *in, 1320 const float *scaled, int size, int scale_idx, 1321 int cb, const float lambda, const float uplim, 1322 int *bits) 1323{ 1324 return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits); 1325} 1326 1327/** 1328 * Functions developed from template function and optimized for getting the band cost 1329 */ 1330#if HAVE_MIPSFPU 1331static float get_band_cost_ZERO_mips(struct AACEncContext *s, 1332 PutBitContext *pb, const float *in, 1333 const float *scaled, int size, int scale_idx, 1334 int cb, const float lambda, const float uplim, 1335 int *bits) 1336{ 1337 int i; 1338 float cost = 0; 1339 1340 for (i = 0; i < size; i += 4) { 1341 cost += in[i ] * in[i ]; 1342 cost += in[i+1] * in[i+1]; 1343 cost += in[i+2] * in[i+2]; 1344 cost += in[i+3] * in[i+3]; 1345 } 1346 if (bits) 1347 *bits = 0; 1348 return cost * lambda; 1349} 1350 1351static float get_band_cost_SQUAD_mips(struct AACEncContext *s, 1352 PutBitContext *pb, const float *in, 1353 const float *scaled, int size, int scale_idx, 1354 int cb, const float lambda, const float uplim, 1355 int *bits) 1356{ 1357 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 1358 const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; 1359 int i; 1360 float cost = 0; 1361 int qc1, qc2, qc3, qc4; 1362 int curbits = 0; 1363 1364 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; 1365 float *p_codes = (float *)ff_aac_codebook_vectors[cb-1]; 1366 1367 for (i = 0; i < size; i += 4) { 1368 const float *vec; 1369 int curidx; 1370 int *in_int = (int *)&in[i]; 1371 float *in_pos = (float *)&in[i]; 1372 float di0, di1, di2, di3; 1373 1374 qc1 = scaled[i ] * Q34 + 0.4054f; 1375 qc2 = scaled[i+1] * Q34 + 0.4054f; 1376 qc3 = scaled[i+2] * Q34 + 0.4054f; 1377 qc4 = scaled[i+3] * Q34 + 0.4054f; 1378 1379 __asm__ volatile ( 1380 ".set push \n\t" 1381 ".set noreorder \n\t" 1382 1383 "slt %[qc1], $zero, %[qc1] \n\t" 1384 "slt %[qc2], $zero, %[qc2] \n\t" 1385 "slt %[qc3], $zero, %[qc3] \n\t" 1386 "slt %[qc4], $zero, %[qc4] \n\t" 1387 "lw $t0, 0(%[in_int]) \n\t" 1388 "lw $t1, 4(%[in_int]) \n\t" 1389 "lw $t2, 8(%[in_int]) \n\t" 1390 "lw $t3, 12(%[in_int]) \n\t" 1391 "srl $t0, $t0, 31 \n\t" 1392 "srl $t1, $t1, 31 \n\t" 1393 "srl $t2, $t2, 31 \n\t" 1394 "srl $t3, $t3, 31 \n\t" 1395 "subu $t4, $zero, %[qc1] \n\t" 1396 "subu $t5, $zero, %[qc2] \n\t" 1397 "subu $t6, $zero, %[qc3] \n\t" 1398 "subu $t7, $zero, %[qc4] \n\t" 1399 "movn %[qc1], $t4, $t0 \n\t" 1400 "movn %[qc2], $t5, $t1 \n\t" 1401 "movn %[qc3], $t6, $t2 \n\t" 1402 "movn %[qc4], $t7, $t3 \n\t" 1403 1404 ".set pop \n\t" 1405 1406 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 1407 [qc3]"+r"(qc3), [qc4]"+r"(qc4) 1408 : [in_int]"r"(in_int) 1409 : "t0", "t1", "t2", "t3", 1410 "t4", "t5", "t6", "t7", 1411 "memory" 1412 ); 1413 1414 curidx = qc1; 1415 curidx *= 3; 1416 curidx += qc2; 1417 curidx *= 3; 1418 curidx += qc3; 1419 curidx *= 3; 1420 curidx += qc4; 1421 curidx += 40; 1422 1423 curbits += p_bits[curidx]; 1424 vec = &p_codes[curidx*4]; 1425 1426 __asm__ volatile ( 1427 ".set push \n\t" 1428 ".set noreorder \n\t" 1429 1430 "lwc1 $f0, 0(%[in_pos]) \n\t" 1431 "lwc1 $f1, 0(%[vec]) \n\t" 1432 "lwc1 $f2, 4(%[in_pos]) \n\t" 1433 "lwc1 $f3, 4(%[vec]) \n\t" 1434 "lwc1 $f4, 8(%[in_pos]) \n\t" 1435 "lwc1 $f5, 8(%[vec]) \n\t" 1436 "lwc1 $f6, 12(%[in_pos]) \n\t" 1437 "lwc1 $f7, 12(%[vec]) \n\t" 1438 "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t" 1439 "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t" 1440 "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t" 1441 "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t" 1442 1443 ".set pop \n\t" 1444 1445 : [di0]"=&f"(di0), [di1]"=&f"(di1), 1446 [di2]"=&f"(di2), [di3]"=&f"(di3) 1447 : [in_pos]"r"(in_pos), [vec]"r"(vec), 1448 [IQ]"f"(IQ) 1449 : "$f0", "$f1", "$f2", "$f3", 1450 "$f4", "$f5", "$f6", "$f7", 1451 "memory" 1452 ); 1453 1454 cost += di0 * di0 + di1 * di1 1455 + di2 * di2 + di3 * di3; 1456 } 1457 1458 if (bits) 1459 *bits = curbits; 1460 return cost * lambda + curbits; 1461} 1462 1463static float get_band_cost_UQUAD_mips(struct AACEncContext *s, 1464 PutBitContext *pb, const float *in, 1465 const float *scaled, int size, int scale_idx, 1466 int cb, const float lambda, const float uplim, 1467 int *bits) 1468{ 1469 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 1470 const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; 1471 int i; 1472 float cost = 0; 1473 int curbits = 0; 1474 int qc1, qc2, qc3, qc4; 1475 1476 uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1]; 1477 float *p_codes = (float *)ff_aac_codebook_vectors[cb-1]; 1478 1479 for (i = 0; i < size; i += 4) { 1480 const float *vec; 1481 int curidx; 1482 float *in_pos = (float *)&in[i]; 1483 float di0, di1, di2, di3; 1484 1485 qc1 = scaled[i ] * Q34 + 0.4054f; 1486 qc2 = scaled[i+1] * Q34 + 0.4054f; 1487 qc3 = scaled[i+2] * Q34 + 0.4054f; 1488 qc4 = scaled[i+3] * Q34 + 0.4054f; 1489 1490 __asm__ volatile ( 1491 ".set push \n\t" 1492 ".set noreorder \n\t" 1493 1494 "ori $t4, $zero, 2 \n\t" 1495 "slt $t0, $t4, %[qc1] \n\t" 1496 "slt $t1, $t4, %[qc2] \n\t" 1497 "slt $t2, $t4, %[qc3] \n\t" 1498 "slt $t3, $t4, %[qc4] \n\t" 1499 "movn %[qc1], $t4, $t0 \n\t" 1500 "movn %[qc2], $t4, $t1 \n\t" 1501 "movn %[qc3], $t4, $t2 \n\t" 1502 "movn %[qc4], $t4, $t3 \n\t" 1503 1504 ".set pop \n\t" 1505 1506 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 1507 [qc3]"+r"(qc3), [qc4]"+r"(qc4) 1508 : 1509 : "t0", "t1", "t2", "t3", "t4" 1510 ); 1511 1512 curidx = qc1; 1513 curidx *= 3; 1514 curidx += qc2; 1515 curidx *= 3; 1516 curidx += qc3; 1517 curidx *= 3; 1518 curidx += qc4; 1519 1520 curbits += p_bits[curidx]; 1521 curbits += uquad_sign_bits[curidx]; 1522 vec = &p_codes[curidx*4]; 1523 1524 __asm__ volatile ( 1525 ".set push \n\t" 1526 ".set noreorder \n\t" 1527 1528 "lwc1 %[di0], 0(%[in_pos]) \n\t" 1529 "lwc1 %[di1], 4(%[in_pos]) \n\t" 1530 "lwc1 %[di2], 8(%[in_pos]) \n\t" 1531 "lwc1 %[di3], 12(%[in_pos]) \n\t" 1532 "abs.s %[di0], %[di0] \n\t" 1533 "abs.s %[di1], %[di1] \n\t" 1534 "abs.s %[di2], %[di2] \n\t" 1535 "abs.s %[di3], %[di3] \n\t" 1536 "lwc1 $f0, 0(%[vec]) \n\t" 1537 "lwc1 $f1, 4(%[vec]) \n\t" 1538 "lwc1 $f2, 8(%[vec]) \n\t" 1539 "lwc1 $f3, 12(%[vec]) \n\t" 1540 "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t" 1541 "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t" 1542 "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t" 1543 "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t" 1544 1545 ".set pop \n\t" 1546 1547 : [di0]"=&f"(di0), [di1]"=&f"(di1), 1548 [di2]"=&f"(di2), [di3]"=&f"(di3) 1549 : [in_pos]"r"(in_pos), [vec]"r"(vec), 1550 [IQ]"f"(IQ) 1551 : "$f0", "$f1", "$f2", "$f3", 1552 "memory" 1553 ); 1554 1555 cost += di0 * di0 + di1 * di1 1556 + di2 * di2 + di3 * di3; 1557 } 1558 1559 if (bits) 1560 *bits = curbits; 1561 return cost * lambda + curbits; 1562} 1563 1564static float get_band_cost_SPAIR_mips(struct AACEncContext *s, 1565 PutBitContext *pb, const float *in, 1566 const float *scaled, int size, int scale_idx, 1567 int cb, const float lambda, const float uplim, 1568 int *bits) 1569{ 1570 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 1571 const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; 1572 int i; 1573 float cost = 0; 1574 int qc1, qc2, qc3, qc4; 1575 int curbits = 0; 1576 1577 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; 1578 float *p_codes = (float *)ff_aac_codebook_vectors[cb-1]; 1579 1580 for (i = 0; i < size; i += 4) { 1581 const float *vec, *vec2; 1582 int curidx, curidx2; 1583 int *in_int = (int *)&in[i]; 1584 float *in_pos = (float *)&in[i]; 1585 float di0, di1, di2, di3; 1586 1587 qc1 = scaled[i ] * Q34 + 0.4054f; 1588 qc2 = scaled[i+1] * Q34 + 0.4054f; 1589 qc3 = scaled[i+2] * Q34 + 0.4054f; 1590 qc4 = scaled[i+3] * Q34 + 0.4054f; 1591 1592 __asm__ volatile ( 1593 ".set push \n\t" 1594 ".set noreorder \n\t" 1595 1596 "ori $t4, $zero, 4 \n\t" 1597 "slt $t0, $t4, %[qc1] \n\t" 1598 "slt $t1, $t4, %[qc2] \n\t" 1599 "slt $t2, $t4, %[qc3] \n\t" 1600 "slt $t3, $t4, %[qc4] \n\t" 1601 "movn %[qc1], $t4, $t0 \n\t" 1602 "movn %[qc2], $t4, $t1 \n\t" 1603 "movn %[qc3], $t4, $t2 \n\t" 1604 "movn %[qc4], $t4, $t3 \n\t" 1605 "lw $t0, 0(%[in_int]) \n\t" 1606 "lw $t1, 4(%[in_int]) \n\t" 1607 "lw $t2, 8(%[in_int]) \n\t" 1608 "lw $t3, 12(%[in_int]) \n\t" 1609 "srl $t0, $t0, 31 \n\t" 1610 "srl $t1, $t1, 31 \n\t" 1611 "srl $t2, $t2, 31 \n\t" 1612 "srl $t3, $t3, 31 \n\t" 1613 "subu $t4, $zero, %[qc1] \n\t" 1614 "subu $t5, $zero, %[qc2] \n\t" 1615 "subu $t6, $zero, %[qc3] \n\t" 1616 "subu $t7, $zero, %[qc4] \n\t" 1617 "movn %[qc1], $t4, $t0 \n\t" 1618 "movn %[qc2], $t5, $t1 \n\t" 1619 "movn %[qc3], $t6, $t2 \n\t" 1620 "movn %[qc4], $t7, $t3 \n\t" 1621 1622 ".set pop \n\t" 1623 1624 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 1625 [qc3]"+r"(qc3), [qc4]"+r"(qc4) 1626 : [in_int]"r"(in_int) 1627 : "t0", "t1", "t2", "t3", 1628 "t4", "t5", "t6", "t7", 1629 "memory" 1630 ); 1631 1632 curidx = 9 * qc1; 1633 curidx += qc2 + 40; 1634 1635 curidx2 = 9 * qc3; 1636 curidx2 += qc4 + 40; 1637 1638 curbits += p_bits[curidx]; 1639 curbits += p_bits[curidx2]; 1640 1641 vec = &p_codes[curidx*2]; 1642 vec2 = &p_codes[curidx2*2]; 1643 1644 __asm__ volatile ( 1645 ".set push \n\t" 1646 ".set noreorder \n\t" 1647 1648 "lwc1 $f0, 0(%[in_pos]) \n\t" 1649 "lwc1 $f1, 0(%[vec]) \n\t" 1650 "lwc1 $f2, 4(%[in_pos]) \n\t" 1651 "lwc1 $f3, 4(%[vec]) \n\t" 1652 "lwc1 $f4, 8(%[in_pos]) \n\t" 1653 "lwc1 $f5, 0(%[vec2]) \n\t" 1654 "lwc1 $f6, 12(%[in_pos]) \n\t" 1655 "lwc1 $f7, 4(%[vec2]) \n\t" 1656 "nmsub.s %[di0], $f0, $f1, %[IQ] \n\t" 1657 "nmsub.s %[di1], $f2, $f3, %[IQ] \n\t" 1658 "nmsub.s %[di2], $f4, $f5, %[IQ] \n\t" 1659 "nmsub.s %[di3], $f6, $f7, %[IQ] \n\t" 1660 1661 ".set pop \n\t" 1662 1663 : [di0]"=&f"(di0), [di1]"=&f"(di1), 1664 [di2]"=&f"(di2), [di3]"=&f"(di3) 1665 : [in_pos]"r"(in_pos), [vec]"r"(vec), 1666 [vec2]"r"(vec2), [IQ]"f"(IQ) 1667 : "$f0", "$f1", "$f2", "$f3", 1668 "$f4", "$f5", "$f6", "$f7", 1669 "memory" 1670 ); 1671 1672 cost += di0 * di0 + di1 * di1 1673 + di2 * di2 + di3 * di3; 1674 } 1675 1676 if (bits) 1677 *bits = curbits; 1678 return cost * lambda + curbits; 1679} 1680 1681static float get_band_cost_UPAIR7_mips(struct AACEncContext *s, 1682 PutBitContext *pb, const float *in, 1683 const float *scaled, int size, int scale_idx, 1684 int cb, const float lambda, const float uplim, 1685 int *bits) 1686{ 1687 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 1688 const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; 1689 int i; 1690 float cost = 0; 1691 int qc1, qc2, qc3, qc4; 1692 int curbits = 0; 1693 1694 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; 1695 float *p_codes = (float *)ff_aac_codebook_vectors[cb-1]; 1696 1697 for (i = 0; i < size; i += 4) { 1698 const float *vec, *vec2; 1699 int curidx, curidx2, sign1, count1, sign2, count2; 1700 int *in_int = (int *)&in[i]; 1701 float *in_pos = (float *)&in[i]; 1702 float di0, di1, di2, di3; 1703 1704 qc1 = scaled[i ] * Q34 + 0.4054f; 1705 qc2 = scaled[i+1] * Q34 + 0.4054f; 1706 qc3 = scaled[i+2] * Q34 + 0.4054f; 1707 qc4 = scaled[i+3] * Q34 + 0.4054f; 1708 1709 __asm__ volatile ( 1710 ".set push \n\t" 1711 ".set noreorder \n\t" 1712 1713 "ori $t4, $zero, 7 \n\t" 1714 "ori %[sign1], $zero, 0 \n\t" 1715 "ori %[sign2], $zero, 0 \n\t" 1716 "slt $t0, $t4, %[qc1] \n\t" 1717 "slt $t1, $t4, %[qc2] \n\t" 1718 "slt $t2, $t4, %[qc3] \n\t" 1719 "slt $t3, $t4, %[qc4] \n\t" 1720 "movn %[qc1], $t4, $t0 \n\t" 1721 "movn %[qc2], $t4, $t1 \n\t" 1722 "movn %[qc3], $t4, $t2 \n\t" 1723 "movn %[qc4], $t4, $t3 \n\t" 1724 "lw $t0, 0(%[in_int]) \n\t" 1725 "lw $t1, 4(%[in_int]) \n\t" 1726 "lw $t2, 8(%[in_int]) \n\t" 1727 "lw $t3, 12(%[in_int]) \n\t" 1728 "slt $t0, $t0, $zero \n\t" 1729 "movn %[sign1], $t0, %[qc1] \n\t" 1730 "slt $t2, $t2, $zero \n\t" 1731 "movn %[sign2], $t2, %[qc3] \n\t" 1732 "slt $t1, $t1, $zero \n\t" 1733 "sll $t0, %[sign1], 1 \n\t" 1734 "or $t0, $t0, $t1 \n\t" 1735 "movn %[sign1], $t0, %[qc2] \n\t" 1736 "slt $t3, $t3, $zero \n\t" 1737 "sll $t0, %[sign2], 1 \n\t" 1738 "or $t0, $t0, $t3 \n\t" 1739 "movn %[sign2], $t0, %[qc4] \n\t" 1740 "slt %[count1], $zero, %[qc1] \n\t" 1741 "slt $t1, $zero, %[qc2] \n\t" 1742 "slt %[count2], $zero, %[qc3] \n\t" 1743 "slt $t2, $zero, %[qc4] \n\t" 1744 "addu %[count1], %[count1], $t1 \n\t" 1745 "addu %[count2], %[count2], $t2 \n\t" 1746 1747 ".set pop \n\t" 1748 1749 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 1750 [qc3]"+r"(qc3), [qc4]"+r"(qc4), 1751 [sign1]"=&r"(sign1), [count1]"=&r"(count1), 1752 [sign2]"=&r"(sign2), [count2]"=&r"(count2) 1753 : [in_int]"r"(in_int) 1754 : "t0", "t1", "t2", "t3", "t4", 1755 "memory" 1756 ); 1757 1758 curidx = 8 * qc1; 1759 curidx += qc2; 1760 1761 curidx2 = 8 * qc3; 1762 curidx2 += qc4; 1763 1764 curbits += p_bits[curidx]; 1765 curbits += upair7_sign_bits[curidx]; 1766 vec = &p_codes[curidx*2]; 1767 1768 curbits += p_bits[curidx2]; 1769 curbits += upair7_sign_bits[curidx2]; 1770 vec2 = &p_codes[curidx2*2]; 1771 1772 __asm__ volatile ( 1773 ".set push \n\t" 1774 ".set noreorder \n\t" 1775 1776 "lwc1 %[di0], 0(%[in_pos]) \n\t" 1777 "lwc1 %[di1], 4(%[in_pos]) \n\t" 1778 "lwc1 %[di2], 8(%[in_pos]) \n\t" 1779 "lwc1 %[di3], 12(%[in_pos]) \n\t" 1780 "abs.s %[di0], %[di0] \n\t" 1781 "abs.s %[di1], %[di1] \n\t" 1782 "abs.s %[di2], %[di2] \n\t" 1783 "abs.s %[di3], %[di3] \n\t" 1784 "lwc1 $f0, 0(%[vec]) \n\t" 1785 "lwc1 $f1, 4(%[vec]) \n\t" 1786 "lwc1 $f2, 0(%[vec2]) \n\t" 1787 "lwc1 $f3, 4(%[vec2]) \n\t" 1788 "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t" 1789 "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t" 1790 "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t" 1791 "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t" 1792 1793 ".set pop \n\t" 1794 1795 : [di0]"=&f"(di0), [di1]"=&f"(di1), 1796 [di2]"=&f"(di2), [di3]"=&f"(di3) 1797 : [in_pos]"r"(in_pos), [vec]"r"(vec), 1798 [vec2]"r"(vec2), [IQ]"f"(IQ) 1799 : "$f0", "$f1", "$f2", "$f3", 1800 "memory" 1801 ); 1802 1803 cost += di0 * di0 + di1 * di1 1804 + di2 * di2 + di3 * di3; 1805 } 1806 1807 if (bits) 1808 *bits = curbits; 1809 return cost * lambda + curbits; 1810} 1811 1812static float get_band_cost_UPAIR12_mips(struct AACEncContext *s, 1813 PutBitContext *pb, const float *in, 1814 const float *scaled, int size, int scale_idx, 1815 int cb, const float lambda, const float uplim, 1816 int *bits) 1817{ 1818 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 1819 const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; 1820 int i; 1821 float cost = 0; 1822 int qc1, qc2, qc3, qc4; 1823 int curbits = 0; 1824 1825 uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1]; 1826 float *p_codes = (float *)ff_aac_codebook_vectors[cb-1]; 1827 1828 for (i = 0; i < size; i += 4) { 1829 const float *vec, *vec2; 1830 int curidx, curidx2; 1831 int sign1, count1, sign2, count2; 1832 int *in_int = (int *)&in[i]; 1833 float *in_pos = (float *)&in[i]; 1834 float di0, di1, di2, di3; 1835 1836 qc1 = scaled[i ] * Q34 + 0.4054f; 1837 qc2 = scaled[i+1] * Q34 + 0.4054f; 1838 qc3 = scaled[i+2] * Q34 + 0.4054f; 1839 qc4 = scaled[i+3] * Q34 + 0.4054f; 1840 1841 __asm__ volatile ( 1842 ".set push \n\t" 1843 ".set noreorder \n\t" 1844 1845 "ori $t4, $zero, 12 \n\t" 1846 "ori %[sign1], $zero, 0 \n\t" 1847 "ori %[sign2], $zero, 0 \n\t" 1848 "slt $t0, $t4, %[qc1] \n\t" 1849 "slt $t1, $t4, %[qc2] \n\t" 1850 "slt $t2, $t4, %[qc3] \n\t" 1851 "slt $t3, $t4, %[qc4] \n\t" 1852 "movn %[qc1], $t4, $t0 \n\t" 1853 "movn %[qc2], $t4, $t1 \n\t" 1854 "movn %[qc3], $t4, $t2 \n\t" 1855 "movn %[qc4], $t4, $t3 \n\t" 1856 "lw $t0, 0(%[in_int]) \n\t" 1857 "lw $t1, 4(%[in_int]) \n\t" 1858 "lw $t2, 8(%[in_int]) \n\t" 1859 "lw $t3, 12(%[in_int]) \n\t" 1860 "slt $t0, $t0, $zero \n\t" 1861 "movn %[sign1], $t0, %[qc1] \n\t" 1862 "slt $t2, $t2, $zero \n\t" 1863 "movn %[sign2], $t2, %[qc3] \n\t" 1864 "slt $t1, $t1, $zero \n\t" 1865 "sll $t0, %[sign1], 1 \n\t" 1866 "or $t0, $t0, $t1 \n\t" 1867 "movn %[sign1], $t0, %[qc2] \n\t" 1868 "slt $t3, $t3, $zero \n\t" 1869 "sll $t0, %[sign2], 1 \n\t" 1870 "or $t0, $t0, $t3 \n\t" 1871 "movn %[sign2], $t0, %[qc4] \n\t" 1872 "slt %[count1], $zero, %[qc1] \n\t" 1873 "slt $t1, $zero, %[qc2] \n\t" 1874 "slt %[count2], $zero, %[qc3] \n\t" 1875 "slt $t2, $zero, %[qc4] \n\t" 1876 "addu %[count1], %[count1], $t1 \n\t" 1877 "addu %[count2], %[count2], $t2 \n\t" 1878 1879 ".set pop \n\t" 1880 1881 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 1882 [qc3]"+r"(qc3), [qc4]"+r"(qc4), 1883 [sign1]"=&r"(sign1), [count1]"=&r"(count1), 1884 [sign2]"=&r"(sign2), [count2]"=&r"(count2) 1885 : [in_int]"r"(in_int) 1886 : "t0", "t1", "t2", "t3", "t4", 1887 "memory" 1888 ); 1889 1890 curidx = 13 * qc1; 1891 curidx += qc2; 1892 1893 curidx2 = 13 * qc3; 1894 curidx2 += qc4; 1895 1896 curbits += p_bits[curidx]; 1897 curbits += p_bits[curidx2]; 1898 curbits += upair12_sign_bits[curidx]; 1899 curbits += upair12_sign_bits[curidx2]; 1900 vec = &p_codes[curidx*2]; 1901 vec2 = &p_codes[curidx2*2]; 1902 1903 __asm__ volatile ( 1904 ".set push \n\t" 1905 ".set noreorder \n\t" 1906 1907 "lwc1 %[di0], 0(%[in_pos]) \n\t" 1908 "lwc1 %[di1], 4(%[in_pos]) \n\t" 1909 "lwc1 %[di2], 8(%[in_pos]) \n\t" 1910 "lwc1 %[di3], 12(%[in_pos]) \n\t" 1911 "abs.s %[di0], %[di0] \n\t" 1912 "abs.s %[di1], %[di1] \n\t" 1913 "abs.s %[di2], %[di2] \n\t" 1914 "abs.s %[di3], %[di3] \n\t" 1915 "lwc1 $f0, 0(%[vec]) \n\t" 1916 "lwc1 $f1, 4(%[vec]) \n\t" 1917 "lwc1 $f2, 0(%[vec2]) \n\t" 1918 "lwc1 $f3, 4(%[vec2]) \n\t" 1919 "nmsub.s %[di0], %[di0], $f0, %[IQ] \n\t" 1920 "nmsub.s %[di1], %[di1], $f1, %[IQ] \n\t" 1921 "nmsub.s %[di2], %[di2], $f2, %[IQ] \n\t" 1922 "nmsub.s %[di3], %[di3], $f3, %[IQ] \n\t" 1923 1924 ".set pop \n\t" 1925 1926 : [di0]"=&f"(di0), [di1]"=&f"(di1), 1927 [di2]"=&f"(di2), [di3]"=&f"(di3) 1928 : [in_pos]"r"(in_pos), [vec]"r"(vec), 1929 [vec2]"r"(vec2), [IQ]"f"(IQ) 1930 : "$f0", "$f1", "$f2", "$f3", 1931 "memory" 1932 ); 1933 1934 cost += di0 * di0 + di1 * di1 1935 + di2 * di2 + di3 * di3; 1936 } 1937 1938 if (bits) 1939 *bits = curbits; 1940 return cost * lambda + curbits; 1941} 1942 1943static float get_band_cost_ESC_mips(struct AACEncContext *s, 1944 PutBitContext *pb, const float *in, 1945 const float *scaled, int size, int scale_idx, 1946 int cb, const float lambda, const float uplim, 1947 int *bits) 1948{ 1949 const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512]; 1950 const float IQ = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512]; 1951 const float CLIPPED_ESCAPE = 165140.0f * IQ; 1952 int i; 1953 float cost = 0; 1954 int qc1, qc2, qc3, qc4; 1955 int curbits = 0; 1956 1957 uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1]; 1958 float *p_codes = (float* )ff_aac_codebook_vectors[cb-1]; 1959 1960 for (i = 0; i < size; i += 4) { 1961 const float *vec, *vec2; 1962 int curidx, curidx2; 1963 float t1, t2, t3, t4; 1964 float di1, di2, di3, di4; 1965 int cond0, cond1, cond2, cond3; 1966 int c1, c2, c3, c4; 1967 1968 qc1 = scaled[i ] * Q34 + 0.4054f; 1969 qc2 = scaled[i+1] * Q34 + 0.4054f; 1970 qc3 = scaled[i+2] * Q34 + 0.4054f; 1971 qc4 = scaled[i+3] * Q34 + 0.4054f; 1972 1973 __asm__ volatile ( 1974 ".set push \n\t" 1975 ".set noreorder \n\t" 1976 1977 "ori $t4, $zero, 15 \n\t" 1978 "ori $t5, $zero, 16 \n\t" 1979 "shll_s.w %[c1], %[qc1], 18 \n\t" 1980 "shll_s.w %[c2], %[qc2], 18 \n\t" 1981 "shll_s.w %[c3], %[qc3], 18 \n\t" 1982 "shll_s.w %[c4], %[qc4], 18 \n\t" 1983 "srl %[c1], %[c1], 18 \n\t" 1984 "srl %[c2], %[c2], 18 \n\t" 1985 "srl %[c3], %[c3], 18 \n\t" 1986 "srl %[c4], %[c4], 18 \n\t" 1987 "slt %[cond0], $t4, %[qc1] \n\t" 1988 "slt %[cond1], $t4, %[qc2] \n\t" 1989 "slt %[cond2], $t4, %[qc3] \n\t" 1990 "slt %[cond3], $t4, %[qc4] \n\t" 1991 "movn %[qc1], $t5, %[cond0] \n\t" 1992 "movn %[qc2], $t5, %[cond1] \n\t" 1993 "movn %[qc3], $t5, %[cond2] \n\t" 1994 "movn %[qc4], $t5, %[cond3] \n\t" 1995 1996 ".set pop \n\t" 1997 1998 : [qc1]"+r"(qc1), [qc2]"+r"(qc2), 1999 [qc3]"+r"(qc3), [qc4]"+r"(qc4), 2000 [cond0]"=&r"(cond0), [cond1]"=&r"(cond1), 2001 [cond2]"=&r"(cond2), [cond3]"=&r"(cond3), 2002 [c1]"=&r"(c1), [c2]"=&r"(c2), 2003 [c3]"=&r"(c3), [c4]"=&r"(c4) 2004 : 2005 : "t4", "t5" 2006 ); 2007 2008 curidx = 17 * qc1; 2009 curidx += qc2; 2010 2011 curidx2 = 17 * qc3; 2012 curidx2 += qc4; 2013 2014 curbits += p_bits[curidx]; 2015 curbits += esc_sign_bits[curidx]; 2016 vec = &p_codes[curidx*2]; 2017 2018 curbits += p_bits[curidx2]; 2019 curbits += esc_sign_bits[curidx2]; 2020 vec2 = &p_codes[curidx2*2]; 2021 2022 curbits += (av_log2(c1) * 2 - 3) & (-cond0); 2023 curbits += (av_log2(c2) * 2 - 3) & (-cond1); 2024 curbits += (av_log2(c3) * 2 - 3) & (-cond2); 2025 curbits += (av_log2(c4) * 2 - 3) & (-cond3); 2026 2027 t1 = fabsf(in[i ]); 2028 t2 = fabsf(in[i+1]); 2029 t3 = fabsf(in[i+2]); 2030 t4 = fabsf(in[i+3]); 2031 2032 if (cond0) { 2033 if (t1 >= CLIPPED_ESCAPE) { 2034 di1 = t1 - CLIPPED_ESCAPE; 2035 } else { 2036 di1 = t1 - c1 * cbrtf(c1) * IQ; 2037 } 2038 } else 2039 di1 = t1 - vec[0] * IQ; 2040 2041 if (cond1) { 2042 if (t2 >= CLIPPED_ESCAPE) { 2043 di2 = t2 - CLIPPED_ESCAPE; 2044 } else { 2045 di2 = t2 - c2 * cbrtf(c2) * IQ; 2046 } 2047 } else 2048 di2 = t2 - vec[1] * IQ; 2049 2050 if (cond2) { 2051 if (t3 >= CLIPPED_ESCAPE) { 2052 di3 = t3 - CLIPPED_ESCAPE; 2053 } else { 2054 di3 = t3 - c3 * cbrtf(c3) * IQ; 2055 } 2056 } else 2057 di3 = t3 - vec2[0] * IQ; 2058 2059 if (cond3) { 2060 if (t4 >= CLIPPED_ESCAPE) { 2061 di4 = t4 - CLIPPED_ESCAPE; 2062 } else { 2063 di4 = t4 - c4 * cbrtf(c4) * IQ; 2064 } 2065 } else 2066 di4 = t4 - vec2[1]*IQ; 2067 2068 cost += di1 * di1 + di2 * di2 2069 + di3 * di3 + di4 * di4; 2070 } 2071 2072 if (bits) 2073 *bits = curbits; 2074 return cost * lambda + curbits; 2075} 2076 2077static float (*const get_band_cost_arr[])(struct AACEncContext *s, 2078 PutBitContext *pb, const float *in, 2079 const float *scaled, int size, int scale_idx, 2080 int cb, const float lambda, const float uplim, 2081 int *bits) = { 2082 get_band_cost_ZERO_mips, 2083 get_band_cost_SQUAD_mips, 2084 get_band_cost_SQUAD_mips, 2085 get_band_cost_UQUAD_mips, 2086 get_band_cost_UQUAD_mips, 2087 get_band_cost_SPAIR_mips, 2088 get_band_cost_SPAIR_mips, 2089 get_band_cost_UPAIR7_mips, 2090 get_band_cost_UPAIR7_mips, 2091 get_band_cost_UPAIR12_mips, 2092 get_band_cost_UPAIR12_mips, 2093 get_band_cost_ESC_mips, 2094}; 2095 2096#define get_band_cost( \ 2097 s, pb, in, scaled, size, scale_idx, cb, \ 2098 lambda, uplim, bits) \ 2099 get_band_cost_arr[cb]( \ 2100 s, pb, in, scaled, size, scale_idx, cb, \ 2101 lambda, uplim, bits) 2102 2103static float quantize_band_cost(struct AACEncContext *s, const float *in, 2104 const float *scaled, int size, int scale_idx, 2105 int cb, const float lambda, const float uplim, 2106 int *bits) 2107{ 2108 return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits); 2109} 2110 2111static void search_for_quantizers_twoloop_mips(AVCodecContext *avctx, 2112 AACEncContext *s, 2113 SingleChannelElement *sce, 2114 const float lambda) 2115{ 2116 int start = 0, i, w, w2, g; 2117 int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels; 2118 float dists[128] = { 0 }, uplims[128]; 2119 float maxvals[128]; 2120 int fflag, minscaler; 2121 int its = 0; 2122 int allz = 0; 2123 float minthr = INFINITY; 2124 2125 destbits = FFMIN(destbits, 5800); 2126 for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { 2127 for (g = 0; g < sce->ics.num_swb; g++) { 2128 int nz = 0; 2129 float uplim = 0.0f; 2130 for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) { 2131 FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g]; 2132 uplim += band->threshold; 2133 if (band->energy <= band->threshold || band->threshold == 0.0f) { 2134 sce->zeroes[(w+w2)*16+g] = 1; 2135 continue; 2136 } 2137 nz = 1; 2138 } 2139 uplims[w*16+g] = uplim *512; 2140 sce->zeroes[w*16+g] = !nz; 2141 if (nz) 2142 minthr = FFMIN(minthr, uplim); 2143 allz |= nz; 2144 } 2145 } 2146 for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { 2147 for (g = 0; g < sce->ics.num_swb; g++) { 2148 if (sce->zeroes[w*16+g]) { 2149 sce->sf_idx[w*16+g] = SCALE_ONE_POS; 2150 continue; 2151 } 2152 sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59); 2153 } 2154 } 2155 2156 if (!allz) 2157 return; 2158 abs_pow34_v(s->scoefs, sce->coeffs, 1024); 2159 2160 for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { 2161 start = w*128; 2162 for (g = 0; g < sce->ics.num_swb; g++) { 2163 const float *scaled = s->scoefs + start; 2164 maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled); 2165 start += sce->ics.swb_sizes[g]; 2166 } 2167 } 2168 2169 do { 2170 int tbits, qstep; 2171 minscaler = sce->sf_idx[0]; 2172 qstep = its ? 1 : 32; 2173 do { 2174 int prev = -1; 2175 tbits = 0; 2176 fflag = 0; 2177 2178 if (qstep > 1) { 2179 for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { 2180 start = w*128; 2181 for (g = 0; g < sce->ics.num_swb; g++) { 2182 const float *coefs = sce->coeffs + start; 2183 const float *scaled = s->scoefs + start; 2184 int bits = 0; 2185 int cb; 2186 2187 if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) { 2188 start += sce->ics.swb_sizes[g]; 2189 continue; 2190 } 2191 minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]); 2192 cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]); 2193 for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) { 2194 int b; 2195 bits += quantize_band_cost_bits(s, coefs + w2*128, 2196 scaled + w2*128, 2197 sce->ics.swb_sizes[g], 2198 sce->sf_idx[w*16+g], 2199 cb, 2200 1.0f, 2201 INFINITY, 2202 &b); 2203 } 2204 if (prev != -1) { 2205 bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO]; 2206 } 2207 tbits += bits; 2208 start += sce->ics.swb_sizes[g]; 2209 prev = sce->sf_idx[w*16+g]; 2210 } 2211 } 2212 } 2213 else { 2214 for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { 2215 start = w*128; 2216 for (g = 0; g < sce->ics.num_swb; g++) { 2217 const float *coefs = sce->coeffs + start; 2218 const float *scaled = s->scoefs + start; 2219 int bits = 0; 2220 int cb; 2221 float dist = 0.0f; 2222 2223 if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) { 2224 start += sce->ics.swb_sizes[g]; 2225 continue; 2226 } 2227 minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]); 2228 cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]); 2229 for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) { 2230 int b; 2231 dist += quantize_band_cost(s, coefs + w2*128, 2232 scaled + w2*128, 2233 sce->ics.swb_sizes[g], 2234 sce->sf_idx[w*16+g], 2235 cb, 2236 1.0f, 2237 INFINITY, 2238 &b); 2239 bits += b; 2240 } 2241 dists[w*16+g] = dist - bits; 2242 if (prev != -1) { 2243 bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO]; 2244 } 2245 tbits += bits; 2246 start += sce->ics.swb_sizes[g]; 2247 prev = sce->sf_idx[w*16+g]; 2248 } 2249 } 2250 } 2251 if (tbits > destbits) { 2252 for (i = 0; i < 128; i++) 2253 if (sce->sf_idx[i] < 218 - qstep) 2254 sce->sf_idx[i] += qstep; 2255 } else { 2256 for (i = 0; i < 128; i++) 2257 if (sce->sf_idx[i] > 60 - qstep) 2258 sce->sf_idx[i] -= qstep; 2259 } 2260 qstep >>= 1; 2261 if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217) 2262 qstep = 1; 2263 } while (qstep); 2264 2265 fflag = 0; 2266 minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF); 2267 for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) { 2268 for (g = 0; g < sce->ics.num_swb; g++) { 2269 int prevsc = sce->sf_idx[w*16+g]; 2270 if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) { 2271 if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1)) 2272 sce->sf_idx[w*16+g]--; 2273 else 2274 sce->sf_idx[w*16+g]-=2; 2275 } 2276 sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF); 2277 sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219); 2278 if (sce->sf_idx[w*16+g] != prevsc) 2279 fflag = 1; 2280 sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]); 2281 } 2282 } 2283 its++; 2284 } while (fflag && its < 10); 2285} 2286 2287static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe, 2288 const float lambda) 2289{ 2290 int start = 0, i, w, w2, g; 2291 float M[128], S[128]; 2292 float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3; 2293 SingleChannelElement *sce0 = &cpe->ch[0]; 2294 SingleChannelElement *sce1 = &cpe->ch[1]; 2295 if (!cpe->common_window) 2296 return; 2297 for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) { 2298 for (g = 0; g < sce0->ics.num_swb; g++) { 2299 if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) { 2300 float dist1 = 0.0f, dist2 = 0.0f; 2301 for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) { 2302 FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g]; 2303 FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g]; 2304 float minthr = FFMIN(band0->threshold, band1->threshold); 2305 float maxthr = FFMAX(band0->threshold, band1->threshold); 2306 for (i = 0; i < sce0->ics.swb_sizes[g]; i+=4) { 2307 M[i ] = (sce0->coeffs[start+w2*128+i ] 2308 + sce1->coeffs[start+w2*128+i ]) * 0.5; 2309 M[i+1] = (sce0->coeffs[start+w2*128+i+1] 2310 + sce1->coeffs[start+w2*128+i+1]) * 0.5; 2311 M[i+2] = (sce0->coeffs[start+w2*128+i+2] 2312 + sce1->coeffs[start+w2*128+i+2]) * 0.5; 2313 M[i+3] = (sce0->coeffs[start+w2*128+i+3] 2314 + sce1->coeffs[start+w2*128+i+3]) * 0.5; 2315 2316 S[i ] = M[i ] 2317 - sce1->coeffs[start+w2*128+i ]; 2318 S[i+1] = M[i+1] 2319 - sce1->coeffs[start+w2*128+i+1]; 2320 S[i+2] = M[i+2] 2321 - sce1->coeffs[start+w2*128+i+2]; 2322 S[i+3] = M[i+3] 2323 - sce1->coeffs[start+w2*128+i+3]; 2324 } 2325 abs_pow34_v(L34, sce0->coeffs+start+w2*128, sce0->ics.swb_sizes[g]); 2326 abs_pow34_v(R34, sce1->coeffs+start+w2*128, sce0->ics.swb_sizes[g]); 2327 abs_pow34_v(M34, M, sce0->ics.swb_sizes[g]); 2328 abs_pow34_v(S34, S, sce0->ics.swb_sizes[g]); 2329 dist1 += quantize_band_cost(s, sce0->coeffs + start + w2*128, 2330 L34, 2331 sce0->ics.swb_sizes[g], 2332 sce0->sf_idx[(w+w2)*16+g], 2333 sce0->band_type[(w+w2)*16+g], 2334 lambda / band0->threshold, INFINITY, NULL); 2335 dist1 += quantize_band_cost(s, sce1->coeffs + start + w2*128, 2336 R34, 2337 sce1->ics.swb_sizes[g], 2338 sce1->sf_idx[(w+w2)*16+g], 2339 sce1->band_type[(w+w2)*16+g], 2340 lambda / band1->threshold, INFINITY, NULL); 2341 dist2 += quantize_band_cost(s, M, 2342 M34, 2343 sce0->ics.swb_sizes[g], 2344 sce0->sf_idx[(w+w2)*16+g], 2345 sce0->band_type[(w+w2)*16+g], 2346 lambda / maxthr, INFINITY, NULL); 2347 dist2 += quantize_band_cost(s, S, 2348 S34, 2349 sce1->ics.swb_sizes[g], 2350 sce1->sf_idx[(w+w2)*16+g], 2351 sce1->band_type[(w+w2)*16+g], 2352 lambda / minthr, INFINITY, NULL); 2353 } 2354 cpe->ms_mask[w*16+g] = dist2 < dist1; 2355 } 2356 start += sce0->ics.swb_sizes[g]; 2357 } 2358 } 2359} 2360#endif /*HAVE_MIPSFPU */ 2361 2362static void codebook_trellis_rate_mips(AACEncContext *s, SingleChannelElement *sce, 2363 int win, int group_len, const float lambda) 2364{ 2365 BandCodingPath path[120][12]; 2366 int w, swb, cb, start, size; 2367 int i, j; 2368 const int max_sfb = sce->ics.max_sfb; 2369 const int run_bits = sce->ics.num_windows == 1 ? 5 : 3; 2370 const int run_esc = (1 << run_bits) - 1; 2371 int idx, ppos, count; 2372 int stackrun[120], stackcb[120], stack_len; 2373 float next_minbits = INFINITY; 2374 int next_mincb = 0; 2375 2376 abs_pow34_v(s->scoefs, sce->coeffs, 1024); 2377 start = win*128; 2378 for (cb = 0; cb < 12; cb++) { 2379 path[0][cb].cost = run_bits+4; 2380 path[0][cb].prev_idx = -1; 2381 path[0][cb].run = 0; 2382 } 2383 for (swb = 0; swb < max_sfb; swb++) { 2384 size = sce->ics.swb_sizes[swb]; 2385 if (sce->zeroes[win*16 + swb]) { 2386 float cost_stay_here = path[swb][0].cost; 2387 float cost_get_here = next_minbits + run_bits + 4; 2388 if ( run_value_bits[sce->ics.num_windows == 8][path[swb][0].run] 2389 != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1]) 2390 cost_stay_here += run_bits; 2391 if (cost_get_here < cost_stay_here) { 2392 path[swb+1][0].prev_idx = next_mincb; 2393 path[swb+1][0].cost = cost_get_here; 2394 path[swb+1][0].run = 1; 2395 } else { 2396 path[swb+1][0].prev_idx = 0; 2397 path[swb+1][0].cost = cost_stay_here; 2398 path[swb+1][0].run = path[swb][0].run + 1; 2399 } 2400 next_minbits = path[swb+1][0].cost; 2401 next_mincb = 0; 2402 for (cb = 1; cb < 12; cb++) { 2403 path[swb+1][cb].cost = 61450; 2404 path[swb+1][cb].prev_idx = -1; 2405 path[swb+1][cb].run = 0; 2406 } 2407 } else { 2408 float minbits = next_minbits; 2409 int mincb = next_mincb; 2410 int startcb = sce->band_type[win*16+swb]; 2411 next_minbits = INFINITY; 2412 next_mincb = 0; 2413 for (cb = 0; cb < startcb; cb++) { 2414 path[swb+1][cb].cost = 61450; 2415 path[swb+1][cb].prev_idx = -1; 2416 path[swb+1][cb].run = 0; 2417 } 2418 for (cb = startcb; cb < 12; cb++) { 2419 float cost_stay_here, cost_get_here; 2420 float bits = 0.0f; 2421 for (w = 0; w < group_len; w++) { 2422 bits += quantize_band_cost_bits(s, sce->coeffs + start + w*128, 2423 s->scoefs + start + w*128, size, 2424 sce->sf_idx[(win+w)*16+swb], cb, 2425 0, INFINITY, NULL); 2426 } 2427 cost_stay_here = path[swb][cb].cost + bits; 2428 cost_get_here = minbits + bits + run_bits + 4; 2429 if ( run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run] 2430 != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1]) 2431 cost_stay_here += run_bits; 2432 if (cost_get_here < cost_stay_here) { 2433 path[swb+1][cb].prev_idx = mincb; 2434 path[swb+1][cb].cost = cost_get_here; 2435 path[swb+1][cb].run = 1; 2436 } else { 2437 path[swb+1][cb].prev_idx = cb; 2438 path[swb+1][cb].cost = cost_stay_here; 2439 path[swb+1][cb].run = path[swb][cb].run + 1; 2440 } 2441 if (path[swb+1][cb].cost < next_minbits) { 2442 next_minbits = path[swb+1][cb].cost; 2443 next_mincb = cb; 2444 } 2445 } 2446 } 2447 start += sce->ics.swb_sizes[swb]; 2448 } 2449 2450 stack_len = 0; 2451 idx = 0; 2452 for (cb = 1; cb < 12; cb++) 2453 if (path[max_sfb][cb].cost < path[max_sfb][idx].cost) 2454 idx = cb; 2455 ppos = max_sfb; 2456 while (ppos > 0) { 2457 av_assert1(idx >= 0); 2458 cb = idx; 2459 stackrun[stack_len] = path[ppos][cb].run; 2460 stackcb [stack_len] = cb; 2461 idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx; 2462 ppos -= path[ppos][cb].run; 2463 stack_len++; 2464 } 2465 2466 start = 0; 2467 for (i = stack_len - 1; i >= 0; i--) { 2468 put_bits(&s->pb, 4, stackcb[i]); 2469 count = stackrun[i]; 2470 memset(sce->zeroes + win*16 + start, !stackcb[i], count); 2471 for (j = 0; j < count; j++) { 2472 sce->band_type[win*16 + start] = stackcb[i]; 2473 start++; 2474 } 2475 while (count >= run_esc) { 2476 put_bits(&s->pb, run_bits, run_esc); 2477 count -= run_esc; 2478 } 2479 put_bits(&s->pb, run_bits, count); 2480 } 2481} 2482#endif /* HAVE_INLINE_ASM */ 2483 2484void ff_aac_coder_init_mips(AACEncContext *c) { 2485#if HAVE_INLINE_ASM 2486 AACCoefficientsEncoder *e = c->coder; 2487 int option = c->options.aac_coder; 2488 2489 if (option == 2) { 2490 e->quantize_and_encode_band = quantize_and_encode_band_mips; 2491 e->encode_window_bands_info = codebook_trellis_rate_mips; 2492#if HAVE_MIPSFPU 2493 e->search_for_quantizers = search_for_quantizers_twoloop_mips; 2494 e->search_for_ms = search_for_ms_mips; 2495#endif /* HAVE_MIPSFPU */ 2496 } 2497#endif /* HAVE_INLINE_ASM */ 2498} 2499