1/*
2 * Copyright (c) 2012
3 *      MIPS Technologies, Inc., California.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14 *    contributors may be used to endorse or promote products derived from
15 *    this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * Author:  Stanislav Ocovaj (socovaj@mips.com)
30 *          Szabolcs Pal     (sabolc@mips.com)
31 *
32 * AAC coefficients encoder optimized for MIPS floating-point architecture
33 *
34 * This file is part of FFmpeg.
35 *
36 * FFmpeg is free software; you can redistribute it and/or
37 * modify it under the terms of the GNU Lesser General Public
38 * License as published by the Free Software Foundation; either
39 * version 2.1 of the License, or (at your option) any later version.
40 *
41 * FFmpeg is distributed in the hope that it will be useful,
42 * but WITHOUT ANY WARRANTY; without even the implied warranty of
43 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
44 * Lesser General Public License for more details.
45 *
46 * You should have received a copy of the GNU Lesser General Public
47 * License along with FFmpeg; if not, write to the Free Software
48 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
49 */
50
51/**
52 * @file
53 * Reference: libavcodec/aaccoder.c
54 */
55
56#include "libavutil/libm.h"
57
58#include <float.h>
59#include "libavutil/mathematics.h"
60#include "libavcodec/avcodec.h"
61#include "libavcodec/put_bits.h"
62#include "libavcodec/aac.h"
63#include "libavcodec/aacenc.h"
64#include "libavcodec/aactab.h"
65
66#if HAVE_INLINE_ASM
67typedef struct BandCodingPath {
68    int prev_idx;
69    float cost;
70    int run;
71} BandCodingPath;
72
73static const uint8_t run_value_bits_long[64] = {
74     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
75     5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, 10,
76    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
77    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15
78};
79
80static const uint8_t run_value_bits_short[16] = {
81    3, 3, 3, 3, 3, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 9
82};
83
84static const uint8_t *run_value_bits[2] = {
85    run_value_bits_long, run_value_bits_short
86};
87
88static const uint8_t uquad_sign_bits[81] = {
89    0, 1, 1, 1, 2, 2, 1, 2, 2,
90    1, 2, 2, 2, 3, 3, 2, 3, 3,
91    1, 2, 2, 2, 3, 3, 2, 3, 3,
92    1, 2, 2, 2, 3, 3, 2, 3, 3,
93    2, 3, 3, 3, 4, 4, 3, 4, 4,
94    2, 3, 3, 3, 4, 4, 3, 4, 4,
95    1, 2, 2, 2, 3, 3, 2, 3, 3,
96    2, 3, 3, 3, 4, 4, 3, 4, 4,
97    2, 3, 3, 3, 4, 4, 3, 4, 4
98};
99
100static const uint8_t upair7_sign_bits[64] = {
101    0, 1, 1, 1, 1, 1, 1, 1,
102    1, 2, 2, 2, 2, 2, 2, 2,
103    1, 2, 2, 2, 2, 2, 2, 2,
104    1, 2, 2, 2, 2, 2, 2, 2,
105    1, 2, 2, 2, 2, 2, 2, 2,
106    1, 2, 2, 2, 2, 2, 2, 2,
107    1, 2, 2, 2, 2, 2, 2, 2,
108    1, 2, 2, 2, 2, 2, 2, 2,
109};
110
111static const uint8_t upair12_sign_bits[169] = {
112    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
114    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
115    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
116    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
117    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
118    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
119    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
120    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
121    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
122    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
123    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
124    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
125};
126
127static const uint8_t esc_sign_bits[289] = {
128    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
129    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
130    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
131    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
132    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
133    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
134    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
135    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
136    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
137    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
138    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
139    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
140    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
141    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
142    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
143    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
144    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
145};
146
147static void abs_pow34_v(float *out, const float *in, const int size) {
148#ifndef USE_REALLY_FULL_SEARCH
149    int i;
150    float a, b, c, d;
151    float ax, bx, cx, dx;
152
153    for (i = 0; i < size; i += 4) {
154        a = fabsf(in[i  ]);
155        b = fabsf(in[i+1]);
156        c = fabsf(in[i+2]);
157        d = fabsf(in[i+3]);
158
159        ax = sqrtf(a);
160        bx = sqrtf(b);
161        cx = sqrtf(c);
162        dx = sqrtf(d);
163
164        a = a * ax;
165        b = b * bx;
166        c = c * cx;
167        d = d * dx;
168
169        out[i  ] = sqrtf(a);
170        out[i+1] = sqrtf(b);
171        out[i+2] = sqrtf(c);
172        out[i+3] = sqrtf(d);
173    }
174#endif /* USE_REALLY_FULL_SEARCH */
175}
176
177static float find_max_val(int group_len, int swb_size, const float *scaled) {
178    float maxval = 0.0f;
179    int w2, i;
180    for (w2 = 0; w2 < group_len; w2++) {
181        for (i = 0; i < swb_size; i++) {
182            maxval = FFMAX(maxval, scaled[w2*128+i]);
183        }
184    }
185    return maxval;
186}
187
188static int find_min_book(float maxval, int sf) {
189    float Q = ff_aac_pow2sf_tab[POW_SF2_ZERO - sf + SCALE_ONE_POS - SCALE_DIV_512];
190    float Q34 = sqrtf(Q * sqrtf(Q));
191    int qmaxval, cb;
192    qmaxval = maxval * Q34 + 0.4054f;
193    if      (qmaxval ==  0) cb = 0;
194    else if (qmaxval ==  1) cb = 1;
195    else if (qmaxval ==  2) cb = 3;
196    else if (qmaxval <=  4) cb = 5;
197    else if (qmaxval <=  7) cb = 7;
198    else if (qmaxval <= 12) cb = 9;
199    else                    cb = 11;
200    return cb;
201}
202
203/**
204 * Functions developed from template function and optimized for quantizing and encoding band
205 */
206static void quantize_and_encode_band_cost_SQUAD_mips(struct AACEncContext *s,
207                                                     PutBitContext *pb, const float *in,
208                                                     const float *scaled, int size, int scale_idx,
209                                                     int cb, const float lambda, const float uplim,
210                                                     int *bits)
211{
212    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
213    int i;
214    int qc1, qc2, qc3, qc4;
215
216    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
217    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
218
219    abs_pow34_v(s->scoefs, in, size);
220    scaled = s->scoefs;
221    for (i = 0; i < size; i += 4) {
222        int curidx;
223        int *in_int = (int *)&in[i];
224
225        qc1 = scaled[i  ] * Q34 + 0.4054f;
226        qc2 = scaled[i+1] * Q34 + 0.4054f;
227        qc3 = scaled[i+2] * Q34 + 0.4054f;
228        qc4 = scaled[i+3] * Q34 + 0.4054f;
229
230        __asm__ volatile (
231            ".set push                      \n\t"
232            ".set noreorder                 \n\t"
233
234            "slt    %[qc1], $zero,  %[qc1]  \n\t"
235            "slt    %[qc2], $zero,  %[qc2]  \n\t"
236            "slt    %[qc3], $zero,  %[qc3]  \n\t"
237            "slt    %[qc4], $zero,  %[qc4]  \n\t"
238            "lw     $t0,    0(%[in_int])    \n\t"
239            "lw     $t1,    4(%[in_int])    \n\t"
240            "lw     $t2,    8(%[in_int])    \n\t"
241            "lw     $t3,    12(%[in_int])   \n\t"
242            "srl    $t0,    $t0,    31      \n\t"
243            "srl    $t1,    $t1,    31      \n\t"
244            "srl    $t2,    $t2,    31      \n\t"
245            "srl    $t3,    $t3,    31      \n\t"
246            "subu   $t4,    $zero,  %[qc1]  \n\t"
247            "subu   $t5,    $zero,  %[qc2]  \n\t"
248            "subu   $t6,    $zero,  %[qc3]  \n\t"
249            "subu   $t7,    $zero,  %[qc4]  \n\t"
250            "movn   %[qc1], $t4,    $t0     \n\t"
251            "movn   %[qc2], $t5,    $t1     \n\t"
252            "movn   %[qc3], $t6,    $t2     \n\t"
253            "movn   %[qc4], $t7,    $t3     \n\t"
254
255            ".set pop                       \n\t"
256
257            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
258              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
259            : [in_int]"r"(in_int)
260            : "t0", "t1", "t2", "t3",
261              "t4", "t5", "t6", "t7",
262              "memory"
263        );
264
265        curidx = qc1;
266        curidx *= 3;
267        curidx += qc2;
268        curidx *= 3;
269        curidx += qc3;
270        curidx *= 3;
271        curidx += qc4;
272        curidx += 40;
273
274        put_bits(pb, p_bits[curidx], p_codes[curidx]);
275    }
276}
277
278static void quantize_and_encode_band_cost_UQUAD_mips(struct AACEncContext *s,
279                                                     PutBitContext *pb, const float *in,
280                                                     const float *scaled, int size, int scale_idx,
281                                                     int cb, const float lambda, const float uplim,
282                                                     int *bits)
283{
284    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
285    int i;
286    int qc1, qc2, qc3, qc4;
287
288    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
289    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
290
291    abs_pow34_v(s->scoefs, in, size);
292    scaled = s->scoefs;
293    for (i = 0; i < size; i += 4) {
294        int curidx, sign, count;
295        int *in_int = (int *)&in[i];
296        uint8_t v_bits;
297        unsigned int v_codes;
298
299        qc1 = scaled[i  ] * Q34 + 0.4054f;
300        qc2 = scaled[i+1] * Q34 + 0.4054f;
301        qc3 = scaled[i+2] * Q34 + 0.4054f;
302        qc4 = scaled[i+3] * Q34 + 0.4054f;
303
304        __asm__ volatile (
305            ".set push                              \n\t"
306            ".set noreorder                         \n\t"
307
308            "ori    $t4,        $zero,      2       \n\t"
309            "ori    %[sign],    $zero,      0       \n\t"
310            "slt    $t0,        $t4,        %[qc1]  \n\t"
311            "slt    $t1,        $t4,        %[qc2]  \n\t"
312            "slt    $t2,        $t4,        %[qc3]  \n\t"
313            "slt    $t3,        $t4,        %[qc4]  \n\t"
314            "movn   %[qc1],     $t4,        $t0     \n\t"
315            "movn   %[qc2],     $t4,        $t1     \n\t"
316            "movn   %[qc3],     $t4,        $t2     \n\t"
317            "movn   %[qc4],     $t4,        $t3     \n\t"
318            "lw     $t0,        0(%[in_int])        \n\t"
319            "lw     $t1,        4(%[in_int])        \n\t"
320            "lw     $t2,        8(%[in_int])        \n\t"
321            "lw     $t3,        12(%[in_int])       \n\t"
322            "slt    $t0,        $t0,        $zero   \n\t"
323            "movn   %[sign],    $t0,        %[qc1]  \n\t"
324            "slt    $t1,        $t1,        $zero   \n\t"
325            "slt    $t2,        $t2,        $zero   \n\t"
326            "slt    $t3,        $t3,        $zero   \n\t"
327            "sll    $t0,        %[sign],    1       \n\t"
328            "or     $t0,        $t0,        $t1     \n\t"
329            "movn   %[sign],    $t0,        %[qc2]  \n\t"
330            "slt    $t4,        $zero,      %[qc1]  \n\t"
331            "slt    $t1,        $zero,      %[qc2]  \n\t"
332            "slt    %[count],   $zero,      %[qc3]  \n\t"
333            "sll    $t0,        %[sign],    1       \n\t"
334            "or     $t0,        $t0,        $t2     \n\t"
335            "movn   %[sign],    $t0,        %[qc3]  \n\t"
336            "slt    $t2,        $zero,      %[qc4]  \n\t"
337            "addu   %[count],   %[count],   $t4     \n\t"
338            "addu   %[count],   %[count],   $t1     \n\t"
339            "sll    $t0,        %[sign],    1       \n\t"
340            "or     $t0,        $t0,        $t3     \n\t"
341            "movn   %[sign],    $t0,        %[qc4]  \n\t"
342            "addu   %[count],   %[count],   $t2     \n\t"
343
344            ".set pop                               \n\t"
345
346            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
347              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
348              [sign]"=&r"(sign), [count]"=&r"(count)
349            : [in_int]"r"(in_int)
350            : "t0", "t1", "t2", "t3", "t4",
351              "memory"
352        );
353
354        curidx = qc1;
355        curidx *= 3;
356        curidx += qc2;
357        curidx *= 3;
358        curidx += qc3;
359        curidx *= 3;
360        curidx += qc4;
361
362        v_codes = (p_codes[curidx] << count) | (sign & ((1 << count) - 1));
363        v_bits  = p_bits[curidx] + count;
364        put_bits(pb, v_bits, v_codes);
365    }
366}
367
368static void quantize_and_encode_band_cost_SPAIR_mips(struct AACEncContext *s,
369                                                     PutBitContext *pb, const float *in,
370                                                     const float *scaled, int size, int scale_idx,
371                                                     int cb, const float lambda, const float uplim,
372                                                     int *bits)
373{
374    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
375    int i;
376    int qc1, qc2, qc3, qc4;
377
378    uint8_t  *p_bits  = (uint8_t  *)ff_aac_spectral_bits[cb-1];
379    uint16_t *p_codes = (uint16_t *)ff_aac_spectral_codes[cb-1];
380
381    abs_pow34_v(s->scoefs, in, size);
382    scaled = s->scoefs;
383    for (i = 0; i < size; i += 4) {
384        int curidx, curidx2;
385        int *in_int = (int *)&in[i];
386        uint8_t v_bits;
387        unsigned int v_codes;
388
389        qc1 = scaled[i  ] * Q34 + 0.4054f;
390        qc2 = scaled[i+1] * Q34 + 0.4054f;
391        qc3 = scaled[i+2] * Q34 + 0.4054f;
392        qc4 = scaled[i+3] * Q34 + 0.4054f;
393
394        __asm__ volatile (
395            ".set push                      \n\t"
396            ".set noreorder                 \n\t"
397
398            "ori    $t4,    $zero,  4       \n\t"
399            "slt    $t0,    $t4,    %[qc1]  \n\t"
400            "slt    $t1,    $t4,    %[qc2]  \n\t"
401            "slt    $t2,    $t4,    %[qc3]  \n\t"
402            "slt    $t3,    $t4,    %[qc4]  \n\t"
403            "movn   %[qc1], $t4,    $t0     \n\t"
404            "movn   %[qc2], $t4,    $t1     \n\t"
405            "movn   %[qc3], $t4,    $t2     \n\t"
406            "movn   %[qc4], $t4,    $t3     \n\t"
407            "lw     $t0,    0(%[in_int])    \n\t"
408            "lw     $t1,    4(%[in_int])    \n\t"
409            "lw     $t2,    8(%[in_int])    \n\t"
410            "lw     $t3,    12(%[in_int])   \n\t"
411            "srl    $t0,    $t0,    31      \n\t"
412            "srl    $t1,    $t1,    31      \n\t"
413            "srl    $t2,    $t2,    31      \n\t"
414            "srl    $t3,    $t3,    31      \n\t"
415            "subu   $t4,    $zero,  %[qc1]  \n\t"
416            "subu   $t5,    $zero,  %[qc2]  \n\t"
417            "subu   $t6,    $zero,  %[qc3]  \n\t"
418            "subu   $t7,    $zero,  %[qc4]  \n\t"
419            "movn   %[qc1], $t4,    $t0     \n\t"
420            "movn   %[qc2], $t5,    $t1     \n\t"
421            "movn   %[qc3], $t6,    $t2     \n\t"
422            "movn   %[qc4], $t7,    $t3     \n\t"
423
424            ".set pop                       \n\t"
425
426            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
427              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
428            : [in_int]"r"(in_int)
429            : "t0", "t1", "t2", "t3",
430              "t4", "t5", "t6", "t7",
431              "memory"
432        );
433
434        curidx = 9 * qc1;
435        curidx += qc2 + 40;
436
437        curidx2 = 9 * qc3;
438        curidx2 += qc4 + 40;
439
440        v_codes = (p_codes[curidx] << p_bits[curidx2]) | (p_codes[curidx2]);
441        v_bits  = p_bits[curidx] + p_bits[curidx2];
442        put_bits(pb, v_bits, v_codes);
443    }
444}
445
446static void quantize_and_encode_band_cost_UPAIR7_mips(struct AACEncContext *s,
447                                                      PutBitContext *pb, const float *in,
448                                                      const float *scaled, int size, int scale_idx,
449                                                      int cb, const float lambda, const float uplim,
450                                                      int *bits)
451{
452    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
453    int i;
454    int qc1, qc2, qc3, qc4;
455
456    uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
457    uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
458
459    abs_pow34_v(s->scoefs, in, size);
460    scaled = s->scoefs;
461    for (i = 0; i < size; i += 4) {
462        int curidx, sign1, count1, sign2, count2;
463        int *in_int = (int *)&in[i];
464        uint8_t v_bits;
465        unsigned int v_codes;
466
467        qc1 = scaled[i  ] * Q34 + 0.4054f;
468        qc2 = scaled[i+1] * Q34 + 0.4054f;
469        qc3 = scaled[i+2] * Q34 + 0.4054f;
470        qc4 = scaled[i+3] * Q34 + 0.4054f;
471
472        __asm__ volatile (
473            ".set push                              \n\t"
474            ".set noreorder                         \n\t"
475
476            "ori    $t4,        $zero,      7       \n\t"
477            "ori    %[sign1],   $zero,      0       \n\t"
478            "ori    %[sign2],   $zero,      0       \n\t"
479            "slt    $t0,        $t4,        %[qc1]  \n\t"
480            "slt    $t1,        $t4,        %[qc2]  \n\t"
481            "slt    $t2,        $t4,        %[qc3]  \n\t"
482            "slt    $t3,        $t4,        %[qc4]  \n\t"
483            "movn   %[qc1],     $t4,        $t0     \n\t"
484            "movn   %[qc2],     $t4,        $t1     \n\t"
485            "movn   %[qc3],     $t4,        $t2     \n\t"
486            "movn   %[qc4],     $t4,        $t3     \n\t"
487            "lw     $t0,        0(%[in_int])        \n\t"
488            "lw     $t1,        4(%[in_int])        \n\t"
489            "lw     $t2,        8(%[in_int])        \n\t"
490            "lw     $t3,        12(%[in_int])       \n\t"
491            "slt    $t0,        $t0,        $zero   \n\t"
492            "movn   %[sign1],   $t0,        %[qc1]  \n\t"
493            "slt    $t2,        $t2,        $zero   \n\t"
494            "movn   %[sign2],   $t2,        %[qc3]  \n\t"
495            "slt    $t1,        $t1,        $zero   \n\t"
496            "sll    $t0,        %[sign1],   1       \n\t"
497            "or     $t0,        $t0,        $t1     \n\t"
498            "movn   %[sign1],   $t0,        %[qc2]  \n\t"
499            "slt    $t3,        $t3,        $zero   \n\t"
500            "sll    $t0,        %[sign2],   1       \n\t"
501            "or     $t0,        $t0,        $t3     \n\t"
502            "movn   %[sign2],   $t0,        %[qc4]  \n\t"
503            "slt    %[count1],  $zero,      %[qc1]  \n\t"
504            "slt    $t1,        $zero,      %[qc2]  \n\t"
505            "slt    %[count2],  $zero,      %[qc3]  \n\t"
506            "slt    $t2,        $zero,      %[qc4]  \n\t"
507            "addu   %[count1],  %[count1],  $t1     \n\t"
508            "addu   %[count2],  %[count2],  $t2     \n\t"
509
510            ".set pop                               \n\t"
511
512            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
513              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
514              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
515              [sign2]"=&r"(sign2), [count2]"=&r"(count2)
516            : [in_int]"r"(in_int)
517            : "t0", "t1", "t2", "t3", "t4",
518              "memory"
519        );
520
521        curidx  = 8 * qc1;
522        curidx += qc2;
523
524        v_codes = (p_codes[curidx] << count1) | sign1;
525        v_bits  = p_bits[curidx] + count1;
526        put_bits(pb, v_bits, v_codes);
527
528        curidx  = 8 * qc3;
529        curidx += qc4;
530
531        v_codes = (p_codes[curidx] << count2) | sign2;
532        v_bits  = p_bits[curidx] + count2;
533        put_bits(pb, v_bits, v_codes);
534    }
535}
536
537static void quantize_and_encode_band_cost_UPAIR12_mips(struct AACEncContext *s,
538                                                       PutBitContext *pb, const float *in,
539                                                       const float *scaled, int size, int scale_idx,
540                                                       int cb, const float lambda, const float uplim,
541                                                       int *bits)
542{
543    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
544    int i;
545    int qc1, qc2, qc3, qc4;
546
547    uint8_t  *p_bits  = (uint8_t*) ff_aac_spectral_bits[cb-1];
548    uint16_t *p_codes = (uint16_t*)ff_aac_spectral_codes[cb-1];
549
550    abs_pow34_v(s->scoefs, in, size);
551    scaled = s->scoefs;
552    for (i = 0; i < size; i += 4) {
553        int curidx, sign1, count1, sign2, count2;
554        int *in_int = (int *)&in[i];
555        uint8_t v_bits;
556        unsigned int v_codes;
557
558        qc1 = scaled[i  ] * Q34 + 0.4054f;
559        qc2 = scaled[i+1] * Q34 + 0.4054f;
560        qc3 = scaled[i+2] * Q34 + 0.4054f;
561        qc4 = scaled[i+3] * Q34 + 0.4054f;
562
563        __asm__ volatile (
564            ".set push                              \n\t"
565            ".set noreorder                         \n\t"
566
567            "ori    $t4,        $zero,      12      \n\t"
568            "ori    %[sign1],   $zero,      0       \n\t"
569            "ori    %[sign2],   $zero,      0       \n\t"
570            "slt    $t0,        $t4,        %[qc1]  \n\t"
571            "slt    $t1,        $t4,        %[qc2]  \n\t"
572            "slt    $t2,        $t4,        %[qc3]  \n\t"
573            "slt    $t3,        $t4,        %[qc4]  \n\t"
574            "movn   %[qc1],     $t4,        $t0     \n\t"
575            "movn   %[qc2],     $t4,        $t1     \n\t"
576            "movn   %[qc3],     $t4,        $t2     \n\t"
577            "movn   %[qc4],     $t4,        $t3     \n\t"
578            "lw     $t0,        0(%[in_int])        \n\t"
579            "lw     $t1,        4(%[in_int])        \n\t"
580            "lw     $t2,        8(%[in_int])        \n\t"
581            "lw     $t3,        12(%[in_int])       \n\t"
582            "slt    $t0,        $t0,        $zero   \n\t"
583            "movn   %[sign1],   $t0,        %[qc1]  \n\t"
584            "slt    $t2,        $t2,        $zero   \n\t"
585            "movn   %[sign2],   $t2,        %[qc3]  \n\t"
586            "slt    $t1,        $t1,        $zero   \n\t"
587            "sll    $t0,        %[sign1],   1       \n\t"
588            "or     $t0,        $t0,        $t1     \n\t"
589            "movn   %[sign1],   $t0,        %[qc2]  \n\t"
590            "slt    $t3,        $t3,        $zero   \n\t"
591            "sll    $t0,        %[sign2],   1       \n\t"
592            "or     $t0,        $t0,        $t3     \n\t"
593            "movn   %[sign2],   $t0,        %[qc4]  \n\t"
594            "slt    %[count1],  $zero,      %[qc1]  \n\t"
595            "slt    $t1,        $zero,      %[qc2]  \n\t"
596            "slt    %[count2],  $zero,      %[qc3]  \n\t"
597            "slt    $t2,        $zero,      %[qc4]  \n\t"
598            "addu   %[count1],  %[count1],  $t1     \n\t"
599            "addu   %[count2],  %[count2],  $t2     \n\t"
600
601            ".set pop                               \n\t"
602
603            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
604              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
605              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
606              [sign2]"=&r"(sign2), [count2]"=&r"(count2)
607            : [in_int]"r"(in_int)
608            : "t0", "t1", "t2", "t3", "t4",
609              "memory"
610        );
611
612        curidx  = 13 * qc1;
613        curidx += qc2;
614
615        v_codes = (p_codes[curidx] << count1) | sign1;
616        v_bits  = p_bits[curidx] + count1;
617        put_bits(pb, v_bits, v_codes);
618
619        curidx  = 13 * qc3;
620        curidx += qc4;
621
622        v_codes = (p_codes[curidx] << count2) | sign2;
623        v_bits  = p_bits[curidx] + count2;
624        put_bits(pb, v_bits, v_codes);
625    }
626}
627
628static void quantize_and_encode_band_cost_ESC_mips(struct AACEncContext *s,
629                                                   PutBitContext *pb, const float *in,
630                                                   const float *scaled, int size, int scale_idx,
631                                                   int cb, const float lambda, const float uplim,
632                                                   int *bits)
633{
634    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
635    int i;
636    int qc1, qc2, qc3, qc4;
637
638    uint8_t  *p_bits    = (uint8_t* )ff_aac_spectral_bits[cb-1];
639    uint16_t *p_codes   = (uint16_t*)ff_aac_spectral_codes[cb-1];
640    float    *p_vectors = (float*   )ff_aac_codebook_vectors[cb-1];
641
642    abs_pow34_v(s->scoefs, in, size);
643    scaled = s->scoefs;
644
645    if (cb < 11) {
646        for (i = 0; i < size; i += 4) {
647            int curidx, curidx2, sign1, count1, sign2, count2;
648            int *in_int = (int *)&in[i];
649            uint8_t v_bits;
650            unsigned int v_codes;
651
652            qc1 = scaled[i  ] * Q34 + 0.4054f;
653            qc2 = scaled[i+1] * Q34 + 0.4054f;
654            qc3 = scaled[i+2] * Q34 + 0.4054f;
655            qc4 = scaled[i+3] * Q34 + 0.4054f;
656
657            __asm__ volatile (
658                ".set push                                  \n\t"
659                ".set noreorder                             \n\t"
660
661                "ori        $t4,        $zero,      16      \n\t"
662                "ori        %[sign1],   $zero,      0       \n\t"
663                "ori        %[sign2],   $zero,      0       \n\t"
664                "slt        $t0,        $t4,        %[qc1]  \n\t"
665                "slt        $t1,        $t4,        %[qc2]  \n\t"
666                "slt        $t2,        $t4,        %[qc3]  \n\t"
667                "slt        $t3,        $t4,        %[qc4]  \n\t"
668                "movn       %[qc1],     $t4,        $t0     \n\t"
669                "movn       %[qc2],     $t4,        $t1     \n\t"
670                "movn       %[qc3],     $t4,        $t2     \n\t"
671                "movn       %[qc4],     $t4,        $t3     \n\t"
672                "lw         $t0,        0(%[in_int])        \n\t"
673                "lw         $t1,        4(%[in_int])        \n\t"
674                "lw         $t2,        8(%[in_int])        \n\t"
675                "lw         $t3,        12(%[in_int])       \n\t"
676                "slt        $t0,        $t0,        $zero   \n\t"
677                "movn       %[sign1],   $t0,        %[qc1]  \n\t"
678                "slt        $t2,        $t2,        $zero   \n\t"
679                "movn       %[sign2],   $t2,        %[qc3]  \n\t"
680                "slt        $t1,        $t1,        $zero   \n\t"
681                "sll        $t0,        %[sign1],   1       \n\t"
682                "or         $t0,        $t0,        $t1     \n\t"
683                "movn       %[sign1],   $t0,        %[qc2]  \n\t"
684                "slt        $t3,        $t3,        $zero   \n\t"
685                "sll        $t0,        %[sign2],   1       \n\t"
686                "or         $t0,        $t0,        $t3     \n\t"
687                "movn       %[sign2],   $t0,        %[qc4]  \n\t"
688                "slt        %[count1],  $zero,      %[qc1]  \n\t"
689                "slt        $t1,        $zero,      %[qc2]  \n\t"
690                "slt        %[count2],  $zero,      %[qc3]  \n\t"
691                "slt        $t2,        $zero,      %[qc4]  \n\t"
692                "addu       %[count1],  %[count1],  $t1     \n\t"
693                "addu       %[count2],  %[count2],  $t2     \n\t"
694
695                ".set pop                                   \n\t"
696
697                : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
698                  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
699                  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
700                  [sign2]"=&r"(sign2), [count2]"=&r"(count2)
701                : [in_int]"r"(in_int)
702                : "t0", "t1", "t2", "t3", "t4",
703                  "memory"
704            );
705
706            curidx = 17 * qc1;
707            curidx += qc2;
708            curidx2 = 17 * qc3;
709            curidx2 += qc4;
710
711            v_codes = (p_codes[curidx] << count1) | sign1;
712            v_bits  = p_bits[curidx] + count1;
713            put_bits(pb, v_bits, v_codes);
714
715            v_codes = (p_codes[curidx2] << count2) | sign2;
716            v_bits  = p_bits[curidx2] + count2;
717            put_bits(pb, v_bits, v_codes);
718        }
719    } else {
720        for (i = 0; i < size; i += 4) {
721            int curidx, curidx2, sign1, count1, sign2, count2;
722            int *in_int = (int *)&in[i];
723            uint8_t v_bits;
724            unsigned int v_codes;
725            int c1, c2, c3, c4;
726
727            qc1 = scaled[i  ] * Q34 + 0.4054f;
728            qc2 = scaled[i+1] * Q34 + 0.4054f;
729            qc3 = scaled[i+2] * Q34 + 0.4054f;
730            qc4 = scaled[i+3] * Q34 + 0.4054f;
731
732            __asm__ volatile (
733                ".set push                                  \n\t"
734                ".set noreorder                             \n\t"
735
736                "ori        $t4,        $zero,      16      \n\t"
737                "ori        %[sign1],   $zero,      0       \n\t"
738                "ori        %[sign2],   $zero,      0       \n\t"
739                "shll_s.w   %[c1],      %[qc1],     18      \n\t"
740                "shll_s.w   %[c2],      %[qc2],     18      \n\t"
741                "shll_s.w   %[c3],      %[qc3],     18      \n\t"
742                "shll_s.w   %[c4],      %[qc4],     18      \n\t"
743                "srl        %[c1],      %[c1],      18      \n\t"
744                "srl        %[c2],      %[c2],      18      \n\t"
745                "srl        %[c3],      %[c3],      18      \n\t"
746                "srl        %[c4],      %[c4],      18      \n\t"
747                "slt        $t0,        $t4,        %[qc1]  \n\t"
748                "slt        $t1,        $t4,        %[qc2]  \n\t"
749                "slt        $t2,        $t4,        %[qc3]  \n\t"
750                "slt        $t3,        $t4,        %[qc4]  \n\t"
751                "movn       %[qc1],     $t4,        $t0     \n\t"
752                "movn       %[qc2],     $t4,        $t1     \n\t"
753                "movn       %[qc3],     $t4,        $t2     \n\t"
754                "movn       %[qc4],     $t4,        $t3     \n\t"
755                "lw         $t0,        0(%[in_int])        \n\t"
756                "lw         $t1,        4(%[in_int])        \n\t"
757                "lw         $t2,        8(%[in_int])        \n\t"
758                "lw         $t3,        12(%[in_int])       \n\t"
759                "slt        $t0,        $t0,        $zero   \n\t"
760                "movn       %[sign1],   $t0,        %[qc1]  \n\t"
761                "slt        $t2,        $t2,        $zero   \n\t"
762                "movn       %[sign2],   $t2,        %[qc3]  \n\t"
763                "slt        $t1,        $t1,        $zero   \n\t"
764                "sll        $t0,        %[sign1],   1       \n\t"
765                "or         $t0,        $t0,        $t1     \n\t"
766                "movn       %[sign1],   $t0,        %[qc2]  \n\t"
767                "slt        $t3,        $t3,        $zero   \n\t"
768                "sll        $t0,        %[sign2],   1       \n\t"
769                "or         $t0,        $t0,        $t3     \n\t"
770                "movn       %[sign2],   $t0,        %[qc4]  \n\t"
771                "slt        %[count1],  $zero,      %[qc1]  \n\t"
772                "slt        $t1,        $zero,      %[qc2]  \n\t"
773                "slt        %[count2],  $zero,      %[qc3]  \n\t"
774                "slt        $t2,        $zero,      %[qc4]  \n\t"
775                "addu       %[count1],  %[count1],  $t1     \n\t"
776                "addu       %[count2],  %[count2],  $t2     \n\t"
777
778                ".set pop                                   \n\t"
779
780                : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
781                  [qc3]"+r"(qc3), [qc4]"+r"(qc4),
782                  [sign1]"=&r"(sign1), [count1]"=&r"(count1),
783                  [sign2]"=&r"(sign2), [count2]"=&r"(count2),
784                  [c1]"=&r"(c1), [c2]"=&r"(c2),
785                  [c3]"=&r"(c3), [c4]"=&r"(c4)
786                : [in_int]"r"(in_int)
787                : "t0", "t1", "t2", "t3", "t4",
788                  "memory"
789            );
790
791            curidx = 17 * qc1;
792            curidx += qc2;
793
794            curidx2 = 17 * qc3;
795            curidx2 += qc4;
796
797            v_codes = (p_codes[curidx] << count1) | sign1;
798            v_bits  = p_bits[curidx] + count1;
799            put_bits(pb, v_bits, v_codes);
800
801            if (p_vectors[curidx*2  ] == 64.0f) {
802                int len = av_log2(c1);
803                v_codes = (((1 << (len - 3)) - 2) << len) | (c1 & ((1 << len) - 1));
804                put_bits(pb, len * 2 - 3, v_codes);
805            }
806            if (p_vectors[curidx*2+1] == 64.0f) {
807                int len = av_log2(c2);
808                v_codes = (((1 << (len - 3)) - 2) << len) | (c2 & ((1 << len) - 1));
809                put_bits(pb, len*2-3, v_codes);
810            }
811
812            v_codes = (p_codes[curidx2] << count2) | sign2;
813            v_bits  = p_bits[curidx2] + count2;
814            put_bits(pb, v_bits, v_codes);
815
816            if (p_vectors[curidx2*2  ] == 64.0f) {
817                int len = av_log2(c3);
818                v_codes = (((1 << (len - 3)) - 2) << len) | (c3 & ((1 << len) - 1));
819                put_bits(pb, len* 2 - 3, v_codes);
820            }
821            if (p_vectors[curidx2*2+1] == 64.0f) {
822                int len = av_log2(c4);
823                v_codes = (((1 << (len - 3)) - 2) << len) | (c4 & ((1 << len) - 1));
824                put_bits(pb, len * 2 - 3, v_codes);
825            }
826        }
827    }
828}
829
830static void (*const quantize_and_encode_band_cost_arr[])(struct AACEncContext *s,
831                                                         PutBitContext *pb, const float *in,
832                                                         const float *scaled, int size, int scale_idx,
833                                                         int cb, const float lambda, const float uplim,
834                                                         int *bits) = {
835    NULL,
836    quantize_and_encode_band_cost_SQUAD_mips,
837    quantize_and_encode_band_cost_SQUAD_mips,
838    quantize_and_encode_band_cost_UQUAD_mips,
839    quantize_and_encode_band_cost_UQUAD_mips,
840    quantize_and_encode_band_cost_SPAIR_mips,
841    quantize_and_encode_band_cost_SPAIR_mips,
842    quantize_and_encode_band_cost_UPAIR7_mips,
843    quantize_and_encode_band_cost_UPAIR7_mips,
844    quantize_and_encode_band_cost_UPAIR12_mips,
845    quantize_and_encode_band_cost_UPAIR12_mips,
846    quantize_and_encode_band_cost_ESC_mips,
847};
848
849#define quantize_and_encode_band_cost(                                  \
850                                s, pb, in, scaled, size, scale_idx, cb, \
851                                lambda, uplim, bits)                    \
852    quantize_and_encode_band_cost_arr[cb](                              \
853                                s, pb, in, scaled, size, scale_idx, cb, \
854                                lambda, uplim, bits)
855
856static void quantize_and_encode_band_mips(struct AACEncContext *s, PutBitContext *pb,
857                                          const float *in, int size, int scale_idx,
858                                          int cb, const float lambda)
859{
860    quantize_and_encode_band_cost(s, pb, in, NULL, size, scale_idx, cb, lambda,
861                                  INFINITY, NULL);
862}
863
864/**
865 * Functions developed from template function and optimized for getting the number of bits
866 */
867static float get_band_numbits_ZERO_mips(struct AACEncContext *s,
868                                        PutBitContext *pb, const float *in,
869                                        const float *scaled, int size, int scale_idx,
870                                        int cb, const float lambda, const float uplim,
871                                        int *bits)
872{
873    return 0;
874}
875
876static float get_band_numbits_SQUAD_mips(struct AACEncContext *s,
877                                         PutBitContext *pb, const float *in,
878                                         const float *scaled, int size, int scale_idx,
879                                         int cb, const float lambda, const float uplim,
880                                         int *bits)
881{
882    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
883    int i;
884    int qc1, qc2, qc3, qc4;
885    int curbits = 0;
886
887    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
888
889    for (i = 0; i < size; i += 4) {
890        int curidx;
891        int *in_int = (int *)&in[i];
892
893        qc1 = scaled[i  ] * Q34 + 0.4054f;
894        qc2 = scaled[i+1] * Q34 + 0.4054f;
895        qc3 = scaled[i+2] * Q34 + 0.4054f;
896        qc4 = scaled[i+3] * Q34 + 0.4054f;
897
898        __asm__ volatile (
899            ".set push                      \n\t"
900            ".set noreorder                 \n\t"
901
902            "slt    %[qc1], $zero,  %[qc1]  \n\t"
903            "slt    %[qc2], $zero,  %[qc2]  \n\t"
904            "slt    %[qc3], $zero,  %[qc3]  \n\t"
905            "slt    %[qc4], $zero,  %[qc4]  \n\t"
906            "lw     $t0,    0(%[in_int])    \n\t"
907            "lw     $t1,    4(%[in_int])    \n\t"
908            "lw     $t2,    8(%[in_int])    \n\t"
909            "lw     $t3,    12(%[in_int])   \n\t"
910            "srl    $t0,    $t0,    31      \n\t"
911            "srl    $t1,    $t1,    31      \n\t"
912            "srl    $t2,    $t2,    31      \n\t"
913            "srl    $t3,    $t3,    31      \n\t"
914            "subu   $t4,    $zero,  %[qc1]  \n\t"
915            "subu   $t5,    $zero,  %[qc2]  \n\t"
916            "subu   $t6,    $zero,  %[qc3]  \n\t"
917            "subu   $t7,    $zero,  %[qc4]  \n\t"
918            "movn   %[qc1], $t4,    $t0     \n\t"
919            "movn   %[qc2], $t5,    $t1     \n\t"
920            "movn   %[qc3], $t6,    $t2     \n\t"
921            "movn   %[qc4], $t7,    $t3     \n\t"
922
923            ".set pop                       \n\t"
924
925            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
926              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
927            : [in_int]"r"(in_int)
928            : "t0", "t1", "t2", "t3",
929              "t4", "t5", "t6", "t7",
930              "memory"
931        );
932
933        curidx = qc1;
934        curidx *= 3;
935        curidx += qc2;
936        curidx *= 3;
937        curidx += qc3;
938        curidx *= 3;
939        curidx += qc4;
940        curidx += 40;
941
942        curbits += p_bits[curidx];
943    }
944    return curbits;
945}
946
947static float get_band_numbits_UQUAD_mips(struct AACEncContext *s,
948                                         PutBitContext *pb, const float *in,
949                                         const float *scaled, int size, int scale_idx,
950                                         int cb, const float lambda, const float uplim,
951                                         int *bits)
952{
953    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
954    int i;
955    int curbits = 0;
956    int qc1, qc2, qc3, qc4;
957
958    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
959
960    for (i = 0; i < size; i += 4) {
961        int curidx;
962
963        qc1 = scaled[i  ] * Q34 + 0.4054f;
964        qc2 = scaled[i+1] * Q34 + 0.4054f;
965        qc3 = scaled[i+2] * Q34 + 0.4054f;
966        qc4 = scaled[i+3] * Q34 + 0.4054f;
967
968        __asm__ volatile (
969            ".set push                      \n\t"
970            ".set noreorder                 \n\t"
971
972            "ori    $t4,    $zero,  2       \n\t"
973            "slt    $t0,    $t4,    %[qc1]  \n\t"
974            "slt    $t1,    $t4,    %[qc2]  \n\t"
975            "slt    $t2,    $t4,    %[qc3]  \n\t"
976            "slt    $t3,    $t4,    %[qc4]  \n\t"
977            "movn   %[qc1], $t4,    $t0     \n\t"
978            "movn   %[qc2], $t4,    $t1     \n\t"
979            "movn   %[qc3], $t4,    $t2     \n\t"
980            "movn   %[qc4], $t4,    $t3     \n\t"
981
982            ".set pop                       \n\t"
983
984            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
985              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
986            :
987            : "t0", "t1", "t2", "t3", "t4"
988        );
989
990        curidx = qc1;
991        curidx *= 3;
992        curidx += qc2;
993        curidx *= 3;
994        curidx += qc3;
995        curidx *= 3;
996        curidx += qc4;
997
998        curbits += p_bits[curidx];
999        curbits += uquad_sign_bits[curidx];
1000    }
1001    return curbits;
1002}
1003
1004static float get_band_numbits_SPAIR_mips(struct AACEncContext *s,
1005                                         PutBitContext *pb, const float *in,
1006                                         const float *scaled, int size, int scale_idx,
1007                                         int cb, const float lambda, const float uplim,
1008                                         int *bits)
1009{
1010    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1011    int i;
1012    int qc1, qc2, qc3, qc4;
1013    int curbits = 0;
1014
1015    uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1016
1017    for (i = 0; i < size; i += 4) {
1018        int curidx, curidx2;
1019        int *in_int = (int *)&in[i];
1020
1021        qc1 = scaled[i  ] * Q34 + 0.4054f;
1022        qc2 = scaled[i+1] * Q34 + 0.4054f;
1023        qc3 = scaled[i+2] * Q34 + 0.4054f;
1024        qc4 = scaled[i+3] * Q34 + 0.4054f;
1025
1026        __asm__ volatile (
1027            ".set push                      \n\t"
1028            ".set noreorder                 \n\t"
1029
1030            "ori    $t4,    $zero,  4       \n\t"
1031            "slt    $t0,    $t4,    %[qc1]  \n\t"
1032            "slt    $t1,    $t4,    %[qc2]  \n\t"
1033            "slt    $t2,    $t4,    %[qc3]  \n\t"
1034            "slt    $t3,    $t4,    %[qc4]  \n\t"
1035            "movn   %[qc1], $t4,    $t0     \n\t"
1036            "movn   %[qc2], $t4,    $t1     \n\t"
1037            "movn   %[qc3], $t4,    $t2     \n\t"
1038            "movn   %[qc4], $t4,    $t3     \n\t"
1039            "lw     $t0,    0(%[in_int])    \n\t"
1040            "lw     $t1,    4(%[in_int])    \n\t"
1041            "lw     $t2,    8(%[in_int])    \n\t"
1042            "lw     $t3,    12(%[in_int])   \n\t"
1043            "srl    $t0,    $t0,    31      \n\t"
1044            "srl    $t1,    $t1,    31      \n\t"
1045            "srl    $t2,    $t2,    31      \n\t"
1046            "srl    $t3,    $t3,    31      \n\t"
1047            "subu   $t4,    $zero,  %[qc1]  \n\t"
1048            "subu   $t5,    $zero,  %[qc2]  \n\t"
1049            "subu   $t6,    $zero,  %[qc3]  \n\t"
1050            "subu   $t7,    $zero,  %[qc4]  \n\t"
1051            "movn   %[qc1], $t4,    $t0     \n\t"
1052            "movn   %[qc2], $t5,    $t1     \n\t"
1053            "movn   %[qc3], $t6,    $t2     \n\t"
1054            "movn   %[qc4], $t7,    $t3     \n\t"
1055
1056            ".set pop                       \n\t"
1057
1058            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1059              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1060            : [in_int]"r"(in_int)
1061            : "t0", "t1", "t2", "t3",
1062              "t4", "t5", "t6", "t7",
1063              "memory"
1064        );
1065
1066        curidx  = 9 * qc1;
1067        curidx += qc2 + 40;
1068
1069        curidx2  = 9 * qc3;
1070        curidx2 += qc4 + 40;
1071
1072        curbits += p_bits[curidx] + p_bits[curidx2];
1073    }
1074    return curbits;
1075}
1076
1077static float get_band_numbits_UPAIR7_mips(struct AACEncContext *s,
1078                                          PutBitContext *pb, const float *in,
1079                                          const float *scaled, int size, int scale_idx,
1080                                          int cb, const float lambda, const float uplim,
1081                                          int *bits)
1082{
1083    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1084    int i;
1085    int qc1, qc2, qc3, qc4;
1086    int curbits = 0;
1087
1088    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1089
1090    for (i = 0; i < size; i += 4) {
1091        int curidx, curidx2;
1092
1093        qc1 = scaled[i  ] * Q34 + 0.4054f;
1094        qc2 = scaled[i+1] * Q34 + 0.4054f;
1095        qc3 = scaled[i+2] * Q34 + 0.4054f;
1096        qc4 = scaled[i+3] * Q34 + 0.4054f;
1097
1098        __asm__ volatile (
1099            ".set push                      \n\t"
1100            ".set noreorder                 \n\t"
1101
1102            "ori    $t4,    $zero,  7       \n\t"
1103            "slt    $t0,    $t4,    %[qc1]  \n\t"
1104            "slt    $t1,    $t4,    %[qc2]  \n\t"
1105            "slt    $t2,    $t4,    %[qc3]  \n\t"
1106            "slt    $t3,    $t4,    %[qc4]  \n\t"
1107            "movn   %[qc1], $t4,    $t0     \n\t"
1108            "movn   %[qc2], $t4,    $t1     \n\t"
1109            "movn   %[qc3], $t4,    $t2     \n\t"
1110            "movn   %[qc4], $t4,    $t3     \n\t"
1111
1112            ".set pop                       \n\t"
1113
1114            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1115              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1116            :
1117            : "t0", "t1", "t2", "t3", "t4"
1118        );
1119
1120        curidx  = 8 * qc1;
1121        curidx += qc2;
1122
1123        curidx2  = 8 * qc3;
1124        curidx2 += qc4;
1125
1126        curbits += p_bits[curidx] +
1127                   upair7_sign_bits[curidx] +
1128                   p_bits[curidx2] +
1129                   upair7_sign_bits[curidx2];
1130    }
1131    return curbits;
1132}
1133
1134static float get_band_numbits_UPAIR12_mips(struct AACEncContext *s,
1135                                           PutBitContext *pb, const float *in,
1136                                           const float *scaled, int size, int scale_idx,
1137                                           int cb, const float lambda, const float uplim,
1138                                           int *bits)
1139{
1140    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1141    int i;
1142    int qc1, qc2, qc3, qc4;
1143    int curbits = 0;
1144
1145    uint8_t *p_bits = (uint8_t *)ff_aac_spectral_bits[cb-1];
1146
1147    for (i = 0; i < size; i += 4) {
1148        int curidx, curidx2;
1149
1150        qc1 = scaled[i  ] * Q34 + 0.4054f;
1151        qc2 = scaled[i+1] * Q34 + 0.4054f;
1152        qc3 = scaled[i+2] * Q34 + 0.4054f;
1153        qc4 = scaled[i+3] * Q34 + 0.4054f;
1154
1155        __asm__ volatile (
1156            ".set push                      \n\t"
1157            ".set noreorder                 \n\t"
1158
1159            "ori    $t4,    $zero,  12      \n\t"
1160            "slt    $t0,    $t4,    %[qc1]  \n\t"
1161            "slt    $t1,    $t4,    %[qc2]  \n\t"
1162            "slt    $t2,    $t4,    %[qc3]  \n\t"
1163            "slt    $t3,    $t4,    %[qc4]  \n\t"
1164            "movn   %[qc1], $t4,    $t0     \n\t"
1165            "movn   %[qc2], $t4,    $t1     \n\t"
1166            "movn   %[qc3], $t4,    $t2     \n\t"
1167            "movn   %[qc4], $t4,    $t3     \n\t"
1168
1169            ".set pop                       \n\t"
1170
1171            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1172              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1173            :
1174            : "t0", "t1", "t2", "t3", "t4"
1175        );
1176
1177        curidx  = 13 * qc1;
1178        curidx += qc2;
1179
1180        curidx2  = 13 * qc3;
1181        curidx2 += qc4;
1182
1183        curbits += p_bits[curidx] +
1184                   p_bits[curidx2] +
1185                   upair12_sign_bits[curidx] +
1186                   upair12_sign_bits[curidx2];
1187    }
1188    return curbits;
1189}
1190
1191static float get_band_numbits_ESC_mips(struct AACEncContext *s,
1192                                       PutBitContext *pb, const float *in,
1193                                       const float *scaled, int size, int scale_idx,
1194                                       int cb, const float lambda, const float uplim,
1195                                       int *bits)
1196{
1197    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1198    int i;
1199    int qc1, qc2, qc3, qc4;
1200    int curbits = 0;
1201
1202    uint8_t *p_bits = (uint8_t*)ff_aac_spectral_bits[cb-1];
1203
1204    for (i = 0; i < size; i += 4) {
1205        int curidx, curidx2;
1206        int cond0, cond1, cond2, cond3;
1207        int c1, c2, c3, c4;
1208
1209        qc1 = scaled[i  ] * Q34 + 0.4054f;
1210        qc2 = scaled[i+1] * Q34 + 0.4054f;
1211        qc3 = scaled[i+2] * Q34 + 0.4054f;
1212        qc4 = scaled[i+3] * Q34 + 0.4054f;
1213
1214        __asm__ volatile (
1215            ".set push                                  \n\t"
1216            ".set noreorder                             \n\t"
1217
1218            "ori        $t4,        $zero,  15          \n\t"
1219            "ori        $t5,        $zero,  16          \n\t"
1220            "shll_s.w   %[c1],      %[qc1], 18          \n\t"
1221            "shll_s.w   %[c2],      %[qc2], 18          \n\t"
1222            "shll_s.w   %[c3],      %[qc3], 18          \n\t"
1223            "shll_s.w   %[c4],      %[qc4], 18          \n\t"
1224            "srl        %[c1],      %[c1],  18          \n\t"
1225            "srl        %[c2],      %[c2],  18          \n\t"
1226            "srl        %[c3],      %[c3],  18          \n\t"
1227            "srl        %[c4],      %[c4],  18          \n\t"
1228            "slt        %[cond0],   $t4,    %[qc1]      \n\t"
1229            "slt        %[cond1],   $t4,    %[qc2]      \n\t"
1230            "slt        %[cond2],   $t4,    %[qc3]      \n\t"
1231            "slt        %[cond3],   $t4,    %[qc4]      \n\t"
1232            "movn       %[qc1],     $t5,    %[cond0]    \n\t"
1233            "movn       %[qc2],     $t5,    %[cond1]    \n\t"
1234            "movn       %[qc3],     $t5,    %[cond2]    \n\t"
1235            "movn       %[qc4],     $t5,    %[cond3]    \n\t"
1236            "ori        $t5,        $zero,  31          \n\t"
1237            "clz        %[c1],      %[c1]               \n\t"
1238            "clz        %[c2],      %[c2]               \n\t"
1239            "clz        %[c3],      %[c3]               \n\t"
1240            "clz        %[c4],      %[c4]               \n\t"
1241            "subu       %[c1],      $t5,    %[c1]       \n\t"
1242            "subu       %[c2],      $t5,    %[c2]       \n\t"
1243            "subu       %[c3],      $t5,    %[c3]       \n\t"
1244            "subu       %[c4],      $t5,    %[c4]       \n\t"
1245            "sll        %[c1],      %[c1],  1           \n\t"
1246            "sll        %[c2],      %[c2],  1           \n\t"
1247            "sll        %[c3],      %[c3],  1           \n\t"
1248            "sll        %[c4],      %[c4],  1           \n\t"
1249            "addiu      %[c1],      %[c1],  -3          \n\t"
1250            "addiu      %[c2],      %[c2],  -3          \n\t"
1251            "addiu      %[c3],      %[c3],  -3          \n\t"
1252            "addiu      %[c4],      %[c4],  -3          \n\t"
1253            "subu       %[cond0],   $zero,  %[cond0]    \n\t"
1254            "subu       %[cond1],   $zero,  %[cond1]    \n\t"
1255            "subu       %[cond2],   $zero,  %[cond2]    \n\t"
1256            "subu       %[cond3],   $zero,  %[cond3]    \n\t"
1257            "and        %[c1],      %[c1],  %[cond0]    \n\t"
1258            "and        %[c2],      %[c2],  %[cond1]    \n\t"
1259            "and        %[c3],      %[c3],  %[cond2]    \n\t"
1260            "and        %[c4],      %[c4],  %[cond3]    \n\t"
1261
1262            ".set pop                                   \n\t"
1263
1264            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1265              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1266              [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
1267              [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
1268              [c1]"=&r"(c1), [c2]"=&r"(c2),
1269              [c3]"=&r"(c3), [c4]"=&r"(c4)
1270            :
1271            : "t4", "t5"
1272        );
1273
1274        curidx = 17 * qc1;
1275        curidx += qc2;
1276
1277        curidx2 = 17 * qc3;
1278        curidx2 += qc4;
1279
1280        curbits += p_bits[curidx];
1281        curbits += esc_sign_bits[curidx];
1282        curbits += p_bits[curidx2];
1283        curbits += esc_sign_bits[curidx2];
1284
1285        curbits += c1;
1286        curbits += c2;
1287        curbits += c3;
1288        curbits += c4;
1289    }
1290    return curbits;
1291}
1292
1293static float (*const get_band_numbits_arr[])(struct AACEncContext *s,
1294                                             PutBitContext *pb, const float *in,
1295                                             const float *scaled, int size, int scale_idx,
1296                                             int cb, const float lambda, const float uplim,
1297                                             int *bits) = {
1298    get_band_numbits_ZERO_mips,
1299    get_band_numbits_SQUAD_mips,
1300    get_band_numbits_SQUAD_mips,
1301    get_band_numbits_UQUAD_mips,
1302    get_band_numbits_UQUAD_mips,
1303    get_band_numbits_SPAIR_mips,
1304    get_band_numbits_SPAIR_mips,
1305    get_band_numbits_UPAIR7_mips,
1306    get_band_numbits_UPAIR7_mips,
1307    get_band_numbits_UPAIR12_mips,
1308    get_band_numbits_UPAIR12_mips,
1309    get_band_numbits_ESC_mips,
1310};
1311
1312#define get_band_numbits(                                  \
1313                                s, pb, in, scaled, size, scale_idx, cb, \
1314                                lambda, uplim, bits)                    \
1315    get_band_numbits_arr[cb](                              \
1316                                s, pb, in, scaled, size, scale_idx, cb, \
1317                                lambda, uplim, bits)
1318
1319static float quantize_band_cost_bits(struct AACEncContext *s, const float *in,
1320                                     const float *scaled, int size, int scale_idx,
1321                                     int cb, const float lambda, const float uplim,
1322                                     int *bits)
1323{
1324    return get_band_numbits(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
1325}
1326
1327/**
1328 * Functions developed from template function and optimized for getting the band cost
1329 */
1330#if HAVE_MIPSFPU
1331static float get_band_cost_ZERO_mips(struct AACEncContext *s,
1332                                     PutBitContext *pb, const float *in,
1333                                     const float *scaled, int size, int scale_idx,
1334                                     int cb, const float lambda, const float uplim,
1335                                     int *bits)
1336{
1337    int i;
1338    float cost = 0;
1339
1340    for (i = 0; i < size; i += 4) {
1341        cost += in[i  ] * in[i  ];
1342        cost += in[i+1] * in[i+1];
1343        cost += in[i+2] * in[i+2];
1344        cost += in[i+3] * in[i+3];
1345    }
1346    if (bits)
1347        *bits = 0;
1348    return cost * lambda;
1349}
1350
1351static float get_band_cost_SQUAD_mips(struct AACEncContext *s,
1352                                      PutBitContext *pb, const float *in,
1353                                      const float *scaled, int size, int scale_idx,
1354                                      int cb, const float lambda, const float uplim,
1355                                      int *bits)
1356{
1357    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1358    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1359    int i;
1360    float cost = 0;
1361    int qc1, qc2, qc3, qc4;
1362    int curbits = 0;
1363
1364    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1365    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1366
1367    for (i = 0; i < size; i += 4) {
1368        const float *vec;
1369        int curidx;
1370        int   *in_int = (int   *)&in[i];
1371        float *in_pos = (float *)&in[i];
1372        float di0, di1, di2, di3;
1373
1374        qc1 = scaled[i  ] * Q34 + 0.4054f;
1375        qc2 = scaled[i+1] * Q34 + 0.4054f;
1376        qc3 = scaled[i+2] * Q34 + 0.4054f;
1377        qc4 = scaled[i+3] * Q34 + 0.4054f;
1378
1379        __asm__ volatile (
1380            ".set push                                  \n\t"
1381            ".set noreorder                             \n\t"
1382
1383            "slt        %[qc1], $zero,  %[qc1]          \n\t"
1384            "slt        %[qc2], $zero,  %[qc2]          \n\t"
1385            "slt        %[qc3], $zero,  %[qc3]          \n\t"
1386            "slt        %[qc4], $zero,  %[qc4]          \n\t"
1387            "lw         $t0,    0(%[in_int])            \n\t"
1388            "lw         $t1,    4(%[in_int])            \n\t"
1389            "lw         $t2,    8(%[in_int])            \n\t"
1390            "lw         $t3,    12(%[in_int])           \n\t"
1391            "srl        $t0,    $t0,    31              \n\t"
1392            "srl        $t1,    $t1,    31              \n\t"
1393            "srl        $t2,    $t2,    31              \n\t"
1394            "srl        $t3,    $t3,    31              \n\t"
1395            "subu       $t4,    $zero,  %[qc1]          \n\t"
1396            "subu       $t5,    $zero,  %[qc2]          \n\t"
1397            "subu       $t6,    $zero,  %[qc3]          \n\t"
1398            "subu       $t7,    $zero,  %[qc4]          \n\t"
1399            "movn       %[qc1], $t4,    $t0             \n\t"
1400            "movn       %[qc2], $t5,    $t1             \n\t"
1401            "movn       %[qc3], $t6,    $t2             \n\t"
1402            "movn       %[qc4], $t7,    $t3             \n\t"
1403
1404            ".set pop                                   \n\t"
1405
1406            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1407              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1408            : [in_int]"r"(in_int)
1409            : "t0", "t1", "t2", "t3",
1410              "t4", "t5", "t6", "t7",
1411              "memory"
1412        );
1413
1414        curidx = qc1;
1415        curidx *= 3;
1416        curidx += qc2;
1417        curidx *= 3;
1418        curidx += qc3;
1419        curidx *= 3;
1420        curidx += qc4;
1421        curidx += 40;
1422
1423        curbits += p_bits[curidx];
1424        vec     = &p_codes[curidx*4];
1425
1426        __asm__ volatile (
1427            ".set push                                  \n\t"
1428            ".set noreorder                             \n\t"
1429
1430            "lwc1       $f0,    0(%[in_pos])            \n\t"
1431            "lwc1       $f1,    0(%[vec])               \n\t"
1432            "lwc1       $f2,    4(%[in_pos])            \n\t"
1433            "lwc1       $f3,    4(%[vec])               \n\t"
1434            "lwc1       $f4,    8(%[in_pos])            \n\t"
1435            "lwc1       $f5,    8(%[vec])               \n\t"
1436            "lwc1       $f6,    12(%[in_pos])           \n\t"
1437            "lwc1       $f7,    12(%[vec])              \n\t"
1438            "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
1439            "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
1440            "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
1441            "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
1442
1443            ".set pop                                   \n\t"
1444
1445            : [di0]"=&f"(di0), [di1]"=&f"(di1),
1446              [di2]"=&f"(di2), [di3]"=&f"(di3)
1447            : [in_pos]"r"(in_pos), [vec]"r"(vec),
1448              [IQ]"f"(IQ)
1449            : "$f0", "$f1", "$f2", "$f3",
1450              "$f4", "$f5", "$f6", "$f7",
1451              "memory"
1452        );
1453
1454        cost += di0 * di0 + di1 * di1
1455                + di2 * di2 + di3 * di3;
1456    }
1457
1458    if (bits)
1459        *bits = curbits;
1460    return cost * lambda + curbits;
1461}
1462
1463static float get_band_cost_UQUAD_mips(struct AACEncContext *s,
1464                                      PutBitContext *pb, const float *in,
1465                                      const float *scaled, int size, int scale_idx,
1466                                      int cb, const float lambda, const float uplim,
1467                                      int *bits)
1468{
1469    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1470    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1471    int i;
1472    float cost = 0;
1473    int curbits = 0;
1474    int qc1, qc2, qc3, qc4;
1475
1476    uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
1477    float   *p_codes = (float  *)ff_aac_codebook_vectors[cb-1];
1478
1479    for (i = 0; i < size; i += 4) {
1480        const float *vec;
1481        int curidx;
1482        float *in_pos = (float *)&in[i];
1483        float di0, di1, di2, di3;
1484
1485        qc1 = scaled[i  ] * Q34 + 0.4054f;
1486        qc2 = scaled[i+1] * Q34 + 0.4054f;
1487        qc3 = scaled[i+2] * Q34 + 0.4054f;
1488        qc4 = scaled[i+3] * Q34 + 0.4054f;
1489
1490        __asm__ volatile (
1491            ".set push                                  \n\t"
1492            ".set noreorder                             \n\t"
1493
1494            "ori        $t4,    $zero,  2               \n\t"
1495            "slt        $t0,    $t4,    %[qc1]          \n\t"
1496            "slt        $t1,    $t4,    %[qc2]          \n\t"
1497            "slt        $t2,    $t4,    %[qc3]          \n\t"
1498            "slt        $t3,    $t4,    %[qc4]          \n\t"
1499            "movn       %[qc1], $t4,    $t0             \n\t"
1500            "movn       %[qc2], $t4,    $t1             \n\t"
1501            "movn       %[qc3], $t4,    $t2             \n\t"
1502            "movn       %[qc4], $t4,    $t3             \n\t"
1503
1504            ".set pop                                   \n\t"
1505
1506            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1507              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1508            :
1509            : "t0", "t1", "t2", "t3", "t4"
1510        );
1511
1512        curidx = qc1;
1513        curidx *= 3;
1514        curidx += qc2;
1515        curidx *= 3;
1516        curidx += qc3;
1517        curidx *= 3;
1518        curidx += qc4;
1519
1520        curbits += p_bits[curidx];
1521        curbits += uquad_sign_bits[curidx];
1522        vec     = &p_codes[curidx*4];
1523
1524        __asm__ volatile (
1525            ".set push                                  \n\t"
1526            ".set noreorder                             \n\t"
1527
1528            "lwc1       %[di0], 0(%[in_pos])            \n\t"
1529            "lwc1       %[di1], 4(%[in_pos])            \n\t"
1530            "lwc1       %[di2], 8(%[in_pos])            \n\t"
1531            "lwc1       %[di3], 12(%[in_pos])           \n\t"
1532            "abs.s      %[di0], %[di0]                  \n\t"
1533            "abs.s      %[di1], %[di1]                  \n\t"
1534            "abs.s      %[di2], %[di2]                  \n\t"
1535            "abs.s      %[di3], %[di3]                  \n\t"
1536            "lwc1       $f0,    0(%[vec])               \n\t"
1537            "lwc1       $f1,    4(%[vec])               \n\t"
1538            "lwc1       $f2,    8(%[vec])               \n\t"
1539            "lwc1       $f3,    12(%[vec])              \n\t"
1540            "nmsub.s    %[di0], %[di0], $f0,    %[IQ]   \n\t"
1541            "nmsub.s    %[di1], %[di1], $f1,    %[IQ]   \n\t"
1542            "nmsub.s    %[di2], %[di2], $f2,    %[IQ]   \n\t"
1543            "nmsub.s    %[di3], %[di3], $f3,    %[IQ]   \n\t"
1544
1545            ".set pop                                   \n\t"
1546
1547            : [di0]"=&f"(di0), [di1]"=&f"(di1),
1548              [di2]"=&f"(di2), [di3]"=&f"(di3)
1549            : [in_pos]"r"(in_pos), [vec]"r"(vec),
1550              [IQ]"f"(IQ)
1551            : "$f0", "$f1", "$f2", "$f3",
1552              "memory"
1553        );
1554
1555        cost += di0 * di0 + di1 * di1
1556                + di2 * di2 + di3 * di3;
1557    }
1558
1559    if (bits)
1560        *bits = curbits;
1561    return cost * lambda + curbits;
1562}
1563
1564static float get_band_cost_SPAIR_mips(struct AACEncContext *s,
1565                                      PutBitContext *pb, const float *in,
1566                                      const float *scaled, int size, int scale_idx,
1567                                      int cb, const float lambda, const float uplim,
1568                                      int *bits)
1569{
1570    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1571    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1572    int i;
1573    float cost = 0;
1574    int qc1, qc2, qc3, qc4;
1575    int curbits = 0;
1576
1577    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1578    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1579
1580    for (i = 0; i < size; i += 4) {
1581        const float *vec, *vec2;
1582        int curidx, curidx2;
1583        int   *in_int = (int   *)&in[i];
1584        float *in_pos = (float *)&in[i];
1585        float di0, di1, di2, di3;
1586
1587        qc1 = scaled[i  ] * Q34 + 0.4054f;
1588        qc2 = scaled[i+1] * Q34 + 0.4054f;
1589        qc3 = scaled[i+2] * Q34 + 0.4054f;
1590        qc4 = scaled[i+3] * Q34 + 0.4054f;
1591
1592        __asm__ volatile (
1593            ".set push                                  \n\t"
1594            ".set noreorder                             \n\t"
1595
1596            "ori        $t4,    $zero,  4               \n\t"
1597            "slt        $t0,    $t4,    %[qc1]          \n\t"
1598            "slt        $t1,    $t4,    %[qc2]          \n\t"
1599            "slt        $t2,    $t4,    %[qc3]          \n\t"
1600            "slt        $t3,    $t4,    %[qc4]          \n\t"
1601            "movn       %[qc1], $t4,    $t0             \n\t"
1602            "movn       %[qc2], $t4,    $t1             \n\t"
1603            "movn       %[qc3], $t4,    $t2             \n\t"
1604            "movn       %[qc4], $t4,    $t3             \n\t"
1605            "lw         $t0,    0(%[in_int])            \n\t"
1606            "lw         $t1,    4(%[in_int])            \n\t"
1607            "lw         $t2,    8(%[in_int])            \n\t"
1608            "lw         $t3,    12(%[in_int])           \n\t"
1609            "srl        $t0,    $t0,    31              \n\t"
1610            "srl        $t1,    $t1,    31              \n\t"
1611            "srl        $t2,    $t2,    31              \n\t"
1612            "srl        $t3,    $t3,    31              \n\t"
1613            "subu       $t4,    $zero,  %[qc1]          \n\t"
1614            "subu       $t5,    $zero,  %[qc2]          \n\t"
1615            "subu       $t6,    $zero,  %[qc3]          \n\t"
1616            "subu       $t7,    $zero,  %[qc4]          \n\t"
1617            "movn       %[qc1], $t4,    $t0             \n\t"
1618            "movn       %[qc2], $t5,    $t1             \n\t"
1619            "movn       %[qc3], $t6,    $t2             \n\t"
1620            "movn       %[qc4], $t7,    $t3             \n\t"
1621
1622            ".set pop                                   \n\t"
1623
1624            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1625              [qc3]"+r"(qc3), [qc4]"+r"(qc4)
1626            : [in_int]"r"(in_int)
1627            : "t0", "t1", "t2", "t3",
1628              "t4", "t5", "t6", "t7",
1629              "memory"
1630        );
1631
1632        curidx = 9 * qc1;
1633        curidx += qc2 + 40;
1634
1635        curidx2 = 9 * qc3;
1636        curidx2 += qc4 + 40;
1637
1638        curbits += p_bits[curidx];
1639        curbits += p_bits[curidx2];
1640
1641        vec     = &p_codes[curidx*2];
1642        vec2    = &p_codes[curidx2*2];
1643
1644        __asm__ volatile (
1645            ".set push                                  \n\t"
1646            ".set noreorder                             \n\t"
1647
1648            "lwc1       $f0,    0(%[in_pos])            \n\t"
1649            "lwc1       $f1,    0(%[vec])               \n\t"
1650            "lwc1       $f2,    4(%[in_pos])            \n\t"
1651            "lwc1       $f3,    4(%[vec])               \n\t"
1652            "lwc1       $f4,    8(%[in_pos])            \n\t"
1653            "lwc1       $f5,    0(%[vec2])              \n\t"
1654            "lwc1       $f6,    12(%[in_pos])           \n\t"
1655            "lwc1       $f7,    4(%[vec2])              \n\t"
1656            "nmsub.s    %[di0], $f0,    $f1,    %[IQ]   \n\t"
1657            "nmsub.s    %[di1], $f2,    $f3,    %[IQ]   \n\t"
1658            "nmsub.s    %[di2], $f4,    $f5,    %[IQ]   \n\t"
1659            "nmsub.s    %[di3], $f6,    $f7,    %[IQ]   \n\t"
1660
1661            ".set pop                                   \n\t"
1662
1663            : [di0]"=&f"(di0), [di1]"=&f"(di1),
1664              [di2]"=&f"(di2), [di3]"=&f"(di3)
1665            : [in_pos]"r"(in_pos), [vec]"r"(vec),
1666              [vec2]"r"(vec2), [IQ]"f"(IQ)
1667            : "$f0", "$f1", "$f2", "$f3",
1668              "$f4", "$f5", "$f6", "$f7",
1669              "memory"
1670        );
1671
1672        cost += di0 * di0 + di1 * di1
1673                + di2 * di2 + di3 * di3;
1674    }
1675
1676    if (bits)
1677        *bits = curbits;
1678    return cost * lambda + curbits;
1679}
1680
1681static float get_band_cost_UPAIR7_mips(struct AACEncContext *s,
1682                                       PutBitContext *pb, const float *in,
1683                                       const float *scaled, int size, int scale_idx,
1684                                       int cb, const float lambda, const float uplim,
1685                                       int *bits)
1686{
1687    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1688    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1689    int i;
1690    float cost = 0;
1691    int qc1, qc2, qc3, qc4;
1692    int curbits = 0;
1693
1694    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1695    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1696
1697    for (i = 0; i < size; i += 4) {
1698        const float *vec, *vec2;
1699        int curidx, curidx2, sign1, count1, sign2, count2;
1700        int   *in_int = (int   *)&in[i];
1701        float *in_pos = (float *)&in[i];
1702        float di0, di1, di2, di3;
1703
1704        qc1 = scaled[i  ] * Q34 + 0.4054f;
1705        qc2 = scaled[i+1] * Q34 + 0.4054f;
1706        qc3 = scaled[i+2] * Q34 + 0.4054f;
1707        qc4 = scaled[i+3] * Q34 + 0.4054f;
1708
1709        __asm__ volatile (
1710            ".set push                                          \n\t"
1711            ".set noreorder                                     \n\t"
1712
1713            "ori        $t4,        $zero,      7               \n\t"
1714            "ori        %[sign1],   $zero,      0               \n\t"
1715            "ori        %[sign2],   $zero,      0               \n\t"
1716            "slt        $t0,        $t4,        %[qc1]          \n\t"
1717            "slt        $t1,        $t4,        %[qc2]          \n\t"
1718            "slt        $t2,        $t4,        %[qc3]          \n\t"
1719            "slt        $t3,        $t4,        %[qc4]          \n\t"
1720            "movn       %[qc1],     $t4,        $t0             \n\t"
1721            "movn       %[qc2],     $t4,        $t1             \n\t"
1722            "movn       %[qc3],     $t4,        $t2             \n\t"
1723            "movn       %[qc4],     $t4,        $t3             \n\t"
1724            "lw         $t0,        0(%[in_int])                \n\t"
1725            "lw         $t1,        4(%[in_int])                \n\t"
1726            "lw         $t2,        8(%[in_int])                \n\t"
1727            "lw         $t3,        12(%[in_int])               \n\t"
1728            "slt        $t0,        $t0,        $zero           \n\t"
1729            "movn       %[sign1],   $t0,        %[qc1]          \n\t"
1730            "slt        $t2,        $t2,        $zero           \n\t"
1731            "movn       %[sign2],   $t2,        %[qc3]          \n\t"
1732            "slt        $t1,        $t1,        $zero           \n\t"
1733            "sll        $t0,        %[sign1],   1               \n\t"
1734            "or         $t0,        $t0,        $t1             \n\t"
1735            "movn       %[sign1],   $t0,        %[qc2]          \n\t"
1736            "slt        $t3,        $t3,        $zero           \n\t"
1737            "sll        $t0,        %[sign2],   1               \n\t"
1738            "or         $t0,        $t0,        $t3             \n\t"
1739            "movn       %[sign2],   $t0,        %[qc4]          \n\t"
1740            "slt        %[count1],  $zero,      %[qc1]          \n\t"
1741            "slt        $t1,        $zero,      %[qc2]          \n\t"
1742            "slt        %[count2],  $zero,      %[qc3]          \n\t"
1743            "slt        $t2,        $zero,      %[qc4]          \n\t"
1744            "addu       %[count1],  %[count1],  $t1             \n\t"
1745            "addu       %[count2],  %[count2],  $t2             \n\t"
1746
1747            ".set pop                                           \n\t"
1748
1749            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1750              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1751              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
1752              [sign2]"=&r"(sign2), [count2]"=&r"(count2)
1753            : [in_int]"r"(in_int)
1754            : "t0", "t1", "t2", "t3", "t4",
1755              "memory"
1756        );
1757
1758        curidx = 8 * qc1;
1759        curidx += qc2;
1760
1761        curidx2 = 8 * qc3;
1762        curidx2 += qc4;
1763
1764        curbits += p_bits[curidx];
1765        curbits += upair7_sign_bits[curidx];
1766        vec     = &p_codes[curidx*2];
1767
1768        curbits += p_bits[curidx2];
1769        curbits += upair7_sign_bits[curidx2];
1770        vec2    = &p_codes[curidx2*2];
1771
1772        __asm__ volatile (
1773            ".set push                                          \n\t"
1774            ".set noreorder                                     \n\t"
1775
1776            "lwc1       %[di0],     0(%[in_pos])                \n\t"
1777            "lwc1       %[di1],     4(%[in_pos])                \n\t"
1778            "lwc1       %[di2],     8(%[in_pos])                \n\t"
1779            "lwc1       %[di3],     12(%[in_pos])               \n\t"
1780            "abs.s      %[di0],     %[di0]                      \n\t"
1781            "abs.s      %[di1],     %[di1]                      \n\t"
1782            "abs.s      %[di2],     %[di2]                      \n\t"
1783            "abs.s      %[di3],     %[di3]                      \n\t"
1784            "lwc1       $f0,        0(%[vec])                   \n\t"
1785            "lwc1       $f1,        4(%[vec])                   \n\t"
1786            "lwc1       $f2,        0(%[vec2])                  \n\t"
1787            "lwc1       $f3,        4(%[vec2])                  \n\t"
1788            "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
1789            "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
1790            "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
1791            "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
1792
1793            ".set pop                                           \n\t"
1794
1795            : [di0]"=&f"(di0), [di1]"=&f"(di1),
1796              [di2]"=&f"(di2), [di3]"=&f"(di3)
1797            : [in_pos]"r"(in_pos), [vec]"r"(vec),
1798              [vec2]"r"(vec2), [IQ]"f"(IQ)
1799            : "$f0", "$f1", "$f2", "$f3",
1800              "memory"
1801        );
1802
1803        cost += di0 * di0 + di1 * di1
1804                + di2 * di2 + di3 * di3;
1805    }
1806
1807    if (bits)
1808        *bits = curbits;
1809    return cost * lambda + curbits;
1810}
1811
1812static float get_band_cost_UPAIR12_mips(struct AACEncContext *s,
1813                                        PutBitContext *pb, const float *in,
1814                                        const float *scaled, int size, int scale_idx,
1815                                        int cb, const float lambda, const float uplim,
1816                                        int *bits)
1817{
1818    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1819    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1820    int i;
1821    float cost = 0;
1822    int qc1, qc2, qc3, qc4;
1823    int curbits = 0;
1824
1825    uint8_t *p_bits  = (uint8_t *)ff_aac_spectral_bits[cb-1];
1826    float   *p_codes = (float   *)ff_aac_codebook_vectors[cb-1];
1827
1828    for (i = 0; i < size; i += 4) {
1829        const float *vec, *vec2;
1830        int curidx, curidx2;
1831        int sign1, count1, sign2, count2;
1832        int   *in_int = (int   *)&in[i];
1833        float *in_pos = (float *)&in[i];
1834        float di0, di1, di2, di3;
1835
1836        qc1 = scaled[i  ] * Q34 + 0.4054f;
1837        qc2 = scaled[i+1] * Q34 + 0.4054f;
1838        qc3 = scaled[i+2] * Q34 + 0.4054f;
1839        qc4 = scaled[i+3] * Q34 + 0.4054f;
1840
1841        __asm__ volatile (
1842            ".set push                                          \n\t"
1843            ".set noreorder                                     \n\t"
1844
1845            "ori        $t4,        $zero,      12              \n\t"
1846            "ori        %[sign1],   $zero,      0               \n\t"
1847            "ori        %[sign2],   $zero,      0               \n\t"
1848            "slt        $t0,        $t4,        %[qc1]          \n\t"
1849            "slt        $t1,        $t4,        %[qc2]          \n\t"
1850            "slt        $t2,        $t4,        %[qc3]          \n\t"
1851            "slt        $t3,        $t4,        %[qc4]          \n\t"
1852            "movn       %[qc1],     $t4,        $t0             \n\t"
1853            "movn       %[qc2],     $t4,        $t1             \n\t"
1854            "movn       %[qc3],     $t4,        $t2             \n\t"
1855            "movn       %[qc4],     $t4,        $t3             \n\t"
1856            "lw         $t0,        0(%[in_int])                \n\t"
1857            "lw         $t1,        4(%[in_int])                \n\t"
1858            "lw         $t2,        8(%[in_int])                \n\t"
1859            "lw         $t3,        12(%[in_int])               \n\t"
1860            "slt        $t0,        $t0,        $zero           \n\t"
1861            "movn       %[sign1],   $t0,        %[qc1]          \n\t"
1862            "slt        $t2,        $t2,        $zero           \n\t"
1863            "movn       %[sign2],   $t2,        %[qc3]          \n\t"
1864            "slt        $t1,        $t1,        $zero           \n\t"
1865            "sll        $t0,        %[sign1],   1               \n\t"
1866            "or         $t0,        $t0,        $t1             \n\t"
1867            "movn       %[sign1],   $t0,        %[qc2]          \n\t"
1868            "slt        $t3,        $t3,        $zero           \n\t"
1869            "sll        $t0,        %[sign2],   1               \n\t"
1870            "or         $t0,        $t0,        $t3             \n\t"
1871            "movn       %[sign2],   $t0,        %[qc4]          \n\t"
1872            "slt        %[count1],  $zero,      %[qc1]          \n\t"
1873            "slt        $t1,        $zero,      %[qc2]          \n\t"
1874            "slt        %[count2],  $zero,      %[qc3]          \n\t"
1875            "slt        $t2,        $zero,      %[qc4]          \n\t"
1876            "addu       %[count1],  %[count1],  $t1             \n\t"
1877            "addu       %[count2],  %[count2],  $t2             \n\t"
1878
1879            ".set pop                                           \n\t"
1880
1881            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1882              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
1883              [sign1]"=&r"(sign1), [count1]"=&r"(count1),
1884              [sign2]"=&r"(sign2), [count2]"=&r"(count2)
1885            : [in_int]"r"(in_int)
1886            : "t0", "t1", "t2", "t3", "t4",
1887              "memory"
1888        );
1889
1890        curidx = 13 * qc1;
1891        curidx += qc2;
1892
1893        curidx2 = 13 * qc3;
1894        curidx2 += qc4;
1895
1896        curbits += p_bits[curidx];
1897        curbits += p_bits[curidx2];
1898        curbits += upair12_sign_bits[curidx];
1899        curbits += upair12_sign_bits[curidx2];
1900        vec     = &p_codes[curidx*2];
1901        vec2    = &p_codes[curidx2*2];
1902
1903        __asm__ volatile (
1904            ".set push                                          \n\t"
1905            ".set noreorder                                     \n\t"
1906
1907            "lwc1       %[di0],     0(%[in_pos])                \n\t"
1908            "lwc1       %[di1],     4(%[in_pos])                \n\t"
1909            "lwc1       %[di2],     8(%[in_pos])                \n\t"
1910            "lwc1       %[di3],     12(%[in_pos])               \n\t"
1911            "abs.s      %[di0],     %[di0]                      \n\t"
1912            "abs.s      %[di1],     %[di1]                      \n\t"
1913            "abs.s      %[di2],     %[di2]                      \n\t"
1914            "abs.s      %[di3],     %[di3]                      \n\t"
1915            "lwc1       $f0,        0(%[vec])                   \n\t"
1916            "lwc1       $f1,        4(%[vec])                   \n\t"
1917            "lwc1       $f2,        0(%[vec2])                  \n\t"
1918            "lwc1       $f3,        4(%[vec2])                  \n\t"
1919            "nmsub.s    %[di0],     %[di0],     $f0,    %[IQ]   \n\t"
1920            "nmsub.s    %[di1],     %[di1],     $f1,    %[IQ]   \n\t"
1921            "nmsub.s    %[di2],     %[di2],     $f2,    %[IQ]   \n\t"
1922            "nmsub.s    %[di3],     %[di3],     $f3,    %[IQ]   \n\t"
1923
1924            ".set pop                                           \n\t"
1925
1926            : [di0]"=&f"(di0), [di1]"=&f"(di1),
1927              [di2]"=&f"(di2), [di3]"=&f"(di3)
1928            : [in_pos]"r"(in_pos), [vec]"r"(vec),
1929              [vec2]"r"(vec2), [IQ]"f"(IQ)
1930            : "$f0", "$f1", "$f2", "$f3",
1931              "memory"
1932        );
1933
1934        cost += di0 * di0 + di1 * di1
1935                + di2 * di2 + di3 * di3;
1936    }
1937
1938    if (bits)
1939        *bits = curbits;
1940    return cost * lambda + curbits;
1941}
1942
1943static float get_band_cost_ESC_mips(struct AACEncContext *s,
1944                                    PutBitContext *pb, const float *in,
1945                                    const float *scaled, int size, int scale_idx,
1946                                    int cb, const float lambda, const float uplim,
1947                                    int *bits)
1948{
1949    const float Q34 = ff_aac_pow34sf_tab[POW_SF2_ZERO - scale_idx + SCALE_ONE_POS - SCALE_DIV_512];
1950    const float IQ  = ff_aac_pow2sf_tab [POW_SF2_ZERO + scale_idx - SCALE_ONE_POS + SCALE_DIV_512];
1951    const float CLIPPED_ESCAPE = 165140.0f * IQ;
1952    int i;
1953    float cost = 0;
1954    int qc1, qc2, qc3, qc4;
1955    int curbits = 0;
1956
1957    uint8_t *p_bits  = (uint8_t*)ff_aac_spectral_bits[cb-1];
1958    float   *p_codes = (float*  )ff_aac_codebook_vectors[cb-1];
1959
1960    for (i = 0; i < size; i += 4) {
1961        const float *vec, *vec2;
1962        int curidx, curidx2;
1963        float t1, t2, t3, t4;
1964        float di1, di2, di3, di4;
1965        int cond0, cond1, cond2, cond3;
1966        int c1, c2, c3, c4;
1967
1968        qc1 = scaled[i  ] * Q34 + 0.4054f;
1969        qc2 = scaled[i+1] * Q34 + 0.4054f;
1970        qc3 = scaled[i+2] * Q34 + 0.4054f;
1971        qc4 = scaled[i+3] * Q34 + 0.4054f;
1972
1973        __asm__ volatile (
1974            ".set push                                  \n\t"
1975            ".set noreorder                             \n\t"
1976
1977            "ori        $t4,        $zero,  15          \n\t"
1978            "ori        $t5,        $zero,  16          \n\t"
1979            "shll_s.w   %[c1],      %[qc1], 18          \n\t"
1980            "shll_s.w   %[c2],      %[qc2], 18          \n\t"
1981            "shll_s.w   %[c3],      %[qc3], 18          \n\t"
1982            "shll_s.w   %[c4],      %[qc4], 18          \n\t"
1983            "srl        %[c1],      %[c1],  18          \n\t"
1984            "srl        %[c2],      %[c2],  18          \n\t"
1985            "srl        %[c3],      %[c3],  18          \n\t"
1986            "srl        %[c4],      %[c4],  18          \n\t"
1987            "slt        %[cond0],   $t4,    %[qc1]      \n\t"
1988            "slt        %[cond1],   $t4,    %[qc2]      \n\t"
1989            "slt        %[cond2],   $t4,    %[qc3]      \n\t"
1990            "slt        %[cond3],   $t4,    %[qc4]      \n\t"
1991            "movn       %[qc1],     $t5,    %[cond0]    \n\t"
1992            "movn       %[qc2],     $t5,    %[cond1]    \n\t"
1993            "movn       %[qc3],     $t5,    %[cond2]    \n\t"
1994            "movn       %[qc4],     $t5,    %[cond3]    \n\t"
1995
1996            ".set pop                                   \n\t"
1997
1998            : [qc1]"+r"(qc1), [qc2]"+r"(qc2),
1999              [qc3]"+r"(qc3), [qc4]"+r"(qc4),
2000              [cond0]"=&r"(cond0), [cond1]"=&r"(cond1),
2001              [cond2]"=&r"(cond2), [cond3]"=&r"(cond3),
2002              [c1]"=&r"(c1), [c2]"=&r"(c2),
2003              [c3]"=&r"(c3), [c4]"=&r"(c4)
2004            :
2005            : "t4", "t5"
2006        );
2007
2008        curidx = 17 * qc1;
2009        curidx += qc2;
2010
2011        curidx2 = 17 * qc3;
2012        curidx2 += qc4;
2013
2014        curbits += p_bits[curidx];
2015        curbits += esc_sign_bits[curidx];
2016        vec     = &p_codes[curidx*2];
2017
2018        curbits += p_bits[curidx2];
2019        curbits += esc_sign_bits[curidx2];
2020        vec2     = &p_codes[curidx2*2];
2021
2022        curbits += (av_log2(c1) * 2 - 3) & (-cond0);
2023        curbits += (av_log2(c2) * 2 - 3) & (-cond1);
2024        curbits += (av_log2(c3) * 2 - 3) & (-cond2);
2025        curbits += (av_log2(c4) * 2 - 3) & (-cond3);
2026
2027        t1 = fabsf(in[i  ]);
2028        t2 = fabsf(in[i+1]);
2029        t3 = fabsf(in[i+2]);
2030        t4 = fabsf(in[i+3]);
2031
2032        if (cond0) {
2033            if (t1 >= CLIPPED_ESCAPE) {
2034                di1 = t1 - CLIPPED_ESCAPE;
2035            } else {
2036                di1 = t1 - c1 * cbrtf(c1) * IQ;
2037            }
2038        } else
2039            di1 = t1 - vec[0] * IQ;
2040
2041        if (cond1) {
2042            if (t2 >= CLIPPED_ESCAPE) {
2043                di2 = t2 - CLIPPED_ESCAPE;
2044            } else {
2045                di2 = t2 - c2 * cbrtf(c2) * IQ;
2046            }
2047        } else
2048            di2 = t2 - vec[1] * IQ;
2049
2050        if (cond2) {
2051            if (t3 >= CLIPPED_ESCAPE) {
2052                di3 = t3 - CLIPPED_ESCAPE;
2053            } else {
2054                di3 = t3 - c3 * cbrtf(c3) * IQ;
2055            }
2056        } else
2057            di3 = t3 - vec2[0] * IQ;
2058
2059        if (cond3) {
2060            if (t4 >= CLIPPED_ESCAPE) {
2061                di4 = t4 - CLIPPED_ESCAPE;
2062            } else {
2063                di4 = t4 - c4 * cbrtf(c4) * IQ;
2064            }
2065        } else
2066            di4 = t4 - vec2[1]*IQ;
2067
2068        cost += di1 * di1 + di2 * di2
2069                + di3 * di3 + di4 * di4;
2070    }
2071
2072    if (bits)
2073        *bits = curbits;
2074    return cost * lambda + curbits;
2075}
2076
2077static float (*const get_band_cost_arr[])(struct AACEncContext *s,
2078                                          PutBitContext *pb, const float *in,
2079                                          const float *scaled, int size, int scale_idx,
2080                                          int cb, const float lambda, const float uplim,
2081                                          int *bits) = {
2082    get_band_cost_ZERO_mips,
2083    get_band_cost_SQUAD_mips,
2084    get_band_cost_SQUAD_mips,
2085    get_band_cost_UQUAD_mips,
2086    get_band_cost_UQUAD_mips,
2087    get_band_cost_SPAIR_mips,
2088    get_band_cost_SPAIR_mips,
2089    get_band_cost_UPAIR7_mips,
2090    get_band_cost_UPAIR7_mips,
2091    get_band_cost_UPAIR12_mips,
2092    get_band_cost_UPAIR12_mips,
2093    get_band_cost_ESC_mips,
2094};
2095
2096#define get_band_cost(                                  \
2097                                s, pb, in, scaled, size, scale_idx, cb, \
2098                                lambda, uplim, bits)                    \
2099    get_band_cost_arr[cb](                              \
2100                                s, pb, in, scaled, size, scale_idx, cb, \
2101                                lambda, uplim, bits)
2102
2103static float quantize_band_cost(struct AACEncContext *s, const float *in,
2104                                const float *scaled, int size, int scale_idx,
2105                                int cb, const float lambda, const float uplim,
2106                                int *bits)
2107{
2108    return get_band_cost(s, NULL, in, scaled, size, scale_idx, cb, lambda, uplim, bits);
2109}
2110
2111static void search_for_quantizers_twoloop_mips(AVCodecContext *avctx,
2112                                               AACEncContext *s,
2113                                               SingleChannelElement *sce,
2114                                               const float lambda)
2115{
2116    int start = 0, i, w, w2, g;
2117    int destbits = avctx->bit_rate * 1024.0 / avctx->sample_rate / avctx->channels;
2118    float dists[128] = { 0 }, uplims[128];
2119    float maxvals[128];
2120    int fflag, minscaler;
2121    int its  = 0;
2122    int allz = 0;
2123    float minthr = INFINITY;
2124
2125    destbits = FFMIN(destbits, 5800);
2126    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2127        for (g = 0;  g < sce->ics.num_swb; g++) {
2128            int nz = 0;
2129            float uplim = 0.0f;
2130            for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2131                FFPsyBand *band = &s->psy.ch[s->cur_channel].psy_bands[(w+w2)*16+g];
2132                uplim += band->threshold;
2133                if (band->energy <= band->threshold || band->threshold == 0.0f) {
2134                    sce->zeroes[(w+w2)*16+g] = 1;
2135                    continue;
2136                }
2137                nz = 1;
2138            }
2139            uplims[w*16+g] = uplim *512;
2140            sce->zeroes[w*16+g] = !nz;
2141            if (nz)
2142                minthr = FFMIN(minthr, uplim);
2143            allz |= nz;
2144        }
2145    }
2146    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2147        for (g = 0;  g < sce->ics.num_swb; g++) {
2148            if (sce->zeroes[w*16+g]) {
2149                sce->sf_idx[w*16+g] = SCALE_ONE_POS;
2150                continue;
2151            }
2152            sce->sf_idx[w*16+g] = SCALE_ONE_POS + FFMIN(log2f(uplims[w*16+g]/minthr)*4,59);
2153        }
2154    }
2155
2156    if (!allz)
2157        return;
2158    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
2159
2160    for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2161        start = w*128;
2162        for (g = 0;  g < sce->ics.num_swb; g++) {
2163            const float *scaled = s->scoefs + start;
2164            maxvals[w*16+g] = find_max_val(sce->ics.group_len[w], sce->ics.swb_sizes[g], scaled);
2165            start += sce->ics.swb_sizes[g];
2166        }
2167    }
2168
2169    do {
2170        int tbits, qstep;
2171        minscaler = sce->sf_idx[0];
2172        qstep = its ? 1 : 32;
2173        do {
2174            int prev = -1;
2175            tbits = 0;
2176            fflag = 0;
2177
2178            if (qstep > 1) {
2179                for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2180                    start = w*128;
2181                    for (g = 0;  g < sce->ics.num_swb; g++) {
2182                        const float *coefs = sce->coeffs + start;
2183                        const float *scaled = s->scoefs + start;
2184                        int bits = 0;
2185                        int cb;
2186
2187                        if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
2188                            start += sce->ics.swb_sizes[g];
2189                            continue;
2190                        }
2191                        minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
2192                        cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2193                        for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2194                            int b;
2195                            bits += quantize_band_cost_bits(s, coefs + w2*128,
2196                                                            scaled + w2*128,
2197                                                            sce->ics.swb_sizes[g],
2198                                                            sce->sf_idx[w*16+g],
2199                                                            cb,
2200                                                            1.0f,
2201                                                            INFINITY,
2202                                                            &b);
2203                        }
2204                        if (prev != -1) {
2205                            bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
2206                        }
2207                        tbits += bits;
2208                        start += sce->ics.swb_sizes[g];
2209                        prev = sce->sf_idx[w*16+g];
2210                    }
2211                }
2212            }
2213            else {
2214                for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2215                    start = w*128;
2216                    for (g = 0;  g < sce->ics.num_swb; g++) {
2217                        const float *coefs = sce->coeffs + start;
2218                        const float *scaled = s->scoefs + start;
2219                        int bits = 0;
2220                        int cb;
2221                        float dist = 0.0f;
2222
2223                        if (sce->zeroes[w*16+g] || sce->sf_idx[w*16+g] >= 218) {
2224                            start += sce->ics.swb_sizes[g];
2225                            continue;
2226                        }
2227                        minscaler = FFMIN(minscaler, sce->sf_idx[w*16+g]);
2228                        cb = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2229                        for (w2 = 0; w2 < sce->ics.group_len[w]; w2++) {
2230                            int b;
2231                            dist += quantize_band_cost(s, coefs + w2*128,
2232                                                       scaled + w2*128,
2233                                                       sce->ics.swb_sizes[g],
2234                                                       sce->sf_idx[w*16+g],
2235                                                       cb,
2236                                                       1.0f,
2237                                                       INFINITY,
2238                                                       &b);
2239                            bits += b;
2240                        }
2241                        dists[w*16+g] = dist - bits;
2242                        if (prev != -1) {
2243                            bits += ff_aac_scalefactor_bits[sce->sf_idx[w*16+g] - prev + SCALE_DIFF_ZERO];
2244                        }
2245                        tbits += bits;
2246                        start += sce->ics.swb_sizes[g];
2247                        prev = sce->sf_idx[w*16+g];
2248                    }
2249                }
2250            }
2251            if (tbits > destbits) {
2252                for (i = 0; i < 128; i++)
2253                    if (sce->sf_idx[i] < 218 - qstep)
2254                        sce->sf_idx[i] += qstep;
2255            } else {
2256                for (i = 0; i < 128; i++)
2257                    if (sce->sf_idx[i] > 60 - qstep)
2258                        sce->sf_idx[i] -= qstep;
2259            }
2260            qstep >>= 1;
2261            if (!qstep && tbits > destbits*1.02 && sce->sf_idx[0] < 217)
2262                qstep = 1;
2263        } while (qstep);
2264
2265        fflag = 0;
2266        minscaler = av_clip(minscaler, 60, 255 - SCALE_MAX_DIFF);
2267        for (w = 0; w < sce->ics.num_windows; w += sce->ics.group_len[w]) {
2268            for (g = 0; g < sce->ics.num_swb; g++) {
2269                int prevsc = sce->sf_idx[w*16+g];
2270                if (dists[w*16+g] > uplims[w*16+g] && sce->sf_idx[w*16+g] > 60) {
2271                    if (find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]-1))
2272                        sce->sf_idx[w*16+g]--;
2273                    else
2274                        sce->sf_idx[w*16+g]-=2;
2275                }
2276                sce->sf_idx[w*16+g] = av_clip(sce->sf_idx[w*16+g], minscaler, minscaler + SCALE_MAX_DIFF);
2277                sce->sf_idx[w*16+g] = FFMIN(sce->sf_idx[w*16+g], 219);
2278                if (sce->sf_idx[w*16+g] != prevsc)
2279                    fflag = 1;
2280                sce->band_type[w*16+g] = find_min_book(maxvals[w*16+g], sce->sf_idx[w*16+g]);
2281            }
2282        }
2283        its++;
2284    } while (fflag && its < 10);
2285}
2286
2287static void search_for_ms_mips(AACEncContext *s, ChannelElement *cpe,
2288                               const float lambda)
2289{
2290    int start = 0, i, w, w2, g;
2291    float M[128], S[128];
2292    float *L34 = s->scoefs, *R34 = s->scoefs + 128, *M34 = s->scoefs + 128*2, *S34 = s->scoefs + 128*3;
2293    SingleChannelElement *sce0 = &cpe->ch[0];
2294    SingleChannelElement *sce1 = &cpe->ch[1];
2295    if (!cpe->common_window)
2296        return;
2297    for (w = 0; w < sce0->ics.num_windows; w += sce0->ics.group_len[w]) {
2298        for (g = 0;  g < sce0->ics.num_swb; g++) {
2299            if (!cpe->ch[0].zeroes[w*16+g] && !cpe->ch[1].zeroes[w*16+g]) {
2300                float dist1 = 0.0f, dist2 = 0.0f;
2301                for (w2 = 0; w2 < sce0->ics.group_len[w]; w2++) {
2302                    FFPsyBand *band0 = &s->psy.ch[s->cur_channel+0].psy_bands[(w+w2)*16+g];
2303                    FFPsyBand *band1 = &s->psy.ch[s->cur_channel+1].psy_bands[(w+w2)*16+g];
2304                    float minthr = FFMIN(band0->threshold, band1->threshold);
2305                    float maxthr = FFMAX(band0->threshold, band1->threshold);
2306                    for (i = 0; i < sce0->ics.swb_sizes[g]; i+=4) {
2307                        M[i  ] = (sce0->coeffs[start+w2*128+i  ]
2308                                + sce1->coeffs[start+w2*128+i  ]) * 0.5;
2309                        M[i+1] = (sce0->coeffs[start+w2*128+i+1]
2310                                + sce1->coeffs[start+w2*128+i+1]) * 0.5;
2311                        M[i+2] = (sce0->coeffs[start+w2*128+i+2]
2312                                + sce1->coeffs[start+w2*128+i+2]) * 0.5;
2313                        M[i+3] = (sce0->coeffs[start+w2*128+i+3]
2314                                + sce1->coeffs[start+w2*128+i+3]) * 0.5;
2315
2316                        S[i  ] =  M[i  ]
2317                                - sce1->coeffs[start+w2*128+i  ];
2318                        S[i+1] =  M[i+1]
2319                                - sce1->coeffs[start+w2*128+i+1];
2320                        S[i+2] =  M[i+2]
2321                                - sce1->coeffs[start+w2*128+i+2];
2322                        S[i+3] =  M[i+3]
2323                                - sce1->coeffs[start+w2*128+i+3];
2324                   }
2325                    abs_pow34_v(L34, sce0->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
2326                    abs_pow34_v(R34, sce1->coeffs+start+w2*128, sce0->ics.swb_sizes[g]);
2327                    abs_pow34_v(M34, M,                         sce0->ics.swb_sizes[g]);
2328                    abs_pow34_v(S34, S,                         sce0->ics.swb_sizes[g]);
2329                    dist1 += quantize_band_cost(s, sce0->coeffs + start + w2*128,
2330                                                L34,
2331                                                sce0->ics.swb_sizes[g],
2332                                                sce0->sf_idx[(w+w2)*16+g],
2333                                                sce0->band_type[(w+w2)*16+g],
2334                                                lambda / band0->threshold, INFINITY, NULL);
2335                    dist1 += quantize_band_cost(s, sce1->coeffs + start + w2*128,
2336                                                R34,
2337                                                sce1->ics.swb_sizes[g],
2338                                                sce1->sf_idx[(w+w2)*16+g],
2339                                                sce1->band_type[(w+w2)*16+g],
2340                                                lambda / band1->threshold, INFINITY, NULL);
2341                    dist2 += quantize_band_cost(s, M,
2342                                                M34,
2343                                                sce0->ics.swb_sizes[g],
2344                                                sce0->sf_idx[(w+w2)*16+g],
2345                                                sce0->band_type[(w+w2)*16+g],
2346                                                lambda / maxthr, INFINITY, NULL);
2347                    dist2 += quantize_band_cost(s, S,
2348                                                S34,
2349                                                sce1->ics.swb_sizes[g],
2350                                                sce1->sf_idx[(w+w2)*16+g],
2351                                                sce1->band_type[(w+w2)*16+g],
2352                                                lambda / minthr, INFINITY, NULL);
2353                }
2354                cpe->ms_mask[w*16+g] = dist2 < dist1;
2355            }
2356            start += sce0->ics.swb_sizes[g];
2357        }
2358    }
2359}
2360#endif /*HAVE_MIPSFPU */
2361
2362static void codebook_trellis_rate_mips(AACEncContext *s, SingleChannelElement *sce,
2363                                       int win, int group_len, const float lambda)
2364{
2365    BandCodingPath path[120][12];
2366    int w, swb, cb, start, size;
2367    int i, j;
2368    const int max_sfb  = sce->ics.max_sfb;
2369    const int run_bits = sce->ics.num_windows == 1 ? 5 : 3;
2370    const int run_esc  = (1 << run_bits) - 1;
2371    int idx, ppos, count;
2372    int stackrun[120], stackcb[120], stack_len;
2373    float next_minbits = INFINITY;
2374    int next_mincb = 0;
2375
2376    abs_pow34_v(s->scoefs, sce->coeffs, 1024);
2377    start = win*128;
2378    for (cb = 0; cb < 12; cb++) {
2379        path[0][cb].cost     = run_bits+4;
2380        path[0][cb].prev_idx = -1;
2381        path[0][cb].run      = 0;
2382    }
2383    for (swb = 0; swb < max_sfb; swb++) {
2384        size = sce->ics.swb_sizes[swb];
2385        if (sce->zeroes[win*16 + swb]) {
2386            float cost_stay_here = path[swb][0].cost;
2387            float cost_get_here  = next_minbits + run_bits + 4;
2388            if (   run_value_bits[sce->ics.num_windows == 8][path[swb][0].run]
2389                != run_value_bits[sce->ics.num_windows == 8][path[swb][0].run+1])
2390                cost_stay_here += run_bits;
2391            if (cost_get_here < cost_stay_here) {
2392                path[swb+1][0].prev_idx = next_mincb;
2393                path[swb+1][0].cost     = cost_get_here;
2394                path[swb+1][0].run      = 1;
2395            } else {
2396                path[swb+1][0].prev_idx = 0;
2397                path[swb+1][0].cost     = cost_stay_here;
2398                path[swb+1][0].run      = path[swb][0].run + 1;
2399            }
2400            next_minbits = path[swb+1][0].cost;
2401            next_mincb = 0;
2402            for (cb = 1; cb < 12; cb++) {
2403                path[swb+1][cb].cost = 61450;
2404                path[swb+1][cb].prev_idx = -1;
2405                path[swb+1][cb].run = 0;
2406            }
2407        } else {
2408            float minbits = next_minbits;
2409            int mincb = next_mincb;
2410            int startcb = sce->band_type[win*16+swb];
2411            next_minbits = INFINITY;
2412            next_mincb = 0;
2413            for (cb = 0; cb < startcb; cb++) {
2414                path[swb+1][cb].cost = 61450;
2415                path[swb+1][cb].prev_idx = -1;
2416                path[swb+1][cb].run = 0;
2417            }
2418            for (cb = startcb; cb < 12; cb++) {
2419                float cost_stay_here, cost_get_here;
2420                float bits = 0.0f;
2421                for (w = 0; w < group_len; w++) {
2422                    bits += quantize_band_cost_bits(s, sce->coeffs + start + w*128,
2423                                                    s->scoefs + start + w*128, size,
2424                                                    sce->sf_idx[(win+w)*16+swb], cb,
2425                                                    0, INFINITY, NULL);
2426                }
2427                cost_stay_here = path[swb][cb].cost + bits;
2428                cost_get_here  = minbits            + bits + run_bits + 4;
2429                if (   run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run]
2430                    != run_value_bits[sce->ics.num_windows == 8][path[swb][cb].run+1])
2431                    cost_stay_here += run_bits;
2432                if (cost_get_here < cost_stay_here) {
2433                    path[swb+1][cb].prev_idx = mincb;
2434                    path[swb+1][cb].cost     = cost_get_here;
2435                    path[swb+1][cb].run      = 1;
2436                } else {
2437                    path[swb+1][cb].prev_idx = cb;
2438                    path[swb+1][cb].cost     = cost_stay_here;
2439                    path[swb+1][cb].run      = path[swb][cb].run + 1;
2440                }
2441                if (path[swb+1][cb].cost < next_minbits) {
2442                    next_minbits = path[swb+1][cb].cost;
2443                    next_mincb = cb;
2444                }
2445            }
2446        }
2447        start += sce->ics.swb_sizes[swb];
2448    }
2449
2450    stack_len = 0;
2451    idx       = 0;
2452    for (cb = 1; cb < 12; cb++)
2453        if (path[max_sfb][cb].cost < path[max_sfb][idx].cost)
2454            idx = cb;
2455    ppos = max_sfb;
2456    while (ppos > 0) {
2457        av_assert1(idx >= 0);
2458        cb = idx;
2459        stackrun[stack_len] = path[ppos][cb].run;
2460        stackcb [stack_len] = cb;
2461        idx = path[ppos-path[ppos][cb].run+1][cb].prev_idx;
2462        ppos -= path[ppos][cb].run;
2463        stack_len++;
2464    }
2465
2466    start = 0;
2467    for (i = stack_len - 1; i >= 0; i--) {
2468        put_bits(&s->pb, 4, stackcb[i]);
2469        count = stackrun[i];
2470        memset(sce->zeroes + win*16 + start, !stackcb[i], count);
2471        for (j = 0; j < count; j++) {
2472            sce->band_type[win*16 + start] =  stackcb[i];
2473            start++;
2474        }
2475        while (count >= run_esc) {
2476            put_bits(&s->pb, run_bits, run_esc);
2477            count -= run_esc;
2478        }
2479        put_bits(&s->pb, run_bits, count);
2480    }
2481}
2482#endif /* HAVE_INLINE_ASM */
2483
2484void ff_aac_coder_init_mips(AACEncContext *c) {
2485#if HAVE_INLINE_ASM
2486    AACCoefficientsEncoder *e = c->coder;
2487    int option = c->options.aac_coder;
2488
2489    if (option == 2) {
2490        e->quantize_and_encode_band = quantize_and_encode_band_mips;
2491        e->encode_window_bands_info = codebook_trellis_rate_mips;
2492#if HAVE_MIPSFPU
2493        e->search_for_quantizers    = search_for_quantizers_twoloop_mips;
2494        e->search_for_ms            = search_for_ms_mips;
2495#endif /* HAVE_MIPSFPU */
2496    }
2497#endif /* HAVE_INLINE_ASM */
2498}
2499