1/**
2 * VP8 compatible video decoder
3 *
4 * Copyright (C) 2010 David Conrad
5 * Copyright (C) 2010 Ronald S. Bultje
6 * Copyright (C) 2010 Jason Garrett-Glaser
7 *
8 * This file is part of Libav.
9 *
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25#include "libavutil/imgutils.h"
26#include "avcodec.h"
27#include "internal.h"
28#include "vp8.h"
29#include "vp8data.h"
30#include "rectangle.h"
31#include "thread.h"
32
33#if ARCH_ARM
34#   include "arm/vp8.h"
35#endif
36
37static void free_buffers(VP8Context *s)
38{
39    av_freep(&s->macroblocks_base);
40    av_freep(&s->filter_strength);
41    av_freep(&s->intra4x4_pred_mode_top);
42    av_freep(&s->top_nnz);
43    av_freep(&s->edge_emu_buffer);
44    av_freep(&s->top_border);
45
46    s->macroblocks = NULL;
47}
48
49static int vp8_alloc_frame(VP8Context *s, AVFrame *f)
50{
51    int ret;
52    if ((ret = ff_thread_get_buffer(s->avctx, f)) < 0)
53        return ret;
54    if (s->num_maps_to_be_freed && !s->maps_are_invalid) {
55        f->ref_index[0] = s->segmentation_maps[--s->num_maps_to_be_freed];
56    } else if (!(f->ref_index[0] = av_mallocz(s->mb_width * s->mb_height))) {
57        ff_thread_release_buffer(s->avctx, f);
58        return AVERROR(ENOMEM);
59    }
60    return 0;
61}
62
63static void vp8_release_frame(VP8Context *s, AVFrame *f, int prefer_delayed_free, int can_direct_free)
64{
65    if (f->ref_index[0]) {
66        if (prefer_delayed_free) {
67            /* Upon a size change, we want to free the maps but other threads may still
68             * be using them, so queue them. Upon a seek, all threads are inactive so
69             * we want to cache one to prevent re-allocation in the next decoding
70             * iteration, but the rest we can free directly. */
71            int max_queued_maps = can_direct_free ? 1 : FF_ARRAY_ELEMS(s->segmentation_maps);
72            if (s->num_maps_to_be_freed < max_queued_maps) {
73                s->segmentation_maps[s->num_maps_to_be_freed++] = f->ref_index[0];
74            } else if (can_direct_free) /* vp8_decode_flush(), but our queue is full */ {
75                av_free(f->ref_index[0]);
76            } /* else: MEMLEAK (should never happen, but better that than crash) */
77            f->ref_index[0] = NULL;
78        } else /* vp8_decode_free() */ {
79            av_free(f->ref_index[0]);
80        }
81    }
82    ff_thread_release_buffer(s->avctx, f);
83}
84
85static void vp8_decode_flush_impl(AVCodecContext *avctx,
86                                  int prefer_delayed_free, int can_direct_free, int free_mem)
87{
88    VP8Context *s = avctx->priv_data;
89    int i;
90
91    if (!avctx->internal->is_copy) {
92        for (i = 0; i < 5; i++)
93            if (s->frames[i].data[0])
94                vp8_release_frame(s, &s->frames[i], prefer_delayed_free, can_direct_free);
95    }
96    memset(s->framep, 0, sizeof(s->framep));
97
98    if (free_mem) {
99        free_buffers(s);
100        s->maps_are_invalid = 1;
101    }
102}
103
104static void vp8_decode_flush(AVCodecContext *avctx)
105{
106    vp8_decode_flush_impl(avctx, 1, 1, 0);
107}
108
109static int update_dimensions(VP8Context *s, int width, int height)
110{
111    if (width  != s->avctx->width ||
112        height != s->avctx->height) {
113        if (av_image_check_size(width, height, 0, s->avctx))
114            return AVERROR_INVALIDDATA;
115
116        vp8_decode_flush_impl(s->avctx, 1, 0, 1);
117
118        avcodec_set_dimensions(s->avctx, width, height);
119    }
120
121    s->mb_width  = (s->avctx->coded_width +15) / 16;
122    s->mb_height = (s->avctx->coded_height+15) / 16;
123
124    s->macroblocks_base        = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
125    s->filter_strength         = av_mallocz(s->mb_width*sizeof(*s->filter_strength));
126    s->intra4x4_pred_mode_top  = av_mallocz(s->mb_width*4);
127    s->top_nnz                 = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
128    s->top_border              = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
129
130    if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_top ||
131        !s->top_nnz || !s->top_border)
132        return AVERROR(ENOMEM);
133
134    s->macroblocks        = s->macroblocks_base + 1;
135
136    return 0;
137}
138
139static void parse_segment_info(VP8Context *s)
140{
141    VP56RangeCoder *c = &s->c;
142    int i;
143
144    s->segmentation.update_map = vp8_rac_get(c);
145
146    if (vp8_rac_get(c)) { // update segment feature data
147        s->segmentation.absolute_vals = vp8_rac_get(c);
148
149        for (i = 0; i < 4; i++)
150            s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
151
152        for (i = 0; i < 4; i++)
153            s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
154    }
155    if (s->segmentation.update_map)
156        for (i = 0; i < 3; i++)
157            s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
158}
159
160static void update_lf_deltas(VP8Context *s)
161{
162    VP56RangeCoder *c = &s->c;
163    int i;
164
165    for (i = 0; i < 4; i++)
166        s->lf_delta.ref[i]  = vp8_rac_get_sint(c, 6);
167
168    for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++)
169        s->lf_delta.mode[i] = vp8_rac_get_sint(c, 6);
170}
171
172static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
173{
174    const uint8_t *sizes = buf;
175    int i;
176
177    s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
178
179    buf      += 3*(s->num_coeff_partitions-1);
180    buf_size -= 3*(s->num_coeff_partitions-1);
181    if (buf_size < 0)
182        return -1;
183
184    for (i = 0; i < s->num_coeff_partitions-1; i++) {
185        int size = AV_RL24(sizes + 3*i);
186        if (buf_size - size < 0)
187            return -1;
188
189        ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
190        buf      += size;
191        buf_size -= size;
192    }
193    ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
194
195    return 0;
196}
197
198static void get_quants(VP8Context *s)
199{
200    VP56RangeCoder *c = &s->c;
201    int i, base_qi;
202
203    int yac_qi     = vp8_rac_get_uint(c, 7);
204    int ydc_delta  = vp8_rac_get_sint(c, 4);
205    int y2dc_delta = vp8_rac_get_sint(c, 4);
206    int y2ac_delta = vp8_rac_get_sint(c, 4);
207    int uvdc_delta = vp8_rac_get_sint(c, 4);
208    int uvac_delta = vp8_rac_get_sint(c, 4);
209
210    for (i = 0; i < 4; i++) {
211        if (s->segmentation.enabled) {
212            base_qi = s->segmentation.base_quant[i];
213            if (!s->segmentation.absolute_vals)
214                base_qi += yac_qi;
215        } else
216            base_qi = yac_qi;
217
218        s->qmat[i].luma_qmul[0]    =       vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
219        s->qmat[i].luma_qmul[1]    =       vp8_ac_qlookup[av_clip_uintp2(base_qi             , 7)];
220        s->qmat[i].luma_dc_qmul[0] =   2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
221        s->qmat[i].luma_dc_qmul[1] = 155 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] / 100;
222        s->qmat[i].chroma_qmul[0]  =       vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
223        s->qmat[i].chroma_qmul[1]  =       vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
224
225        s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
226        s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
227    }
228}
229
230/**
231 * Determine which buffers golden and altref should be updated with after this frame.
232 * The spec isn't clear here, so I'm going by my understanding of what libvpx does
233 *
234 * Intra frames update all 3 references
235 * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
236 * If the update (golden|altref) flag is set, it's updated with the current frame
237 *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
238 * If the flag is not set, the number read means:
239 *      0: no update
240 *      1: VP56_FRAME_PREVIOUS
241 *      2: update golden with altref, or update altref with golden
242 */
243static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
244{
245    VP56RangeCoder *c = &s->c;
246
247    if (update)
248        return VP56_FRAME_CURRENT;
249
250    switch (vp8_rac_get_uint(c, 2)) {
251    case 1:
252        return VP56_FRAME_PREVIOUS;
253    case 2:
254        return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
255    }
256    return VP56_FRAME_NONE;
257}
258
259static void update_refs(VP8Context *s)
260{
261    VP56RangeCoder *c = &s->c;
262
263    int update_golden = vp8_rac_get(c);
264    int update_altref = vp8_rac_get(c);
265
266    s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
267    s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
268}
269
270static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
271{
272    VP56RangeCoder *c = &s->c;
273    int header_size, hscale, vscale, i, j, k, l, m, ret;
274    int width  = s->avctx->width;
275    int height = s->avctx->height;
276
277    s->keyframe  = !(buf[0] & 1);
278    s->profile   =  (buf[0]>>1) & 7;
279    s->invisible = !(buf[0] & 0x10);
280    header_size  = AV_RL24(buf) >> 5;
281    buf      += 3;
282    buf_size -= 3;
283
284    if (s->profile > 3)
285        av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
286
287    if (!s->profile)
288        memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
289    else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
290        memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
291
292    if (header_size > buf_size - 7*s->keyframe) {
293        av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
294        return AVERROR_INVALIDDATA;
295    }
296
297    if (s->keyframe) {
298        if (AV_RL24(buf) != 0x2a019d) {
299            av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
300            return AVERROR_INVALIDDATA;
301        }
302        width  = AV_RL16(buf+3) & 0x3fff;
303        height = AV_RL16(buf+5) & 0x3fff;
304        hscale = buf[4] >> 6;
305        vscale = buf[6] >> 6;
306        buf      += 7;
307        buf_size -= 7;
308
309        if (hscale || vscale)
310            av_log_missing_feature(s->avctx, "Upscaling", 1);
311
312        s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
313        for (i = 0; i < 4; i++)
314            for (j = 0; j < 16; j++)
315                memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
316                       sizeof(s->prob->token[i][j]));
317        memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
318        memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
319        memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
320        memset(&s->segmentation, 0, sizeof(s->segmentation));
321        memset(&s->lf_delta, 0, sizeof(s->lf_delta));
322    }
323
324    if (!s->macroblocks_base || /* first frame */
325        width != s->avctx->width || height != s->avctx->height) {
326        if ((ret = update_dimensions(s, width, height)) < 0)
327            return ret;
328    }
329
330    ff_vp56_init_range_decoder(c, buf, header_size);
331    buf      += header_size;
332    buf_size -= header_size;
333
334    if (s->keyframe) {
335        if (vp8_rac_get(c))
336            av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
337        vp8_rac_get(c); // whether we can skip clamping in dsp functions
338    }
339
340    if ((s->segmentation.enabled = vp8_rac_get(c)))
341        parse_segment_info(s);
342    else
343        s->segmentation.update_map = 0; // FIXME: move this to some init function?
344
345    s->filter.simple    = vp8_rac_get(c);
346    s->filter.level     = vp8_rac_get_uint(c, 6);
347    s->filter.sharpness = vp8_rac_get_uint(c, 3);
348
349    if ((s->lf_delta.enabled = vp8_rac_get(c)))
350        if (vp8_rac_get(c))
351            update_lf_deltas(s);
352
353    if (setup_partitions(s, buf, buf_size)) {
354        av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
355        return AVERROR_INVALIDDATA;
356    }
357
358    get_quants(s);
359
360    if (!s->keyframe) {
361        update_refs(s);
362        s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
363        s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
364    }
365
366    // if we aren't saving this frame's probabilities for future frames,
367    // make a copy of the current probabilities
368    if (!(s->update_probabilities = vp8_rac_get(c)))
369        s->prob[1] = s->prob[0];
370
371    s->update_last = s->keyframe || vp8_rac_get(c);
372
373    for (i = 0; i < 4; i++)
374        for (j = 0; j < 8; j++)
375            for (k = 0; k < 3; k++)
376                for (l = 0; l < NUM_DCT_TOKENS-1; l++)
377                    if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
378                        int prob = vp8_rac_get_uint(c, 8);
379                        for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
380                            s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
381                    }
382
383    if ((s->mbskip_enabled = vp8_rac_get(c)))
384        s->prob->mbskip = vp8_rac_get_uint(c, 8);
385
386    if (!s->keyframe) {
387        s->prob->intra  = vp8_rac_get_uint(c, 8);
388        s->prob->last   = vp8_rac_get_uint(c, 8);
389        s->prob->golden = vp8_rac_get_uint(c, 8);
390
391        if (vp8_rac_get(c))
392            for (i = 0; i < 4; i++)
393                s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
394        if (vp8_rac_get(c))
395            for (i = 0; i < 3; i++)
396                s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
397
398        // 17.2 MV probability update
399        for (i = 0; i < 2; i++)
400            for (j = 0; j < 19; j++)
401                if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
402                    s->prob->mvc[i][j] = vp8_rac_get_nn(c);
403    }
404
405    return 0;
406}
407
408static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
409{
410    dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
411    dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
412}
413
414/**
415 * Motion vector coding, 17.1.
416 */
417static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
418{
419    int bit, x = 0;
420
421    if (vp56_rac_get_prob_branchy(c, p[0])) {
422        int i;
423
424        for (i = 0; i < 3; i++)
425            x += vp56_rac_get_prob(c, p[9 + i]) << i;
426        for (i = 9; i > 3; i--)
427            x += vp56_rac_get_prob(c, p[9 + i]) << i;
428        if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
429            x += 8;
430    } else {
431        // small_mvtree
432        const uint8_t *ps = p+2;
433        bit = vp56_rac_get_prob(c, *ps);
434        ps += 1 + 3*bit;
435        x  += 4*bit;
436        bit = vp56_rac_get_prob(c, *ps);
437        ps += 1 + bit;
438        x  += 2*bit;
439        x  += vp56_rac_get_prob(c, *ps);
440    }
441
442    return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
443}
444
445static av_always_inline
446const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
447{
448    if (left == top)
449        return vp8_submv_prob[4-!!left];
450    if (!top)
451        return vp8_submv_prob[2];
452    return vp8_submv_prob[1-!!left];
453}
454
455/**
456 * Split motion vector prediction, 16.4.
457 * @returns the number of motion vectors parsed (2, 4 or 16)
458 */
459static av_always_inline
460int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb)
461{
462    int part_idx;
463    int n, num;
464    VP8Macroblock *top_mb  = &mb[2];
465    VP8Macroblock *left_mb = &mb[-1];
466    const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
467                  *mbsplits_top = vp8_mbsplits[top_mb->partitioning],
468                  *mbsplits_cur, *firstidx;
469    VP56mv *top_mv  = top_mb->bmv;
470    VP56mv *left_mv = left_mb->bmv;
471    VP56mv *cur_mv  = mb->bmv;
472
473    if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
474        if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
475            part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
476        } else {
477            part_idx = VP8_SPLITMVMODE_8x8;
478        }
479    } else {
480        part_idx = VP8_SPLITMVMODE_4x4;
481    }
482
483    num = vp8_mbsplit_count[part_idx];
484    mbsplits_cur = vp8_mbsplits[part_idx],
485    firstidx = vp8_mbfirstidx[part_idx];
486    mb->partitioning = part_idx;
487
488    for (n = 0; n < num; n++) {
489        int k = firstidx[n];
490        uint32_t left, above;
491        const uint8_t *submv_prob;
492
493        if (!(k & 3))
494            left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
495        else
496            left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
497        if (k <= 3)
498            above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
499        else
500            above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
501
502        submv_prob = get_submv_prob(left, above);
503
504        if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
505            if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
506                if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
507                    mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
508                    mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
509                } else {
510                    AV_ZERO32(&mb->bmv[n]);
511                }
512            } else {
513                AV_WN32A(&mb->bmv[n], above);
514            }
515        } else {
516            AV_WN32A(&mb->bmv[n], left);
517        }
518    }
519
520    return num;
521}
522
523static av_always_inline
524void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y)
525{
526    VP8Macroblock *mb_edge[3] = { mb + 2 /* top */,
527                                  mb - 1 /* left */,
528                                  mb + 1 /* top-left */ };
529    enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
530    enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
531    int idx = CNT_ZERO;
532    int cur_sign_bias = s->sign_bias[mb->ref_frame];
533    int8_t *sign_bias = s->sign_bias;
534    VP56mv near_mv[4];
535    uint8_t cnt[4] = { 0 };
536    VP56RangeCoder *c = &s->c;
537
538    AV_ZERO32(&near_mv[0]);
539    AV_ZERO32(&near_mv[1]);
540    AV_ZERO32(&near_mv[2]);
541
542    /* Process MB on top, left and top-left */
543    #define MV_EDGE_CHECK(n)\
544    {\
545        VP8Macroblock *edge = mb_edge[n];\
546        int edge_ref = edge->ref_frame;\
547        if (edge_ref != VP56_FRAME_CURRENT) {\
548            uint32_t mv = AV_RN32A(&edge->mv);\
549            if (mv) {\
550                if (cur_sign_bias != sign_bias[edge_ref]) {\
551                    /* SWAR negate of the values in mv. */\
552                    mv = ~mv;\
553                    mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
554                }\
555                if (!n || mv != AV_RN32A(&near_mv[idx]))\
556                    AV_WN32A(&near_mv[++idx], mv);\
557                cnt[idx]      += 1 + (n != 2);\
558            } else\
559                cnt[CNT_ZERO] += 1 + (n != 2);\
560        }\
561    }
562
563    MV_EDGE_CHECK(0)
564    MV_EDGE_CHECK(1)
565    MV_EDGE_CHECK(2)
566
567    mb->partitioning = VP8_SPLITMVMODE_NONE;
568    if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
569        mb->mode = VP8_MVMODE_MV;
570
571        /* If we have three distinct MVs, merge first and last if they're the same */
572        if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
573            cnt[CNT_NEAREST] += 1;
574
575        /* Swap near and nearest if necessary */
576        if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
577            FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
578            FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
579        }
580
581        if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
582            if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
583
584                /* Choose the best mv out of 0,0 and the nearest mv */
585                clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
586                cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
587                                    (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
588                                    (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
589
590                if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
591                    mb->mode = VP8_MVMODE_SPLIT;
592                    mb->mv = mb->bmv[decode_splitmvs(s, c, mb) - 1];
593                } else {
594                    mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
595                    mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
596                    mb->bmv[0] = mb->mv;
597                }
598            } else {
599                clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
600                mb->bmv[0] = mb->mv;
601            }
602        } else {
603            clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
604            mb->bmv[0] = mb->mv;
605        }
606    } else {
607        mb->mode = VP8_MVMODE_ZERO;
608        AV_ZERO32(&mb->mv);
609        mb->bmv[0] = mb->mv;
610    }
611}
612
613static av_always_inline
614void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
615                           int mb_x, int keyframe)
616{
617    uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
618    if (keyframe) {
619        int x, y;
620        uint8_t* const top = s->intra4x4_pred_mode_top + 4 * mb_x;
621        uint8_t* const left = s->intra4x4_pred_mode_left;
622        for (y = 0; y < 4; y++) {
623            for (x = 0; x < 4; x++) {
624                const uint8_t *ctx;
625                ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
626                *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
627                left[y] = top[x] = *intra4x4;
628                intra4x4++;
629            }
630        }
631    } else {
632        int i;
633        for (i = 0; i < 16; i++)
634            intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
635    }
636}
637
638static av_always_inline
639void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_t *segment, uint8_t *ref)
640{
641    VP56RangeCoder *c = &s->c;
642
643    if (s->segmentation.update_map)
644        *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
645    else
646        *segment = ref ? *ref : *segment;
647    s->segment = *segment;
648
649    mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
650
651    if (s->keyframe) {
652        mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
653
654        if (mb->mode == MODE_I4x4) {
655            decode_intra4x4_modes(s, c, mb_x, 1);
656        } else {
657            const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
658            AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
659            AV_WN32A(s->intra4x4_pred_mode_left, modes);
660        }
661
662        s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
663        mb->ref_frame = VP56_FRAME_CURRENT;
664    } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
665        // inter MB, 16.2
666        if (vp56_rac_get_prob_branchy(c, s->prob->last))
667            mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
668                VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
669        else
670            mb->ref_frame = VP56_FRAME_PREVIOUS;
671        s->ref_count[mb->ref_frame-1]++;
672
673        // motion vectors, 16.3
674        decode_mvs(s, mb, mb_x, mb_y);
675    } else {
676        // intra MB, 16.1
677        mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
678
679        if (mb->mode == MODE_I4x4)
680            decode_intra4x4_modes(s, c, mb_x, 0);
681
682        s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
683        mb->ref_frame = VP56_FRAME_CURRENT;
684        mb->partitioning = VP8_SPLITMVMODE_NONE;
685        AV_ZERO32(&mb->bmv[0]);
686    }
687}
688
689#ifndef decode_block_coeffs_internal
690/**
691 * @param c arithmetic bitstream reader context
692 * @param block destination for block coefficients
693 * @param probs probabilities to use when reading trees from the bitstream
694 * @param i initial coeff index, 0 unless a separate DC block is coded
695 * @param qmul array holding the dc/ac dequant factor at position 0/1
696 * @return 0 if no coeffs were decoded
697 *         otherwise, the index of the last coeff decoded plus one
698 */
699static int decode_block_coeffs_internal(VP56RangeCoder *c, DCTELEM block[16],
700                                        uint8_t probs[16][3][NUM_DCT_TOKENS-1],
701                                        int i, uint8_t *token_prob, int16_t qmul[2])
702{
703    goto skip_eob;
704    do {
705        int coeff;
706        if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
707            return i;
708
709skip_eob:
710        if (!vp56_rac_get_prob_branchy(c, token_prob[1])) { // DCT_0
711            if (++i == 16)
712                return i; // invalid input; blocks should end with EOB
713            token_prob = probs[i][0];
714            goto skip_eob;
715        }
716
717        if (!vp56_rac_get_prob_branchy(c, token_prob[2])) { // DCT_1
718            coeff = 1;
719            token_prob = probs[i+1][1];
720        } else {
721            if (!vp56_rac_get_prob_branchy(c, token_prob[3])) { // DCT 2,3,4
722                coeff = vp56_rac_get_prob_branchy(c, token_prob[4]);
723                if (coeff)
724                    coeff += vp56_rac_get_prob(c, token_prob[5]);
725                coeff += 2;
726            } else {
727                // DCT_CAT*
728                if (!vp56_rac_get_prob_branchy(c, token_prob[6])) {
729                    if (!vp56_rac_get_prob_branchy(c, token_prob[7])) { // DCT_CAT1
730                        coeff  = 5 + vp56_rac_get_prob(c, vp8_dct_cat1_prob[0]);
731                    } else {                                    // DCT_CAT2
732                        coeff  = 7;
733                        coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[0]) << 1;
734                        coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[1]);
735                    }
736                } else {    // DCT_CAT3 and up
737                    int a = vp56_rac_get_prob(c, token_prob[8]);
738                    int b = vp56_rac_get_prob(c, token_prob[9+a]);
739                    int cat = (a<<1) + b;
740                    coeff  = 3 + (8<<cat);
741                    coeff += vp8_rac_get_coeff(c, ff_vp8_dct_cat_prob[cat]);
742                }
743            }
744            token_prob = probs[i+1][2];
745        }
746        block[zigzag_scan[i]] = (vp8_rac_get(c) ? -coeff : coeff) * qmul[!!i];
747    } while (++i < 16);
748
749    return i;
750}
751#endif
752
753/**
754 * @param c arithmetic bitstream reader context
755 * @param block destination for block coefficients
756 * @param probs probabilities to use when reading trees from the bitstream
757 * @param i initial coeff index, 0 unless a separate DC block is coded
758 * @param zero_nhood the initial prediction context for number of surrounding
759 *                   all-zero blocks (only left/top, so 0-2)
760 * @param qmul array holding the dc/ac dequant factor at position 0/1
761 * @return 0 if no coeffs were decoded
762 *         otherwise, the index of the last coeff decoded plus one
763 */
764static av_always_inline
765int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
766                        uint8_t probs[16][3][NUM_DCT_TOKENS-1],
767                        int i, int zero_nhood, int16_t qmul[2])
768{
769    uint8_t *token_prob = probs[i][zero_nhood];
770    if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
771        return 0;
772    return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
773}
774
775static av_always_inline
776void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
777                      uint8_t t_nnz[9], uint8_t l_nnz[9])
778{
779    int i, x, y, luma_start = 0, luma_ctx = 3;
780    int nnz_pred, nnz, nnz_total = 0;
781    int segment = s->segment;
782    int block_dc = 0;
783
784    if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
785        nnz_pred = t_nnz[8] + l_nnz[8];
786
787        // decode DC values and do hadamard
788        nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred,
789                                  s->qmat[segment].luma_dc_qmul);
790        l_nnz[8] = t_nnz[8] = !!nnz;
791        if (nnz) {
792            nnz_total += nnz;
793            block_dc = 1;
794            if (nnz == 1)
795                s->vp8dsp.vp8_luma_dc_wht_dc(s->block, s->block_dc);
796            else
797                s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc);
798        }
799        luma_start = 1;
800        luma_ctx = 0;
801    }
802
803    // luma blocks
804    for (y = 0; y < 4; y++)
805        for (x = 0; x < 4; x++) {
806            nnz_pred = l_nnz[y] + t_nnz[x];
807            nnz = decode_block_coeffs(c, s->block[y][x], s->prob->token[luma_ctx], luma_start,
808                                      nnz_pred, s->qmat[segment].luma_qmul);
809            // nnz+block_dc may be one more than the actual last index, but we don't care
810            s->non_zero_count_cache[y][x] = nnz + block_dc;
811            t_nnz[x] = l_nnz[y] = !!nnz;
812            nnz_total += nnz;
813        }
814
815    // chroma blocks
816    // TODO: what to do about dimensions? 2nd dim for luma is x,
817    // but for chroma it's (y<<1)|x
818    for (i = 4; i < 6; i++)
819        for (y = 0; y < 2; y++)
820            for (x = 0; x < 2; x++) {
821                nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
822                nnz = decode_block_coeffs(c, s->block[i][(y<<1)+x], s->prob->token[2], 0,
823                                          nnz_pred, s->qmat[segment].chroma_qmul);
824                s->non_zero_count_cache[i][(y<<1)+x] = nnz;
825                t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
826                nnz_total += nnz;
827            }
828
829    // if there were no coded coeffs despite the macroblock not being marked skip,
830    // we MUST not do the inner loop filter and should not do IDCT
831    // Since skip isn't used for bitstream prediction, just manually set it.
832    if (!nnz_total)
833        mb->skip = 1;
834}
835
836static av_always_inline
837void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
838                      int linesize, int uvlinesize, int simple)
839{
840    AV_COPY128(top_border, src_y + 15*linesize);
841    if (!simple) {
842        AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
843        AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
844    }
845}
846
847static av_always_inline
848void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
849                    int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
850                    int simple, int xchg)
851{
852    uint8_t *top_border_m1 = top_border-32;     // for TL prediction
853    src_y  -=   linesize;
854    src_cb -= uvlinesize;
855    src_cr -= uvlinesize;
856
857#define XCHG(a,b,xchg) do {                     \
858        if (xchg) AV_SWAP64(b,a);               \
859        else      AV_COPY64(b,a);               \
860    } while (0)
861
862    XCHG(top_border_m1+8, src_y-8, xchg);
863    XCHG(top_border,      src_y,   xchg);
864    XCHG(top_border+8,    src_y+8, 1);
865    if (mb_x < mb_width-1)
866        XCHG(top_border+32, src_y+16, 1);
867
868    // only copy chroma for normal loop filter
869    // or to initialize the top row to 127
870    if (!simple || !mb_y) {
871        XCHG(top_border_m1+16, src_cb-8, xchg);
872        XCHG(top_border_m1+24, src_cr-8, xchg);
873        XCHG(top_border+16,    src_cb, 1);
874        XCHG(top_border+24,    src_cr, 1);
875    }
876}
877
878static av_always_inline
879int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
880{
881    if (!mb_x) {
882        return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
883    } else {
884        return mb_y ? mode : LEFT_DC_PRED8x8;
885    }
886}
887
888static av_always_inline
889int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
890{
891    if (!mb_x) {
892        return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
893    } else {
894        return mb_y ? mode : HOR_PRED8x8;
895    }
896}
897
898static av_always_inline
899int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
900{
901    if (mode == DC_PRED8x8) {
902        return check_dc_pred8x8_mode(mode, mb_x, mb_y);
903    } else {
904        return mode;
905    }
906}
907
908static av_always_inline
909int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
910{
911    switch (mode) {
912    case DC_PRED8x8:
913        return check_dc_pred8x8_mode(mode, mb_x, mb_y);
914    case VERT_PRED8x8:
915        return !mb_y ? DC_127_PRED8x8 : mode;
916    case HOR_PRED8x8:
917        return !mb_x ? DC_129_PRED8x8 : mode;
918    case PLANE_PRED8x8 /*TM*/:
919        return check_tm_pred8x8_mode(mode, mb_x, mb_y);
920    }
921    return mode;
922}
923
924static av_always_inline
925int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
926{
927    if (!mb_x) {
928        return mb_y ? VERT_VP8_PRED : DC_129_PRED;
929    } else {
930        return mb_y ? mode : HOR_VP8_PRED;
931    }
932}
933
934static av_always_inline
935int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
936{
937    switch (mode) {
938    case VERT_PRED:
939        if (!mb_x && mb_y) {
940            *copy_buf = 1;
941            return mode;
942        }
943        /* fall-through */
944    case DIAG_DOWN_LEFT_PRED:
945    case VERT_LEFT_PRED:
946        return !mb_y ? DC_127_PRED : mode;
947    case HOR_PRED:
948        if (!mb_y) {
949            *copy_buf = 1;
950            return mode;
951        }
952        /* fall-through */
953    case HOR_UP_PRED:
954        return !mb_x ? DC_129_PRED : mode;
955    case TM_VP8_PRED:
956        return check_tm_pred4x4_mode(mode, mb_x, mb_y);
957    case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
958    case DIAG_DOWN_RIGHT_PRED:
959    case VERT_RIGHT_PRED:
960    case HOR_DOWN_PRED:
961        if (!mb_y || !mb_x)
962            *copy_buf = 1;
963        return mode;
964    }
965    return mode;
966}
967
968static av_always_inline
969void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
970                   int mb_x, int mb_y)
971{
972    AVCodecContext *avctx = s->avctx;
973    int x, y, mode, nnz;
974    uint32_t tr;
975
976    // for the first row, we need to run xchg_mb_border to init the top edge to 127
977    // otherwise, skip it if we aren't going to deblock
978    if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
979        xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
980                       s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
981                       s->filter.simple, 1);
982
983    if (mb->mode < MODE_I4x4) {
984        if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
985            mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
986        } else {
987            mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
988        }
989        s->hpc.pred16x16[mode](dst[0], s->linesize);
990    } else {
991        uint8_t *ptr = dst[0];
992        uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
993        uint8_t tr_top[4] = { 127, 127, 127, 127 };
994
995        // all blocks on the right edge of the macroblock use bottom edge
996        // the top macroblock for their topright edge
997        uint8_t *tr_right = ptr - s->linesize + 16;
998
999        // if we're on the right edge of the frame, said edge is extended
1000        // from the top macroblock
1001        if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1002            mb_x == s->mb_width-1) {
1003            tr = tr_right[-1]*0x01010101u;
1004            tr_right = (uint8_t *)&tr;
1005        }
1006
1007        if (mb->skip)
1008            AV_ZERO128(s->non_zero_count_cache);
1009
1010        for (y = 0; y < 4; y++) {
1011            uint8_t *topright = ptr + 4 - s->linesize;
1012            for (x = 0; x < 4; x++) {
1013                int copy = 0, linesize = s->linesize;
1014                uint8_t *dst = ptr+4*x;
1015                DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1016
1017                if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1018                    topright = tr_top;
1019                } else if (x == 3)
1020                    topright = tr_right;
1021
1022                if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1023                    mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1024                    if (copy) {
1025                        dst = copy_dst + 12;
1026                        linesize = 8;
1027                        if (!(mb_y + y)) {
1028                            copy_dst[3] = 127U;
1029                            AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1030                        } else {
1031                            AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1032                            if (!(mb_x + x)) {
1033                                copy_dst[3] = 129U;
1034                            } else {
1035                                copy_dst[3] = ptr[4*x-s->linesize-1];
1036                            }
1037                        }
1038                        if (!(mb_x + x)) {
1039                            copy_dst[11] =
1040                            copy_dst[19] =
1041                            copy_dst[27] =
1042                            copy_dst[35] = 129U;
1043                        } else {
1044                            copy_dst[11] = ptr[4*x              -1];
1045                            copy_dst[19] = ptr[4*x+s->linesize  -1];
1046                            copy_dst[27] = ptr[4*x+s->linesize*2-1];
1047                            copy_dst[35] = ptr[4*x+s->linesize*3-1];
1048                        }
1049                    }
1050                } else {
1051                    mode = intra4x4[x];
1052                }
1053                s->hpc.pred4x4[mode](dst, topright, linesize);
1054                if (copy) {
1055                    AV_COPY32(ptr+4*x              , copy_dst+12);
1056                    AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1057                    AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1058                    AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1059                }
1060
1061                nnz = s->non_zero_count_cache[y][x];
1062                if (nnz) {
1063                    if (nnz == 1)
1064                        s->vp8dsp.vp8_idct_dc_add(ptr+4*x, s->block[y][x], s->linesize);
1065                    else
1066                        s->vp8dsp.vp8_idct_add(ptr+4*x, s->block[y][x], s->linesize);
1067                }
1068                topright += 4;
1069            }
1070
1071            ptr   += 4*s->linesize;
1072            intra4x4 += 4;
1073        }
1074    }
1075
1076    if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1077        mode = check_intra_pred8x8_mode_emuedge(s->chroma_pred_mode, mb_x, mb_y);
1078    } else {
1079        mode = check_intra_pred8x8_mode(s->chroma_pred_mode, mb_x, mb_y);
1080    }
1081    s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1082    s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1083
1084    if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
1085        xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1086                       s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1087                       s->filter.simple, 0);
1088}
1089
1090static const uint8_t subpel_idx[3][8] = {
1091    { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1092                                // also function pointer index
1093    { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1094    { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1095};
1096
1097/**
1098 * luma MC function
1099 *
1100 * @param s VP8 decoding context
1101 * @param dst target buffer for block data at block position
1102 * @param ref reference picture buffer at origin (0, 0)
1103 * @param mv motion vector (relative to block position) to get pixel data from
1104 * @param x_off horizontal position of block from origin (0, 0)
1105 * @param y_off vertical position of block from origin (0, 0)
1106 * @param block_w width of block (16, 8 or 4)
1107 * @param block_h height of block (always same as block_w)
1108 * @param width width of src/dst plane data
1109 * @param height height of src/dst plane data
1110 * @param linesize size of a single line of plane data, including padding
1111 * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1112 */
1113static av_always_inline
1114void vp8_mc_luma(VP8Context *s, uint8_t *dst, AVFrame *ref, const VP56mv *mv,
1115                 int x_off, int y_off, int block_w, int block_h,
1116                 int width, int height, int linesize,
1117                 vp8_mc_func mc_func[3][3])
1118{
1119    uint8_t *src = ref->data[0];
1120
1121    if (AV_RN32A(mv)) {
1122
1123        int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1124        int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1125
1126        x_off += mv->x >> 2;
1127        y_off += mv->y >> 2;
1128
1129        // edge emulation
1130        ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1131        src += y_off * linesize + x_off;
1132        if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1133            y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1134            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1135                                    block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1136                                    x_off - mx_idx, y_off - my_idx, width, height);
1137            src = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1138        }
1139        mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1140    } else {
1141        ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1142        mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1143    }
1144}
1145
1146/**
1147 * chroma MC function
1148 *
1149 * @param s VP8 decoding context
1150 * @param dst1 target buffer for block data at block position (U plane)
1151 * @param dst2 target buffer for block data at block position (V plane)
1152 * @param ref reference picture buffer at origin (0, 0)
1153 * @param mv motion vector (relative to block position) to get pixel data from
1154 * @param x_off horizontal position of block from origin (0, 0)
1155 * @param y_off vertical position of block from origin (0, 0)
1156 * @param block_w width of block (16, 8 or 4)
1157 * @param block_h height of block (always same as block_w)
1158 * @param width width of src/dst plane data
1159 * @param height height of src/dst plane data
1160 * @param linesize size of a single line of plane data, including padding
1161 * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1162 */
1163static av_always_inline
1164void vp8_mc_chroma(VP8Context *s, uint8_t *dst1, uint8_t *dst2, AVFrame *ref,
1165                   const VP56mv *mv, int x_off, int y_off,
1166                   int block_w, int block_h, int width, int height, int linesize,
1167                   vp8_mc_func mc_func[3][3])
1168{
1169    uint8_t *src1 = ref->data[1], *src2 = ref->data[2];
1170
1171    if (AV_RN32A(mv)) {
1172        int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1173        int my = mv->y&7, my_idx = subpel_idx[0][my];
1174
1175        x_off += mv->x >> 3;
1176        y_off += mv->y >> 3;
1177
1178        // edge emulation
1179        src1 += y_off * linesize + x_off;
1180        src2 += y_off * linesize + x_off;
1181        ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1182        if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1183            y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1184            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1185                                    block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1186                                    x_off - mx_idx, y_off - my_idx, width, height);
1187            src1 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1188            mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1189
1190            s->dsp.emulated_edge_mc(s->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1191                                    block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1192                                    x_off - mx_idx, y_off - my_idx, width, height);
1193            src2 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1194            mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1195        } else {
1196            mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1197            mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1198        }
1199    } else {
1200        ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1201        mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1202        mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1203    }
1204}
1205
1206static av_always_inline
1207void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
1208                 AVFrame *ref_frame, int x_off, int y_off,
1209                 int bx_off, int by_off,
1210                 int block_w, int block_h,
1211                 int width, int height, VP56mv *mv)
1212{
1213    VP56mv uvmv = *mv;
1214
1215    /* Y */
1216    vp8_mc_luma(s, dst[0] + by_off * s->linesize + bx_off,
1217                ref_frame, mv, x_off + bx_off, y_off + by_off,
1218                block_w, block_h, width, height, s->linesize,
1219                s->put_pixels_tab[block_w == 8]);
1220
1221    /* U/V */
1222    if (s->profile == 3) {
1223        uvmv.x &= ~7;
1224        uvmv.y &= ~7;
1225    }
1226    x_off   >>= 1; y_off   >>= 1;
1227    bx_off  >>= 1; by_off  >>= 1;
1228    width   >>= 1; height  >>= 1;
1229    block_w >>= 1; block_h >>= 1;
1230    vp8_mc_chroma(s, dst[1] + by_off * s->uvlinesize + bx_off,
1231                  dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1232                  &uvmv, x_off + bx_off, y_off + by_off,
1233                  block_w, block_h, width, height, s->uvlinesize,
1234                  s->put_pixels_tab[1 + (block_w == 4)]);
1235}
1236
1237/* Fetch pixels for estimated mv 4 macroblocks ahead.
1238 * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1239static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1240{
1241    /* Don't prefetch refs that haven't been used very often this frame. */
1242    if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1243        int x_off = mb_x << 4, y_off = mb_y << 4;
1244        int mx = (mb->mv.x>>2) + x_off + 8;
1245        int my = (mb->mv.y>>2) + y_off;
1246        uint8_t **src= s->framep[ref]->data;
1247        int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1248        /* For threading, a ff_thread_await_progress here might be useful, but
1249         * it actually slows down the decoder. Since a bad prefetch doesn't
1250         * generate bad decoder output, we don't run it here. */
1251        s->dsp.prefetch(src[0]+off, s->linesize, 4);
1252        off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1253        s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1254    }
1255}
1256
1257/**
1258 * Apply motion vectors to prediction buffer, chapter 18.
1259 */
1260static av_always_inline
1261void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
1262                   int mb_x, int mb_y)
1263{
1264    int x_off = mb_x << 4, y_off = mb_y << 4;
1265    int width = 16*s->mb_width, height = 16*s->mb_height;
1266    AVFrame *ref = s->framep[mb->ref_frame];
1267    VP56mv *bmv = mb->bmv;
1268
1269    switch (mb->partitioning) {
1270    case VP8_SPLITMVMODE_NONE:
1271        vp8_mc_part(s, dst, ref, x_off, y_off,
1272                    0, 0, 16, 16, width, height, &mb->mv);
1273        break;
1274    case VP8_SPLITMVMODE_4x4: {
1275        int x, y;
1276        VP56mv uvmv;
1277
1278        /* Y */
1279        for (y = 0; y < 4; y++) {
1280            for (x = 0; x < 4; x++) {
1281                vp8_mc_luma(s, dst[0] + 4*y*s->linesize + x*4,
1282                            ref, &bmv[4*y + x],
1283                            4*x + x_off, 4*y + y_off, 4, 4,
1284                            width, height, s->linesize,
1285                            s->put_pixels_tab[2]);
1286            }
1287        }
1288
1289        /* U/V */
1290        x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1291        for (y = 0; y < 2; y++) {
1292            for (x = 0; x < 2; x++) {
1293                uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1294                         mb->bmv[ 2*y    * 4 + 2*x+1].x +
1295                         mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1296                         mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1297                uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1298                         mb->bmv[ 2*y    * 4 + 2*x+1].y +
1299                         mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1300                         mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1301                uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1302                uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1303                if (s->profile == 3) {
1304                    uvmv.x &= ~7;
1305                    uvmv.y &= ~7;
1306                }
1307                vp8_mc_chroma(s, dst[1] + 4*y*s->uvlinesize + x*4,
1308                              dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1309                              4*x + x_off, 4*y + y_off, 4, 4,
1310                              width, height, s->uvlinesize,
1311                              s->put_pixels_tab[2]);
1312            }
1313        }
1314        break;
1315    }
1316    case VP8_SPLITMVMODE_16x8:
1317        vp8_mc_part(s, dst, ref, x_off, y_off,
1318                    0, 0, 16, 8, width, height, &bmv[0]);
1319        vp8_mc_part(s, dst, ref, x_off, y_off,
1320                    0, 8, 16, 8, width, height, &bmv[1]);
1321        break;
1322    case VP8_SPLITMVMODE_8x16:
1323        vp8_mc_part(s, dst, ref, x_off, y_off,
1324                    0, 0, 8, 16, width, height, &bmv[0]);
1325        vp8_mc_part(s, dst, ref, x_off, y_off,
1326                    8, 0, 8, 16, width, height, &bmv[1]);
1327        break;
1328    case VP8_SPLITMVMODE_8x8:
1329        vp8_mc_part(s, dst, ref, x_off, y_off,
1330                    0, 0, 8, 8, width, height, &bmv[0]);
1331        vp8_mc_part(s, dst, ref, x_off, y_off,
1332                    8, 0, 8, 8, width, height, &bmv[1]);
1333        vp8_mc_part(s, dst, ref, x_off, y_off,
1334                    0, 8, 8, 8, width, height, &bmv[2]);
1335        vp8_mc_part(s, dst, ref, x_off, y_off,
1336                    8, 8, 8, 8, width, height, &bmv[3]);
1337        break;
1338    }
1339}
1340
1341static av_always_inline void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
1342{
1343    int x, y, ch;
1344
1345    if (mb->mode != MODE_I4x4) {
1346        uint8_t *y_dst = dst[0];
1347        for (y = 0; y < 4; y++) {
1348            uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[y]);
1349            if (nnz4) {
1350                if (nnz4&~0x01010101) {
1351                    for (x = 0; x < 4; x++) {
1352                        if ((uint8_t)nnz4 == 1)
1353                            s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
1354                        else if((uint8_t)nnz4 > 1)
1355                            s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
1356                        nnz4 >>= 8;
1357                        if (!nnz4)
1358                            break;
1359                    }
1360                } else {
1361                    s->vp8dsp.vp8_idct_dc_add4y(y_dst, s->block[y], s->linesize);
1362                }
1363            }
1364            y_dst += 4*s->linesize;
1365        }
1366    }
1367
1368    for (ch = 0; ch < 2; ch++) {
1369        uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[4+ch]);
1370        if (nnz4) {
1371            uint8_t *ch_dst = dst[1+ch];
1372            if (nnz4&~0x01010101) {
1373                for (y = 0; y < 2; y++) {
1374                    for (x = 0; x < 2; x++) {
1375                        if ((uint8_t)nnz4 == 1)
1376                            s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1377                        else if((uint8_t)nnz4 > 1)
1378                            s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1379                        nnz4 >>= 8;
1380                        if (!nnz4)
1381                            goto chroma_idct_end;
1382                    }
1383                    ch_dst += 4*s->uvlinesize;
1384                }
1385            } else {
1386                s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, s->block[4+ch], s->uvlinesize);
1387            }
1388        }
1389chroma_idct_end: ;
1390    }
1391}
1392
1393static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1394{
1395    int interior_limit, filter_level;
1396
1397    if (s->segmentation.enabled) {
1398        filter_level = s->segmentation.filter_level[s->segment];
1399        if (!s->segmentation.absolute_vals)
1400            filter_level += s->filter.level;
1401    } else
1402        filter_level = s->filter.level;
1403
1404    if (s->lf_delta.enabled) {
1405        filter_level += s->lf_delta.ref[mb->ref_frame];
1406        filter_level += s->lf_delta.mode[mb->mode];
1407    }
1408
1409    filter_level = av_clip_uintp2(filter_level, 6);
1410
1411    interior_limit = filter_level;
1412    if (s->filter.sharpness) {
1413        interior_limit >>= (s->filter.sharpness + 3) >> 2;
1414        interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1415    }
1416    interior_limit = FFMAX(interior_limit, 1);
1417
1418    f->filter_level = filter_level;
1419    f->inner_limit = interior_limit;
1420    f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1421}
1422
1423static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1424{
1425    int mbedge_lim, bedge_lim, hev_thresh;
1426    int filter_level = f->filter_level;
1427    int inner_limit = f->inner_limit;
1428    int inner_filter = f->inner_filter;
1429    int linesize = s->linesize;
1430    int uvlinesize = s->uvlinesize;
1431    static const uint8_t hev_thresh_lut[2][64] = {
1432        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1433          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1434          3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1435          3, 3, 3, 3 },
1436        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1437          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1438          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1439          2, 2, 2, 2 }
1440    };
1441
1442    if (!filter_level)
1443        return;
1444
1445     bedge_lim = 2*filter_level + inner_limit;
1446    mbedge_lim = bedge_lim + 4;
1447
1448    hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1449
1450    if (mb_x) {
1451        s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1452                                       mbedge_lim, inner_limit, hev_thresh);
1453        s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1454                                       mbedge_lim, inner_limit, hev_thresh);
1455    }
1456
1457    if (inner_filter) {
1458        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1459                                             inner_limit, hev_thresh);
1460        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1461                                             inner_limit, hev_thresh);
1462        s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1463                                             inner_limit, hev_thresh);
1464        s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1465                                             uvlinesize,  bedge_lim,
1466                                             inner_limit, hev_thresh);
1467    }
1468
1469    if (mb_y) {
1470        s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1471                                       mbedge_lim, inner_limit, hev_thresh);
1472        s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1473                                       mbedge_lim, inner_limit, hev_thresh);
1474    }
1475
1476    if (inner_filter) {
1477        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1478                                             linesize,    bedge_lim,
1479                                             inner_limit, hev_thresh);
1480        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1481                                             linesize,    bedge_lim,
1482                                             inner_limit, hev_thresh);
1483        s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1484                                             linesize,    bedge_lim,
1485                                             inner_limit, hev_thresh);
1486        s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1487                                             dst[2] + 4 * uvlinesize,
1488                                             uvlinesize,  bedge_lim,
1489                                             inner_limit, hev_thresh);
1490    }
1491}
1492
1493static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1494{
1495    int mbedge_lim, bedge_lim;
1496    int filter_level = f->filter_level;
1497    int inner_limit = f->inner_limit;
1498    int inner_filter = f->inner_filter;
1499    int linesize = s->linesize;
1500
1501    if (!filter_level)
1502        return;
1503
1504     bedge_lim = 2*filter_level + inner_limit;
1505    mbedge_lim = bedge_lim + 4;
1506
1507    if (mb_x)
1508        s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1509    if (inner_filter) {
1510        s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1511        s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1512        s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1513    }
1514
1515    if (mb_y)
1516        s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1517    if (inner_filter) {
1518        s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1519        s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1520        s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1521    }
1522}
1523
1524static void filter_mb_row(VP8Context *s, AVFrame *curframe, int mb_y)
1525{
1526    VP8FilterStrength *f = s->filter_strength;
1527    uint8_t *dst[3] = {
1528        curframe->data[0] + 16*mb_y*s->linesize,
1529        curframe->data[1] +  8*mb_y*s->uvlinesize,
1530        curframe->data[2] +  8*mb_y*s->uvlinesize
1531    };
1532    int mb_x;
1533
1534    for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1535        backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1536        filter_mb(s, dst, f++, mb_x, mb_y);
1537        dst[0] += 16;
1538        dst[1] += 8;
1539        dst[2] += 8;
1540    }
1541}
1542
1543static void filter_mb_row_simple(VP8Context *s, AVFrame *curframe, int mb_y)
1544{
1545    VP8FilterStrength *f = s->filter_strength;
1546    uint8_t *dst = curframe->data[0] + 16*mb_y*s->linesize;
1547    int mb_x;
1548
1549    for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1550        backup_mb_border(s->top_border[mb_x+1], dst, NULL, NULL, s->linesize, 0, 1);
1551        filter_mb_simple(s, dst, f++, mb_x, mb_y);
1552        dst += 16;
1553    }
1554}
1555
1556static void release_queued_segmaps(VP8Context *s, int is_close)
1557{
1558    int leave_behind = is_close ? 0 : !s->maps_are_invalid;
1559    while (s->num_maps_to_be_freed > leave_behind)
1560        av_freep(&s->segmentation_maps[--s->num_maps_to_be_freed]);
1561    s->maps_are_invalid = 0;
1562}
1563
1564static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
1565                            AVPacket *avpkt)
1566{
1567    VP8Context *s = avctx->priv_data;
1568    int ret, mb_x, mb_y, i, y, referenced;
1569    enum AVDiscard skip_thresh;
1570    AVFrame *av_uninit(curframe), *prev_frame;
1571
1572    release_queued_segmaps(s, 0);
1573
1574    if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1575        return ret;
1576
1577    prev_frame = s->framep[VP56_FRAME_CURRENT];
1578
1579    referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1580                                || s->update_altref == VP56_FRAME_CURRENT;
1581
1582    skip_thresh = !referenced ? AVDISCARD_NONREF :
1583                    !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1584
1585    if (avctx->skip_frame >= skip_thresh) {
1586        s->invisible = 1;
1587        goto skip_decode;
1588    }
1589    s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1590
1591    // release no longer referenced frames
1592    for (i = 0; i < 5; i++)
1593        if (s->frames[i].data[0] &&
1594            &s->frames[i] != prev_frame &&
1595            &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1596            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1597            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1598            vp8_release_frame(s, &s->frames[i], 1, 0);
1599
1600    // find a free buffer
1601    for (i = 0; i < 5; i++)
1602        if (&s->frames[i] != prev_frame &&
1603            &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1604            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1605            &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1606            curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1607            break;
1608        }
1609    if (i == 5) {
1610        av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1611        abort();
1612    }
1613    if (curframe->data[0])
1614        vp8_release_frame(s, curframe, 1, 0);
1615
1616    curframe->key_frame = s->keyframe;
1617    curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1618    curframe->reference = referenced ? 3 : 0;
1619    if ((ret = vp8_alloc_frame(s, curframe))) {
1620        av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1621        return ret;
1622    }
1623
1624    // check if golden and altref are swapped
1625    if (s->update_altref != VP56_FRAME_NONE) {
1626        s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[s->update_altref];
1627    } else {
1628        s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[VP56_FRAME_GOLDEN2];
1629    }
1630    if (s->update_golden != VP56_FRAME_NONE) {
1631        s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[s->update_golden];
1632    } else {
1633        s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[VP56_FRAME_GOLDEN];
1634    }
1635    if (s->update_last) {
1636        s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1637    } else {
1638        s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1639    }
1640    s->next_framep[VP56_FRAME_CURRENT]      = curframe;
1641
1642    ff_thread_finish_setup(avctx);
1643
1644    // Given that arithmetic probabilities are updated every frame, it's quite likely
1645    // that the values we have on a random interframe are complete junk if we didn't
1646    // start decode on a keyframe. So just don't display anything rather than junk.
1647    if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1648                         !s->framep[VP56_FRAME_GOLDEN] ||
1649                         !s->framep[VP56_FRAME_GOLDEN2])) {
1650        av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1651        return AVERROR_INVALIDDATA;
1652    }
1653
1654    s->linesize   = curframe->linesize[0];
1655    s->uvlinesize = curframe->linesize[1];
1656
1657    if (!s->edge_emu_buffer)
1658        s->edge_emu_buffer = av_malloc(21*s->linesize);
1659
1660    memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1661
1662    /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1663    memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1664
1665    // top edge of 127 for intra prediction
1666    if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1667        s->top_border[0][15] = s->top_border[0][23] = 127;
1668        memset(s->top_border[1]-1, 127, s->mb_width*sizeof(*s->top_border)+1);
1669    }
1670    memset(s->ref_count, 0, sizeof(s->ref_count));
1671    if (s->keyframe)
1672        memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1673
1674#define MARGIN (16 << 2)
1675    s->mv_min.y = -MARGIN;
1676    s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1677
1678    for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1679        VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1680        VP8Macroblock *mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1681        int mb_xy = mb_y*s->mb_width;
1682        uint8_t *dst[3] = {
1683            curframe->data[0] + 16*mb_y*s->linesize,
1684            curframe->data[1] +  8*mb_y*s->uvlinesize,
1685            curframe->data[2] +  8*mb_y*s->uvlinesize
1686        };
1687
1688        memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
1689        memset(s->left_nnz, 0, sizeof(s->left_nnz));
1690        AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1691
1692        // left edge of 129 for intra prediction
1693        if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1694            for (i = 0; i < 3; i++)
1695                for (y = 0; y < 16>>!!i; y++)
1696                    dst[i][y*curframe->linesize[i]-1] = 129;
1697            if (mb_y == 1) // top left edge is also 129
1698                s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1699        }
1700
1701        s->mv_min.x = -MARGIN;
1702        s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1703        if (prev_frame && s->segmentation.enabled && !s->segmentation.update_map)
1704            ff_thread_await_progress(prev_frame, mb_y, 0);
1705
1706        for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1707            /* Prefetch the current frame, 4 MBs ahead */
1708            s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1709            s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1710
1711            decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
1712                           prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL);
1713
1714            prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1715
1716            if (!mb->skip)
1717                decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz);
1718
1719            if (mb->mode <= MODE_I4x4)
1720                intra_predict(s, dst, mb, mb_x, mb_y);
1721            else
1722                inter_predict(s, dst, mb, mb_x, mb_y);
1723
1724            prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1725
1726            if (!mb->skip) {
1727                idct_mb(s, dst, mb);
1728            } else {
1729                AV_ZERO64(s->left_nnz);
1730                AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1731
1732                // Reset DC block predictors if they would exist if the mb had coefficients
1733                if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1734                    s->left_nnz[8]      = 0;
1735                    s->top_nnz[mb_x][8] = 0;
1736                }
1737            }
1738
1739            if (s->deblock_filter)
1740                filter_level_for_mb(s, mb, &s->filter_strength[mb_x]);
1741
1742            prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1743
1744            dst[0] += 16;
1745            dst[1] += 8;
1746            dst[2] += 8;
1747            s->mv_min.x -= 64;
1748            s->mv_max.x -= 64;
1749        }
1750        if (s->deblock_filter) {
1751            if (s->filter.simple)
1752                filter_mb_row_simple(s, curframe, mb_y);
1753            else
1754                filter_mb_row(s, curframe, mb_y);
1755        }
1756        s->mv_min.y -= 64;
1757        s->mv_max.y -= 64;
1758
1759        ff_thread_report_progress(curframe, mb_y, 0);
1760    }
1761
1762    ff_thread_report_progress(curframe, INT_MAX, 0);
1763skip_decode:
1764    // if future frames don't use the updated probabilities,
1765    // reset them to the values we saved
1766    if (!s->update_probabilities)
1767        s->prob[0] = s->prob[1];
1768
1769    memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1770
1771    if (!s->invisible) {
1772        *(AVFrame*)data = *curframe;
1773        *data_size = sizeof(AVFrame);
1774    }
1775
1776    return avpkt->size;
1777}
1778
1779static av_cold int vp8_decode_init(AVCodecContext *avctx)
1780{
1781    VP8Context *s = avctx->priv_data;
1782
1783    s->avctx = avctx;
1784    avctx->pix_fmt = PIX_FMT_YUV420P;
1785
1786    dsputil_init(&s->dsp, avctx);
1787    ff_h264_pred_init(&s->hpc, CODEC_ID_VP8, 8, 1);
1788    ff_vp8dsp_init(&s->vp8dsp);
1789
1790    return 0;
1791}
1792
1793static av_cold int vp8_decode_free(AVCodecContext *avctx)
1794{
1795    vp8_decode_flush_impl(avctx, 0, 1, 1);
1796    release_queued_segmaps(avctx->priv_data, 1);
1797    return 0;
1798}
1799
1800static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
1801{
1802    VP8Context *s = avctx->priv_data;
1803
1804    s->avctx = avctx;
1805
1806    return 0;
1807}
1808
1809#define REBASE(pic) \
1810    pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
1811
1812static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
1813{
1814    VP8Context *s = dst->priv_data, *s_src = src->priv_data;
1815
1816    if (s->macroblocks_base &&
1817        (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
1818        free_buffers(s);
1819        s->maps_are_invalid = 1;
1820    }
1821
1822    s->prob[0] = s_src->prob[!s_src->update_probabilities];
1823    s->segmentation = s_src->segmentation;
1824    s->lf_delta = s_src->lf_delta;
1825    memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
1826
1827    memcpy(&s->frames, &s_src->frames, sizeof(s->frames));
1828    s->framep[0] = REBASE(s_src->next_framep[0]);
1829    s->framep[1] = REBASE(s_src->next_framep[1]);
1830    s->framep[2] = REBASE(s_src->next_framep[2]);
1831    s->framep[3] = REBASE(s_src->next_framep[3]);
1832
1833    return 0;
1834}
1835
1836AVCodec ff_vp8_decoder = {
1837    .name           = "vp8",
1838    .type           = AVMEDIA_TYPE_VIDEO,
1839    .id             = CODEC_ID_VP8,
1840    .priv_data_size = sizeof(VP8Context),
1841    .init           = vp8_decode_init,
1842    .close          = vp8_decode_free,
1843    .decode         = vp8_decode_frame,
1844    .capabilities   = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
1845    .flush = vp8_decode_flush,
1846    .long_name = NULL_IF_CONFIG_SMALL("On2 VP8"),
1847    .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
1848    .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
1849};
1850