1=============================================
2Snow Video Codec Specification Draft 20080110
3=============================================
4
5Introduction:
6=============
7This specification describes the Snow bitstream syntax and semantics as
8well as the formal Snow decoding process.
9
10The decoding process is described precisely and any compliant decoder
11MUST produce the exact same output for a spec-conformant Snow stream.
12For encoding, though, any process which generates a stream compliant to
13the syntactical and semantic requirements and which is decodable by
14the process described in this spec shall be considered a conformant
15Snow encoder.
16
17Definitions:
18============
19
20MUST    the specific part must be done to conform to this standard
21SHOULD  it is recommended to be done that way, but not strictly required
22
23ilog2(x) is the rounded down logarithm of x with basis 2
24ilog2(0) = 0
25
26Type definitions:
27=================
28
29b   1-bit range coded
30u   unsigned scalar value range coded
31s   signed scalar value range coded
32
33
34Bitstream syntax:
35=================
36
37frame:
38    header
39    prediction
40    residual
41
42header:
43    keyframe                            b   MID_STATE
44    if(keyframe || always_reset)
45        reset_contexts
46    if(keyframe){
47        version                         u   header_state
48        always_reset                    b   header_state
49        temporal_decomposition_type     u   header_state
50        temporal_decomposition_count    u   header_state
51        spatial_decomposition_count     u   header_state
52        colorspace_type                 u   header_state
53        chroma_h_shift                  u   header_state
54        chroma_v_shift                  u   header_state
55        spatial_scalability             b   header_state
56        max_ref_frames-1                u   header_state
57        qlogs
58    }
59    if(!keyframe){
60        update_mc                       b   header_state
61        if(update_mc){
62            for(plane=0; plane<2; plane++){
63                diag_mc                 b   header_state
64                htaps/2-1               u   header_state
65                for(i= p->htaps/2; i; i--)
66                    |hcoeff[i]|         u   header_state
67            }
68        }
69        update_qlogs                    b   header_state
70        if(update_qlogs){
71            spatial_decomposition_count u   header_state
72            qlogs
73        }
74    }
75
76    spatial_decomposition_type          s   header_state
77    qlog                                s   header_state
78    mv_scale                            s   header_state
79    qbias                               s   header_state
80    block_max_depth                     s   header_state
81
82qlogs:
83    for(plane=0; plane<2; plane++){
84        quant_table[plane][0][0]        s   header_state
85        for(level=0; level < spatial_decomposition_count; level++){
86            quant_table[plane][level][1]s   header_state
87            quant_table[plane][level][3]s   header_state
88        }
89    }
90
91reset_contexts
92    *_state[*]= MID_STATE
93
94prediction:
95    for(y=0; y<block_count_vertical; y++)
96        for(x=0; x<block_count_horizontal; x++)
97            block(0)
98
99block(level):
100    mvx_diff=mvy_diff=y_diff=cb_diff=cr_diff=0
101    if(keyframe){
102        intra=1
103    }else{
104        if(level!=max_block_depth){
105            s_context= 2*left->level + 2*top->level + topleft->level + topright->level
106            leaf                        b   block_state[4 + s_context]
107        }
108        if(level==max_block_depth || leaf){
109            intra                       b   block_state[1 + left->intra + top->intra]
110            if(intra){
111                y_diff                  s   block_state[32]
112                cb_diff                 s   block_state[64]
113                cr_diff                 s   block_state[96]
114            }else{
115                ref_context= ilog2(2*left->ref) + ilog2(2*top->ref)
116                if(ref_frames > 1)
117                    ref                 u   block_state[128 + 1024 + 32*ref_context]
118                mx_context= ilog2(2*abs(left->mx - top->mx))
119                my_context= ilog2(2*abs(left->my - top->my))
120                mvx_diff                s   block_state[128 + 32*(mx_context + 16*!!ref)]
121                mvy_diff                s   block_state[128 + 32*(my_context + 16*!!ref)]
122            }
123        }else{
124            block(level+1)
125            block(level+1)
126            block(level+1)
127            block(level+1)
128        }
129    }
130
131
132residual:
133    residual2(luma)
134    residual2(chroma_cr)
135    residual2(chroma_cb)
136
137residual2:
138    for(level=0; level<spatial_decomposition_count; level++){
139        if(level==0)
140            subband(LL, 0)
141        subband(HL, level)
142        subband(LH, level)
143        subband(HH, level)
144    }
145
146subband:
147    FIXME
148
149
150
151Tag description:
152----------------
153
154version
155    0
156    this MUST NOT change within a bitstream
157
158always_reset
159    if 1 then the range coder contexts will be reset after each frame
160
161temporal_decomposition_type
162    0
163
164temporal_decomposition_count
165    0
166
167spatial_decomposition_count
168    FIXME
169
170colorspace_type
171    0
172    this MUST NOT change within a bitstream
173
174chroma_h_shift
175    log2(luma.width / chroma.width)
176    this MUST NOT change within a bitstream
177
178chroma_v_shift
179    log2(luma.height / chroma.height)
180    this MUST NOT change within a bitstream
181
182spatial_scalability
183    0
184
185max_ref_frames
186    maximum number of reference frames
187    this MUST NOT change within a bitstream
188
189update_mc
190    indicates that motion compensation filter parameters are stored in the
191    header
192
193diag_mc
194    flag to enable faster diagonal interpolation
195    this SHOULD be 1 unless it turns out to be covered by a valid patent
196
197htaps
198    number of half pel interpolation filter taps, MUST be even, >0 and <10
199
200hcoeff
201    half pel interpolation filter coefficients, hcoeff[0] are the 2 middle
202    coefficients [1] are the next outer ones and so on, resulting in a filter
203    like: ...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ...
204    the sign of the coefficients is not explicitly stored but alternates
205    after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,...
206    hcoeff[0] is not explicitly stored but found by subtracting the sum
207    of all stored coefficients with signs from 32
208    hcoeff[0]= 32 - hcoeff[1] - hcoeff[2] - ...
209    a good choice for hcoeff and htaps is
210    htaps= 6
211    hcoeff={40,-10,2}
212    an alternative which requires more computations at both encoder and
213    decoder side and may or may not be better is
214    htaps= 8
215    hcoeff={42,-14,6,-2}
216
217
218ref_frames
219    minimum of the number of available reference frames and max_ref_frames
220    for example the first frame after a key frame always has ref_frames=1
221
222spatial_decomposition_type
223    wavelet type
224    0 is a 9/7 symmetric compact integer wavelet
225    1 is a 5/3 symmetric compact integer wavelet
226    others are reserved
227    stored as delta from last, last is reset to 0 if always_reset || keyframe
228
229qlog
230    quality (logarthmic quantizer scale)
231    stored as delta from last, last is reset to 0 if always_reset || keyframe
232
233mv_scale
234    stored as delta from last, last is reset to 0 if always_reset || keyframe
235    FIXME check that everything works fine if this changes between frames
236
237qbias
238    dequantization bias
239    stored as delta from last, last is reset to 0 if always_reset || keyframe
240
241block_max_depth
242    maximum depth of the block tree
243    stored as delta from last, last is reset to 0 if always_reset || keyframe
244
245quant_table
246    quantiztation table
247
248
249Highlevel bitstream structure:
250=============================
251 --------------------------------------------
252|                   Header                   |
253 --------------------------------------------
254|    ------------------------------------    |
255|   |               Block0               |   |
256|   |             split?                 |   |
257|   |     yes              no            |   |
258|   |  .........         intra?          |   |
259|   | : Block01 :    yes         no      |   |
260|   | : Block02 :  .......   ..........  |   |
261|   | : Block03 : :  y DC : : ref index: |   |
262|   | : Block04 : : cb DC : : motion x : |   |
263|   |  .........  : cr DC : : motion y : |   |
264|   |              .......   ..........  |   |
265|    ------------------------------------    |
266|    ------------------------------------    |
267|   |               Block1               |   |
268|                    ...                     |
269 --------------------------------------------
270| ------------   ------------   ------------ |
271|| Y subbands | | Cb subbands| | Cr subbands||
272||  ---  ---  | |  ---  ---  | |  ---  ---  ||
273|| |LL0||HL0| | | |LL0||HL0| | | |LL0||HL0| ||
274||  ---  ---  | |  ---  ---  | |  ---  ---  ||
275||  ---  ---  | |  ---  ---  | |  ---  ---  ||
276|| |LH0||HH0| | | |LH0||HH0| | | |LH0||HH0| ||
277||  ---  ---  | |  ---  ---  | |  ---  ---  ||
278||  ---  ---  | |  ---  ---  | |  ---  ---  ||
279|| |HL1||LH1| | | |HL1||LH1| | | |HL1||LH1| ||
280||  ---  ---  | |  ---  ---  | |  ---  ---  ||
281||  ---  ---  | |  ---  ---  | |  ---  ---  ||
282|| |HH1||HL2| | | |HH1||HL2| | | |HH1||HL2| ||
283||    ...     | |    ...     | |    ...     ||
284| ------------   ------------   ------------ |
285 --------------------------------------------
286
287Decoding process:
288=================
289
290                                         ------------
291                                        |            |
292                                        |  Subbands  |
293                   ------------         |            |
294                  |            |         ------------
295                  |  Intra DC  |               |
296                  |            |    LL0 subband prediction
297                   ------------                |
298                                \        Dequantizaton
299 -------------------             \             |
300|  Reference frames |             \           IDWT
301| -------   ------- |    Motion    \           |
302||Frame 0| |Frame 1|| Compensation  .   OBMC   v      -------
303| -------   ------- | --------------. \------> + --->|Frame n|-->output
304| -------   ------- |                                 -------
305||Frame 2| |Frame 3||<----------------------------------/
306|        ...        |
307 -------------------
308
309
310Range Coder:
311============
312
313Binary Range Coder:
314-------------------
315The implemented range coder is an adapted version based upon "Range encoding:
316an algorithm for removing redundancy from a digitised message." by G. N. N.
317Martin.
318The symbols encoded by the Snow range coder are bits (0|1). The
319associated probabilities are not fix but change depending on the symbol mix
320seen so far.
321
322
323bit seen | new state
324---------+-----------------------------------------------
325    0    | 256 - state_transition_table[256 - old_state];
326    1    |       state_transition_table[      old_state];
327
328state_transition_table = {
329  0,   0,   0,   0,   0,   0,   0,   0,  20,  21,  22,  23,  24,  25,  26,  27,
330 28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  37,  38,  39,  40,  41,  42,
331 43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  56,  57,
332 58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,
333 74,  75,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,
334 89,  90,  91,  92,  93,  94,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
335104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118,
336119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133,
337134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
338150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
339165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179,
340180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194,
341195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209,
342210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225,
343226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240,
344241, 242, 243, 244, 245, 246, 247, 248, 248,   0,   0,   0,   0,   0,   0,   0};
345
346FIXME
347
348
349Range Coding of integers:
350-------------------------
351FIXME
352
353
354Neighboring Blocks:
355===================
356left and top are set to the respective blocks unless they are outside of
357the image in which case they are set to the Null block
358
359top-left is set to the top left block unless it is outside of the image in
360which case it is set to the left block
361
362if this block has no larger parent block or it is at the left side of its
363parent block and the top right block is not outside of the image then the
364top right block is used for top-right else the top-left block is used
365
366Null block
367y,cb,cr are 128
368level, ref, mx and my are 0
369
370
371Motion Vector Prediction:
372=========================
3731. the motion vectors of all the neighboring blocks are scaled to
374compensate for the difference of reference frames
375
376scaled_mv= (mv * (256 * (current_reference+1) / (mv.reference+1)) + 128)>>8
377
3782. the median of the scaled left, top and top-right vectors is used as
379motion vector prediction
380
3813. the used motion vector is the sum of the predictor and
382   (mvx_diff, mvy_diff)*mv_scale
383
384
385Intra DC Predicton:
386======================
387the luma and chroma values of the left block are used as predictors
388
389the used luma and chroma is the sum of the predictor and y_diff, cb_diff, cr_diff
390to reverse this in the decoder apply the following:
391block[y][x].dc[0] = block[y][x-1].dc[0] +  y_diff;
392block[y][x].dc[1] = block[y][x-1].dc[1] + cb_diff;
393block[y][x].dc[2] = block[y][x-1].dc[2] + cr_diff;
394block[*][-1].dc[*]= 128;
395
396
397Motion Compensation:
398====================
399
400Halfpel interpolation:
401----------------------
402halfpel interpolation is done by convolution with the halfpel filter stored
403in the header:
404
405horizontal halfpel samples are found by
406H1[y][x] =    hcoeff[0]*(F[y][x  ] + F[y][x+1])
407            + hcoeff[1]*(F[y][x-1] + F[y][x+2])
408            + hcoeff[2]*(F[y][x-2] + F[y][x+3])
409            + ...
410h1[y][x] = (H1[y][x] + 32)>>6;
411
412vertical halfpel samples are found by
413H2[y][x] =    hcoeff[0]*(F[y  ][x] + F[y+1][x])
414            + hcoeff[1]*(F[y-1][x] + F[y+2][x])
415            + ...
416h2[y][x] = (H2[y][x] + 32)>>6;
417
418vertical+horizontal halfpel samples are found by
419H3[y][x] =    hcoeff[0]*(H2[y][x  ] + H2[y][x+1])
420            + hcoeff[1]*(H2[y][x-1] + H2[y][x+2])
421            + ...
422H3[y][x] =    hcoeff[0]*(H1[y  ][x] + H1[y+1][x])
423            + hcoeff[1]*(H1[y+1][x] + H1[y+2][x])
424            + ...
425h3[y][x] = (H3[y][x] + 2048)>>12;
426
427
428                   F   H1  F
429                   |   |   |
430                   |   |   |
431                   |   |   |
432                   F   H1  F
433                   |   |   |
434                   |   |   |
435                   |   |   |
436   F-------F-------F-> H1<-F-------F-------F
437                   v   v   v
438                  H2   H3  H2
439                   ^   ^   ^
440   F-------F-------F-> H1<-F-------F-------F
441                   |   |   |
442                   |   |   |
443                   |   |   |
444                   F   H1  F
445                   |   |   |
446                   |   |   |
447                   |   |   |
448                   F   H1  F
449
450
451unavailable fullpel samples (outside the picture for example) shall be equal
452to the closest available fullpel sample
453
454
455Smaller pel interpolation:
456--------------------------
457if diag_mc is set then points which lie on a line between 2 vertically,
458horiziontally or diagonally adjacent halfpel points shall be interpolated
459linearls with rounding to nearest and halfway values rounded up.
460points which lie on 2 diagonals at the same time should only use the one
461diagonal not containing the fullpel point
462
463
464
465           F-->O---q---O<--h1->O---q---O<--F
466           v \           / v \           / v
467           O   O       O   O   O       O   O
468           |         /     |     \         |
469           q       q       q       q       q
470           |     /         |         \     |
471           O   O       O   O   O       O   O
472           ^ /           \ ^ /           \ ^
473          h2-->O---q---O<--h3->O---q---O<--h2
474           v \           / v \           / v
475           O   O       O   O   O       O   O
476           |     \         |         /     |
477           q       q       q       q       q
478           |         \     |     /         |
479           O   O       O   O   O       O   O
480           ^ /           \ ^ /           \ ^
481           F-->O---q---O<--h1->O---q---O<--F
482
483
484
485the remaining points shall be bilinearly interpolated from the
486up to 4 surrounding halfpel and fullpel points, again rounding should be to
487nearest and halfway values rounded up
488
489compliant Snow decoders MUST support 1-1/8 pel luma and 1/2-1/16 pel chroma
490interpolation at least
491
492
493Overlapped block motion compensation:
494-------------------------------------
495FIXME
496
497LL band prediction:
498===================
499Each sample in the LL0 subband is predicted by the median of the left, top and
500left+top-topleft samples, samples outside the subband shall be considered to
501be 0. To reverse this prediction in the decoder apply the following.
502for(y=0; y<height; y++){
503    for(x=0; x<width; x++){
504        sample[y][x] += median(sample[y-1][x],
505                               sample[y][x-1],
506                               sample[y-1][x]+sample[y][x-1]-sample[y-1][x-1]);
507    }
508}
509sample[-1][*]=sample[*][-1]= 0;
510width,height here are the width and height of the LL0 subband not of the final
511video
512
513
514Dequantizaton:
515==============
516FIXME
517
518Wavelet Transform:
519==================
520
521Snow supports 2 wavelet transforms, the symmetric biorthogonal 5/3 integer
522transform and a integer approximation of the symmetric biorthogonal 9/7
523daubechies wavelet.
524
5252D IDWT (inverse discrete wavelet transform)
526--------------------------------------------
527The 2D IDWT applies a 2D filter recursively, each time combining the
5284 lowest frequency subbands into a single subband until only 1 subband
529remains.
530The 2D filter is done by first applying a 1D filter in the vertical direction
531and then applying it in the horizontal one.
532 ---------------    ---------------    ---------------    ---------------
533|LL0|HL0|       |  |   |   |       |  |       |       |  |       |       |
534|---+---|  HL1  |  | L0|H0 |  HL1  |  |  LL1  |  HL1  |  |       |       |
535|LH0|HH0|       |  |   |   |       |  |       |       |  |       |       |
536|-------+-------|->|-------+-------|->|-------+-------|->|   L1  |  H1   |->...
537|       |       |  |       |       |  |       |       |  |       |       |
538|  LH1  |  HH1  |  |  LH1  |  HH1  |  |  LH1  |  HH1  |  |       |       |
539|       |       |  |       |       |  |       |       |  |       |       |
540 ---------------    ---------------    ---------------    ---------------
541
542
5431D Filter:
544----------
5451. interleave the samples of the low and high frequency subbands like
546s={L0, H0, L1, H1, L2, H2, L3, H3, ... }
547note, this can end with a L or a H, the number of elements shall be w
548s[-1] shall be considered equivalent to s[1  ]
549s[w ] shall be considered equivalent to s[w-2]
550
5512. perform the lifting steps in order as described below
552
5535/3 Integer filter:
5541. s[i] -= (s[i-1] + s[i+1] + 2)>>2; for all even i < w
5552. s[i] += (s[i-1] + s[i+1]    )>>1; for all odd  i < w
556
557\ | /|\ | /|\ | /|\ | /|\
558 \|/ | \|/ | \|/ | \|/ |
559  +  |  +  |  +  |  +  |   -1/4
560 /|\ | /|\ | /|\ | /|\ |
561/ | \|/ | \|/ | \|/ | \|/
562  |  +  |  +  |  +  |  +   +1/2
563
564
565Snow's 9/7 Integer filter:
5661. s[i] -= (3*(s[i-1] + s[i+1])         + 4)>>3; for all even i < w
5672. s[i] -=     s[i-1] + s[i+1]                 ; for all odd  i < w
5683. s[i] += (   s[i-1] + s[i+1] + 4*s[i] + 8)>>4; for all even i < w
5694. s[i] += (3*(s[i-1] + s[i+1])            )>>1; for all odd  i < w
570
571\ | /|\ | /|\ | /|\ | /|\
572 \|/ | \|/ | \|/ | \|/ |
573  +  |  +  |  +  |  +  |   -3/8
574 /|\ | /|\ | /|\ | /|\ |
575/ | \|/ | \|/ | \|/ | \|/
576 (|  + (|  + (|  + (|  +   -1
577\ + /|\ + /|\ + /|\ + /|\  +1/4
578 \|/ | \|/ | \|/ | \|/ |
579  +  |  +  |  +  |  +  |   +1/16
580 /|\ | /|\ | /|\ | /|\ |
581/ | \|/ | \|/ | \|/ | \|/
582  |  +  |  +  |  +  |  +   +3/2
583
584optimization tips:
585following are exactly identical
586(3a)>>1 == a + (a>>1)
587(a + 4b + 8)>>4 == ((a>>2) + b + 2)>>2
588
58916bit implementation note:
590The IDWT can be implemented with 16bits, but this requires some care to
591prevent overflows, the following list, lists the minimum number of bits needed
592for some terms
5931. lifting step
594A= s[i-1] + s[i+1]                              16bit
5953*A + 4                                         18bit
596A + (A>>1) + 2                                  17bit
597
5983. lifting step
599s[i-1] + s[i+1]                                 17bit
600
6014. lifiting step
6023*(s[i-1] + s[i+1])                             17bit
603
604
605TODO:
606=====
607Important:
608finetune initial contexts
609flip wavelet?
610try to use the wavelet transformed predicted image (motion compensated image) as context for coding the residual coefficients
611try the MV length as context for coding the residual coefficients
612use extradata for stuff which is in the keyframes now?
613the MV median predictor is patented IIRC
614implement per picture halfpel interpolation
615try different range coder state transition tables for different contexts
616
617Not Important:
618compare the 6 tap and 8 tap hpel filters (psnr/bitrate and subjective quality)
619spatial_scalability b vs u (!= 0 breaks syntax anyway so we can add a u later)
620
621
622Credits:
623========
624Michael Niedermayer
625Loren Merritt
626
627
628Copyright:
629==========
630GPL + GFDL + whatever is needed to make this a RFC
631