1// SPDX-License-Identifier: LGPL-2.1+
2/*
3 * Copyright 2016 Tom aan de Wiel
4 * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5 *
6 * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
7 *
8 * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
9 * R.D. Brown, 1977
10 */
11
12#include <linux/string.h>
13#include <linux/kernel.h>
14#include <linux/videodev2.h>
15#include "codec-fwht.h"
16
17#define OVERFLOW_BIT BIT(14)
18
19/*
20 * Note: bit 0 of the header must always be 0. Otherwise it cannot
21 * be guaranteed that the magic 8 byte sequence (see below) can
22 * never occur in the rlc output.
23 */
24#define PFRAME_BIT BIT(15)
25#define DUPS_MASK 0x1ffe
26
27#define PBLOCK 0
28#define IBLOCK 1
29
30#define ALL_ZEROS 15
31
32static const uint8_t zigzag[64] = {
33	0,
34	1,  8,
35	2,  9, 16,
36	3, 10, 17, 24,
37	4, 11, 18, 25, 32,
38	5, 12, 19, 26, 33, 40,
39	6, 13, 20, 27, 34, 41, 48,
40	7, 14, 21, 28, 35, 42, 49, 56,
41	15, 22, 29, 36, 43, 50, 57,
42	23, 30, 37, 44, 51, 58,
43	31, 38, 45, 52, 59,
44	39, 46, 53, 60,
45	47, 54, 61,
46	55, 62,
47	63,
48};
49
50/*
51 * noinline_for_stack to work around
52 * https://llvm.org/pr38809
53 */
54static int noinline_for_stack
55rlc(const s16 *in, __be16 *output, int blocktype)
56{
57	s16 block[8 * 8];
58	s16 *wp = block;
59	int i = 0;
60	int x, y;
61	int ret = 0;
62
63	/* read in block from framebuffer */
64	int lastzero_run = 0;
65	int to_encode;
66
67	for (y = 0; y < 8; y++) {
68		for (x = 0; x < 8; x++) {
69			*wp = in[x + y * 8];
70			wp++;
71		}
72	}
73
74	/* keep track of amount of trailing zeros */
75	for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
76		lastzero_run++;
77
78	*output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
79	ret++;
80
81	to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
82
83	i = 0;
84	while (i < to_encode) {
85		int cnt = 0;
86		int tmp;
87
88		/* count leading zeros */
89		while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
90			cnt++;
91			i++;
92			if (i == to_encode) {
93				cnt--;
94				break;
95			}
96		}
97		/* 4 bits for run, 12 for coefficient (quantization by 4) */
98		*output++ = htons((cnt | tmp << 4));
99		i++;
100		ret++;
101	}
102	if (lastzero_run > 14) {
103		*output = htons(ALL_ZEROS | 0);
104		ret++;
105	}
106
107	return ret;
108}
109
110/*
111 * This function will worst-case increase rlc_in by 65*2 bytes:
112 * one s16 value for the header and 8 * 8 coefficients of type s16.
113 */
114static noinline_for_stack u16
115derlc(const __be16 **rlc_in, s16 *dwht_out, const __be16 *end_of_input)
116{
117	/* header */
118	const __be16 *input = *rlc_in;
119	u16 stat;
120	int dec_count = 0;
121	s16 block[8 * 8 + 16];
122	s16 *wp = block;
123	int i;
124
125	if (input > end_of_input)
126		return OVERFLOW_BIT;
127	stat = ntohs(*input++);
128
129	/*
130	 * Now de-compress, it expands one byte to up to 15 bytes
131	 * (or fills the remainder of the 64 bytes with zeroes if it
132	 * is the last byte to expand).
133	 *
134	 * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
135	 * allow for overflow if the incoming data was malformed.
136	 */
137	while (dec_count < 8 * 8) {
138		s16 in;
139		int length;
140		int coeff;
141
142		if (input > end_of_input)
143			return OVERFLOW_BIT;
144		in = ntohs(*input++);
145		length = in & 0xf;
146		coeff = in >> 4;
147
148		/* fill remainder with zeros */
149		if (length == 15) {
150			for (i = 0; i < 64 - dec_count; i++)
151				*wp++ = 0;
152			break;
153		}
154
155		for (i = 0; i < length; i++)
156			*wp++ = 0;
157		*wp++ = coeff;
158		dec_count += length + 1;
159	}
160
161	wp = block;
162
163	for (i = 0; i < 64; i++) {
164		int pos = zigzag[i];
165		int y = pos / 8;
166		int x = pos % 8;
167
168		dwht_out[x + y * 8] = *wp++;
169	}
170	*rlc_in = input;
171	return stat;
172}
173
174static const int quant_table[] = {
175	2, 2, 2, 2, 2, 2,  2,  2,
176	2, 2, 2, 2, 2, 2,  2,  2,
177	2, 2, 2, 2, 2, 2,  2,  3,
178	2, 2, 2, 2, 2, 2,  3,  6,
179	2, 2, 2, 2, 2, 3,  6,  6,
180	2, 2, 2, 2, 3, 6,  6,  6,
181	2, 2, 2, 3, 6, 6,  6,  6,
182	2, 2, 3, 6, 6, 6,  6,  8,
183};
184
185static const int quant_table_p[] = {
186	3, 3, 3, 3, 3, 3,  3,  3,
187	3, 3, 3, 3, 3, 3,  3,  3,
188	3, 3, 3, 3, 3, 3,  3,  3,
189	3, 3, 3, 3, 3, 3,  3,  6,
190	3, 3, 3, 3, 3, 3,  6,  6,
191	3, 3, 3, 3, 3, 6,  6,  9,
192	3, 3, 3, 3, 6, 6,  9,  9,
193	3, 3, 3, 6, 6, 9,  9,  10,
194};
195
196static void quantize_intra(s16 *coeff, s16 *de_coeff, u16 qp)
197{
198	const int *quant = quant_table;
199	int i, j;
200
201	for (j = 0; j < 8; j++) {
202		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
203			*coeff >>= *quant;
204			if (*coeff >= -qp && *coeff <= qp)
205				*coeff = *de_coeff = 0;
206			else
207				*de_coeff = *coeff << *quant;
208		}
209	}
210}
211
212static void dequantize_intra(s16 *coeff)
213{
214	const int *quant = quant_table;
215	int i, j;
216
217	for (j = 0; j < 8; j++)
218		for (i = 0; i < 8; i++, quant++, coeff++)
219			*coeff <<= *quant;
220}
221
222static void quantize_inter(s16 *coeff, s16 *de_coeff, u16 qp)
223{
224	const int *quant = quant_table_p;
225	int i, j;
226
227	for (j = 0; j < 8; j++) {
228		for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
229			*coeff >>= *quant;
230			if (*coeff >= -qp && *coeff <= qp)
231				*coeff = *de_coeff = 0;
232			else
233				*de_coeff = *coeff << *quant;
234		}
235	}
236}
237
238static void dequantize_inter(s16 *coeff)
239{
240	const int *quant = quant_table_p;
241	int i, j;
242
243	for (j = 0; j < 8; j++)
244		for (i = 0; i < 8; i++, quant++, coeff++)
245			*coeff <<= *quant;
246}
247
248static void noinline_for_stack fwht(const u8 *block, s16 *output_block,
249				    unsigned int stride,
250				    unsigned int input_step, bool intra)
251{
252	/* we'll need more than 8 bits for the transformed coefficients */
253	s32 workspace1[8], workspace2[8];
254	const u8 *tmp = block;
255	s16 *out = output_block;
256	int add = intra ? 256 : 0;
257	unsigned int i;
258
259	/* stage 1 */
260	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
261		switch (input_step) {
262		case 1:
263			workspace1[0]  = tmp[0] + tmp[1] - add;
264			workspace1[1]  = tmp[0] - tmp[1];
265
266			workspace1[2]  = tmp[2] + tmp[3] - add;
267			workspace1[3]  = tmp[2] - tmp[3];
268
269			workspace1[4]  = tmp[4] + tmp[5] - add;
270			workspace1[5]  = tmp[4] - tmp[5];
271
272			workspace1[6]  = tmp[6] + tmp[7] - add;
273			workspace1[7]  = tmp[6] - tmp[7];
274			break;
275		case 2:
276			workspace1[0]  = tmp[0] + tmp[2] - add;
277			workspace1[1]  = tmp[0] - tmp[2];
278
279			workspace1[2]  = tmp[4] + tmp[6] - add;
280			workspace1[3]  = tmp[4] - tmp[6];
281
282			workspace1[4]  = tmp[8] + tmp[10] - add;
283			workspace1[5]  = tmp[8] - tmp[10];
284
285			workspace1[6]  = tmp[12] + tmp[14] - add;
286			workspace1[7]  = tmp[12] - tmp[14];
287			break;
288		case 3:
289			workspace1[0]  = tmp[0] + tmp[3] - add;
290			workspace1[1]  = tmp[0] - tmp[3];
291
292			workspace1[2]  = tmp[6] + tmp[9] - add;
293			workspace1[3]  = tmp[6] - tmp[9];
294
295			workspace1[4]  = tmp[12] + tmp[15] - add;
296			workspace1[5]  = tmp[12] - tmp[15];
297
298			workspace1[6]  = tmp[18] + tmp[21] - add;
299			workspace1[7]  = tmp[18] - tmp[21];
300			break;
301		default:
302			workspace1[0]  = tmp[0] + tmp[4] - add;
303			workspace1[1]  = tmp[0] - tmp[4];
304
305			workspace1[2]  = tmp[8] + tmp[12] - add;
306			workspace1[3]  = tmp[8] - tmp[12];
307
308			workspace1[4]  = tmp[16] + tmp[20] - add;
309			workspace1[5]  = tmp[16] - tmp[20];
310
311			workspace1[6]  = tmp[24] + tmp[28] - add;
312			workspace1[7]  = tmp[24] - tmp[28];
313			break;
314		}
315
316		/* stage 2 */
317		workspace2[0] = workspace1[0] + workspace1[2];
318		workspace2[1] = workspace1[0] - workspace1[2];
319		workspace2[2] = workspace1[1] - workspace1[3];
320		workspace2[3] = workspace1[1] + workspace1[3];
321
322		workspace2[4] = workspace1[4] + workspace1[6];
323		workspace2[5] = workspace1[4] - workspace1[6];
324		workspace2[6] = workspace1[5] - workspace1[7];
325		workspace2[7] = workspace1[5] + workspace1[7];
326
327		/* stage 3 */
328		out[0] = workspace2[0] + workspace2[4];
329		out[1] = workspace2[0] - workspace2[4];
330		out[2] = workspace2[1] - workspace2[5];
331		out[3] = workspace2[1] + workspace2[5];
332		out[4] = workspace2[2] + workspace2[6];
333		out[5] = workspace2[2] - workspace2[6];
334		out[6] = workspace2[3] - workspace2[7];
335		out[7] = workspace2[3] + workspace2[7];
336	}
337
338	out = output_block;
339
340	for (i = 0; i < 8; i++, out++) {
341		/* stage 1 */
342		workspace1[0]  = out[0] + out[1 * 8];
343		workspace1[1]  = out[0] - out[1 * 8];
344
345		workspace1[2]  = out[2 * 8] + out[3 * 8];
346		workspace1[3]  = out[2 * 8] - out[3 * 8];
347
348		workspace1[4]  = out[4 * 8] + out[5 * 8];
349		workspace1[5]  = out[4 * 8] - out[5 * 8];
350
351		workspace1[6]  = out[6 * 8] + out[7 * 8];
352		workspace1[7]  = out[6 * 8] - out[7 * 8];
353
354		/* stage 2 */
355		workspace2[0] = workspace1[0] + workspace1[2];
356		workspace2[1] = workspace1[0] - workspace1[2];
357		workspace2[2] = workspace1[1] - workspace1[3];
358		workspace2[3] = workspace1[1] + workspace1[3];
359
360		workspace2[4] = workspace1[4] + workspace1[6];
361		workspace2[5] = workspace1[4] - workspace1[6];
362		workspace2[6] = workspace1[5] - workspace1[7];
363		workspace2[7] = workspace1[5] + workspace1[7];
364		/* stage 3 */
365		out[0 * 8] = workspace2[0] + workspace2[4];
366		out[1 * 8] = workspace2[0] - workspace2[4];
367		out[2 * 8] = workspace2[1] - workspace2[5];
368		out[3 * 8] = workspace2[1] + workspace2[5];
369		out[4 * 8] = workspace2[2] + workspace2[6];
370		out[5 * 8] = workspace2[2] - workspace2[6];
371		out[6 * 8] = workspace2[3] - workspace2[7];
372		out[7 * 8] = workspace2[3] + workspace2[7];
373	}
374}
375
376/*
377 * Not the nicest way of doing it, but P-blocks get twice the range of
378 * that of the I-blocks. Therefore we need a type bigger than 8 bits.
379 * Furthermore values can be negative... This is just a version that
380 * works with 16 signed data
381 */
382static void noinline_for_stack
383fwht16(const s16 *block, s16 *output_block, int stride, int intra)
384{
385	/* we'll need more than 8 bits for the transformed coefficients */
386	s32 workspace1[8], workspace2[8];
387	const s16 *tmp = block;
388	s16 *out = output_block;
389	int i;
390
391	for (i = 0; i < 8; i++, tmp += stride, out += 8) {
392		/* stage 1 */
393		workspace1[0]  = tmp[0] + tmp[1];
394		workspace1[1]  = tmp[0] - tmp[1];
395
396		workspace1[2]  = tmp[2] + tmp[3];
397		workspace1[3]  = tmp[2] - tmp[3];
398
399		workspace1[4]  = tmp[4] + tmp[5];
400		workspace1[5]  = tmp[4] - tmp[5];
401
402		workspace1[6]  = tmp[6] + tmp[7];
403		workspace1[7]  = tmp[6] - tmp[7];
404
405		/* stage 2 */
406		workspace2[0] = workspace1[0] + workspace1[2];
407		workspace2[1] = workspace1[0] - workspace1[2];
408		workspace2[2] = workspace1[1] - workspace1[3];
409		workspace2[3] = workspace1[1] + workspace1[3];
410
411		workspace2[4] = workspace1[4] + workspace1[6];
412		workspace2[5] = workspace1[4] - workspace1[6];
413		workspace2[6] = workspace1[5] - workspace1[7];
414		workspace2[7] = workspace1[5] + workspace1[7];
415
416		/* stage 3 */
417		out[0] = workspace2[0] + workspace2[4];
418		out[1] = workspace2[0] - workspace2[4];
419		out[2] = workspace2[1] - workspace2[5];
420		out[3] = workspace2[1] + workspace2[5];
421		out[4] = workspace2[2] + workspace2[6];
422		out[5] = workspace2[2] - workspace2[6];
423		out[6] = workspace2[3] - workspace2[7];
424		out[7] = workspace2[3] + workspace2[7];
425	}
426
427	out = output_block;
428
429	for (i = 0; i < 8; i++, out++) {
430		/* stage 1 */
431		workspace1[0]  = out[0] + out[1*8];
432		workspace1[1]  = out[0] - out[1*8];
433
434		workspace1[2]  = out[2*8] + out[3*8];
435		workspace1[3]  = out[2*8] - out[3*8];
436
437		workspace1[4]  = out[4*8] + out[5*8];
438		workspace1[5]  = out[4*8] - out[5*8];
439
440		workspace1[6]  = out[6*8] + out[7*8];
441		workspace1[7]  = out[6*8] - out[7*8];
442
443		/* stage 2 */
444		workspace2[0] = workspace1[0] + workspace1[2];
445		workspace2[1] = workspace1[0] - workspace1[2];
446		workspace2[2] = workspace1[1] - workspace1[3];
447		workspace2[3] = workspace1[1] + workspace1[3];
448
449		workspace2[4] = workspace1[4] + workspace1[6];
450		workspace2[5] = workspace1[4] - workspace1[6];
451		workspace2[6] = workspace1[5] - workspace1[7];
452		workspace2[7] = workspace1[5] + workspace1[7];
453
454		/* stage 3 */
455		out[0*8] = workspace2[0] + workspace2[4];
456		out[1*8] = workspace2[0] - workspace2[4];
457		out[2*8] = workspace2[1] - workspace2[5];
458		out[3*8] = workspace2[1] + workspace2[5];
459		out[4*8] = workspace2[2] + workspace2[6];
460		out[5*8] = workspace2[2] - workspace2[6];
461		out[6*8] = workspace2[3] - workspace2[7];
462		out[7*8] = workspace2[3] + workspace2[7];
463	}
464}
465
466static noinline_for_stack void
467ifwht(const s16 *block, s16 *output_block, int intra)
468{
469	/*
470	 * we'll need more than 8 bits for the transformed coefficients
471	 * use native unit of cpu
472	 */
473	int workspace1[8], workspace2[8];
474	int inter = intra ? 0 : 1;
475	const s16 *tmp = block;
476	s16 *out = output_block;
477	int i;
478
479	for (i = 0; i < 8; i++, tmp += 8, out += 8) {
480		/* stage 1 */
481		workspace1[0]  = tmp[0] + tmp[1];
482		workspace1[1]  = tmp[0] - tmp[1];
483
484		workspace1[2]  = tmp[2] + tmp[3];
485		workspace1[3]  = tmp[2] - tmp[3];
486
487		workspace1[4]  = tmp[4] + tmp[5];
488		workspace1[5]  = tmp[4] - tmp[5];
489
490		workspace1[6]  = tmp[6] + tmp[7];
491		workspace1[7]  = tmp[6] - tmp[7];
492
493		/* stage 2 */
494		workspace2[0] = workspace1[0] + workspace1[2];
495		workspace2[1] = workspace1[0] - workspace1[2];
496		workspace2[2] = workspace1[1] - workspace1[3];
497		workspace2[3] = workspace1[1] + workspace1[3];
498
499		workspace2[4] = workspace1[4] + workspace1[6];
500		workspace2[5] = workspace1[4] - workspace1[6];
501		workspace2[6] = workspace1[5] - workspace1[7];
502		workspace2[7] = workspace1[5] + workspace1[7];
503
504		/* stage 3 */
505		out[0] = workspace2[0] + workspace2[4];
506		out[1] = workspace2[0] - workspace2[4];
507		out[2] = workspace2[1] - workspace2[5];
508		out[3] = workspace2[1] + workspace2[5];
509		out[4] = workspace2[2] + workspace2[6];
510		out[5] = workspace2[2] - workspace2[6];
511		out[6] = workspace2[3] - workspace2[7];
512		out[7] = workspace2[3] + workspace2[7];
513	}
514
515	out = output_block;
516
517	for (i = 0; i < 8; i++, out++) {
518		/* stage 1 */
519		workspace1[0]  = out[0] + out[1 * 8];
520		workspace1[1]  = out[0] - out[1 * 8];
521
522		workspace1[2]  = out[2 * 8] + out[3 * 8];
523		workspace1[3]  = out[2 * 8] - out[3 * 8];
524
525		workspace1[4]  = out[4 * 8] + out[5 * 8];
526		workspace1[5]  = out[4 * 8] - out[5 * 8];
527
528		workspace1[6]  = out[6 * 8] + out[7 * 8];
529		workspace1[7]  = out[6 * 8] - out[7 * 8];
530
531		/* stage 2 */
532		workspace2[0] = workspace1[0] + workspace1[2];
533		workspace2[1] = workspace1[0] - workspace1[2];
534		workspace2[2] = workspace1[1] - workspace1[3];
535		workspace2[3] = workspace1[1] + workspace1[3];
536
537		workspace2[4] = workspace1[4] + workspace1[6];
538		workspace2[5] = workspace1[4] - workspace1[6];
539		workspace2[6] = workspace1[5] - workspace1[7];
540		workspace2[7] = workspace1[5] + workspace1[7];
541
542		/* stage 3 */
543		if (inter) {
544			int d;
545
546			out[0 * 8] = workspace2[0] + workspace2[4];
547			out[1 * 8] = workspace2[0] - workspace2[4];
548			out[2 * 8] = workspace2[1] - workspace2[5];
549			out[3 * 8] = workspace2[1] + workspace2[5];
550			out[4 * 8] = workspace2[2] + workspace2[6];
551			out[5 * 8] = workspace2[2] - workspace2[6];
552			out[6 * 8] = workspace2[3] - workspace2[7];
553			out[7 * 8] = workspace2[3] + workspace2[7];
554
555			for (d = 0; d < 8; d++)
556				out[8 * d] >>= 6;
557		} else {
558			int d;
559
560			out[0 * 8] = workspace2[0] + workspace2[4];
561			out[1 * 8] = workspace2[0] - workspace2[4];
562			out[2 * 8] = workspace2[1] - workspace2[5];
563			out[3 * 8] = workspace2[1] + workspace2[5];
564			out[4 * 8] = workspace2[2] + workspace2[6];
565			out[5 * 8] = workspace2[2] - workspace2[6];
566			out[6 * 8] = workspace2[3] - workspace2[7];
567			out[7 * 8] = workspace2[3] + workspace2[7];
568
569			for (d = 0; d < 8; d++) {
570				out[8 * d] >>= 6;
571				out[8 * d] += 128;
572			}
573		}
574	}
575}
576
577static void fill_encoder_block(const u8 *input, s16 *dst,
578			       unsigned int stride, unsigned int input_step)
579{
580	int i, j;
581
582	for (i = 0; i < 8; i++) {
583		for (j = 0; j < 8; j++, input += input_step)
584			*dst++ = *input;
585		input += stride - 8 * input_step;
586	}
587}
588
589static int var_intra(const s16 *input)
590{
591	int32_t mean = 0;
592	int32_t ret = 0;
593	const s16 *tmp = input;
594	int i;
595
596	for (i = 0; i < 8 * 8; i++, tmp++)
597		mean += *tmp;
598	mean /= 64;
599	tmp = input;
600	for (i = 0; i < 8 * 8; i++, tmp++)
601		ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
602	return ret;
603}
604
605static int var_inter(const s16 *old, const s16 *new)
606{
607	int32_t ret = 0;
608	int i;
609
610	for (i = 0; i < 8 * 8; i++, old++, new++)
611		ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
612	return ret;
613}
614
615static noinline_for_stack int
616decide_blocktype(const u8 *cur, const u8 *reference, s16 *deltablock,
617		 unsigned int stride, unsigned int input_step)
618{
619	s16 tmp[64];
620	s16 old[64];
621	s16 *work = tmp;
622	unsigned int k, l;
623	int vari;
624	int vard;
625
626	fill_encoder_block(cur, tmp, stride, input_step);
627	fill_encoder_block(reference, old, 8, 1);
628	vari = var_intra(tmp);
629
630	for (k = 0; k < 8; k++) {
631		for (l = 0; l < 8; l++) {
632			*deltablock = *work - *reference;
633			deltablock++;
634			work++;
635			reference++;
636		}
637	}
638	deltablock -= 64;
639	vard = var_inter(old, tmp);
640	return vari <= vard ? IBLOCK : PBLOCK;
641}
642
643static void fill_decoder_block(u8 *dst, const s16 *input, int stride,
644			       unsigned int dst_step)
645{
646	int i, j;
647
648	for (i = 0; i < 8; i++) {
649		for (j = 0; j < 8; j++, input++, dst += dst_step) {
650			if (*input < 0)
651				*dst = 0;
652			else if (*input > 255)
653				*dst = 255;
654			else
655				*dst = *input;
656		}
657		dst += stride - (8 * dst_step);
658	}
659}
660
661static void add_deltas(s16 *deltas, const u8 *ref, int stride,
662		       unsigned int ref_step)
663{
664	int k, l;
665
666	for (k = 0; k < 8; k++) {
667		for (l = 0; l < 8; l++) {
668			*deltas += *ref;
669			ref += ref_step;
670			/*
671			 * Due to quantizing, it might possible that the
672			 * decoded coefficients are slightly out of range
673			 */
674			if (*deltas < 0)
675				*deltas = 0;
676			else if (*deltas > 255)
677				*deltas = 255;
678			deltas++;
679		}
680		ref += stride - (8 * ref_step);
681	}
682}
683
684static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
685			struct fwht_cframe *cf, u32 height, u32 width,
686			u32 stride, unsigned int input_step,
687			bool is_intra, bool next_is_intra)
688{
689	u8 *input_start = input;
690	__be16 *rlco_start = *rlco;
691	s16 deltablock[64];
692	__be16 pframe_bit = htons(PFRAME_BIT);
693	u32 encoding = 0;
694	unsigned int last_size = 0;
695	unsigned int i, j;
696
697	width = round_up(width, 8);
698	height = round_up(height, 8);
699
700	for (j = 0; j < height / 8; j++) {
701		input = input_start + j * 8 * stride;
702		for (i = 0; i < width / 8; i++) {
703			/* intra code, first frame is always intra coded. */
704			int blocktype = IBLOCK;
705			unsigned int size;
706
707			if (!is_intra)
708				blocktype = decide_blocktype(input, refp,
709					deltablock, stride, input_step);
710			if (blocktype == IBLOCK) {
711				fwht(input, cf->coeffs, stride, input_step, 1);
712				quantize_intra(cf->coeffs, cf->de_coeffs,
713					       cf->i_frame_qp);
714			} else {
715				/* inter code */
716				encoding |= FWHT_FRAME_PCODED;
717				fwht16(deltablock, cf->coeffs, 8, 0);
718				quantize_inter(cf->coeffs, cf->de_coeffs,
719					       cf->p_frame_qp);
720			}
721			if (!next_is_intra) {
722				ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
723
724				if (blocktype == PBLOCK)
725					add_deltas(cf->de_fwht, refp, 8, 1);
726				fill_decoder_block(refp, cf->de_fwht, 8, 1);
727			}
728
729			input += 8 * input_step;
730			refp += 8 * 8;
731
732			size = rlc(cf->coeffs, *rlco, blocktype);
733			if (last_size == size &&
734			    !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
735				__be16 *last_rlco = *rlco - size;
736				s16 hdr = ntohs(*last_rlco);
737
738				if (!((*last_rlco ^ **rlco) & pframe_bit) &&
739				    (hdr & DUPS_MASK) < DUPS_MASK)
740					*last_rlco = htons(hdr + 2);
741				else
742					*rlco += size;
743			} else {
744				*rlco += size;
745			}
746			if (*rlco >= rlco_max) {
747				encoding |= FWHT_FRAME_UNENCODED;
748				goto exit_loop;
749			}
750			last_size = size;
751		}
752	}
753
754exit_loop:
755	if (encoding & FWHT_FRAME_UNENCODED) {
756		u8 *out = (u8 *)rlco_start;
757		u8 *p;
758
759		input = input_start;
760		/*
761		 * The compressed stream should never contain the magic
762		 * header, so when we copy the YUV data we replace 0xff
763		 * by 0xfe. Since YUV is limited range such values
764		 * shouldn't appear anyway.
765		 */
766		for (j = 0; j < height; j++) {
767			for (i = 0, p = input; i < width; i++, p += input_step)
768				*out++ = (*p == 0xff) ? 0xfe : *p;
769			input += stride;
770		}
771		*rlco = (__be16 *)out;
772		encoding &= ~FWHT_FRAME_PCODED;
773	}
774	return encoding;
775}
776
777u32 fwht_encode_frame(struct fwht_raw_frame *frm,
778		      struct fwht_raw_frame *ref_frm,
779		      struct fwht_cframe *cf,
780		      bool is_intra, bool next_is_intra,
781		      unsigned int width, unsigned int height,
782		      unsigned int stride, unsigned int chroma_stride)
783{
784	unsigned int size = height * width;
785	__be16 *rlco = cf->rlc_data;
786	__be16 *rlco_max;
787	u32 encoding;
788
789	rlco_max = rlco + size / 2 - 256;
790	encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
791				height, width, stride,
792				frm->luma_alpha_step, is_intra, next_is_intra);
793	if (encoding & FWHT_FRAME_UNENCODED)
794		encoding |= FWHT_LUMA_UNENCODED;
795	encoding &= ~FWHT_FRAME_UNENCODED;
796
797	if (frm->components_num >= 3) {
798		u32 chroma_h = height / frm->height_div;
799		u32 chroma_w = width / frm->width_div;
800		unsigned int chroma_size = chroma_h * chroma_w;
801
802		rlco_max = rlco + chroma_size / 2 - 256;
803		encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max,
804					 cf, chroma_h, chroma_w,
805					 chroma_stride, frm->chroma_step,
806					 is_intra, next_is_intra);
807		if (encoding & FWHT_FRAME_UNENCODED)
808			encoding |= FWHT_CB_UNENCODED;
809		encoding &= ~FWHT_FRAME_UNENCODED;
810		rlco_max = rlco + chroma_size / 2 - 256;
811		encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max,
812					 cf, chroma_h, chroma_w,
813					 chroma_stride, frm->chroma_step,
814					 is_intra, next_is_intra);
815		if (encoding & FWHT_FRAME_UNENCODED)
816			encoding |= FWHT_CR_UNENCODED;
817		encoding &= ~FWHT_FRAME_UNENCODED;
818	}
819
820	if (frm->components_num == 4) {
821		rlco_max = rlco + size / 2 - 256;
822		encoding |= encode_plane(frm->alpha, ref_frm->alpha, &rlco,
823					 rlco_max, cf, height, width,
824					 stride, frm->luma_alpha_step,
825					 is_intra, next_is_intra);
826		if (encoding & FWHT_FRAME_UNENCODED)
827			encoding |= FWHT_ALPHA_UNENCODED;
828		encoding &= ~FWHT_FRAME_UNENCODED;
829	}
830
831	cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
832	return encoding;
833}
834
835static bool decode_plane(struct fwht_cframe *cf, const __be16 **rlco,
836			 u32 height, u32 width, const u8 *ref, u32 ref_stride,
837			 unsigned int ref_step, u8 *dst,
838			 unsigned int dst_stride, unsigned int dst_step,
839			 bool uncompressed, const __be16 *end_of_rlco_buf)
840{
841	unsigned int copies = 0;
842	s16 copy[8 * 8];
843	u16 stat;
844	unsigned int i, j;
845	bool is_intra = !ref;
846
847	width = round_up(width, 8);
848	height = round_up(height, 8);
849
850	if (uncompressed) {
851		int i;
852
853		if (end_of_rlco_buf + 1 < *rlco + width * height / 2)
854			return false;
855		for (i = 0; i < height; i++) {
856			memcpy(dst, *rlco, width);
857			dst += dst_stride;
858			*rlco += width / 2;
859		}
860		return true;
861	}
862
863	/*
864	 * When decoding each macroblock the rlco pointer will be increased
865	 * by 65 * 2 bytes worst-case.
866	 * To avoid overflow the buffer has to be 65/64th of the actual raw
867	 * image size, just in case someone feeds it malicious data.
868	 */
869	for (j = 0; j < height / 8; j++) {
870		for (i = 0; i < width / 8; i++) {
871			const u8 *refp = ref + j * 8 * ref_stride +
872				i * 8 * ref_step;
873			u8 *dstp = dst + j * 8 * dst_stride + i * 8 * dst_step;
874
875			if (copies) {
876				memcpy(cf->de_fwht, copy, sizeof(copy));
877				if ((stat & PFRAME_BIT) && !is_intra)
878					add_deltas(cf->de_fwht, refp,
879						   ref_stride, ref_step);
880				fill_decoder_block(dstp, cf->de_fwht,
881						   dst_stride, dst_step);
882				copies--;
883				continue;
884			}
885
886			stat = derlc(rlco, cf->coeffs, end_of_rlco_buf);
887			if (stat & OVERFLOW_BIT)
888				return false;
889			if ((stat & PFRAME_BIT) && !is_intra)
890				dequantize_inter(cf->coeffs);
891			else
892				dequantize_intra(cf->coeffs);
893
894			ifwht(cf->coeffs, cf->de_fwht,
895			      ((stat & PFRAME_BIT) && !is_intra) ? 0 : 1);
896
897			copies = (stat & DUPS_MASK) >> 1;
898			if (copies)
899				memcpy(copy, cf->de_fwht, sizeof(copy));
900			if ((stat & PFRAME_BIT) && !is_intra)
901				add_deltas(cf->de_fwht, refp,
902					   ref_stride, ref_step);
903			fill_decoder_block(dstp, cf->de_fwht, dst_stride,
904					   dst_step);
905		}
906	}
907	return true;
908}
909
910bool fwht_decode_frame(struct fwht_cframe *cf, u32 hdr_flags,
911		       unsigned int components_num, unsigned int width,
912		       unsigned int height, const struct fwht_raw_frame *ref,
913		       unsigned int ref_stride, unsigned int ref_chroma_stride,
914		       struct fwht_raw_frame *dst, unsigned int dst_stride,
915		       unsigned int dst_chroma_stride)
916{
917	const __be16 *rlco = cf->rlc_data;
918	const __be16 *end_of_rlco_buf = cf->rlc_data +
919			(cf->size / sizeof(*rlco)) - 1;
920
921	if (!decode_plane(cf, &rlco, height, width, ref->luma, ref_stride,
922			  ref->luma_alpha_step, dst->luma, dst_stride,
923			  dst->luma_alpha_step,
924			  hdr_flags & V4L2_FWHT_FL_LUMA_IS_UNCOMPRESSED,
925			  end_of_rlco_buf))
926		return false;
927
928	if (components_num >= 3) {
929		u32 h = height;
930		u32 w = width;
931
932		if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_HEIGHT))
933			h /= 2;
934		if (!(hdr_flags & V4L2_FWHT_FL_CHROMA_FULL_WIDTH))
935			w /= 2;
936
937		if (!decode_plane(cf, &rlco, h, w, ref->cb, ref_chroma_stride,
938				  ref->chroma_step, dst->cb, dst_chroma_stride,
939				  dst->chroma_step,
940				  hdr_flags & V4L2_FWHT_FL_CB_IS_UNCOMPRESSED,
941				  end_of_rlco_buf))
942			return false;
943		if (!decode_plane(cf, &rlco, h, w, ref->cr, ref_chroma_stride,
944				  ref->chroma_step, dst->cr, dst_chroma_stride,
945				  dst->chroma_step,
946				  hdr_flags & V4L2_FWHT_FL_CR_IS_UNCOMPRESSED,
947				  end_of_rlco_buf))
948			return false;
949	}
950
951	if (components_num == 4)
952		if (!decode_plane(cf, &rlco, height, width, ref->alpha, ref_stride,
953				  ref->luma_alpha_step, dst->alpha, dst_stride,
954				  dst->luma_alpha_step,
955				  hdr_flags & V4L2_FWHT_FL_ALPHA_IS_UNCOMPRESSED,
956				  end_of_rlco_buf))
957			return false;
958	return true;
959}
960