Lines Matching +full:0 +full:- +full:8

1 // SPDX-License-Identifier: LGPL-2.1+
6 * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
8 * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
15 #include "codec-fwht.h"
20 * Note: bit 0 of the header must always be 0. Otherwise it cannot
21 * be guaranteed that the magic 8 byte sequence (see below) can
25 #define DUPS_MASK 0x1ffe
27 #define PBLOCK 0
33 0,
34 1, 8,
57 s16 block[8 * 8]; in rlc()
59 int i = 0; in rlc()
61 int ret = 0; in rlc()
64 int lastzero_run = 0; in rlc()
67 for (y = 0; y < 8; y++) { in rlc()
68 for (x = 0; x < 8; x++) { in rlc()
69 *wp = in[x + y * 8]; in rlc()
75 for (i = 63; i >= 0 && !block[zigzag[i]]; i--) in rlc()
78 *output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0); in rlc()
81 to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0); in rlc()
83 i = 0; in rlc()
85 int cnt = 0; in rlc()
89 while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) { in rlc()
93 cnt--; in rlc()
103 *output = htons(ALL_ZEROS | 0); in rlc()
111 * This function will worst-case increase rlc_in by 65*2 bytes:
112 * one s16 value for the header and 8 * 8 coefficients of type s16.
120 int dec_count = 0; in derlc()
121 s16 block[8 * 8 + 16]; in derlc()
130 * Now de-compress, it expands one byte to up to 15 bytes in derlc()
134 * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to in derlc()
137 while (dec_count < 8 * 8) { in derlc()
145 length = in & 0xf; in derlc()
150 for (i = 0; i < 64 - dec_count; i++) in derlc()
151 *wp++ = 0; in derlc()
155 for (i = 0; i < length; i++) in derlc()
156 *wp++ = 0; in derlc()
163 for (i = 0; i < 64; i++) { in derlc()
165 int y = pos / 8; in derlc()
166 int x = pos % 8; in derlc()
168 dwht_out[x + y * 8] = *wp++; in derlc()
182 2, 2, 3, 6, 6, 6, 6, 8,
201 for (j = 0; j < 8; j++) { in quantize_intra()
202 for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) { in quantize_intra()
204 if (*coeff >= -qp && *coeff <= qp) in quantize_intra()
205 *coeff = *de_coeff = 0; in quantize_intra()
217 for (j = 0; j < 8; j++) in dequantize_intra()
218 for (i = 0; i < 8; i++, quant++, coeff++) in dequantize_intra()
227 for (j = 0; j < 8; j++) { in quantize_inter()
228 for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) { in quantize_inter()
230 if (*coeff >= -qp && *coeff <= qp) in quantize_inter()
231 *coeff = *de_coeff = 0; in quantize_inter()
243 for (j = 0; j < 8; j++) in dequantize_inter()
244 for (i = 0; i < 8; i++, quant++, coeff++) in dequantize_inter()
252 /* we'll need more than 8 bits for the transformed coefficients */ in fwht()
253 s32 workspace1[8], workspace2[8]; in fwht()
256 int add = intra ? 256 : 0; in fwht()
260 for (i = 0; i < 8; i++, tmp += stride, out += 8) { in fwht()
263 workspace1[0] = tmp[0] + tmp[1] - add; in fwht()
264 workspace1[1] = tmp[0] - tmp[1]; in fwht()
266 workspace1[2] = tmp[2] + tmp[3] - add; in fwht()
267 workspace1[3] = tmp[2] - tmp[3]; in fwht()
269 workspace1[4] = tmp[4] + tmp[5] - add; in fwht()
270 workspace1[5] = tmp[4] - tmp[5]; in fwht()
272 workspace1[6] = tmp[6] + tmp[7] - add; in fwht()
273 workspace1[7] = tmp[6] - tmp[7]; in fwht()
276 workspace1[0] = tmp[0] + tmp[2] - add; in fwht()
277 workspace1[1] = tmp[0] - tmp[2]; in fwht()
279 workspace1[2] = tmp[4] + tmp[6] - add; in fwht()
280 workspace1[3] = tmp[4] - tmp[6]; in fwht()
282 workspace1[4] = tmp[8] + tmp[10] - add; in fwht()
283 workspace1[5] = tmp[8] - tmp[10]; in fwht()
285 workspace1[6] = tmp[12] + tmp[14] - add; in fwht()
286 workspace1[7] = tmp[12] - tmp[14]; in fwht()
289 workspace1[0] = tmp[0] + tmp[3] - add; in fwht()
290 workspace1[1] = tmp[0] - tmp[3]; in fwht()
292 workspace1[2] = tmp[6] + tmp[9] - add; in fwht()
293 workspace1[3] = tmp[6] - tmp[9]; in fwht()
295 workspace1[4] = tmp[12] + tmp[15] - add; in fwht()
296 workspace1[5] = tmp[12] - tmp[15]; in fwht()
298 workspace1[6] = tmp[18] + tmp[21] - add; in fwht()
299 workspace1[7] = tmp[18] - tmp[21]; in fwht()
302 workspace1[0] = tmp[0] + tmp[4] - add; in fwht()
303 workspace1[1] = tmp[0] - tmp[4]; in fwht()
305 workspace1[2] = tmp[8] + tmp[12] - add; in fwht()
306 workspace1[3] = tmp[8] - tmp[12]; in fwht()
308 workspace1[4] = tmp[16] + tmp[20] - add; in fwht()
309 workspace1[5] = tmp[16] - tmp[20]; in fwht()
311 workspace1[6] = tmp[24] + tmp[28] - add; in fwht()
312 workspace1[7] = tmp[24] - tmp[28]; in fwht()
317 workspace2[0] = workspace1[0] + workspace1[2]; in fwht()
318 workspace2[1] = workspace1[0] - workspace1[2]; in fwht()
319 workspace2[2] = workspace1[1] - workspace1[3]; in fwht()
323 workspace2[5] = workspace1[4] - workspace1[6]; in fwht()
324 workspace2[6] = workspace1[5] - workspace1[7]; in fwht()
328 out[0] = workspace2[0] + workspace2[4]; in fwht()
329 out[1] = workspace2[0] - workspace2[4]; in fwht()
330 out[2] = workspace2[1] - workspace2[5]; in fwht()
333 out[5] = workspace2[2] - workspace2[6]; in fwht()
334 out[6] = workspace2[3] - workspace2[7]; in fwht()
340 for (i = 0; i < 8; i++, out++) { in fwht()
342 workspace1[0] = out[0] + out[1 * 8]; in fwht()
343 workspace1[1] = out[0] - out[1 * 8]; in fwht()
345 workspace1[2] = out[2 * 8] + out[3 * 8]; in fwht()
346 workspace1[3] = out[2 * 8] - out[3 * 8]; in fwht()
348 workspace1[4] = out[4 * 8] + out[5 * 8]; in fwht()
349 workspace1[5] = out[4 * 8] - out[5 * 8]; in fwht()
351 workspace1[6] = out[6 * 8] + out[7 * 8]; in fwht()
352 workspace1[7] = out[6 * 8] - out[7 * 8]; in fwht()
355 workspace2[0] = workspace1[0] + workspace1[2]; in fwht()
356 workspace2[1] = workspace1[0] - workspace1[2]; in fwht()
357 workspace2[2] = workspace1[1] - workspace1[3]; in fwht()
361 workspace2[5] = workspace1[4] - workspace1[6]; in fwht()
362 workspace2[6] = workspace1[5] - workspace1[7]; in fwht()
365 out[0 * 8] = workspace2[0] + workspace2[4]; in fwht()
366 out[1 * 8] = workspace2[0] - workspace2[4]; in fwht()
367 out[2 * 8] = workspace2[1] - workspace2[5]; in fwht()
368 out[3 * 8] = workspace2[1] + workspace2[5]; in fwht()
369 out[4 * 8] = workspace2[2] + workspace2[6]; in fwht()
370 out[5 * 8] = workspace2[2] - workspace2[6]; in fwht()
371 out[6 * 8] = workspace2[3] - workspace2[7]; in fwht()
372 out[7 * 8] = workspace2[3] + workspace2[7]; in fwht()
377 * Not the nicest way of doing it, but P-blocks get twice the range of
378 * that of the I-blocks. Therefore we need a type bigger than 8 bits.
385 /* we'll need more than 8 bits for the transformed coefficients */ in fwht16()
386 s32 workspace1[8], workspace2[8]; in fwht16()
391 for (i = 0; i < 8; i++, tmp += stride, out += 8) { in fwht16()
393 workspace1[0] = tmp[0] + tmp[1]; in fwht16()
394 workspace1[1] = tmp[0] - tmp[1]; in fwht16()
397 workspace1[3] = tmp[2] - tmp[3]; in fwht16()
400 workspace1[5] = tmp[4] - tmp[5]; in fwht16()
403 workspace1[7] = tmp[6] - tmp[7]; in fwht16()
406 workspace2[0] = workspace1[0] + workspace1[2]; in fwht16()
407 workspace2[1] = workspace1[0] - workspace1[2]; in fwht16()
408 workspace2[2] = workspace1[1] - workspace1[3]; in fwht16()
412 workspace2[5] = workspace1[4] - workspace1[6]; in fwht16()
413 workspace2[6] = workspace1[5] - workspace1[7]; in fwht16()
417 out[0] = workspace2[0] + workspace2[4]; in fwht16()
418 out[1] = workspace2[0] - workspace2[4]; in fwht16()
419 out[2] = workspace2[1] - workspace2[5]; in fwht16()
422 out[5] = workspace2[2] - workspace2[6]; in fwht16()
423 out[6] = workspace2[3] - workspace2[7]; in fwht16()
429 for (i = 0; i < 8; i++, out++) { in fwht16()
431 workspace1[0] = out[0] + out[1*8]; in fwht16()
432 workspace1[1] = out[0] - out[1*8]; in fwht16()
434 workspace1[2] = out[2*8] + out[3*8]; in fwht16()
435 workspace1[3] = out[2*8] - out[3*8]; in fwht16()
437 workspace1[4] = out[4*8] + out[5*8]; in fwht16()
438 workspace1[5] = out[4*8] - out[5*8]; in fwht16()
440 workspace1[6] = out[6*8] + out[7*8]; in fwht16()
441 workspace1[7] = out[6*8] - out[7*8]; in fwht16()
444 workspace2[0] = workspace1[0] + workspace1[2]; in fwht16()
445 workspace2[1] = workspace1[0] - workspace1[2]; in fwht16()
446 workspace2[2] = workspace1[1] - workspace1[3]; in fwht16()
450 workspace2[5] = workspace1[4] - workspace1[6]; in fwht16()
451 workspace2[6] = workspace1[5] - workspace1[7]; in fwht16()
455 out[0*8] = workspace2[0] + workspace2[4]; in fwht16()
456 out[1*8] = workspace2[0] - workspace2[4]; in fwht16()
457 out[2*8] = workspace2[1] - workspace2[5]; in fwht16()
458 out[3*8] = workspace2[1] + workspace2[5]; in fwht16()
459 out[4*8] = workspace2[2] + workspace2[6]; in fwht16()
460 out[5*8] = workspace2[2] - workspace2[6]; in fwht16()
461 out[6*8] = workspace2[3] - workspace2[7]; in fwht16()
462 out[7*8] = workspace2[3] + workspace2[7]; in fwht16()
470 * we'll need more than 8 bits for the transformed coefficients in ifwht()
473 int workspace1[8], workspace2[8]; in ifwht()
474 int inter = intra ? 0 : 1; in ifwht()
479 for (i = 0; i < 8; i++, tmp += 8, out += 8) { in ifwht()
481 workspace1[0] = tmp[0] + tmp[1]; in ifwht()
482 workspace1[1] = tmp[0] - tmp[1]; in ifwht()
485 workspace1[3] = tmp[2] - tmp[3]; in ifwht()
488 workspace1[5] = tmp[4] - tmp[5]; in ifwht()
491 workspace1[7] = tmp[6] - tmp[7]; in ifwht()
494 workspace2[0] = workspace1[0] + workspace1[2]; in ifwht()
495 workspace2[1] = workspace1[0] - workspace1[2]; in ifwht()
496 workspace2[2] = workspace1[1] - workspace1[3]; in ifwht()
500 workspace2[5] = workspace1[4] - workspace1[6]; in ifwht()
501 workspace2[6] = workspace1[5] - workspace1[7]; in ifwht()
505 out[0] = workspace2[0] + workspace2[4]; in ifwht()
506 out[1] = workspace2[0] - workspace2[4]; in ifwht()
507 out[2] = workspace2[1] - workspace2[5]; in ifwht()
510 out[5] = workspace2[2] - workspace2[6]; in ifwht()
511 out[6] = workspace2[3] - workspace2[7]; in ifwht()
517 for (i = 0; i < 8; i++, out++) { in ifwht()
519 workspace1[0] = out[0] + out[1 * 8]; in ifwht()
520 workspace1[1] = out[0] - out[1 * 8]; in ifwht()
522 workspace1[2] = out[2 * 8] + out[3 * 8]; in ifwht()
523 workspace1[3] = out[2 * 8] - out[3 * 8]; in ifwht()
525 workspace1[4] = out[4 * 8] + out[5 * 8]; in ifwht()
526 workspace1[5] = out[4 * 8] - out[5 * 8]; in ifwht()
528 workspace1[6] = out[6 * 8] + out[7 * 8]; in ifwht()
529 workspace1[7] = out[6 * 8] - out[7 * 8]; in ifwht()
532 workspace2[0] = workspace1[0] + workspace1[2]; in ifwht()
533 workspace2[1] = workspace1[0] - workspace1[2]; in ifwht()
534 workspace2[2] = workspace1[1] - workspace1[3]; in ifwht()
538 workspace2[5] = workspace1[4] - workspace1[6]; in ifwht()
539 workspace2[6] = workspace1[5] - workspace1[7]; in ifwht()
546 out[0 * 8] = workspace2[0] + workspace2[4]; in ifwht()
547 out[1 * 8] = workspace2[0] - workspace2[4]; in ifwht()
548 out[2 * 8] = workspace2[1] - workspace2[5]; in ifwht()
549 out[3 * 8] = workspace2[1] + workspace2[5]; in ifwht()
550 out[4 * 8] = workspace2[2] + workspace2[6]; in ifwht()
551 out[5 * 8] = workspace2[2] - workspace2[6]; in ifwht()
552 out[6 * 8] = workspace2[3] - workspace2[7]; in ifwht()
553 out[7 * 8] = workspace2[3] + workspace2[7]; in ifwht()
555 for (d = 0; d < 8; d++) in ifwht()
556 out[8 * d] >>= 6; in ifwht()
560 out[0 * 8] = workspace2[0] + workspace2[4]; in ifwht()
561 out[1 * 8] = workspace2[0] - workspace2[4]; in ifwht()
562 out[2 * 8] = workspace2[1] - workspace2[5]; in ifwht()
563 out[3 * 8] = workspace2[1] + workspace2[5]; in ifwht()
564 out[4 * 8] = workspace2[2] + workspace2[6]; in ifwht()
565 out[5 * 8] = workspace2[2] - workspace2[6]; in ifwht()
566 out[6 * 8] = workspace2[3] - workspace2[7]; in ifwht()
567 out[7 * 8] = workspace2[3] + workspace2[7]; in ifwht()
569 for (d = 0; d < 8; d++) { in ifwht()
570 out[8 * d] >>= 6; in ifwht()
571 out[8 * d] += 128; in ifwht()
582 for (i = 0; i < 8; i++) { in fill_encoder_block()
583 for (j = 0; j < 8; j++, input += input_step) in fill_encoder_block()
585 input += stride - 8 * input_step; in fill_encoder_block()
591 int32_t mean = 0; in var_intra()
592 int32_t ret = 0; in var_intra()
596 for (i = 0; i < 8 * 8; i++, tmp++) in var_intra()
600 for (i = 0; i < 8 * 8; i++, tmp++) in var_intra()
601 ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean); in var_intra()
607 int32_t ret = 0; in var_inter()
610 for (i = 0; i < 8 * 8; i++, old++, new++) in var_inter()
611 ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new); in var_inter()
627 fill_encoder_block(reference, old, 8, 1); in decide_blocktype()
630 for (k = 0; k < 8; k++) { in decide_blocktype()
631 for (l = 0; l < 8; l++) { in decide_blocktype()
632 *deltablock = *work - *reference; in decide_blocktype()
638 deltablock -= 64; in decide_blocktype()
648 for (i = 0; i < 8; i++) { in fill_decoder_block()
649 for (j = 0; j < 8; j++, input++, dst += dst_step) { in fill_decoder_block()
650 if (*input < 0) in fill_decoder_block()
651 *dst = 0; in fill_decoder_block()
657 dst += stride - (8 * dst_step); in fill_decoder_block()
666 for (k = 0; k < 8; k++) { in add_deltas()
667 for (l = 0; l < 8; l++) { in add_deltas()
674 if (*deltas < 0) in add_deltas()
675 *deltas = 0; in add_deltas()
680 ref += stride - (8 * ref_step); in add_deltas()
693 u32 encoding = 0; in encode_plane()
694 unsigned int last_size = 0; in encode_plane()
697 width = round_up(width, 8); in encode_plane()
698 height = round_up(height, 8); in encode_plane()
700 for (j = 0; j < height / 8; j++) { in encode_plane()
701 input = input_start + j * 8 * stride; in encode_plane()
702 for (i = 0; i < width / 8; i++) { in encode_plane()
711 fwht(input, cf->coeffs, stride, input_step, 1); in encode_plane()
712 quantize_intra(cf->coeffs, cf->de_coeffs, in encode_plane()
713 cf->i_frame_qp); in encode_plane()
717 fwht16(deltablock, cf->coeffs, 8, 0); in encode_plane()
718 quantize_inter(cf->coeffs, cf->de_coeffs, in encode_plane()
719 cf->p_frame_qp); in encode_plane()
722 ifwht(cf->de_coeffs, cf->de_fwht, blocktype); in encode_plane()
725 add_deltas(cf->de_fwht, refp, 8, 1); in encode_plane()
726 fill_decoder_block(refp, cf->de_fwht, 8, 1); in encode_plane()
729 input += 8 * input_step; in encode_plane()
730 refp += 8 * 8; in encode_plane()
732 size = rlc(cf->coeffs, *rlco, blocktype); in encode_plane()
734 !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) { in encode_plane()
735 __be16 *last_rlco = *rlco - size; in encode_plane()
762 * header, so when we copy the YUV data we replace 0xff in encode_plane()
763 * by 0xfe. Since YUV is limited range such values in encode_plane()
766 for (j = 0; j < height; j++) { in encode_plane()
767 for (i = 0, p = input; i < width; i++, p += input_step) in encode_plane()
768 *out++ = (*p == 0xff) ? 0xfe : *p; in encode_plane()
785 __be16 *rlco = cf->rlc_data; in fwht_encode_frame()
789 rlco_max = rlco + size / 2 - 256; in fwht_encode_frame()
790 encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf, in fwht_encode_frame()
792 frm->luma_alpha_step, is_intra, next_is_intra); in fwht_encode_frame()
797 if (frm->components_num >= 3) { in fwht_encode_frame()
798 u32 chroma_h = height / frm->height_div; in fwht_encode_frame()
799 u32 chroma_w = width / frm->width_div; in fwht_encode_frame()
802 rlco_max = rlco + chroma_size / 2 - 256; in fwht_encode_frame()
803 encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max, in fwht_encode_frame()
805 chroma_stride, frm->chroma_step, in fwht_encode_frame()
810 rlco_max = rlco + chroma_size / 2 - 256; in fwht_encode_frame()
811 encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max, in fwht_encode_frame()
813 chroma_stride, frm->chroma_step, in fwht_encode_frame()
820 if (frm->components_num == 4) { in fwht_encode_frame()
821 rlco_max = rlco + size / 2 - 256; in fwht_encode_frame()
822 encoding |= encode_plane(frm->alpha, ref_frm->alpha, &rlco, in fwht_encode_frame()
824 stride, frm->luma_alpha_step, in fwht_encode_frame()
831 cf->size = (rlco - cf->rlc_data) * sizeof(*rlco); in fwht_encode_frame()
841 unsigned int copies = 0; in decode_plane()
842 s16 copy[8 * 8]; in decode_plane()
847 width = round_up(width, 8); in decode_plane()
848 height = round_up(height, 8); in decode_plane()
855 for (i = 0; i < height; i++) { in decode_plane()
865 * by 65 * 2 bytes worst-case. in decode_plane()
869 for (j = 0; j < height / 8; j++) { in decode_plane()
870 for (i = 0; i < width / 8; i++) { in decode_plane()
871 const u8 *refp = ref + j * 8 * ref_stride + in decode_plane()
872 i * 8 * ref_step; in decode_plane()
873 u8 *dstp = dst + j * 8 * dst_stride + i * 8 * dst_step; in decode_plane()
876 memcpy(cf->de_fwht, copy, sizeof(copy)); in decode_plane()
878 add_deltas(cf->de_fwht, refp, in decode_plane()
880 fill_decoder_block(dstp, cf->de_fwht, in decode_plane()
882 copies--; in decode_plane()
886 stat = derlc(rlco, cf->coeffs, end_of_rlco_buf); in decode_plane()
890 dequantize_inter(cf->coeffs); in decode_plane()
892 dequantize_intra(cf->coeffs); in decode_plane()
894 ifwht(cf->coeffs, cf->de_fwht, in decode_plane()
895 ((stat & PFRAME_BIT) && !is_intra) ? 0 : 1); in decode_plane()
899 memcpy(copy, cf->de_fwht, sizeof(copy)); in decode_plane()
901 add_deltas(cf->de_fwht, refp, in decode_plane()
903 fill_decoder_block(dstp, cf->de_fwht, dst_stride, in decode_plane()
917 const __be16 *rlco = cf->rlc_data; in fwht_decode_frame()
918 const __be16 *end_of_rlco_buf = cf->rlc_data + in fwht_decode_frame()
919 (cf->size / sizeof(*rlco)) - 1; in fwht_decode_frame()
921 if (!decode_plane(cf, &rlco, height, width, ref->luma, ref_stride, in fwht_decode_frame()
922 ref->luma_alpha_step, dst->luma, dst_stride, in fwht_decode_frame()
923 dst->luma_alpha_step, in fwht_decode_frame()
937 if (!decode_plane(cf, &rlco, h, w, ref->cb, ref_chroma_stride, in fwht_decode_frame()
938 ref->chroma_step, dst->cb, dst_chroma_stride, in fwht_decode_frame()
939 dst->chroma_step, in fwht_decode_frame()
943 if (!decode_plane(cf, &rlco, h, w, ref->cr, ref_chroma_stride, in fwht_decode_frame()
944 ref->chroma_step, dst->cr, dst_chroma_stride, in fwht_decode_frame()
945 dst->chroma_step, in fwht_decode_frame()
952 if (!decode_plane(cf, &rlco, height, width, ref->alpha, ref_stride, in fwht_decode_frame()
953 ref->luma_alpha_step, dst->alpha, dst_stride, in fwht_decode_frame()
954 dst->luma_alpha_step, in fwht_decode_frame()