Bug Summary

File:root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
Warning:line 4445, column 35
The result of left shift is undefined because the right operand is not smaller than 32, the capacity of 'int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name highbd_inv_txfm_sse4.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -relaxed-aliasing -ffp-contract=off -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -target-feature +sse4.1 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/root/firefox-clang/obj-x86_64-pc-linux-gnu/media/libaom -fcoverage-compilation-dir=/root/firefox-clang/obj-x86_64-pc-linux-gnu/media/libaom -resource-dir /usr/lib/llvm-21/lib/clang/21 -include /root/firefox-clang/obj-x86_64-pc-linux-gnu/mozilla-config.h -U _FORTIFY_SOURCE -D _FORTIFY_SOURCE=2 -D _GLIBCXX_ASSERTIONS -D DEBUG=1 -D MOZ_HAS_MOZGLUE -I /root/firefox-clang/media/libaom -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/media/libaom -I /root/firefox-clang/media/libaom/config/linux/x64 -I /root/firefox-clang/media/libaom/config -I /root/firefox-clang/third_party/aom -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include/nspr -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include/nss -D MOZILLA_CLIENT -internal-isystem /usr/lib/llvm-21/lib/clang/21/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-error=tautological-type-limit-compare -Wno-range-loop-analysis -Wno-error=deprecated-declarations -Wno-error=array-bounds -Wno-error=free-nonheap-object -Wno-error=atomic-alignment -Wno-error=deprecated-builtins -Wno-psabi -Wno-error=builtin-macro-redefined -Wno-unknown-warning-option -Wno-sign-compare -Wno-unused-function -Wno-unreachable-code -Wno-unneeded-internal-declaration -ferror-limit 19 -fstrict-flex-arrays=1 -stack-protector 2 -fstack-clash-protection -ftrivial-auto-var-init=pattern -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -vectorize-loops -vectorize-slp -analyzer-checker optin.performance.Padding -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2025-06-27-100320-3286336-1 -x c /root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
1/*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11#include <assert.h>
12#include <smmintrin.h> /* SSE4.1 */
13
14#include "config/aom_config.h"
15#include "config/av1_rtcd.h"
16
17#include "av1/common/av1_inv_txfm1d_cfg.h"
18#include "av1/common/idct.h"
19#include "av1/common/x86/av1_inv_txfm_ssse3.h"
20#include "av1/common/x86/av1_txfm_sse2.h"
21#include "av1/common/x86/av1_txfm_sse4.h"
22#include "av1/common/x86/highbd_txfm_utility_sse4.h"
23
24static inline __m128i highbd_clamp_epi16(__m128i u, int bd) {
25 const __m128i zero = _mm_setzero_si128();
26 const __m128i one = _mm_set1_epi16(1);
27 const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
28 __m128i clamped, mask;
29
30 mask = _mm_cmpgt_epi16(u, max);
31 clamped = _mm_andnot_si128(mask, u);
32 mask = _mm_and_si128(mask, max);
33 clamped = _mm_or_si128(mask, clamped);
34 mask = _mm_cmpgt_epi16(clamped, zero);
35 clamped = _mm_and_si128(clamped, mask);
36
37 return clamped;
38}
39
40static inline void round_shift_4x4(__m128i *in, int shift) {
41 if (shift != 0) {
42 __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
43 in[0] = _mm_add_epi32(in[0], rnding);
44 in[1] = _mm_add_epi32(in[1], rnding);
45 in[2] = _mm_add_epi32(in[2], rnding);
46 in[3] = _mm_add_epi32(in[3], rnding);
47
48 in[0] = _mm_srai_epi32(in[0], shift);
49 in[1] = _mm_srai_epi32(in[1], shift);
50 in[2] = _mm_srai_epi32(in[2], shift);
51 in[3] = _mm_srai_epi32(in[3], shift);
52 }
53}
54
55static void round_shift_8x8(__m128i *in, int shift) {
56 round_shift_4x4(&in[0], shift);
57 round_shift_4x4(&in[4], shift);
58 round_shift_4x4(&in[8], shift);
59 round_shift_4x4(&in[12], shift);
60}
61
62static void highbd_clamp_epi32_sse4_1(__m128i *in, __m128i *out,
63 const __m128i *clamp_lo,
64 const __m128i *clamp_hi, int size) {
65 __m128i a0, a1;
66 for (int i = 0; i < size; i += 4) {
67 a0 = _mm_max_epi32(in[i], *clamp_lo);
68 out[i] = _mm_min_epi32(a0, *clamp_hi);
69
70 a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
71 out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
72
73 a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
74 out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
75
76 a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
77 out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
78 }
79}
80
81static inline __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
82 __m128i res0, __m128i res1,
83 const int bd) {
84 __m128i x0 = _mm_cvtepi16_epi32(pred);
85 __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8)((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i
)(pred), (int)(8)))
);
86 __m128i min_clip_val = _mm_setzero_si128();
87 __m128i max_clip_val = _mm_set1_epi32((1 << bd) - 1);
88 x0 = _mm_add_epi32(res0, x0);
89 x1 = _mm_add_epi32(res1, x1);
90 x0 = _mm_max_epi32(x0, min_clip_val);
91 x0 = _mm_min_epi32(x0, max_clip_val);
92 x1 = _mm_max_epi32(x1, min_clip_val);
93 x1 = _mm_min_epi32(x1, max_clip_val);
94 x0 = _mm_packus_epi32(x0, x1);
95 return x0;
96}
97
98static inline __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred,
99 __m128i res0, const int bd) {
100 __m128i x0 = _mm_cvtepi16_epi32(pred);
101
102 x0 = _mm_add_epi32(res0, x0);
103 x0 = _mm_packus_epi32(x0, x0);
104 x0 = highbd_clamp_epi16(x0, bd);
105 return x0;
106}
107
108static inline void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output,
109 int stride, int flipud,
110 int height, const int bd) {
111 int j = flipud ? (height - 1) : 0;
112 const int step = flipud ? -1 : 1;
113 for (int i = 0; i < height; ++i, j += step) {
114 __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
115 __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd);
116
117 _mm_storel_epi64((__m128i *)(output + i * stride), u);
118 }
119}
120
121static inline void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
122 int stride, int flipud,
123 int height, const int bd) {
124 int j = flipud ? (height - 1) : 0;
125 const int step = flipud ? -1 : 1;
126 for (int i = 0; i < height; ++i, j += step) {
127 __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
128 __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
129
130 _mm_storeu_si128((__m128i *)(output + i * stride), u);
131 }
132}
133
134static inline void load_buffer_32bit_input(const int32_t *in, int stride,
135 __m128i *out, int out_size) {
136 for (int i = 0; i < out_size; ++i) {
137 out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
138 }
139}
140
141static inline void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
142 in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
143 in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
144 in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
145 in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
146}
147
148void av1_highbd_iwht4x4_16_add_sse4_1(const tran_low_t *input, uint8_t *dest8,
149 int stride, int bd) {
150 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
151 0.5 shifts per pixel. */
152 __m128i op[4];
153 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8)((uint16_t *)(((uintptr_t)(dest8)) << 1));
154
155 load_buffer_4x4(input, op);
156
157 // Shift before-hand.
158 op[0] = _mm_srai_epi32(op[0], UNIT_QUANT_SHIFT2);
159 op[1] = _mm_srai_epi32(op[1], UNIT_QUANT_SHIFT2);
160 op[2] = _mm_srai_epi32(op[2], UNIT_QUANT_SHIFT2);
161 op[3] = _mm_srai_epi32(op[3], UNIT_QUANT_SHIFT2);
162
163 for (int i = 0; i < 2; ++i) {
164 __m128i a1 = op[0];
165 __m128i c1 = op[1];
166 __m128i d1 = op[2];
167 __m128i b1 = op[3];
168 a1 = _mm_add_epi32(a1, c1); // a1 += c1
169 d1 = _mm_sub_epi32(d1, b1); // d1 -= b1
170 __m128i e1 = _mm_sub_epi32(a1, d1); // e1 = (a1 - d1) >> 1
171 e1 = _mm_srai_epi32(e1, 1);
172 b1 = _mm_sub_epi32(e1, b1); // b1 = e1 - b1
173 c1 = _mm_sub_epi32(e1, c1); // c1 = e1 - c1
174 a1 = _mm_sub_epi32(a1, b1); // a1 -= b1
175 d1 = _mm_add_epi32(d1, c1); // d1 += c1
176
177 op[0] = a1;
178 op[1] = b1;
179 op[2] = c1;
180 op[3] = d1;
181 if (i == 0) {
182 transpose_32bit_4x4(op, op);
183 }
184 }
185
186 // Convert to int16_t. The C code checks that we are in range.
187 op[0] = _mm_packs_epi32(op[0], op[1]);
188 op[1] = _mm_packs_epi32(op[2], op[3]);
189
190 // Load uint16_t.
191 __m128i dst[2];
192 __m128i tmp[4];
193 tmp[0] = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
194 tmp[1] = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
195 dst[0] = _mm_unpacklo_epi64(tmp[0], tmp[1]);
196 tmp[2] = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
197 tmp[3] = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
198 dst[1] = _mm_unpacklo_epi64(tmp[2], tmp[3]);
199
200 // Add to the previous results.
201 dst[0] = _mm_add_epi16(dst[0], op[0]);
202 dst[1] = _mm_add_epi16(dst[1], op[1]);
203
204 // Clamp.
205 dst[0] = highbd_clamp_epi16(dst[0], bd);
206 dst[1] = highbd_clamp_epi16(dst[1], bd);
207
208 // Store.
209 _mm_storel_epi64((__m128i *)(dest + 0 * stride), dst[0]);
210 dst[0] = _mm_srli_si128(dst[0], 8)((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i
)(dst[0]), (int)(8)))
;
211 _mm_storel_epi64((__m128i *)(dest + 1 * stride), dst[0]);
212 _mm_storel_epi64((__m128i *)(dest + 2 * stride), dst[1]);
213 dst[1] = _mm_srli_si128(dst[1], 8)((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i
)(dst[1]), (int)(8)))
;
214 _mm_storel_epi64((__m128i *)(dest + 3 * stride), dst[1]);
215}
216
217static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
218 __m128i *out1, const __m128i *clamp_lo,
219 const __m128i *clamp_hi) {
220 __m128i a0 = _mm_add_epi32(in0, in1);
221 __m128i a1 = _mm_sub_epi32(in0, in1);
222
223 a0 = _mm_max_epi32(a0, *clamp_lo);
224 a0 = _mm_min_epi32(a0, *clamp_hi);
225 a1 = _mm_max_epi32(a1, *clamp_lo);
226 a1 = _mm_min_epi32(a1, *clamp_hi);
227
228 *out0 = a0;
229 *out1 = a1;
230}
231
232static void shift_and_clamp_sse4_1(__m128i *in0, __m128i *in1,
233 const __m128i *clamp_lo,
234 const __m128i *clamp_hi, int shift) {
235 __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
236 __m128i in0_w_offset = _mm_add_epi32(*in0, offset);
237 __m128i in1_w_offset = _mm_add_epi32(*in1, offset);
238
239 in0_w_offset = _mm_sra_epi32(in0_w_offset, _mm_cvtsi32_si128(shift));
240 in1_w_offset = _mm_sra_epi32(in1_w_offset, _mm_cvtsi32_si128(shift));
241
242 in0_w_offset = _mm_max_epi32(in0_w_offset, *clamp_lo);
243 in0_w_offset = _mm_min_epi32(in0_w_offset, *clamp_hi);
244 in1_w_offset = _mm_max_epi32(in1_w_offset, *clamp_lo);
245 in1_w_offset = _mm_min_epi32(in1_w_offset, *clamp_hi);
246
247 *in0 = in0_w_offset;
248 *in1 = in1_w_offset;
249}
250
251static inline void idct32_stage4_sse4_1(
252 __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56,
253 const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40,
254 const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24,
255 const __m128i *rounding, int bit) {
256 __m128i temp1, temp2;
257 temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
258 bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
259 bf1[17] = temp1;
260
261 temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
262 bf1[29] =
263 half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
264 bf1[18] = temp2;
265
266 temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
267 bf1[26] =
268 half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
269 bf1[21] = temp1;
270
271 temp2 =
272 half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
273 bf1[25] =
274 half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
275 bf1[22] = temp2;
276}
277
278static inline void idct32_stage5_sse4_1(
279 __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48,
280 const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo,
281 const __m128i *clamp_hi, const __m128i *rounding, int bit) {
282 __m128i temp1, temp2;
283 temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
284 bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
285 bf1[9] = temp1;
286
287 temp2 =
288 half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
289 bf1[13] =
290 half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
291 bf1[10] = temp2;
292
293 addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
294 addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
295 addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
296 addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
297 addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
298 addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
299 addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
300 addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
301}
302
303static inline void idct32_stage6_sse4_1(
304 __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32,
305 const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
306 const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
307 const __m128i *rounding, int bit) {
308 __m128i temp1, temp2;
309 temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
310 bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
311 bf1[5] = temp1;
312
313 addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
314 addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
315 addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
316 addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
317
318 temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
319 bf1[29] =
320 half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
321 bf1[18] = temp1;
322 temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
323 bf1[28] =
324 half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
325 bf1[19] = temp2;
326 temp1 =
327 half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
328 bf1[27] =
329 half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
330 bf1[20] = temp1;
331 temp2 =
332 half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
333 bf1[26] =
334 half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
335 bf1[21] = temp2;
336}
337
338static inline void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
339 const __m128i *cospi32,
340 const __m128i *clamp_lo,
341 const __m128i *clamp_hi,
342 const __m128i *rounding, int bit) {
343 __m128i temp1, temp2;
344 addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
345 addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
346 addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
347 addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
348
349 temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
350 bf1[13] =
351 half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
352 bf1[10] = temp1;
353 temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
354 bf1[12] =
355 half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
356 bf1[11] = temp2;
357
358 addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
359 addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
360 addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
361 addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
362 addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
363 addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
364 addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
365 addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
366}
367
368static inline void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
369 const __m128i *cospi32,
370 const __m128i *clamp_lo,
371 const __m128i *clamp_hi,
372 const __m128i *rounding, int bit) {
373 __m128i temp1, temp2;
374 addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
375 addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
376 addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
377 addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
378 addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
379 addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
380 addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
381 addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
382
383 temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
384 bf1[27] =
385 half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
386 bf1[20] = temp1;
387 temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
388 bf1[26] =
389 half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
390 bf1[21] = temp2;
391 temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
392 bf1[25] =
393 half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
394 bf1[22] = temp1;
395 temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
396 bf1[24] =
397 half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
398 bf1[23] = temp2;
399}
400
401static inline void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
402 const int do_cols, const int bd,
403 const int out_shift,
404 const __m128i *clamp_lo,
405 const __m128i *clamp_hi) {
406 addsub_sse4_1(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
407 addsub_sse4_1(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
408 addsub_sse4_1(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
409 addsub_sse4_1(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
410 addsub_sse4_1(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
411 addsub_sse4_1(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
412 addsub_sse4_1(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
413 addsub_sse4_1(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
414 addsub_sse4_1(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
415 addsub_sse4_1(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
416 addsub_sse4_1(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
417 addsub_sse4_1(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
418 addsub_sse4_1(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
419 addsub_sse4_1(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
420 addsub_sse4_1(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
421 addsub_sse4_1(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
422
423 if (!do_cols) {
424 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
425 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
426 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
427 for (int i = 0; i < 32; i += 8) {
428 round_shift_4x4(out + i, out_shift);
429 round_shift_4x4(out + i + 4, out_shift);
430 }
431 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
432 }
433}
434
435static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
436 __m128i *out0, __m128i *out1,
437 const __m128i *clamp_lo, const __m128i *clamp_hi,
438 int shift) {
439 __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
440 __m128i a0 = _mm_add_epi32(offset, in0);
441 __m128i a1 = _mm_sub_epi32(offset, in1);
442
443 a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
444 a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
445
446 a0 = _mm_max_epi32(a0, *clamp_lo);
447 a0 = _mm_min_epi32(a0, *clamp_hi);
448 a1 = _mm_max_epi32(a1, *clamp_lo);
449 a1 = _mm_min_epi32(a1, *clamp_hi);
450
451 *out0 = a0;
452 *out1 = a1;
453}
454
455static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
456 int bd, int out_shift) {
457 const int32_t *cospi = cospi_arr(bit);
458 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
459 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
460 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
461 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
462 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
463 int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
464 __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
465 __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
466 __m128i u0, u1, u2, u3;
467 __m128i v0, v1, v2, v3, x, y;
468
469 // Stage 0
470 // Stage 1
471 // Stage 2
472 u0 = in[0];
473 u1 = in[1];
474 u2 = in[2];
475 u3 = in[3];
476
477 x = _mm_mullo_epi32(u0, cospi32);
478 y = _mm_mullo_epi32(u2, cospi32);
479 v0 = _mm_add_epi32(x, y);
480 v0 = _mm_add_epi32(v0, rnding);
481 v0 = _mm_srai_epi32(v0, bit);
482
483 v1 = _mm_sub_epi32(x, y);
484 v1 = _mm_add_epi32(v1, rnding);
485 v1 = _mm_srai_epi32(v1, bit);
486
487 x = _mm_mullo_epi32(u1, cospi48);
488 y = _mm_mullo_epi32(u3, cospim16);
489 v2 = _mm_add_epi32(x, y);
490 v2 = _mm_add_epi32(v2, rnding);
491 v2 = _mm_srai_epi32(v2, bit);
492
493 x = _mm_mullo_epi32(u1, cospi16);
494 y = _mm_mullo_epi32(u3, cospi48);
495 v3 = _mm_add_epi32(x, y);
496 v3 = _mm_add_epi32(v3, rnding);
497 v3 = _mm_srai_epi32(v3, bit);
498
499 // Stage 3
500 addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
501 addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
502
503 if (!do_cols) {
504 log_range = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
505 clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
506 clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
507
508 shift_and_clamp_sse4_1(out + 0, out + 3, &clamp_lo, &clamp_hi, out_shift);
509 shift_and_clamp_sse4_1(out + 1, out + 2, &clamp_lo, &clamp_hi, out_shift);
510 }
511}
512
513static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
514 int bd, int out_shift) {
515 const int32_t *sinpi = sinpi_arr(bit);
516 const __m128i zero = _mm_setzero_si128();
517 __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1));
518 rnding = _mm_unpacklo_epi32(rnding, zero);
519 const __m128i mul = _mm_set1_epi32(1 << 4);
520 const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
521 const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
522 const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
523 const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
524 __m128i t;
525 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
526 __m128i x0, x1, x2, x3;
527 __m128i u0, u1, u2, u3;
528 __m128i u0_low, u1_low, u2_low, u3_low;
529 __m128i u0_high, u1_high, u2_high, u3_high;
530
531 x0 = in[0];
532 x1 = in[1];
533 x2 = in[2];
534 x3 = in[3];
535
536 s0 = _mm_mullo_epi32(x0, sinpi1);
537 s1 = _mm_mullo_epi32(x0, sinpi2);
538 s2 = _mm_mullo_epi32(x1, sinpi3);
539 s3 = _mm_mullo_epi32(x2, sinpi4);
540 s4 = _mm_mullo_epi32(x2, sinpi1);
541 s5 = _mm_mullo_epi32(x3, sinpi2);
542 s6 = _mm_mullo_epi32(x3, sinpi4);
543 t = _mm_sub_epi32(x0, x2);
544 s7 = _mm_add_epi32(t, x3);
545
546 t = _mm_add_epi32(s0, s3);
547 s0 = _mm_add_epi32(t, s5);
548 t = _mm_sub_epi32(s1, s4);
549 s1 = _mm_sub_epi32(t, s6);
550 s3 = s2;
551 s2 = _mm_mullo_epi32(s7, sinpi3);
552
553 u0 = _mm_add_epi32(s0, s3);
554 u1 = _mm_add_epi32(s1, s3);
555 u2 = s2;
556 t = _mm_add_epi32(s0, s1);
557 u3 = _mm_sub_epi32(t, s3);
558
559 // u0
560 u0_low = _mm_mul_epi32(u0, mul);
561 u0_low = _mm_add_epi64(u0_low, rnding);
562
563 u0 = _mm_srli_si128(u0, 4)((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i
)(u0), (int)(4)))
;
564 u0_high = _mm_mul_epi32(u0, mul);
565 u0_high = _mm_add_epi64(u0_high, rnding);
566
567 u0_low = _mm_srli_si128(u0_low, 2)((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i
)(u0_low), (int)(2)))
;
568 u0_high = _mm_srli_si128(u0_high, 2)((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i
)(u0_high), (int)(2)))
;
569
570 u0 = _mm_unpacklo_epi32(u0_low, u0_high);
571 u0_high = _mm_unpackhi_epi32(u0_low, u0_high);
572 u0 = _mm_unpacklo_epi64(u0, u0_high);
573
574 // u1
575 u1_low = _mm_mul_epi32(u1, mul);
576 u1_low = _mm_add_epi64(u1_low, rnding);
577
578 u1 = _mm_srli_si128(u1, 4)((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i
)(u1), (int)(4)))
;
579 u1_high = _mm_mul_epi32(u1, mul);
580 u1_high = _mm_add_epi64(u1_high, rnding);
581
582 u1_low = _mm_srli_si128(u1_low, 2)((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i
)(u1_low), (int)(2)))
;
583 u1_high = _mm_srli_si128(u1_high, 2)((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i
)(u1_high), (int)(2)))
;
584
585 u1 = _mm_unpacklo_epi32(u1_low, u1_high);
586 u1_high = _mm_unpackhi_epi32(u1_low, u1_high);
587 u1 = _mm_unpacklo_epi64(u1, u1_high);
588
589 // u2
590 u2_low = _mm_mul_epi32(u2, mul);
591 u2_low = _mm_add_epi64(u2_low, rnding);
592
593 u2 = _mm_srli_si128(u2, 4)((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i
)(u2), (int)(4)))
;
594 u2_high = _mm_mul_epi32(u2, mul);
595 u2_high = _mm_add_epi64(u2_high, rnding);
596
597 u2_low = _mm_srli_si128(u2_low, 2)((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i
)(u2_low), (int)(2)))
;
598 u2_high = _mm_srli_si128(u2_high, 2)((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i
)(u2_high), (int)(2)))
;
599
600 u2 = _mm_unpacklo_epi32(u2_low, u2_high);
601 u2_high = _mm_unpackhi_epi32(u2_low, u2_high);
602 u2 = _mm_unpacklo_epi64(u2, u2_high);
603
604 // u3
605 u3_low = _mm_mul_epi32(u3, mul);
606 u3_low = _mm_add_epi64(u3_low, rnding);
607
608 u3 = _mm_srli_si128(u3, 4)((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i
)(u3), (int)(4)))
;
609 u3_high = _mm_mul_epi32(u3, mul);
610 u3_high = _mm_add_epi64(u3_high, rnding);
611
612 u3_low = _mm_srli_si128(u3_low, 2)((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i
)(u3_low), (int)(2)))
;
613 u3_high = _mm_srli_si128(u3_high, 2)((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i
)(u3_high), (int)(2)))
;
614
615 u3 = _mm_unpacklo_epi32(u3_low, u3_high);
616 u3_high = _mm_unpackhi_epi32(u3_low, u3_high);
617 u3 = _mm_unpacklo_epi64(u3, u3_high);
618
619 out[0] = u0;
620 out[1] = u1;
621 out[2] = u2;
622 out[3] = u3;
623
624 if (!do_cols) {
625 const int log_range = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
626 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
627 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
628 round_shift_4x4(out, out_shift);
629 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
630 }
631}
632
633static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
634 int fliplr, int flipud, int shift, int bd) {
635 const __m128i zero = _mm_setzero_si128();
636 __m128i u0, u1, u2, u3;
637 __m128i v0, v1, v2, v3;
638
639 round_shift_4x4(in, shift);
640
641 v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
642 v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
643 v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
644 v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
645
646 v0 = _mm_unpacklo_epi16(v0, zero);
647 v1 = _mm_unpacklo_epi16(v1, zero);
648 v2 = _mm_unpacklo_epi16(v2, zero);
649 v3 = _mm_unpacklo_epi16(v3, zero);
650
651 if (fliplr) {
652 in[0] = _mm_shuffle_epi32(in[0], 0x1B)((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(in[0]), (int
)(0x1B)))
;
653 in[1] = _mm_shuffle_epi32(in[1], 0x1B)((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(in[1]), (int
)(0x1B)))
;
654 in[2] = _mm_shuffle_epi32(in[2], 0x1B)((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(in[2]), (int
)(0x1B)))
;
655 in[3] = _mm_shuffle_epi32(in[3], 0x1B)((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(in[3]), (int
)(0x1B)))
;
656 }
657
658 if (flipud) {
659 u0 = _mm_add_epi32(in[3], v0);
660 u1 = _mm_add_epi32(in[2], v1);
661 u2 = _mm_add_epi32(in[1], v2);
662 u3 = _mm_add_epi32(in[0], v3);
663 } else {
664 u0 = _mm_add_epi32(in[0], v0);
665 u1 = _mm_add_epi32(in[1], v1);
666 u2 = _mm_add_epi32(in[2], v2);
667 u3 = _mm_add_epi32(in[3], v3);
668 }
669
670 v0 = _mm_packus_epi32(u0, u1);
671 v2 = _mm_packus_epi32(u2, u3);
672
673 u0 = highbd_clamp_epi16(v0, bd);
674 u2 = highbd_clamp_epi16(v2, bd);
675
676 v0 = _mm_unpacklo_epi64(u0, u0);
677 v1 = _mm_unpackhi_epi64(u0, u0);
678 v2 = _mm_unpacklo_epi64(u2, u2);
679 v3 = _mm_unpackhi_epi64(u2, u2);
680
681 _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
682 _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
683 _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
684 _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
685}
686
687static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
688 int bd, int out_shift) {
689 (void)bit;
690 __m128i zero = _mm_setzero_si128();
691 __m128i fact = _mm_set1_epi32(NewSqrt2);
692 __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits((int32_t)12) - 1));
693 __m128i a0_low, a1_low;
694 __m128i a0_high, a1_high;
695
696 offset = _mm_unpacklo_epi32(offset, zero);
697
698 for (int i = 0; i < 4; i++) {
699 a0_low = _mm_mul_epi32(in[i], fact);
700 a0_low = _mm_add_epi32(a0_low, offset);
701 a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits((int32_t)12));
702
703 a0_high = _mm_srli_si128(in[i], 4)((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i
)(in[i]), (int)(4)))
;
704 a0_high = _mm_mul_epi32(a0_high, fact);
705 a0_high = _mm_add_epi32(a0_high, offset);
706 a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits((int32_t)12));
707
708 a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
709 a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
710 out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
711 }
712
713 if (!do_cols) {
714 const int log_range = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
715 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
716 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
717 round_shift_4x4(out, out_shift);
718 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
719 }
720}
721void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output,
722 int stride, TX_TYPE tx_type, int bd) {
723 __m128i in[4];
724 const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
725
726 switch (tx_type) {
727 case DCT_DCT:
728 load_buffer_4x4(input, in);
729 idct4x4_sse4_1(in, in, INV_COS_BIT12, 0, bd, 0);
730 transpose_32bit_4x4(in, in);
731 idct4x4_sse4_1(in, in, INV_COS_BIT12, 1, bd, 0);
732 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
733 break;
734 case ADST_DCT:
735 load_buffer_4x4(input, in);
736 idct4x4_sse4_1(in, in, INV_COS_BIT12, 0, bd, 0);
737 transpose_32bit_4x4(in, in);
738 iadst4x4_sse4_1(in, in, INV_COS_BIT12, 1, bd, 0);
739 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
740 break;
741 case DCT_ADST:
742 load_buffer_4x4(input, in);
743 iadst4x4_sse4_1(in, in, INV_COS_BIT12, 0, bd, 0);
744 transpose_32bit_4x4(in, in);
745 idct4x4_sse4_1(in, in, INV_COS_BIT12, 1, bd, 0);
746 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
747 break;
748 case ADST_ADST:
749 load_buffer_4x4(input, in);
750 iadst4x4_sse4_1(in, in, INV_COS_BIT12, 0, bd, 0);
751 transpose_32bit_4x4(in, in);
752 iadst4x4_sse4_1(in, in, INV_COS_BIT12, 1, bd, 0);
753 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
754 break;
755 case FLIPADST_DCT:
756 load_buffer_4x4(input, in);
757 idct4x4_sse4_1(in, in, INV_COS_BIT12, 0, bd, 0);
758 transpose_32bit_4x4(in, in);
759 iadst4x4_sse4_1(in, in, INV_COS_BIT12, 1, bd, 0);
760 write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
761 break;
762 case DCT_FLIPADST:
763 load_buffer_4x4(input, in);
764 iadst4x4_sse4_1(in, in, INV_COS_BIT12, 0, bd, 0);
765 transpose_32bit_4x4(in, in);
766 idct4x4_sse4_1(in, in, INV_COS_BIT12, 1, bd, 0);
767 write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
768 break;
769 case FLIPADST_FLIPADST:
770 load_buffer_4x4(input, in);
771 iadst4x4_sse4_1(in, in, INV_COS_BIT12, 0, bd, 0);
772 transpose_32bit_4x4(in, in);
773 iadst4x4_sse4_1(in, in, INV_COS_BIT12, 1, bd, 0);
774 write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
775 break;
776 case ADST_FLIPADST:
777 load_buffer_4x4(input, in);
778 iadst4x4_sse4_1(in, in, INV_COS_BIT12, 0, bd, 0);
779 transpose_32bit_4x4(in, in);
780 iadst4x4_sse4_1(in, in, INV_COS_BIT12, 1, bd, 0);
781 write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
782 break;
783 case FLIPADST_ADST:
784 load_buffer_4x4(input, in);
785 iadst4x4_sse4_1(in, in, INV_COS_BIT12, 0, bd, 0);
786 transpose_32bit_4x4(in, in);
787 iadst4x4_sse4_1(in, in, INV_COS_BIT12, 1, bd, 0);
788 write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
789 break;
790 case IDTX:
791 load_buffer_4x4(input, in);
792 iidentity4_sse4_1(in, in, INV_COS_BIT12, 0, bd, 0);
793 transpose_32bit_4x4(in, in);
794 iidentity4_sse4_1(in, in, INV_COS_BIT12, 1, bd, 0);
795 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
796 break;
797 case V_DCT:
798 load_buffer_4x4(input, in);
799 iidentity4_sse4_1(in, in, INV_COS_BIT12, 0, bd, 0);
800 transpose_32bit_4x4(in, in);
801 idct4x4_sse4_1(in, in, INV_COS_BIT12, 1, bd, 0);
802 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
803 break;
804 case H_DCT:
805 load_buffer_4x4(input, in);
806 idct4x4_sse4_1(in, in, INV_COS_BIT12, 0, bd, 0);
807 transpose_32bit_4x4(in, in);
808 iidentity4_sse4_1(in, in, INV_COS_BIT12, 1, bd, 0);
809 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
810 break;
811 case V_ADST:
812 load_buffer_4x4(input, in);
813 iidentity4_sse4_1(in, in, INV_COS_BIT12, 0, bd, 0);
814 transpose_32bit_4x4(in, in);
815 iadst4x4_sse4_1(in, in, INV_COS_BIT12, 1, bd, 0);
816 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
817 break;
818 case H_ADST:
819 load_buffer_4x4(input, in);
820 iadst4x4_sse4_1(in, in, INV_COS_BIT12, 0, bd, 0);
821 transpose_32bit_4x4(in, in);
822 iidentity4_sse4_1(in, in, INV_COS_BIT12, 1, bd, 0);
823 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
824 break;
825 case V_FLIPADST:
826 load_buffer_4x4(input, in);
827 iidentity4_sse4_1(in, in, INV_COS_BIT12, 0, bd, 0);
828 transpose_32bit_4x4(in, in);
829 iadst4x4_sse4_1(in, in, INV_COS_BIT12, 1, bd, 0);
830 write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
831 break;
832 case H_FLIPADST:
833 load_buffer_4x4(input, in);
834 iadst4x4_sse4_1(in, in, INV_COS_BIT12, 0, bd, 0);
835 transpose_32bit_4x4(in, in);
836 iidentity4_sse4_1(in, in, INV_COS_BIT12, 1, bd, 0);
837 write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
838 break;
839 default: assert(0)((void) sizeof ((0) ? 1 : 0), __extension__ ({ if (0) ; else __assert_fail
("0", "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c"
, 839, __extension__ __PRETTY_FUNCTION__); }))
;
840 }
841}
842
843// 8x8
844static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
845 in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
846 in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
847 in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
848 in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
849 in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
850 in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
851 in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
852 in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
853 in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
854 in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
855 in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
856 in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
857 in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
858 in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
859 in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
860 in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
861}
862
863static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
864 int bd, int out_shift) {
865 const int32_t *cospi = cospi_arr(bit);
866 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
867 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
868 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
869 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
870 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
871 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
872 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
873 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
874 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
875 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
876 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
877 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
878 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
879 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
880 __m128i u0, u1, u2, u3, u4, u5, u6, u7;
881 __m128i v0, v1, v2, v3, v4, v5, v6, v7;
882 __m128i x, y;
883 int col;
884
885 // Note:
886 // Even column: 0, 2, ..., 14
887 // Odd column: 1, 3, ..., 15
888 // one even column plus one odd column constructs one row (8 coeffs)
889 // total we have 8 rows (8x8).
890 for (col = 0; col < 2; ++col) {
891 // stage 0
892 // stage 1
893 // stage 2
894 u0 = in[0 * 2 + col];
895 u1 = in[4 * 2 + col];
896 u2 = in[2 * 2 + col];
897 u3 = in[6 * 2 + col];
898
899 x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
900 y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
901 u4 = _mm_add_epi32(x, y);
902 u4 = _mm_add_epi32(u4, rnding);
903 u4 = _mm_srai_epi32(u4, bit);
904
905 x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
906 y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
907 u7 = _mm_add_epi32(x, y);
908 u7 = _mm_add_epi32(u7, rnding);
909 u7 = _mm_srai_epi32(u7, bit);
910
911 x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
912 y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
913 u5 = _mm_add_epi32(x, y);
914 u5 = _mm_add_epi32(u5, rnding);
915 u5 = _mm_srai_epi32(u5, bit);
916
917 x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
918 y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
919 u6 = _mm_add_epi32(x, y);
920 u6 = _mm_add_epi32(u6, rnding);
921 u6 = _mm_srai_epi32(u6, bit);
922
923 // stage 3
924 x = _mm_mullo_epi32(u0, cospi32);
925 y = _mm_mullo_epi32(u1, cospi32);
926 v0 = _mm_add_epi32(x, y);
927 v0 = _mm_add_epi32(v0, rnding);
928 v0 = _mm_srai_epi32(v0, bit);
929
930 v1 = _mm_sub_epi32(x, y);
931 v1 = _mm_add_epi32(v1, rnding);
932 v1 = _mm_srai_epi32(v1, bit);
933
934 x = _mm_mullo_epi32(u2, cospi48);
935 y = _mm_mullo_epi32(u3, cospim16);
936 v2 = _mm_add_epi32(x, y);
937 v2 = _mm_add_epi32(v2, rnding);
938 v2 = _mm_srai_epi32(v2, bit);
939
940 x = _mm_mullo_epi32(u2, cospi16);
941 y = _mm_mullo_epi32(u3, cospi48);
942 v3 = _mm_add_epi32(x, y);
943 v3 = _mm_add_epi32(v3, rnding);
944 v3 = _mm_srai_epi32(v3, bit);
945
946 addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
947 addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
948
949 // stage 4
950 addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
951 addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
952 u4 = v4;
953 u7 = v7;
954
955 x = _mm_mullo_epi32(v5, cospi32);
956 y = _mm_mullo_epi32(v6, cospi32);
957 u6 = _mm_add_epi32(y, x);
958 u6 = _mm_add_epi32(u6, rnding);
959 u6 = _mm_srai_epi32(u6, bit);
960
961 u5 = _mm_sub_epi32(y, x);
962 u5 = _mm_add_epi32(u5, rnding);
963 u5 = _mm_srai_epi32(u5, bit);
964
965 // stage 5
966 addsub_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo,
967 &clamp_hi);
968 addsub_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo,
969 &clamp_hi);
970 addsub_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo,
971 &clamp_hi);
972 addsub_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo,
973 &clamp_hi);
974 }
975
976 if (!do_cols) {
977 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
978 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
979 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
980 round_shift_8x8(out, out_shift);
981 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
982 }
983}
984
985static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
986 int bd, int out_shift) {
987 const int32_t *cospi = cospi_arr(bit);
988 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
989 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
990 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
991 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
992 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
993 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
994 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
995 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
996 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
997 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
998 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
999 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1000 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1001 const __m128i kZero = _mm_setzero_si128();
1002 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
1003 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1004 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1005 __m128i u[8], v[8], x;
1006
1007 // Even 8 points: 0, 2, ..., 14
1008 // stage 0
1009 // stage 1
1010 // stage 2
1011 // (1)
1012 u[0] = _mm_mullo_epi32(in[14], cospi4);
1013 x = _mm_mullo_epi32(in[0], cospi60);
1014 u[0] = _mm_add_epi32(u[0], x);
1015 u[0] = _mm_add_epi32(u[0], rnding);
1016 u[0] = _mm_srai_epi32(u[0], bit);
1017
1018 u[1] = _mm_mullo_epi32(in[14], cospi60);
1019 x = _mm_mullo_epi32(in[0], cospi4);
1020 u[1] = _mm_sub_epi32(u[1], x);
1021 u[1] = _mm_add_epi32(u[1], rnding);
1022 u[1] = _mm_srai_epi32(u[1], bit);
1023
1024 // (2)
1025 u[2] = _mm_mullo_epi32(in[10], cospi20);
1026 x = _mm_mullo_epi32(in[4], cospi44);
1027 u[2] = _mm_add_epi32(u[2], x);
1028 u[2] = _mm_add_epi32(u[2], rnding);
1029 u[2] = _mm_srai_epi32(u[2], bit);
1030
1031 u[3] = _mm_mullo_epi32(in[10], cospi44);
1032 x = _mm_mullo_epi32(in[4], cospi20);
1033 u[3] = _mm_sub_epi32(u[3], x);
1034 u[3] = _mm_add_epi32(u[3], rnding);
1035 u[3] = _mm_srai_epi32(u[3], bit);
1036
1037 // (3)
1038 u[4] = _mm_mullo_epi32(in[6], cospi36);
1039 x = _mm_mullo_epi32(in[8], cospi28);
1040 u[4] = _mm_add_epi32(u[4], x);
1041 u[4] = _mm_add_epi32(u[4], rnding);
1042 u[4] = _mm_srai_epi32(u[4], bit);
1043
1044 u[5] = _mm_mullo_epi32(in[6], cospi28);
1045 x = _mm_mullo_epi32(in[8], cospi36);
1046 u[5] = _mm_sub_epi32(u[5], x);
1047 u[5] = _mm_add_epi32(u[5], rnding);
1048 u[5] = _mm_srai_epi32(u[5], bit);
1049
1050 // (4)
1051 u[6] = _mm_mullo_epi32(in[2], cospi52);
1052 x = _mm_mullo_epi32(in[12], cospi12);
1053 u[6] = _mm_add_epi32(u[6], x);
1054 u[6] = _mm_add_epi32(u[6], rnding);
1055 u[6] = _mm_srai_epi32(u[6], bit);
1056
1057 u[7] = _mm_mullo_epi32(in[2], cospi12);
1058 x = _mm_mullo_epi32(in[12], cospi52);
1059 u[7] = _mm_sub_epi32(u[7], x);
1060 u[7] = _mm_add_epi32(u[7], rnding);
1061 u[7] = _mm_srai_epi32(u[7], bit);
1062
1063 // stage 3
1064 addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1065 addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1066 addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1067 addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1068
1069 // stage 4
1070 u[0] = v[0];
1071 u[1] = v[1];
1072 u[2] = v[2];
1073 u[3] = v[3];
1074
1075 u[4] = _mm_mullo_epi32(v[4], cospi16);
1076 x = _mm_mullo_epi32(v[5], cospi48);
1077 u[4] = _mm_add_epi32(u[4], x);
1078 u[4] = _mm_add_epi32(u[4], rnding);
1079 u[4] = _mm_srai_epi32(u[4], bit);
1080
1081 u[5] = _mm_mullo_epi32(v[4], cospi48);
1082 x = _mm_mullo_epi32(v[5], cospi16);
1083 u[5] = _mm_sub_epi32(u[5], x);
1084 u[5] = _mm_add_epi32(u[5], rnding);
1085 u[5] = _mm_srai_epi32(u[5], bit);
1086
1087 u[6] = _mm_mullo_epi32(v[6], cospim48);
1088 x = _mm_mullo_epi32(v[7], cospi16);
1089 u[6] = _mm_add_epi32(u[6], x);
1090 u[6] = _mm_add_epi32(u[6], rnding);
1091 u[6] = _mm_srai_epi32(u[6], bit);
1092
1093 u[7] = _mm_mullo_epi32(v[6], cospi16);
1094 x = _mm_mullo_epi32(v[7], cospim48);
1095 u[7] = _mm_sub_epi32(u[7], x);
1096 u[7] = _mm_add_epi32(u[7], rnding);
1097 u[7] = _mm_srai_epi32(u[7], bit);
1098
1099 // stage 5
1100 addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1101 addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1102 addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1103 addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1104
1105 // stage 6
1106 u[0] = v[0];
1107 u[1] = v[1];
1108 u[4] = v[4];
1109 u[5] = v[5];
1110
1111 v[0] = _mm_mullo_epi32(v[2], cospi32);
1112 x = _mm_mullo_epi32(v[3], cospi32);
1113 u[2] = _mm_add_epi32(v[0], x);
1114 u[2] = _mm_add_epi32(u[2], rnding);
1115 u[2] = _mm_srai_epi32(u[2], bit);
1116
1117 u[3] = _mm_sub_epi32(v[0], x);
1118 u[3] = _mm_add_epi32(u[3], rnding);
1119 u[3] = _mm_srai_epi32(u[3], bit);
1120
1121 v[0] = _mm_mullo_epi32(v[6], cospi32);
1122 x = _mm_mullo_epi32(v[7], cospi32);
1123 u[6] = _mm_add_epi32(v[0], x);
1124 u[6] = _mm_add_epi32(u[6], rnding);
1125 u[6] = _mm_srai_epi32(u[6], bit);
1126
1127 u[7] = _mm_sub_epi32(v[0], x);
1128 u[7] = _mm_add_epi32(u[7], rnding);
1129 u[7] = _mm_srai_epi32(u[7], bit);
1130
1131 // stage 7
1132 if (do_cols) {
1133 out[0] = u[0];
1134 out[2] = _mm_sub_epi32(kZero, u[4]);
1135 out[4] = u[6];
1136 out[6] = _mm_sub_epi32(kZero, u[2]);
1137 out[8] = u[3];
1138 out[10] = _mm_sub_epi32(kZero, u[7]);
1139 out[12] = u[5];
1140 out[14] = _mm_sub_epi32(kZero, u[1]);
1141 } else {
1142 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
1143 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1144 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1145
1146 neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
1147 out_shift);
1148 neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
1149 out_shift);
1150 neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out,
1151 &clamp_hi_out, out_shift);
1152 neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out,
1153 &clamp_hi_out, out_shift);
1154 }
1155
1156 // Odd 8 points: 1, 3, ..., 15
1157 // stage 0
1158 // stage 1
1159 // stage 2
1160 // (1)
1161 u[0] = _mm_mullo_epi32(in[15], cospi4);
1162 x = _mm_mullo_epi32(in[1], cospi60);
1163 u[0] = _mm_add_epi32(u[0], x);
1164 u[0] = _mm_add_epi32(u[0], rnding);
1165 u[0] = _mm_srai_epi32(u[0], bit);
1166
1167 u[1] = _mm_mullo_epi32(in[15], cospi60);
1168 x = _mm_mullo_epi32(in[1], cospi4);
1169 u[1] = _mm_sub_epi32(u[1], x);
1170 u[1] = _mm_add_epi32(u[1], rnding);
1171 u[1] = _mm_srai_epi32(u[1], bit);
1172
1173 // (2)
1174 u[2] = _mm_mullo_epi32(in[11], cospi20);
1175 x = _mm_mullo_epi32(in[5], cospi44);
1176 u[2] = _mm_add_epi32(u[2], x);
1177 u[2] = _mm_add_epi32(u[2], rnding);
1178 u[2] = _mm_srai_epi32(u[2], bit);
1179
1180 u[3] = _mm_mullo_epi32(in[11], cospi44);
1181 x = _mm_mullo_epi32(in[5], cospi20);
1182 u[3] = _mm_sub_epi32(u[3], x);
1183 u[3] = _mm_add_epi32(u[3], rnding);
1184 u[3] = _mm_srai_epi32(u[3], bit);
1185
1186 // (3)
1187 u[4] = _mm_mullo_epi32(in[7], cospi36);
1188 x = _mm_mullo_epi32(in[9], cospi28);
1189 u[4] = _mm_add_epi32(u[4], x);
1190 u[4] = _mm_add_epi32(u[4], rnding);
1191 u[4] = _mm_srai_epi32(u[4], bit);
1192
1193 u[5] = _mm_mullo_epi32(in[7], cospi28);
1194 x = _mm_mullo_epi32(in[9], cospi36);
1195 u[5] = _mm_sub_epi32(u[5], x);
1196 u[5] = _mm_add_epi32(u[5], rnding);
1197 u[5] = _mm_srai_epi32(u[5], bit);
1198
1199 // (4)
1200 u[6] = _mm_mullo_epi32(in[3], cospi52);
1201 x = _mm_mullo_epi32(in[13], cospi12);
1202 u[6] = _mm_add_epi32(u[6], x);
1203 u[6] = _mm_add_epi32(u[6], rnding);
1204 u[6] = _mm_srai_epi32(u[6], bit);
1205
1206 u[7] = _mm_mullo_epi32(in[3], cospi12);
1207 x = _mm_mullo_epi32(in[13], cospi52);
1208 u[7] = _mm_sub_epi32(u[7], x);
1209 u[7] = _mm_add_epi32(u[7], rnding);
1210 u[7] = _mm_srai_epi32(u[7], bit);
1211
1212 // stage 3
1213 addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1214 addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1215 addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1216 addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1217
1218 // stage 4
1219 u[0] = v[0];
1220 u[1] = v[1];
1221 u[2] = v[2];
1222 u[3] = v[3];
1223
1224 u[4] = _mm_mullo_epi32(v[4], cospi16);
1225 x = _mm_mullo_epi32(v[5], cospi48);
1226 u[4] = _mm_add_epi32(u[4], x);
1227 u[4] = _mm_add_epi32(u[4], rnding);
1228 u[4] = _mm_srai_epi32(u[4], bit);
1229
1230 u[5] = _mm_mullo_epi32(v[4], cospi48);
1231 x = _mm_mullo_epi32(v[5], cospi16);
1232 u[5] = _mm_sub_epi32(u[5], x);
1233 u[5] = _mm_add_epi32(u[5], rnding);
1234 u[5] = _mm_srai_epi32(u[5], bit);
1235
1236 u[6] = _mm_mullo_epi32(v[6], cospim48);
1237 x = _mm_mullo_epi32(v[7], cospi16);
1238 u[6] = _mm_add_epi32(u[6], x);
1239 u[6] = _mm_add_epi32(u[6], rnding);
1240 u[6] = _mm_srai_epi32(u[6], bit);
1241
1242 u[7] = _mm_mullo_epi32(v[6], cospi16);
1243 x = _mm_mullo_epi32(v[7], cospim48);
1244 u[7] = _mm_sub_epi32(u[7], x);
1245 u[7] = _mm_add_epi32(u[7], rnding);
1246 u[7] = _mm_srai_epi32(u[7], bit);
1247
1248 // stage 5
1249 addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1250 addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1251 addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1252 addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1253
1254 // stage 6
1255 u[0] = v[0];
1256 u[1] = v[1];
1257 u[4] = v[4];
1258 u[5] = v[5];
1259
1260 v[0] = _mm_mullo_epi32(v[2], cospi32);
1261 x = _mm_mullo_epi32(v[3], cospi32);
1262 u[2] = _mm_add_epi32(v[0], x);
1263 u[2] = _mm_add_epi32(u[2], rnding);
1264 u[2] = _mm_srai_epi32(u[2], bit);
1265
1266 u[3] = _mm_sub_epi32(v[0], x);
1267 u[3] = _mm_add_epi32(u[3], rnding);
1268 u[3] = _mm_srai_epi32(u[3], bit);
1269
1270 v[0] = _mm_mullo_epi32(v[6], cospi32);
1271 x = _mm_mullo_epi32(v[7], cospi32);
1272 u[6] = _mm_add_epi32(v[0], x);
1273 u[6] = _mm_add_epi32(u[6], rnding);
1274 u[6] = _mm_srai_epi32(u[6], bit);
1275
1276 u[7] = _mm_sub_epi32(v[0], x);
1277 u[7] = _mm_add_epi32(u[7], rnding);
1278 u[7] = _mm_srai_epi32(u[7], bit);
1279
1280 // stage 7
1281 if (do_cols) {
1282 out[1] = u[0];
1283 out[3] = _mm_sub_epi32(kZero, u[4]);
1284 out[5] = u[6];
1285 out[7] = _mm_sub_epi32(kZero, u[2]);
1286 out[9] = u[3];
1287 out[11] = _mm_sub_epi32(kZero, u[7]);
1288 out[13] = u[5];
1289 out[15] = _mm_sub_epi32(kZero, u[1]);
1290 } else {
1291 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
1292 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1293 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1294
1295 neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
1296 out_shift);
1297 neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
1298 out_shift);
1299 neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out,
1300 &clamp_hi_out, out_shift);
1301 neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out,
1302 &clamp_hi_out, out_shift);
1303 }
1304}
1305
1306static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1307 int bd, int out_shift) {
1308 (void)bit;
1309 out[0] = _mm_add_epi32(in[0], in[0]);
1310 out[1] = _mm_add_epi32(in[1], in[1]);
1311 out[2] = _mm_add_epi32(in[2], in[2]);
1312 out[3] = _mm_add_epi32(in[3], in[3]);
1313 out[4] = _mm_add_epi32(in[4], in[4]);
1314 out[5] = _mm_add_epi32(in[5], in[5]);
1315 out[6] = _mm_add_epi32(in[6], in[6]);
1316 out[7] = _mm_add_epi32(in[7], in[7]);
1317
1318 if (!do_cols) {
1319 const int log_range = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
1320 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1321 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1322 round_shift_4x4(out, out_shift);
1323 round_shift_4x4(out + 4, out_shift);
1324 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 8);
1325 }
1326}
1327
1328static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
1329 int fliplr, int bd) {
1330 __m128i x0, x1;
1331 const __m128i zero = _mm_setzero_si128();
1332
1333 x0 = _mm_unpacklo_epi16(pred, zero);
1334 x1 = _mm_unpackhi_epi16(pred, zero);
1335
1336 if (fliplr) {
1337 res_lo = _mm_shuffle_epi32(res_lo, 0x1B)((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(res_lo), (int
)(0x1B)))
;
1338 res_hi = _mm_shuffle_epi32(res_hi, 0x1B)((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(res_hi), (int
)(0x1B)))
;
1339 x0 = _mm_add_epi32(res_hi, x0);
1340 x1 = _mm_add_epi32(res_lo, x1);
1341
1342 } else {
1343 x0 = _mm_add_epi32(res_lo, x0);
1344 x1 = _mm_add_epi32(res_hi, x1);
1345 }
1346
1347 x0 = _mm_packus_epi32(x0, x1);
1348 return highbd_clamp_epi16(x0, bd);
1349}
1350
1351static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
1352 int fliplr, int flipud, int shift, int bd) {
1353 __m128i u0, u1, u2, u3, u4, u5, u6, u7;
1354 __m128i v0, v1, v2, v3, v4, v5, v6, v7;
1355
1356 round_shift_8x8(in, shift);
1357
1358 v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
1359 v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
1360 v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
1361 v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
1362 v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
1363 v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
1364 v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
1365 v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
1366
1367 if (flipud) {
1368 u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
1369 u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
1370 u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
1371 u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
1372 u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
1373 u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
1374 u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
1375 u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
1376 } else {
1377 u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
1378 u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
1379 u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
1380 u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
1381 u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
1382 u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
1383 u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
1384 u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
1385 }
1386
1387 _mm_store_si128((__m128i *)(output + 0 * stride), u0);
1388 _mm_store_si128((__m128i *)(output + 1 * stride), u1);
1389 _mm_store_si128((__m128i *)(output + 2 * stride), u2);
1390 _mm_store_si128((__m128i *)(output + 3 * stride), u3);
1391 _mm_store_si128((__m128i *)(output + 4 * stride), u4);
1392 _mm_store_si128((__m128i *)(output + 5 * stride), u5);
1393 _mm_store_si128((__m128i *)(output + 6 * stride), u6);
1394 _mm_store_si128((__m128i *)(output + 7 * stride), u7);
1395}
1396
1397void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output,
1398 int stride, TX_TYPE tx_type, int bd) {
1399 __m128i in[16], out[16];
1400 const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8];
1401
1402 switch (tx_type) {
1403 case DCT_DCT:
1404 load_buffer_8x8(input, in);
1405 idct8x8_sse4_1(in, out, INV_COS_BIT12, 0, bd, -shift[0]);
1406 transpose_8x8(out, in);
1407 idct8x8_sse4_1(in, out, INV_COS_BIT12, 1, bd, 0);
1408 write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
1409 break;
1410 case DCT_ADST:
1411 load_buffer_8x8(input, in);
1412 iadst8x8_sse4_1(in, out, INV_COS_BIT12, 0, bd, -shift[0]);
1413 transpose_8x8(out, in);
1414 idct8x8_sse4_1(in, out, INV_COS_BIT12, 1, bd, 0);
1415 write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
1416 break;
1417 case ADST_DCT:
1418 load_buffer_8x8(input, in);
1419 idct8x8_sse4_1(in, out, INV_COS_BIT12, 0, bd, -shift[0]);
1420 transpose_8x8(out, in);
1421 iadst8x8_sse4_1(in, out, INV_COS_BIT12, 1, bd, 0);
1422 write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
1423 break;
1424 case ADST_ADST:
1425 load_buffer_8x8(input, in);
1426 iadst8x8_sse4_1(in, out, INV_COS_BIT12, 0, bd, -shift[0]);
1427 transpose_8x8(out, in);
1428 iadst8x8_sse4_1(in, out, INV_COS_BIT12, 1, bd, 0);
1429 write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
1430 break;
1431 case FLIPADST_DCT:
1432 load_buffer_8x8(input, in);
1433 idct8x8_sse4_1(in, out, INV_COS_BIT12, 0, bd, -shift[0]);
1434 transpose_8x8(out, in);
1435 iadst8x8_sse4_1(in, out, INV_COS_BIT12, 1, bd, 0);
1436 write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd);
1437 break;
1438 case DCT_FLIPADST:
1439 load_buffer_8x8(input, in);
1440 iadst8x8_sse4_1(in, out, INV_COS_BIT12, 0, bd, -shift[0]);
1441 transpose_8x8(out, in);
1442 idct8x8_sse4_1(in, out, INV_COS_BIT12, 1, bd, 0);
1443 write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd);
1444 break;
1445 case ADST_FLIPADST:
1446 load_buffer_8x8(input, in);
1447 iadst8x8_sse4_1(in, out, INV_COS_BIT12, 0, bd, -shift[0]);
1448 transpose_8x8(out, in);
1449 iadst8x8_sse4_1(in, out, INV_COS_BIT12, 1, bd, 0);
1450 write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd);
1451 break;
1452 case FLIPADST_FLIPADST:
1453 load_buffer_8x8(input, in);
1454 iadst8x8_sse4_1(in, out, INV_COS_BIT12, 0, bd, -shift[0]);
1455 transpose_8x8(out, in);
1456 iadst8x8_sse4_1(in, out, INV_COS_BIT12, 1, bd, 0);
1457 write_buffer_8x8(out, output, stride, 1, 1, -shift[1], bd);
1458 break;
1459 case FLIPADST_ADST:
1460 load_buffer_8x8(input, in);
1461 iadst8x8_sse4_1(in, out, INV_COS_BIT12, 0, bd, -shift[0]);
1462 transpose_8x8(out, in);
1463 iadst8x8_sse4_1(in, out, INV_COS_BIT12, 1, bd, 0);
1464 write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd);
1465 break;
1466 default: assert(0)((void) sizeof ((0) ? 1 : 0), __extension__ ({ if (0) ; else __assert_fail
("0", "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c"
, 1466, __extension__ __PRETTY_FUNCTION__); }))
;
1467 }
1468}
1469
1470static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1471 int bd, int out_shift) {
1472 const int32_t *cospi = cospi_arr(bit);
1473 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1474 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1475 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
1476 __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1477 __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1478 __m128i x;
1479
1480 // stage 0
1481 // stage 1
1482 // stage 2
1483 // stage 3
1484 x = _mm_mullo_epi32(in[0], cospi32);
1485 x = _mm_add_epi32(x, rnding);
1486 x = _mm_srai_epi32(x, bit);
1487
1488 // stage 4
1489 // stage 5
1490 if (!do_cols) {
1491 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
1492 clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1493 clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1494
1495 __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
1496 x = _mm_add_epi32(x, offset);
1497 x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
1498 }
1499
1500 x = _mm_max_epi32(x, clamp_lo);
1501 x = _mm_min_epi32(x, clamp_hi);
1502 out[0] = x;
1503 out[1] = x;
1504 out[2] = x;
1505 out[3] = x;
1506 out[4] = x;
1507 out[5] = x;
1508 out[6] = x;
1509 out[7] = x;
1510}
1511
1512static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1513 int bd, int out_shift) {
1514 const int32_t *cospi = cospi_arr(bit);
1515 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1516 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
1517 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1518 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1519 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1520 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1521 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1522 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1523 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1524 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1525 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1526 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
1527 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1528 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1529 __m128i u0, u1, u2, u3, u4, u5, u6, u7;
1530 __m128i v0, v1, v2, v3, v4, v5, v6, v7;
1531 __m128i x, y;
1532
1533 // stage 0
1534 // stage 1
1535 // stage 2
1536 u0 = in[0];
1537 u1 = in[4];
1538 u2 = in[2];
1539 u3 = in[6];
1540
1541 x = _mm_mullo_epi32(in[1], cospi56);
1542 y = _mm_mullo_epi32(in[7], cospim8);
1543 u4 = _mm_add_epi32(x, y);
1544 u4 = _mm_add_epi32(u4, rnding);
1545 u4 = _mm_srai_epi32(u4, bit);
1546
1547 x = _mm_mullo_epi32(in[1], cospi8);
1548 y = _mm_mullo_epi32(in[7], cospi56);
1549 u7 = _mm_add_epi32(x, y);
1550 u7 = _mm_add_epi32(u7, rnding);
1551 u7 = _mm_srai_epi32(u7, bit);
1552
1553 x = _mm_mullo_epi32(in[5], cospi24);
1554 y = _mm_mullo_epi32(in[3], cospim40);
1555 u5 = _mm_add_epi32(x, y);
1556 u5 = _mm_add_epi32(u5, rnding);
1557 u5 = _mm_srai_epi32(u5, bit);
1558
1559 x = _mm_mullo_epi32(in[5], cospi40);
1560 y = _mm_mullo_epi32(in[3], cospi24);
1561 u6 = _mm_add_epi32(x, y);
1562 u6 = _mm_add_epi32(u6, rnding);
1563 u6 = _mm_srai_epi32(u6, bit);
1564
1565 // stage 3
1566 x = _mm_mullo_epi32(u0, cospi32);
1567 y = _mm_mullo_epi32(u1, cospi32);
1568 v0 = _mm_add_epi32(x, y);
1569 v0 = _mm_add_epi32(v0, rnding);
1570 v0 = _mm_srai_epi32(v0, bit);
1571
1572 v1 = _mm_sub_epi32(x, y);
1573 v1 = _mm_add_epi32(v1, rnding);
1574 v1 = _mm_srai_epi32(v1, bit);
1575
1576 x = _mm_mullo_epi32(u2, cospi48);
1577 y = _mm_mullo_epi32(u3, cospim16);
1578 v2 = _mm_add_epi32(x, y);
1579 v2 = _mm_add_epi32(v2, rnding);
1580 v2 = _mm_srai_epi32(v2, bit);
1581
1582 x = _mm_mullo_epi32(u2, cospi16);
1583 y = _mm_mullo_epi32(u3, cospi48);
1584 v3 = _mm_add_epi32(x, y);
1585 v3 = _mm_add_epi32(v3, rnding);
1586 v3 = _mm_srai_epi32(v3, bit);
1587
1588 addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
1589 addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
1590
1591 // stage 4
1592 addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
1593 addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
1594 u4 = v4;
1595 u7 = v7;
1596
1597 x = _mm_mullo_epi32(v5, cospi32);
1598 y = _mm_mullo_epi32(v6, cospi32);
1599 u6 = _mm_add_epi32(y, x);
1600 u6 = _mm_add_epi32(u6, rnding);
1601 u6 = _mm_srai_epi32(u6, bit);
1602
1603 u5 = _mm_sub_epi32(y, x);
1604 u5 = _mm_add_epi32(u5, rnding);
1605 u5 = _mm_srai_epi32(u5, bit);
1606
1607 // stage 5
1608 addsub_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
1609 addsub_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
1610 addsub_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
1611 addsub_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
1612
1613 if (!do_cols) {
1614 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
1615 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1616 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1617
1618 round_shift_4x4(out, out_shift);
1619 round_shift_4x4(out + 4, out_shift);
1620 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 8);
1621 }
1622}
1623
1624static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
1625 int do_cols, int bd, int out_shift) {
1626 const int32_t *cospi = cospi_arr(bit);
1627 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1628 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1629 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1630 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1631 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1632 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1633 const __m128i kZero = _mm_setzero_si128();
1634 __m128i u[8], x;
1635
1636 // stage 0
1637 // stage 1
1638 // stage 2
1639
1640 x = _mm_mullo_epi32(in[0], cospi60);
1641 u[0] = _mm_add_epi32(x, rnding);
1642 u[0] = _mm_srai_epi32(u[0], bit);
1643
1644 x = _mm_mullo_epi32(in[0], cospi4);
1645 u[1] = _mm_sub_epi32(kZero, x);
1646 u[1] = _mm_add_epi32(u[1], rnding);
1647 u[1] = _mm_srai_epi32(u[1], bit);
1648
1649 // stage 3
1650 // stage 4
1651 __m128i temp1, temp2;
1652 temp1 = _mm_mullo_epi32(u[0], cospi16);
1653 x = _mm_mullo_epi32(u[1], cospi48);
1654 temp1 = _mm_add_epi32(temp1, x);
1655 temp1 = _mm_add_epi32(temp1, rnding);
1656 temp1 = _mm_srai_epi32(temp1, bit);
1657 u[4] = temp1;
1658
1659 temp2 = _mm_mullo_epi32(u[0], cospi48);
1660 x = _mm_mullo_epi32(u[1], cospi16);
1661 u[5] = _mm_sub_epi32(temp2, x);
1662 u[5] = _mm_add_epi32(u[5], rnding);
1663 u[5] = _mm_srai_epi32(u[5], bit);
1664
1665 // stage 5
1666 // stage 6
1667 temp1 = _mm_mullo_epi32(u[0], cospi32);
1668 x = _mm_mullo_epi32(u[1], cospi32);
1669 u[2] = _mm_add_epi32(temp1, x);
1670 u[2] = _mm_add_epi32(u[2], rnding);
1671 u[2] = _mm_srai_epi32(u[2], bit);
1672
1673 u[3] = _mm_sub_epi32(temp1, x);
1674 u[3] = _mm_add_epi32(u[3], rnding);
1675 u[3] = _mm_srai_epi32(u[3], bit);
1676
1677 temp1 = _mm_mullo_epi32(u[4], cospi32);
1678 x = _mm_mullo_epi32(u[5], cospi32);
1679 u[6] = _mm_add_epi32(temp1, x);
1680 u[6] = _mm_add_epi32(u[6], rnding);
1681 u[6] = _mm_srai_epi32(u[6], bit);
1682
1683 u[7] = _mm_sub_epi32(temp1, x);
1684 u[7] = _mm_add_epi32(u[7], rnding);
1685 u[7] = _mm_srai_epi32(u[7], bit);
1686
1687 // stage 7
1688 if (do_cols) {
1689 out[0] = u[0];
1690 out[1] = _mm_sub_epi32(kZero, u[4]);
1691 out[2] = u[6];
1692 out[3] = _mm_sub_epi32(kZero, u[2]);
1693 out[4] = u[3];
1694 out[5] = _mm_sub_epi32(kZero, u[7]);
1695 out[6] = u[5];
1696 out[7] = _mm_sub_epi32(kZero, u[1]);
1697 } else {
1698 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
1699 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1700 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1701
1702 neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1703 out_shift);
1704 neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1705 out_shift);
1706 neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1707 out_shift);
1708 neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1709 out_shift);
1710 }
1711}
1712
1713static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1714 int bd, int out_shift) {
1715 const int32_t *cospi = cospi_arr(bit);
1716 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1717 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1718 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1719 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1720 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
1721 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1722 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
1723 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1724 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1725 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1726 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1727 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1728 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1729 const __m128i kZero = _mm_setzero_si128();
1730 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
1731 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1732 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1733 __m128i u[8], v[8], x;
1734
1735 // stage 0
1736 // stage 1
1737 // stage 2
1738
1739 u[0] = _mm_mullo_epi32(in[7], cospi4);
1740 x = _mm_mullo_epi32(in[0], cospi60);
1741 u[0] = _mm_add_epi32(u[0], x);
1742 u[0] = _mm_add_epi32(u[0], rnding);
1743 u[0] = _mm_srai_epi32(u[0], bit);
1744
1745 u[1] = _mm_mullo_epi32(in[7], cospi60);
1746 x = _mm_mullo_epi32(in[0], cospi4);
1747 u[1] = _mm_sub_epi32(u[1], x);
1748 u[1] = _mm_add_epi32(u[1], rnding);
1749 u[1] = _mm_srai_epi32(u[1], bit);
1750
1751 // (2)
1752 u[2] = _mm_mullo_epi32(in[5], cospi20);
1753 x = _mm_mullo_epi32(in[2], cospi44);
1754 u[2] = _mm_add_epi32(u[2], x);
1755 u[2] = _mm_add_epi32(u[2], rnding);
1756 u[2] = _mm_srai_epi32(u[2], bit);
1757
1758 u[3] = _mm_mullo_epi32(in[5], cospi44);
1759 x = _mm_mullo_epi32(in[2], cospi20);
1760 u[3] = _mm_sub_epi32(u[3], x);
1761 u[3] = _mm_add_epi32(u[3], rnding);
1762 u[3] = _mm_srai_epi32(u[3], bit);
1763
1764 // (3)
1765 u[4] = _mm_mullo_epi32(in[3], cospi36);
1766 x = _mm_mullo_epi32(in[4], cospi28);
1767 u[4] = _mm_add_epi32(u[4], x);
1768 u[4] = _mm_add_epi32(u[4], rnding);
1769 u[4] = _mm_srai_epi32(u[4], bit);
1770
1771 u[5] = _mm_mullo_epi32(in[3], cospi28);
1772 x = _mm_mullo_epi32(in[4], cospi36);
1773 u[5] = _mm_sub_epi32(u[5], x);
1774 u[5] = _mm_add_epi32(u[5], rnding);
1775 u[5] = _mm_srai_epi32(u[5], bit);
1776
1777 // (4)
1778 u[6] = _mm_mullo_epi32(in[1], cospi52);
1779 x = _mm_mullo_epi32(in[6], cospi12);
1780 u[6] = _mm_add_epi32(u[6], x);
1781 u[6] = _mm_add_epi32(u[6], rnding);
1782 u[6] = _mm_srai_epi32(u[6], bit);
1783
1784 u[7] = _mm_mullo_epi32(in[1], cospi12);
1785 x = _mm_mullo_epi32(in[6], cospi52);
1786 u[7] = _mm_sub_epi32(u[7], x);
1787 u[7] = _mm_add_epi32(u[7], rnding);
1788 u[7] = _mm_srai_epi32(u[7], bit);
1789
1790 // stage 3
1791 addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1792 addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1793 addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1794 addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1795
1796 // stage 4
1797 u[0] = v[0];
1798 u[1] = v[1];
1799 u[2] = v[2];
1800 u[3] = v[3];
1801
1802 u[4] = _mm_mullo_epi32(v[4], cospi16);
1803 x = _mm_mullo_epi32(v[5], cospi48);
1804 u[4] = _mm_add_epi32(u[4], x);
1805 u[4] = _mm_add_epi32(u[4], rnding);
1806 u[4] = _mm_srai_epi32(u[4], bit);
1807
1808 u[5] = _mm_mullo_epi32(v[4], cospi48);
1809 x = _mm_mullo_epi32(v[5], cospi16);
1810 u[5] = _mm_sub_epi32(u[5], x);
1811 u[5] = _mm_add_epi32(u[5], rnding);
1812 u[5] = _mm_srai_epi32(u[5], bit);
1813
1814 u[6] = _mm_mullo_epi32(v[6], cospim48);
1815 x = _mm_mullo_epi32(v[7], cospi16);
1816 u[6] = _mm_add_epi32(u[6], x);
1817 u[6] = _mm_add_epi32(u[6], rnding);
1818 u[6] = _mm_srai_epi32(u[6], bit);
1819
1820 u[7] = _mm_mullo_epi32(v[6], cospi16);
1821 x = _mm_mullo_epi32(v[7], cospim48);
1822 u[7] = _mm_sub_epi32(u[7], x);
1823 u[7] = _mm_add_epi32(u[7], rnding);
1824 u[7] = _mm_srai_epi32(u[7], bit);
1825
1826 // stage 5
1827 addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1828 addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1829 addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1830 addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1831
1832 // stage 6
1833 u[0] = v[0];
1834 u[1] = v[1];
1835 u[4] = v[4];
1836 u[5] = v[5];
1837
1838 v[0] = _mm_mullo_epi32(v[2], cospi32);
1839 x = _mm_mullo_epi32(v[3], cospi32);
1840 u[2] = _mm_add_epi32(v[0], x);
1841 u[2] = _mm_add_epi32(u[2], rnding);
1842 u[2] = _mm_srai_epi32(u[2], bit);
1843
1844 u[3] = _mm_sub_epi32(v[0], x);
1845 u[3] = _mm_add_epi32(u[3], rnding);
1846 u[3] = _mm_srai_epi32(u[3], bit);
1847
1848 v[0] = _mm_mullo_epi32(v[6], cospi32);
1849 x = _mm_mullo_epi32(v[7], cospi32);
1850 u[6] = _mm_add_epi32(v[0], x);
1851 u[6] = _mm_add_epi32(u[6], rnding);
1852 u[6] = _mm_srai_epi32(u[6], bit);
1853
1854 u[7] = _mm_sub_epi32(v[0], x);
1855 u[7] = _mm_add_epi32(u[7], rnding);
1856 u[7] = _mm_srai_epi32(u[7], bit);
1857
1858 // stage 7
1859 if (do_cols) {
1860 out[0] = u[0];
1861 out[1] = _mm_sub_epi32(kZero, u[4]);
1862 out[2] = u[6];
1863 out[3] = _mm_sub_epi32(kZero, u[2]);
1864 out[4] = u[3];
1865 out[5] = _mm_sub_epi32(kZero, u[7]);
1866 out[6] = u[5];
1867 out[7] = _mm_sub_epi32(kZero, u[1]);
1868 } else {
1869 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
1870 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1871 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1872
1873 neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1874 out_shift);
1875 neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1876 out_shift);
1877 neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1878 out_shift);
1879 neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1880 out_shift);
1881 }
1882}
1883
1884static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
1885 int do_cols, int bd, int out_shift) {
1886 const int32_t *cospi = cospi_arr(bit);
1887 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1888 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1889 int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
1890 __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1891 __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1892 // stage 0
1893 // stage 1
1894 // stage 2
1895 // stage 3
1896 // stage 4
1897 in[0] = _mm_mullo_epi32(in[0], cospi32);
1898 in[0] = _mm_add_epi32(in[0], rnding);
1899 in[0] = _mm_srai_epi32(in[0], bit);
1900
1901 // stage 5
1902 // stage 6
1903 // stage 7
1904 if (!do_cols) {
1905 log_range = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
1906 clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1907 clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1908 if (out_shift != 0) {
1909 __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
1910 in[0] = _mm_add_epi32(in[0], offset);
1911 in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
1912 }
1913 }
1914
1915 in[0] = _mm_max_epi32(in[0], clamp_lo);
1916 in[0] = _mm_min_epi32(in[0], clamp_hi);
1917 out[0] = in[0];
1918 out[1] = in[0];
1919 out[2] = in[0];
1920 out[3] = in[0];
1921 out[4] = in[0];
1922 out[5] = in[0];
1923 out[6] = in[0];
1924 out[7] = in[0];
1925 out[8] = in[0];
1926 out[9] = in[0];
1927 out[10] = in[0];
1928 out[11] = in[0];
1929 out[12] = in[0];
1930 out[13] = in[0];
1931 out[14] = in[0];
1932 out[15] = in[0];
1933}
1934
1935static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
1936 int do_cols, int bd, int out_shift) {
1937 const int32_t *cospi = cospi_arr(bit);
1938 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1939 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1940 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1941 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1942 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1943 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1944 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1945 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1946 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1947 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1948 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1949 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1950 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1951 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1952 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1953 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
1954 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
1955 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1956 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
1957 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1958 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1959 __m128i u[16], x, y;
1960 // stage 0
1961 // stage 1
1962 u[0] = in[0];
1963 u[2] = in[4];
1964 u[4] = in[2];
1965 u[6] = in[6];
1966 u[8] = in[1];
1967 u[10] = in[5];
1968 u[12] = in[3];
1969 u[14] = in[7];
1970
1971 // stage 2
1972 u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
1973 u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
1974
1975 u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
1976 u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
1977
1978 u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
1979 u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
1980
1981 u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
1982 u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
1983
1984 // stage 3
1985 u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
1986 u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
1987 u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
1988 u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
1989
1990 addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1991 addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1992 addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1993 addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1994
1995 // stage 4
1996 x = _mm_mullo_epi32(u[0], cospi32);
1997 u[0] = _mm_add_epi32(x, rnding);
1998 u[0] = _mm_srai_epi32(u[0], bit);
1999 u[1] = u[0];
2000
2001 u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
2002 u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
2003
2004 addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
2005 addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
2006
2007 x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
2008 u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
2009 u[9] = x;
2010 y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
2011 u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
2012 u[10] = y;
2013
2014 // stage 5
2015 addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
2016 addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
2017
2018 x = _mm_mullo_epi32(u[5], cospi32);
2019 y = _mm_mullo_epi32(u[6], cospi32);
2020 u[5] = _mm_sub_epi32(y, x);
2021 u[5] = _mm_add_epi32(u[5], rnding);
2022 u[5] = _mm_srai_epi32(u[5], bit);
2023
2024 u[6] = _mm_add_epi32(y, x);
2025 u[6] = _mm_add_epi32(u[6], rnding);
2026 u[6] = _mm_srai_epi32(u[6], bit);
2027
2028 addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
2029 addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
2030 addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
2031 addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
2032
2033 // stage 6
2034 addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
2035 addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
2036 addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
2037 addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
2038
2039 x = _mm_mullo_epi32(u[10], cospi32);
2040 y = _mm_mullo_epi32(u[13], cospi32);
2041 u[10] = _mm_sub_epi32(y, x);
2042 u[10] = _mm_add_epi32(u[10], rnding);
2043 u[10] = _mm_srai_epi32(u[10], bit);
2044
2045 u[13] = _mm_add_epi32(x, y);
2046 u[13] = _mm_add_epi32(u[13], rnding);
2047 u[13] = _mm_srai_epi32(u[13], bit);
2048
2049 x = _mm_mullo_epi32(u[11], cospi32);
2050 y = _mm_mullo_epi32(u[12], cospi32);
2051 u[11] = _mm_sub_epi32(y, x);
2052 u[11] = _mm_add_epi32(u[11], rnding);
2053 u[11] = _mm_srai_epi32(u[11], bit);
2054
2055 u[12] = _mm_add_epi32(x, y);
2056 u[12] = _mm_add_epi32(u[12], rnding);
2057 u[12] = _mm_srai_epi32(u[12], bit);
2058 // stage 7
2059 addsub_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
2060 addsub_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
2061 addsub_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
2062 addsub_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
2063 addsub_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
2064 addsub_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
2065 addsub_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
2066 addsub_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
2067
2068 if (!do_cols) {
2069 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
2070 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2071 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2072 round_shift_8x8(out, out_shift);
2073 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
2074 }
2075}
2076
2077static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
2078 int do_cols, int bd, int out_shift) {
2079 const int32_t *cospi = cospi_arr(bit);
2080 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2081 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2082 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2083 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2084 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2085 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2086 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2087 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2088 const __m128i zero = _mm_setzero_si128();
2089 __m128i v[16], x, y, temp1, temp2;
2090 // stage 0
2091 // stage 1
2092 // stage 2
2093 x = _mm_mullo_epi32(in[0], cospi62);
2094 v[0] = _mm_add_epi32(x, rnding);
2095 v[0] = _mm_srai_epi32(v[0], bit);
2096
2097 x = _mm_mullo_epi32(in[0], cospi2);
2098 v[1] = _mm_sub_epi32(zero, x);
2099 v[1] = _mm_add_epi32(v[1], rnding);
2100 v[1] = _mm_srai_epi32(v[1], bit);
2101
2102 // stage 3
2103 v[8] = v[0];
2104 v[9] = v[1];
2105
2106 // stage 4
2107 temp1 = _mm_mullo_epi32(v[8], cospi8);
2108 x = _mm_mullo_epi32(v[9], cospi56);
2109 temp1 = _mm_add_epi32(temp1, x);
2110 temp1 = _mm_add_epi32(temp1, rnding);
2111 temp1 = _mm_srai_epi32(temp1, bit);
2112
2113 temp2 = _mm_mullo_epi32(v[8], cospi56);
2114 x = _mm_mullo_epi32(v[9], cospi8);
2115 temp2 = _mm_sub_epi32(temp2, x);
2116 temp2 = _mm_add_epi32(temp2, rnding);
2117 temp2 = _mm_srai_epi32(temp2, bit);
2118 v[8] = temp1;
2119 v[9] = temp2;
2120
2121 // stage 5
2122 v[4] = v[0];
2123 v[5] = v[1];
2124 v[12] = v[8];
2125 v[13] = v[9];
2126
2127 // stage 6
2128 temp1 = _mm_mullo_epi32(v[4], cospi16);
2129 x = _mm_mullo_epi32(v[5], cospi48);
2130 temp1 = _mm_add_epi32(temp1, x);
2131 temp1 = _mm_add_epi32(temp1, rnding);
2132 temp1 = _mm_srai_epi32(temp1, bit);
2133
2134 temp2 = _mm_mullo_epi32(v[4], cospi48);
2135 x = _mm_mullo_epi32(v[5], cospi16);
2136 temp2 = _mm_sub_epi32(temp2, x);
2137 temp2 = _mm_add_epi32(temp2, rnding);
2138 temp2 = _mm_srai_epi32(temp2, bit);
2139 v[4] = temp1;
2140 v[5] = temp2;
2141
2142 temp1 = _mm_mullo_epi32(v[12], cospi16);
2143 x = _mm_mullo_epi32(v[13], cospi48);
2144 temp1 = _mm_add_epi32(temp1, x);
2145 temp1 = _mm_add_epi32(temp1, rnding);
2146 temp1 = _mm_srai_epi32(temp1, bit);
2147
2148 temp2 = _mm_mullo_epi32(v[12], cospi48);
2149 x = _mm_mullo_epi32(v[13], cospi16);
2150 temp2 = _mm_sub_epi32(temp2, x);
2151 temp2 = _mm_add_epi32(temp2, rnding);
2152 temp2 = _mm_srai_epi32(temp2, bit);
2153 v[12] = temp1;
2154 v[13] = temp2;
2155
2156 // stage 7
2157 v[2] = v[0];
2158 v[3] = v[1];
2159 v[6] = v[4];
2160 v[7] = v[5];
2161 v[10] = v[8];
2162 v[11] = v[9];
2163 v[14] = v[12];
2164 v[15] = v[13];
2165
2166 // stage 8
2167 y = _mm_mullo_epi32(v[2], cospi32);
2168 x = _mm_mullo_epi32(v[3], cospi32);
2169 v[2] = _mm_add_epi32(y, x);
2170 v[2] = _mm_add_epi32(v[2], rnding);
2171 v[2] = _mm_srai_epi32(v[2], bit);
2172
2173 v[3] = _mm_sub_epi32(y, x);
2174 v[3] = _mm_add_epi32(v[3], rnding);
2175 v[3] = _mm_srai_epi32(v[3], bit);
2176
2177 y = _mm_mullo_epi32(v[6], cospi32);
2178 x = _mm_mullo_epi32(v[7], cospi32);
2179 v[6] = _mm_add_epi32(y, x);
2180 v[6] = _mm_add_epi32(v[6], rnding);
2181 v[6] = _mm_srai_epi32(v[6], bit);
2182
2183 v[7] = _mm_sub_epi32(y, x);
2184 v[7] = _mm_add_epi32(v[7], rnding);
2185 v[7] = _mm_srai_epi32(v[7], bit);
2186
2187 y = _mm_mullo_epi32(v[10], cospi32);
2188 x = _mm_mullo_epi32(v[11], cospi32);
2189 v[10] = _mm_add_epi32(y, x);
2190 v[10] = _mm_add_epi32(v[10], rnding);
2191 v[10] = _mm_srai_epi32(v[10], bit);
2192
2193 v[11] = _mm_sub_epi32(y, x);
2194 v[11] = _mm_add_epi32(v[11], rnding);
2195 v[11] = _mm_srai_epi32(v[11], bit);
2196
2197 y = _mm_mullo_epi32(v[14], cospi32);
2198 x = _mm_mullo_epi32(v[15], cospi32);
2199 v[14] = _mm_add_epi32(y, x);
2200 v[14] = _mm_add_epi32(v[14], rnding);
2201 v[14] = _mm_srai_epi32(v[14], bit);
2202
2203 v[15] = _mm_sub_epi32(y, x);
2204 v[15] = _mm_add_epi32(v[15], rnding);
2205 v[15] = _mm_srai_epi32(v[15], bit);
2206
2207 // stage 9
2208 if (do_cols) {
2209 out[0] = v[0];
2210 out[1] = _mm_sub_epi32(zero, v[8]);
2211 out[2] = v[12];
2212 out[3] = _mm_sub_epi32(zero, v[4]);
2213 out[4] = v[6];
2214 out[5] = _mm_sub_epi32(zero, v[14]);
2215 out[6] = v[10];
2216 out[7] = _mm_sub_epi32(zero, v[2]);
2217 out[8] = v[3];
2218 out[9] = _mm_sub_epi32(zero, v[11]);
2219 out[10] = v[15];
2220 out[11] = _mm_sub_epi32(zero, v[7]);
2221 out[12] = v[5];
2222 out[13] = _mm_sub_epi32(zero, v[13]);
2223 out[14] = v[9];
2224 out[15] = _mm_sub_epi32(zero, v[1]);
2225 } else {
2226 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
2227 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2228 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2229
2230 neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2231 out_shift);
2232 neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
2233 &clamp_hi_out, out_shift);
2234 neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
2235 &clamp_hi_out, out_shift);
2236 neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
2237 &clamp_hi_out, out_shift);
2238 neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
2239 &clamp_hi_out, out_shift);
2240 neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
2241 &clamp_hi_out, out_shift);
2242 neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
2243 &clamp_hi_out, out_shift);
2244 neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
2245 &clamp_hi_out, out_shift);
2246 }
2247}
2248
2249static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
2250 int do_cols, int bd, int out_shift) {
2251 const int32_t *cospi = cospi_arr(bit);
2252 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2253 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2254 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
2255 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
2256 const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
2257 const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
2258 const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
2259 const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
2260 const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
2261 const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
2262 const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
2263 const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
2264 const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
2265 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
2266 const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
2267 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
2268 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2269 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2270 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2271 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2272 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
2273 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
2274 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2275 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2276 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2277 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2278 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2279 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
2280 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2281 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2282 __m128i zero = _mm_setzero_si128();
2283 __m128i u[16], x, y;
2284
2285 // stage 0
2286 // stage 1
2287 // stage 2
2288 x = _mm_mullo_epi32(in[0], cospi62);
2289 u[0] = _mm_add_epi32(x, rnding);
2290 u[0] = _mm_srai_epi32(u[0], bit);
2291
2292 x = _mm_mullo_epi32(in[0], cospi2);
2293 u[1] = _mm_sub_epi32(zero, x);
2294 u[1] = _mm_add_epi32(u[1], rnding);
2295 u[1] = _mm_srai_epi32(u[1], bit);
2296
2297 x = _mm_mullo_epi32(in[2], cospi54);
2298 u[2] = _mm_add_epi32(x, rnding);
2299 u[2] = _mm_srai_epi32(u[2], bit);
2300
2301 x = _mm_mullo_epi32(in[2], cospi10);
2302 u[3] = _mm_sub_epi32(zero, x);
2303 u[3] = _mm_add_epi32(u[3], rnding);
2304 u[3] = _mm_srai_epi32(u[3], bit);
2305
2306 x = _mm_mullo_epi32(in[4], cospi46);
2307 u[4] = _mm_add_epi32(x, rnding);
2308 u[4] = _mm_srai_epi32(u[4], bit);
2309
2310 x = _mm_mullo_epi32(in[4], cospi18);
2311 u[5] = _mm_sub_epi32(zero, x);
2312 u[5] = _mm_add_epi32(u[5], rnding);
2313 u[5] = _mm_srai_epi32(u[5], bit);
2314
2315 x = _mm_mullo_epi32(in[6], cospi38);
2316 u[6] = _mm_add_epi32(x, rnding);
2317 u[6] = _mm_srai_epi32(u[6], bit);
2318
2319 x = _mm_mullo_epi32(in[6], cospi26);
2320 u[7] = _mm_sub_epi32(zero, x);
2321 u[7] = _mm_add_epi32(u[7], rnding);
2322 u[7] = _mm_srai_epi32(u[7], bit);
2323
2324 u[8] = _mm_mullo_epi32(in[7], cospi34);
2325 u[8] = _mm_add_epi32(u[8], rnding);
2326 u[8] = _mm_srai_epi32(u[8], bit);
2327
2328 u[9] = _mm_mullo_epi32(in[7], cospi30);
2329 u[9] = _mm_add_epi32(u[9], rnding);
2330 u[9] = _mm_srai_epi32(u[9], bit);
2331
2332 u[10] = _mm_mullo_epi32(in[5], cospi42);
2333 u[10] = _mm_add_epi32(u[10], rnding);
2334 u[10] = _mm_srai_epi32(u[10], bit);
2335
2336 u[11] = _mm_mullo_epi32(in[5], cospi22);
2337 u[11] = _mm_add_epi32(u[11], rnding);
2338 u[11] = _mm_srai_epi32(u[11], bit);
2339
2340 u[12] = _mm_mullo_epi32(in[3], cospi50);
2341 u[12] = _mm_add_epi32(u[12], rnding);
2342 u[12] = _mm_srai_epi32(u[12], bit);
2343
2344 u[13] = _mm_mullo_epi32(in[3], cospi14);
2345 u[13] = _mm_add_epi32(u[13], rnding);
2346 u[13] = _mm_srai_epi32(u[13], bit);
2347
2348 u[14] = _mm_mullo_epi32(in[1], cospi58);
2349 u[14] = _mm_add_epi32(u[14], rnding);
2350 u[14] = _mm_srai_epi32(u[14], bit);
2351
2352 u[15] = _mm_mullo_epi32(in[1], cospi6);
2353 u[15] = _mm_add_epi32(u[15], rnding);
2354 u[15] = _mm_srai_epi32(u[15], bit);
2355
2356 // stage 3
2357 addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2358 addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2359 addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2360 addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2361 addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2362 addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2363 addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2364 addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2365
2366 // stage 4
2367 y = _mm_mullo_epi32(u[8], cospi56);
2368 x = _mm_mullo_epi32(u[9], cospi56);
2369 u[8] = _mm_mullo_epi32(u[8], cospi8);
2370 u[8] = _mm_add_epi32(u[8], x);
2371 u[8] = _mm_add_epi32(u[8], rnding);
2372 u[8] = _mm_srai_epi32(u[8], bit);
2373
2374 x = _mm_mullo_epi32(u[9], cospi8);
2375 u[9] = _mm_sub_epi32(y, x);
2376 u[9] = _mm_add_epi32(u[9], rnding);
2377 u[9] = _mm_srai_epi32(u[9], bit);
2378
2379 x = _mm_mullo_epi32(u[11], cospi24);
2380 y = _mm_mullo_epi32(u[10], cospi24);
2381 u[10] = _mm_mullo_epi32(u[10], cospi40);
2382 u[10] = _mm_add_epi32(u[10], x);
2383 u[10] = _mm_add_epi32(u[10], rnding);
2384 u[10] = _mm_srai_epi32(u[10], bit);
2385
2386 x = _mm_mullo_epi32(u[11], cospi40);
2387 u[11] = _mm_sub_epi32(y, x);
2388 u[11] = _mm_add_epi32(u[11], rnding);
2389 u[11] = _mm_srai_epi32(u[11], bit);
2390
2391 x = _mm_mullo_epi32(u[13], cospi8);
2392 y = _mm_mullo_epi32(u[12], cospi8);
2393 u[12] = _mm_mullo_epi32(u[12], cospim56);
2394 u[12] = _mm_add_epi32(u[12], x);
2395 u[12] = _mm_add_epi32(u[12], rnding);
2396 u[12] = _mm_srai_epi32(u[12], bit);
2397
2398 x = _mm_mullo_epi32(u[13], cospim56);
2399 u[13] = _mm_sub_epi32(y, x);
2400 u[13] = _mm_add_epi32(u[13], rnding);
2401 u[13] = _mm_srai_epi32(u[13], bit);
2402
2403 x = _mm_mullo_epi32(u[15], cospi40);
2404 y = _mm_mullo_epi32(u[14], cospi40);
2405 u[14] = _mm_mullo_epi32(u[14], cospim24);
2406 u[14] = _mm_add_epi32(u[14], x);
2407 u[14] = _mm_add_epi32(u[14], rnding);
2408 u[14] = _mm_srai_epi32(u[14], bit);
2409
2410 x = _mm_mullo_epi32(u[15], cospim24);
2411 u[15] = _mm_sub_epi32(y, x);
2412 u[15] = _mm_add_epi32(u[15], rnding);
2413 u[15] = _mm_srai_epi32(u[15], bit);
2414
2415 // stage 5
2416 addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2417 addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2418 addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2419 addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2420 addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2421 addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2422 addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2423 addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2424
2425 // stage 6
2426 x = _mm_mullo_epi32(u[5], cospi48);
2427 y = _mm_mullo_epi32(u[4], cospi48);
2428 u[4] = _mm_mullo_epi32(u[4], cospi16);
2429 u[4] = _mm_add_epi32(u[4], x);
2430 u[4] = _mm_add_epi32(u[4], rnding);
2431 u[4] = _mm_srai_epi32(u[4], bit);
2432
2433 x = _mm_mullo_epi32(u[5], cospi16);
2434 u[5] = _mm_sub_epi32(y, x);
2435 u[5] = _mm_add_epi32(u[5], rnding);
2436 u[5] = _mm_srai_epi32(u[5], bit);
2437
2438 x = _mm_mullo_epi32(u[7], cospi16);
2439 y = _mm_mullo_epi32(u[6], cospi16);
2440 u[6] = _mm_mullo_epi32(u[6], cospim48);
2441 u[6] = _mm_add_epi32(u[6], x);
2442 u[6] = _mm_add_epi32(u[6], rnding);
2443 u[6] = _mm_srai_epi32(u[6], bit);
2444
2445 x = _mm_mullo_epi32(u[7], cospim48);
2446 u[7] = _mm_sub_epi32(y, x);
2447 u[7] = _mm_add_epi32(u[7], rnding);
2448 u[7] = _mm_srai_epi32(u[7], bit);
2449
2450 x = _mm_mullo_epi32(u[13], cospi48);
2451 y = _mm_mullo_epi32(u[12], cospi48);
2452 u[12] = _mm_mullo_epi32(u[12], cospi16);
2453 u[12] = _mm_add_epi32(u[12], x);
2454 u[12] = _mm_add_epi32(u[12], rnding);
2455 u[12] = _mm_srai_epi32(u[12], bit);
2456
2457 x = _mm_mullo_epi32(u[13], cospi16);
2458 u[13] = _mm_sub_epi32(y, x);
2459 u[13] = _mm_add_epi32(u[13], rnding);
2460 u[13] = _mm_srai_epi32(u[13], bit);
2461
2462 x = _mm_mullo_epi32(u[15], cospi16);
2463 y = _mm_mullo_epi32(u[14], cospi16);
2464 u[14] = _mm_mullo_epi32(u[14], cospim48);
2465 u[14] = _mm_add_epi32(u[14], x);
2466 u[14] = _mm_add_epi32(u[14], rnding);
2467 u[14] = _mm_srai_epi32(u[14], bit);
2468
2469 x = _mm_mullo_epi32(u[15], cospim48);
2470 u[15] = _mm_sub_epi32(y, x);
2471 u[15] = _mm_add_epi32(u[15], rnding);
2472 u[15] = _mm_srai_epi32(u[15], bit);
2473
2474 // stage 7
2475 addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2476 addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2477 addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2478 addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2479 addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2480 addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2481 addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2482 addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2483
2484 // stage 8
2485 y = _mm_mullo_epi32(u[2], cospi32);
2486 x = _mm_mullo_epi32(u[3], cospi32);
2487 u[2] = _mm_add_epi32(y, x);
2488 u[2] = _mm_add_epi32(u[2], rnding);
2489 u[2] = _mm_srai_epi32(u[2], bit);
2490
2491 u[3] = _mm_sub_epi32(y, x);
2492 u[3] = _mm_add_epi32(u[3], rnding);
2493 u[3] = _mm_srai_epi32(u[3], bit);
2494 y = _mm_mullo_epi32(u[6], cospi32);
2495 x = _mm_mullo_epi32(u[7], cospi32);
2496 u[6] = _mm_add_epi32(y, x);
2497 u[6] = _mm_add_epi32(u[6], rnding);
2498 u[6] = _mm_srai_epi32(u[6], bit);
2499
2500 u[7] = _mm_sub_epi32(y, x);
2501 u[7] = _mm_add_epi32(u[7], rnding);
2502 u[7] = _mm_srai_epi32(u[7], bit);
2503
2504 y = _mm_mullo_epi32(u[10], cospi32);
2505 x = _mm_mullo_epi32(u[11], cospi32);
2506 u[10] = _mm_add_epi32(y, x);
2507 u[10] = _mm_add_epi32(u[10], rnding);
2508 u[10] = _mm_srai_epi32(u[10], bit);
2509
2510 u[11] = _mm_sub_epi32(y, x);
2511 u[11] = _mm_add_epi32(u[11], rnding);
2512 u[11] = _mm_srai_epi32(u[11], bit);
2513
2514 y = _mm_mullo_epi32(u[14], cospi32);
2515 x = _mm_mullo_epi32(u[15], cospi32);
2516 u[14] = _mm_add_epi32(y, x);
2517 u[14] = _mm_add_epi32(u[14], rnding);
2518 u[14] = _mm_srai_epi32(u[14], bit);
2519
2520 u[15] = _mm_sub_epi32(y, x);
2521 u[15] = _mm_add_epi32(u[15], rnding);
2522 u[15] = _mm_srai_epi32(u[15], bit);
2523
2524 // stage 9
2525 if (do_cols) {
2526 out[0] = u[0];
2527 out[1] = _mm_sub_epi32(zero, u[8]);
2528 out[2] = u[12];
2529 out[3] = _mm_sub_epi32(zero, u[4]);
2530 out[4] = u[6];
2531 out[5] = _mm_sub_epi32(zero, u[14]);
2532 out[6] = u[10];
2533 out[7] = _mm_sub_epi32(zero, u[2]);
2534 out[8] = u[3];
2535 out[9] = _mm_sub_epi32(zero, u[11]);
2536 out[10] = u[15];
2537 out[11] = _mm_sub_epi32(zero, u[7]);
2538 out[12] = u[5];
2539 out[13] = _mm_sub_epi32(zero, u[13]);
2540 out[14] = u[9];
2541 out[15] = _mm_sub_epi32(zero, u[1]);
2542 } else {
2543 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
2544 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2545 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2546
2547 neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2548 out_shift);
2549 neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
2550 &clamp_hi_out, out_shift);
2551 neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
2552 &clamp_hi_out, out_shift);
2553 neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
2554 &clamp_hi_out, out_shift);
2555 neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
2556 &clamp_hi_out, out_shift);
2557 neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
2558 &clamp_hi_out, out_shift);
2559 neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
2560 &clamp_hi_out, out_shift);
2561 neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
2562 &clamp_hi_out, out_shift);
2563 }
2564}
2565
2566static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
2567 int bd, int out_shift) {
2568 const int32_t *cospi = cospi_arr(bit);
2569 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
2570 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
2571 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
2572 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
2573 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
2574 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
2575 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
2576 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
2577 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
2578 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
2579 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
2580 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
2581 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2582 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
2583 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2584 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
2585 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2586 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2587 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2588 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2589 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2590 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
2591 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2592 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2593 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
2594 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2595 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2596 __m128i u[16], v[16], x, y;
2597
2598 {
2599 // stage 0
2600 // stage 1
2601 u[0] = in[0];
2602 u[1] = in[8];
2603 u[2] = in[4];
2604 u[3] = in[12];
2605 u[4] = in[2];
2606 u[5] = in[10];
2607 u[6] = in[6];
2608 u[7] = in[14];
2609 u[8] = in[1];
2610 u[9] = in[9];
2611 u[10] = in[5];
2612 u[11] = in[13];
2613 u[12] = in[3];
2614 u[13] = in[11];
2615 u[14] = in[7];
2616 u[15] = in[15];
2617
2618 // stage 2
2619 v[0] = u[0];
2620 v[1] = u[1];
2621 v[2] = u[2];
2622 v[3] = u[3];
2623 v[4] = u[4];
2624 v[5] = u[5];
2625 v[6] = u[6];
2626 v[7] = u[7];
2627
2628 v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
2629 v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
2630 v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
2631 v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
2632 v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
2633 v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
2634 v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
2635 v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
2636
2637 // stage 3
2638 u[0] = v[0];
2639 u[1] = v[1];
2640 u[2] = v[2];
2641 u[3] = v[3];
2642 u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
2643 u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
2644 u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
2645 u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
2646 addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
2647 addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
2648 addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
2649 addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
2650
2651 // stage 4
2652 x = _mm_mullo_epi32(u[0], cospi32);
2653 y = _mm_mullo_epi32(u[1], cospi32);
2654 v[0] = _mm_add_epi32(x, y);
2655 v[0] = _mm_add_epi32(v[0], rnding);
2656 v[0] = _mm_srai_epi32(v[0], bit);
2657
2658 v[1] = _mm_sub_epi32(x, y);
2659 v[1] = _mm_add_epi32(v[1], rnding);
2660 v[1] = _mm_srai_epi32(v[1], bit);
2661
2662 v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
2663 v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
2664 addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
2665 addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
2666 v[8] = u[8];
2667 v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
2668 v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
2669 v[11] = u[11];
2670 v[12] = u[12];
2671 v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
2672 v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
2673 v[15] = u[15];
2674
2675 // stage 5
2676 addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
2677 addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
2678 u[4] = v[4];
2679
2680 x = _mm_mullo_epi32(v[5], cospi32);
2681 y = _mm_mullo_epi32(v[6], cospi32);
2682 u[5] = _mm_sub_epi32(y, x);
2683 u[5] = _mm_add_epi32(u[5], rnding);
2684 u[5] = _mm_srai_epi32(u[5], bit);
2685
2686 u[6] = _mm_add_epi32(y, x);
2687 u[6] = _mm_add_epi32(u[6], rnding);
2688 u[6] = _mm_srai_epi32(u[6], bit);
2689
2690 u[7] = v[7];
2691 addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
2692 addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
2693 addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
2694 addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
2695
2696 // stage 6
2697 addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
2698 addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
2699 addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
2700 addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
2701 v[8] = u[8];
2702 v[9] = u[9];
2703
2704 x = _mm_mullo_epi32(u[10], cospi32);
2705 y = _mm_mullo_epi32(u[13], cospi32);
2706 v[10] = _mm_sub_epi32(y, x);
2707 v[10] = _mm_add_epi32(v[10], rnding);
2708 v[10] = _mm_srai_epi32(v[10], bit);
2709
2710 v[13] = _mm_add_epi32(x, y);
2711 v[13] = _mm_add_epi32(v[13], rnding);
2712 v[13] = _mm_srai_epi32(v[13], bit);
2713
2714 x = _mm_mullo_epi32(u[11], cospi32);
2715 y = _mm_mullo_epi32(u[12], cospi32);
2716 v[11] = _mm_sub_epi32(y, x);
2717 v[11] = _mm_add_epi32(v[11], rnding);
2718 v[11] = _mm_srai_epi32(v[11], bit);
2719
2720 v[12] = _mm_add_epi32(x, y);
2721 v[12] = _mm_add_epi32(v[12], rnding);
2722 v[12] = _mm_srai_epi32(v[12], bit);
2723
2724 v[14] = u[14];
2725 v[15] = u[15];
2726
2727 // stage 7
2728 addsub_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
2729 addsub_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
2730 addsub_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
2731 addsub_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
2732 addsub_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
2733 addsub_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
2734 addsub_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
2735 addsub_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
2736
2737 if (!do_cols) {
2738 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
2739 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2740 const __m128i clamp_hi_out =
2741 _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2742 round_shift_8x8(out, out_shift);
2743 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
2744 }
2745 }
2746}
2747
2748static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
2749 int bd, int out_shift) {
2750 const int32_t *cospi = cospi_arr(bit);
2751 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2752 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2753 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
2754 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
2755 const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
2756 const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
2757 const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
2758 const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
2759 const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
2760 const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
2761 const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
2762 const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
2763 const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
2764 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
2765 const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
2766 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
2767 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2768 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2769 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2770 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2771 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
2772 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
2773 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2774 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2775 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2776 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2777 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2778 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
2779 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2780 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2781 const __m128i zero = _mm_setzero_si128();
2782 __m128i u[16], v[16], x, y;
2783 // Calculate the column 0, 1, 2, 3
2784 // stage 0
2785 // stage 1
2786 // stage 2
2787 v[0] = _mm_mullo_epi32(in[15], cospi2);
2788 x = _mm_mullo_epi32(in[0], cospi62);
2789 v[0] = _mm_add_epi32(v[0], x);
2790 v[0] = _mm_add_epi32(v[0], rnding);
2791 v[0] = _mm_srai_epi32(v[0], bit);
2792
2793 v[1] = _mm_mullo_epi32(in[15], cospi62);
2794 x = _mm_mullo_epi32(in[0], cospi2);
2795 v[1] = _mm_sub_epi32(v[1], x);
2796 v[1] = _mm_add_epi32(v[1], rnding);
2797 v[1] = _mm_srai_epi32(v[1], bit);
2798
2799 v[2] = _mm_mullo_epi32(in[13], cospi10);
2800 x = _mm_mullo_epi32(in[2], cospi54);
2801 v[2] = _mm_add_epi32(v[2], x);
2802 v[2] = _mm_add_epi32(v[2], rnding);
2803 v[2] = _mm_srai_epi32(v[2], bit);
2804
2805 v[3] = _mm_mullo_epi32(in[13], cospi54);
2806 x = _mm_mullo_epi32(in[2], cospi10);
2807 v[3] = _mm_sub_epi32(v[3], x);
2808 v[3] = _mm_add_epi32(v[3], rnding);
2809 v[3] = _mm_srai_epi32(v[3], bit);
2810
2811 v[4] = _mm_mullo_epi32(in[11], cospi18);
2812 x = _mm_mullo_epi32(in[4], cospi46);
2813 v[4] = _mm_add_epi32(v[4], x);
2814 v[4] = _mm_add_epi32(v[4], rnding);
2815 v[4] = _mm_srai_epi32(v[4], bit);
2816
2817 v[5] = _mm_mullo_epi32(in[11], cospi46);
2818 x = _mm_mullo_epi32(in[4], cospi18);
2819 v[5] = _mm_sub_epi32(v[5], x);
2820 v[5] = _mm_add_epi32(v[5], rnding);
2821 v[5] = _mm_srai_epi32(v[5], bit);
2822
2823 v[6] = _mm_mullo_epi32(in[9], cospi26);
2824 x = _mm_mullo_epi32(in[6], cospi38);
2825 v[6] = _mm_add_epi32(v[6], x);
2826 v[6] = _mm_add_epi32(v[6], rnding);
2827 v[6] = _mm_srai_epi32(v[6], bit);
2828
2829 v[7] = _mm_mullo_epi32(in[9], cospi38);
2830 x = _mm_mullo_epi32(in[6], cospi26);
2831 v[7] = _mm_sub_epi32(v[7], x);
2832 v[7] = _mm_add_epi32(v[7], rnding);
2833 v[7] = _mm_srai_epi32(v[7], bit);
2834
2835 v[8] = _mm_mullo_epi32(in[7], cospi34);
2836 x = _mm_mullo_epi32(in[8], cospi30);
2837 v[8] = _mm_add_epi32(v[8], x);
2838 v[8] = _mm_add_epi32(v[8], rnding);
2839 v[8] = _mm_srai_epi32(v[8], bit);
2840
2841 v[9] = _mm_mullo_epi32(in[7], cospi30);
2842 x = _mm_mullo_epi32(in[8], cospi34);
2843 v[9] = _mm_sub_epi32(v[9], x);
2844 v[9] = _mm_add_epi32(v[9], rnding);
2845 v[9] = _mm_srai_epi32(v[9], bit);
2846
2847 v[10] = _mm_mullo_epi32(in[5], cospi42);
2848 x = _mm_mullo_epi32(in[10], cospi22);
2849 v[10] = _mm_add_epi32(v[10], x);
2850 v[10] = _mm_add_epi32(v[10], rnding);
2851 v[10] = _mm_srai_epi32(v[10], bit);
2852
2853 v[11] = _mm_mullo_epi32(in[5], cospi22);
2854 x = _mm_mullo_epi32(in[10], cospi42);
2855 v[11] = _mm_sub_epi32(v[11], x);
2856 v[11] = _mm_add_epi32(v[11], rnding);
2857 v[11] = _mm_srai_epi32(v[11], bit);
2858
2859 v[12] = _mm_mullo_epi32(in[3], cospi50);
2860 x = _mm_mullo_epi32(in[12], cospi14);
2861 v[12] = _mm_add_epi32(v[12], x);
2862 v[12] = _mm_add_epi32(v[12], rnding);
2863 v[12] = _mm_srai_epi32(v[12], bit);
2864
2865 v[13] = _mm_mullo_epi32(in[3], cospi14);
2866 x = _mm_mullo_epi32(in[12], cospi50);
2867 v[13] = _mm_sub_epi32(v[13], x);
2868 v[13] = _mm_add_epi32(v[13], rnding);
2869 v[13] = _mm_srai_epi32(v[13], bit);
2870
2871 v[14] = _mm_mullo_epi32(in[1], cospi58);
2872 x = _mm_mullo_epi32(in[14], cospi6);
2873 v[14] = _mm_add_epi32(v[14], x);
2874 v[14] = _mm_add_epi32(v[14], rnding);
2875 v[14] = _mm_srai_epi32(v[14], bit);
2876
2877 v[15] = _mm_mullo_epi32(in[1], cospi6);
2878 x = _mm_mullo_epi32(in[14], cospi58);
2879 v[15] = _mm_sub_epi32(v[15], x);
2880 v[15] = _mm_add_epi32(v[15], rnding);
2881 v[15] = _mm_srai_epi32(v[15], bit);
2882
2883 // stage 3
2884 addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2885 addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2886 addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2887 addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2888 addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2889 addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2890 addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2891 addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2892
2893 // stage 4
2894 v[0] = u[0];
2895 v[1] = u[1];
2896 v[2] = u[2];
2897 v[3] = u[3];
2898 v[4] = u[4];
2899 v[5] = u[5];
2900 v[6] = u[6];
2901 v[7] = u[7];
2902
2903 v[8] = _mm_mullo_epi32(u[8], cospi8);
2904 x = _mm_mullo_epi32(u[9], cospi56);
2905 v[8] = _mm_add_epi32(v[8], x);
2906 v[8] = _mm_add_epi32(v[8], rnding);
2907 v[8] = _mm_srai_epi32(v[8], bit);
2908
2909 v[9] = _mm_mullo_epi32(u[8], cospi56);
2910 x = _mm_mullo_epi32(u[9], cospi8);
2911 v[9] = _mm_sub_epi32(v[9], x);
2912 v[9] = _mm_add_epi32(v[9], rnding);
2913 v[9] = _mm_srai_epi32(v[9], bit);
2914
2915 v[10] = _mm_mullo_epi32(u[10], cospi40);
2916 x = _mm_mullo_epi32(u[11], cospi24);
2917 v[10] = _mm_add_epi32(v[10], x);
2918 v[10] = _mm_add_epi32(v[10], rnding);
2919 v[10] = _mm_srai_epi32(v[10], bit);
2920
2921 v[11] = _mm_mullo_epi32(u[10], cospi24);
2922 x = _mm_mullo_epi32(u[11], cospi40);
2923 v[11] = _mm_sub_epi32(v[11], x);
2924 v[11] = _mm_add_epi32(v[11], rnding);
2925 v[11] = _mm_srai_epi32(v[11], bit);
2926
2927 v[12] = _mm_mullo_epi32(u[12], cospim56);
2928 x = _mm_mullo_epi32(u[13], cospi8);
2929 v[12] = _mm_add_epi32(v[12], x);
2930 v[12] = _mm_add_epi32(v[12], rnding);
2931 v[12] = _mm_srai_epi32(v[12], bit);
2932
2933 v[13] = _mm_mullo_epi32(u[12], cospi8);
2934 x = _mm_mullo_epi32(u[13], cospim56);
2935 v[13] = _mm_sub_epi32(v[13], x);
2936 v[13] = _mm_add_epi32(v[13], rnding);
2937 v[13] = _mm_srai_epi32(v[13], bit);
2938
2939 v[14] = _mm_mullo_epi32(u[14], cospim24);
2940 x = _mm_mullo_epi32(u[15], cospi40);
2941 v[14] = _mm_add_epi32(v[14], x);
2942 v[14] = _mm_add_epi32(v[14], rnding);
2943 v[14] = _mm_srai_epi32(v[14], bit);
2944
2945 v[15] = _mm_mullo_epi32(u[14], cospi40);
2946 x = _mm_mullo_epi32(u[15], cospim24);
2947 v[15] = _mm_sub_epi32(v[15], x);
2948 v[15] = _mm_add_epi32(v[15], rnding);
2949 v[15] = _mm_srai_epi32(v[15], bit);
2950
2951 // stage 5
2952 addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2953 addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2954 addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2955 addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2956 addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2957 addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2958 addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2959 addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2960
2961 // stage 6
2962 v[0] = u[0];
2963 v[1] = u[1];
2964 v[2] = u[2];
2965 v[3] = u[3];
2966
2967 v[4] = _mm_mullo_epi32(u[4], cospi16);
2968 x = _mm_mullo_epi32(u[5], cospi48);
2969 v[4] = _mm_add_epi32(v[4], x);
2970 v[4] = _mm_add_epi32(v[4], rnding);
2971 v[4] = _mm_srai_epi32(v[4], bit);
2972
2973 v[5] = _mm_mullo_epi32(u[4], cospi48);
2974 x = _mm_mullo_epi32(u[5], cospi16);
2975 v[5] = _mm_sub_epi32(v[5], x);
2976 v[5] = _mm_add_epi32(v[5], rnding);
2977 v[5] = _mm_srai_epi32(v[5], bit);
2978
2979 v[6] = _mm_mullo_epi32(u[6], cospim48);
2980 x = _mm_mullo_epi32(u[7], cospi16);
2981 v[6] = _mm_add_epi32(v[6], x);
2982 v[6] = _mm_add_epi32(v[6], rnding);
2983 v[6] = _mm_srai_epi32(v[6], bit);
2984
2985 v[7] = _mm_mullo_epi32(u[6], cospi16);
2986 x = _mm_mullo_epi32(u[7], cospim48);
2987 v[7] = _mm_sub_epi32(v[7], x);
2988 v[7] = _mm_add_epi32(v[7], rnding);
2989 v[7] = _mm_srai_epi32(v[7], bit);
2990
2991 v[8] = u[8];
2992 v[9] = u[9];
2993 v[10] = u[10];
2994 v[11] = u[11];
2995
2996 v[12] = _mm_mullo_epi32(u[12], cospi16);
2997 x = _mm_mullo_epi32(u[13], cospi48);
2998 v[12] = _mm_add_epi32(v[12], x);
2999 v[12] = _mm_add_epi32(v[12], rnding);
3000 v[12] = _mm_srai_epi32(v[12], bit);
3001
3002 v[13] = _mm_mullo_epi32(u[12], cospi48);
3003 x = _mm_mullo_epi32(u[13], cospi16);
3004 v[13] = _mm_sub_epi32(v[13], x);
3005 v[13] = _mm_add_epi32(v[13], rnding);
3006 v[13] = _mm_srai_epi32(v[13], bit);
3007
3008 v[14] = _mm_mullo_epi32(u[14], cospim48);
3009 x = _mm_mullo_epi32(u[15], cospi16);
3010 v[14] = _mm_add_epi32(v[14], x);
3011 v[14] = _mm_add_epi32(v[14], rnding);
3012 v[14] = _mm_srai_epi32(v[14], bit);
3013
3014 v[15] = _mm_mullo_epi32(u[14], cospi16);
3015 x = _mm_mullo_epi32(u[15], cospim48);
3016 v[15] = _mm_sub_epi32(v[15], x);
3017 v[15] = _mm_add_epi32(v[15], rnding);
3018 v[15] = _mm_srai_epi32(v[15], bit);
3019
3020 // stage 7
3021 addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
3022 addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
3023 addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
3024 addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
3025 addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
3026 addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
3027 addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
3028 addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
3029
3030 // stage 8
3031 v[0] = u[0];
3032 v[1] = u[1];
3033
3034 y = _mm_mullo_epi32(u[2], cospi32);
3035 x = _mm_mullo_epi32(u[3], cospi32);
3036 v[2] = _mm_add_epi32(y, x);
3037 v[2] = _mm_add_epi32(v[2], rnding);
3038 v[2] = _mm_srai_epi32(v[2], bit);
3039
3040 v[3] = _mm_sub_epi32(y, x);
3041 v[3] = _mm_add_epi32(v[3], rnding);
3042 v[3] = _mm_srai_epi32(v[3], bit);
3043
3044 v[4] = u[4];
3045 v[5] = u[5];
3046
3047 y = _mm_mullo_epi32(u[6], cospi32);
3048 x = _mm_mullo_epi32(u[7], cospi32);
3049 v[6] = _mm_add_epi32(y, x);
3050 v[6] = _mm_add_epi32(v[6], rnding);
3051 v[6] = _mm_srai_epi32(v[6], bit);
3052
3053 v[7] = _mm_sub_epi32(y, x);
3054 v[7] = _mm_add_epi32(v[7], rnding);
3055 v[7] = _mm_srai_epi32(v[7], bit);
3056
3057 v[8] = u[8];
3058 v[9] = u[9];
3059
3060 y = _mm_mullo_epi32(u[10], cospi32);
3061 x = _mm_mullo_epi32(u[11], cospi32);
3062 v[10] = _mm_add_epi32(y, x);
3063 v[10] = _mm_add_epi32(v[10], rnding);
3064 v[10] = _mm_srai_epi32(v[10], bit);
3065
3066 v[11] = _mm_sub_epi32(y, x);
3067 v[11] = _mm_add_epi32(v[11], rnding);
3068 v[11] = _mm_srai_epi32(v[11], bit);
3069
3070 v[12] = u[12];
3071 v[13] = u[13];
3072
3073 y = _mm_mullo_epi32(u[14], cospi32);
3074 x = _mm_mullo_epi32(u[15], cospi32);
3075 v[14] = _mm_add_epi32(y, x);
3076 v[14] = _mm_add_epi32(v[14], rnding);
3077 v[14] = _mm_srai_epi32(v[14], bit);
3078
3079 v[15] = _mm_sub_epi32(y, x);
3080 v[15] = _mm_add_epi32(v[15], rnding);
3081 v[15] = _mm_srai_epi32(v[15], bit);
3082
3083 // stage 9
3084 if (do_cols) {
3085 out[0] = v[0];
3086 out[1] = _mm_sub_epi32(zero, v[8]);
3087 out[2] = v[12];
3088 out[3] = _mm_sub_epi32(zero, v[4]);
3089 out[4] = v[6];
3090 out[5] = _mm_sub_epi32(zero, v[14]);
3091 out[6] = v[10];
3092 out[7] = _mm_sub_epi32(zero, v[2]);
3093 out[8] = v[3];
3094 out[9] = _mm_sub_epi32(zero, v[11]);
3095 out[10] = v[15];
3096 out[11] = _mm_sub_epi32(zero, v[7]);
3097 out[12] = v[5];
3098 out[13] = _mm_sub_epi32(zero, v[13]);
3099 out[14] = v[9];
3100 out[15] = _mm_sub_epi32(zero, v[1]);
3101 } else {
3102 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
3103 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
3104 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
3105
3106 neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
3107 out_shift);
3108 neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
3109 &clamp_hi_out, out_shift);
3110 neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
3111 &clamp_hi_out, out_shift);
3112 neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
3113 &clamp_hi_out, out_shift);
3114 neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
3115 &clamp_hi_out, out_shift);
3116 neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
3117 &clamp_hi_out, out_shift);
3118 neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
3119 &clamp_hi_out, out_shift);
3120 neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
3121 &clamp_hi_out, out_shift);
3122 }
3123}
3124static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
3125 int bd, int out_shift) {
3126 (void)bit;
3127 __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
3128 __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits((int32_t)12) - 1));
3129 __m128i a0_low, a0_high, a1_low, a1_high;
3130 __m128i zero = _mm_setzero_si128();
3131 offset = _mm_unpacklo_epi32(offset, zero);
3132
3133 for (int i = 0; i < 16; i++) {
3134 a0_low = _mm_mul_epi32(in[i], fact);
3135 a0_low = _mm_add_epi32(a0_low, offset);
3136 a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits((int32_t)12));
3137
3138 a0_high = _mm_srli_si128(in[i], 4)((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i
)(in[i]), (int)(4)))
;
3139 a0_high = _mm_mul_epi32(a0_high, fact);
3140 a0_high = _mm_add_epi32(a0_high, offset);
3141 a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits((int32_t)12));
3142
3143 a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
3144 a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
3145 out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
3146 }
3147
3148 if (!do_cols) {
3149 const int log_range = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
3150 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3151 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3152 round_shift_8x8(out, out_shift);
3153 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16);
3154 }
3155}
3156static inline void idct64_stage8_sse4_1(
3157 __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
3158 const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
3159 const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
3160 const __m128i *rnding, int bit) {
3161 int i;
3162 __m128i temp1, temp2, temp3, temp4;
3163 temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit);
3164 u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit);
3165 u[10] = temp1;
3166 temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit);
3167 u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit);
3168 u[11] = temp2;
3169
3170 for (i = 16; i < 20; ++i) {
3171 addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
3172 addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo,
3173 clamp_hi);
3174 }
3175
3176 temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit);
3177 temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit);
3178 temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit);
3179 temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit);
3180 u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit);
3181 u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit);
3182 u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit);
3183 u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit);
3184 u[36] = temp1;
3185 u[37] = temp2;
3186 u[38] = temp3;
3187 u[39] = temp4;
3188
3189 temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit);
3190 temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit);
3191 temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit);
3192 temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit);
3193 u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit);
3194 u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit);
3195 u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit);
3196 u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit);
3197 u[40] = temp1;
3198 u[41] = temp2;
3199 u[42] = temp3;
3200 u[43] = temp4;
3201}
3202
3203static inline void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
3204 const __m128i *cospi32,
3205 const __m128i *clamp_lo,
3206 const __m128i *clamp_hi,
3207 const __m128i *rnding, int bit) {
3208 int i;
3209 __m128i temp1, temp2, temp3, temp4;
3210 for (i = 0; i < 8; ++i) {
3211 addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
3212 }
3213
3214 temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit);
3215 temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit);
3216 temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit);
3217 temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit);
3218 u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit);
3219 u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit);
3220 u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit);
3221 u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit);
3222 u[20] = temp1;
3223 u[21] = temp2;
3224 u[22] = temp3;
3225 u[23] = temp4;
3226 for (i = 32; i < 40; i++) {
3227 addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
3228 }
3229
3230 for (i = 48; i < 56; i++) {
3231 addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
3232 }
3233}
3234
3235static inline void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
3236 const __m128i *cospi32,
3237 const __m128i *clamp_lo,
3238 const __m128i *clamp_hi,
3239 const __m128i *rnding, int bit) {
3240 __m128i temp1, temp2, temp3, temp4;
3241 for (int i = 0; i < 16; i++) {
3242 addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
3243 }
3244
3245 temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit);
3246 temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit);
3247 temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit);
3248 temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit);
3249 u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit);
3250 u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit);
3251 u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit);
3252 u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit);
3253 u[40] = temp1;
3254 u[41] = temp2;
3255 u[42] = temp3;
3256 u[43] = temp4;
3257
3258 temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit);
3259 temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit);
3260 temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit);
3261 temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit);
3262 u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit);
3263 u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit);
3264 u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit);
3265 u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit);
3266 u[44] = temp1;
3267 u[45] = temp2;
3268 u[46] = temp3;
3269 u[47] = temp4;
3270}
3271
3272static inline void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
3273 int bd, int out_shift,
3274 const __m128i *clamp_lo,
3275 const __m128i *clamp_hi) {
3276 for (int i = 0; i < 32; i++) {
3277 addsub_sse4_1(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi);
3278 }
3279
3280 if (!do_cols) {
3281 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
3282 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
3283 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
3284
3285 for (int i = 0; i < 64; i += 4) {
3286 round_shift_4x4(out + i, out_shift);
3287 highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, &clamp_hi_out,
3288 4);
3289 }
3290 }
3291}
3292
3293static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
3294 int do_cols, int bd, int out_shift) {
3295 const int32_t *cospi = cospi_arr(bit);
3296 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3297 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
3298 __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3299 __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3300
3301 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3302
3303 {
3304 __m128i x;
3305
3306 // stage 1
3307 // stage 2
3308 // stage 3
3309 // stage 4
3310 // stage 5
3311 // stage 6
3312 x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit);
3313
3314 // stage 8
3315 // stage 9
3316 // stage 10
3317 // stage 11
3318 if (!do_cols) {
3319 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
3320 clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
3321 clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
3322 if (out_shift != 0) {
3323 __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
3324 x = _mm_add_epi32(x, offset);
3325 x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
3326 }
3327 }
3328 x = _mm_max_epi32(x, clamp_lo);
3329 x = _mm_min_epi32(x, clamp_hi);
3330 out[0] = x;
3331 out[1] = x;
3332 out[2] = x;
3333 out[3] = x;
3334 out[4] = x;
3335 out[5] = x;
3336 out[6] = x;
3337 out[7] = x;
3338 out[8] = x;
3339 out[9] = x;
3340 out[10] = x;
3341 out[11] = x;
3342 out[12] = x;
3343 out[13] = x;
3344 out[14] = x;
3345 out[15] = x;
3346 out[16] = x;
3347 out[17] = x;
3348 out[18] = x;
3349 out[19] = x;
3350 out[20] = x;
3351 out[21] = x;
3352 out[22] = x;
3353 out[23] = x;
3354 out[24] = x;
3355 out[25] = x;
3356 out[26] = x;
3357 out[27] = x;
3358 out[28] = x;
3359 out[29] = x;
3360 out[30] = x;
3361 out[31] = x;
3362 out[32] = x;
3363 out[33] = x;
3364 out[34] = x;
3365 out[35] = x;
3366 out[36] = x;
3367 out[37] = x;
3368 out[38] = x;
3369 out[39] = x;
3370 out[40] = x;
3371 out[41] = x;
3372 out[42] = x;
3373 out[43] = x;
3374 out[44] = x;
3375 out[45] = x;
3376 out[46] = x;
3377 out[47] = x;
3378 out[48] = x;
3379 out[49] = x;
3380 out[50] = x;
3381 out[51] = x;
3382 out[52] = x;
3383 out[53] = x;
3384 out[54] = x;
3385 out[55] = x;
3386 out[56] = x;
3387 out[57] = x;
3388 out[58] = x;
3389 out[59] = x;
3390 out[60] = x;
3391 out[61] = x;
3392 out[62] = x;
3393 out[63] = x;
3394 }
3395}
3396
3397static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
3398 int do_cols, int bd, int out_shift) {
3399 int i, j;
3400 const int32_t *cospi = cospi_arr(bit);
3401 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3402 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
3403 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3404 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3405
3406 const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3407 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3408 const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3409 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3410 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3411 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3412 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3413 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3414 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3415 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3416 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3417 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3418 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3419 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3420 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3421 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3422 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3423 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
3424 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
3425 const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
3426 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3427 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
3428 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
3429 const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
3430 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
3431 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3432 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3433 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3434 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3435 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
3436 const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3437 const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
3438 const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3439 const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3440 const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3441 const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
3442 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
3443 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3444
3445 {
3446 __m128i u[64];
3447
3448 // stage 1
3449 u[0] = in[0];
3450 u[8] = in[4];
3451 u[16] = in[2];
3452 u[24] = in[6];
3453 u[32] = in[1];
3454 u[40] = in[5];
3455 u[48] = in[3];
3456 u[56] = in[7];
3457
3458 // stage 2
3459 u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
3460 u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
3461 u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
3462 u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
3463 u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
3464 u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
3465 u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
3466 u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
3467
3468 // stage 3
3469 u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
3470 u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
3471 u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
3472 u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
3473 u[33] = u[32];
3474 u[38] = u[39];
3475 u[41] = u[40];
3476 u[46] = u[47];
3477 u[49] = u[48];
3478 u[54] = u[55];
3479 u[57] = u[56];
3480 u[62] = u[63];
3481
3482 // stage 4
3483 __m128i temp1, temp2;
3484 u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
3485 u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
3486 u[17] = u[16];
3487 u[22] = u[23];
3488 u[25] = u[24];
3489 u[30] = u[31];
3490
3491 temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3492 u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3493 u[33] = temp1;
3494
3495 temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3496 u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3497 u[57] = temp2;
3498
3499 temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3500 u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3501 u[41] = temp1;
3502
3503 temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3504 u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3505 u[46] = temp2;
3506
3507 // stage 5
3508 u[9] = u[8];
3509 u[14] = u[15];
3510
3511 temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3512 u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3513 u[17] = temp1;
3514
3515 temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3516 u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3517 u[22] = temp2;
3518
3519 u[35] = u[32];
3520 u[34] = u[33];
3521 u[36] = u[39];
3522 u[37] = u[38];
3523 u[43] = u[40];
3524 u[42] = u[41];
3525 u[44] = u[47];
3526 u[45] = u[46];
3527 u[51] = u[48];
3528 u[50] = u[49];
3529 u[52] = u[55];
3530 u[53] = u[54];
3531 u[59] = u[56];
3532 u[58] = u[57];
3533 u[60] = u[63];
3534 u[61] = u[62];
3535
3536 // stage 6
3537 temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3538 u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3539 u[0] = temp1;
3540
3541 temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3542 u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3543 u[9] = temp2;
3544 u[19] = u[16];
3545 u[18] = u[17];
3546 u[20] = u[23];
3547 u[21] = u[22];
3548 u[27] = u[24];
3549 u[26] = u[25];
3550 u[28] = u[31];
3551 u[29] = u[30];
3552
3553 temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3554 u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3555 u[34] = temp1;
3556 temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3557 u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3558 u[35] = temp2;
3559 temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3560 u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3561 u[36] = temp1;
3562 temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3563 u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3564 u[37] = temp2;
3565 temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3566 u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3567 u[42] = temp1;
3568 temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3569 u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3570 u[43] = temp2;
3571 temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3572 u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3573 u[44] = temp1;
3574 temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3575 u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3576 u[45] = temp2;
3577
3578 // stage 7
3579 u[3] = u[0];
3580 u[2] = u[1];
3581 u[11] = u[8];
3582 u[10] = u[9];
3583 u[12] = u[15];
3584 u[13] = u[14];
3585
3586 temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3587 u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3588 u[18] = temp1;
3589 temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3590 u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3591 u[19] = temp2;
3592 temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3593 u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3594 u[20] = temp1;
3595 temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3596 u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3597 u[21] = temp2;
3598 for (i = 32; i < 64; i += 16) {
3599 for (j = i; j < i + 4; j++) {
3600 addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3601 addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3602 &clamp_hi);
3603 }
3604 }
3605
3606 // stage 8
3607 u[7] = u[0];
3608 u[6] = u[1];
3609 u[5] = u[2];
3610 u[4] = u[3];
3611
3612 idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3613 &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3614
3615 // stage 9
3616 idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3617 bit);
3618
3619 // stage 10
3620 idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3621 bit);
3622
3623 // stage 11
3624 idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3625 }
3626}
3627
3628static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
3629 int do_cols, int bd, int out_shift) {
3630 int i, j;
3631 const int32_t *cospi = cospi_arr(bit);
3632 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3633 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
3634 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3635 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3636
3637 const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3638 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3639 const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3640 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3641 const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3642 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3643 const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3644 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3645 const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
3646 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
3647 const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
3648 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3649 const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
3650 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
3651 const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
3652 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3653 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3654 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3655 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3656 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3657 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
3658 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3659 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3660 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3661 const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
3662 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
3663 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
3664 const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
3665 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3666 const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3667 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3668 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3669 const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3670
3671 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
3672 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
3673 const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
3674 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3675 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
3676 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
3677 const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
3678 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
3679 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3680 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3681 const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
3682 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3683 const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
3684 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
3685 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3686 const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
3687 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
3688 const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
3689 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
3690 const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
3691 const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
3692
3693 {
3694 __m128i u[64];
3695 __m128i tmp1, tmp2, tmp3, tmp4;
3696 // stage 1
3697 u[0] = in[0];
3698 u[32] = in[1];
3699 u[36] = in[9];
3700 u[40] = in[5];
3701 u[44] = in[13];
3702 u[48] = in[3];
3703 u[52] = in[11];
3704 u[56] = in[7];
3705 u[60] = in[15];
3706 u[16] = in[2];
3707 u[20] = in[10];
3708 u[24] = in[6];
3709 u[28] = in[14];
3710 u[4] = in[8];
3711 u[8] = in[4];
3712 u[12] = in[12];
3713
3714 // stage 2
3715 u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
3716 u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
3717 u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
3718 u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
3719 u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
3720 u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
3721 u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
3722 u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
3723 u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
3724 u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
3725 u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
3726 u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
3727 u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
3728 u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
3729 u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
3730 u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
3731
3732 // stage 3
3733 u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
3734 u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
3735 u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit);
3736 u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit);
3737 u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit);
3738 u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit);
3739 u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
3740 u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
3741 u[33] = u[32];
3742 u[34] = u[35];
3743 u[37] = u[36];
3744 u[38] = u[39];
3745 u[41] = u[40];
3746 u[42] = u[43];
3747 u[45] = u[44];
3748 u[46] = u[47];
3749 u[49] = u[48];
3750 u[50] = u[51];
3751 u[53] = u[52];
3752 u[54] = u[55];
3753 u[57] = u[56];
3754 u[58] = u[59];
3755 u[61] = u[60];
3756 u[62] = u[63];
3757
3758 // stage 4
3759 u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
3760 u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
3761 u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
3762 u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
3763
3764 u[17] = u[16];
3765 u[18] = u[19];
3766 u[21] = u[20];
3767 u[22] = u[23];
3768 u[25] = u[24];
3769 u[26] = u[27];
3770 u[29] = u[28];
3771 u[30] = u[31];
3772
3773 tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3774 tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3775 tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3776 tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3777 u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3778 u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3779 u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3780 u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3781 u[33] = tmp1;
3782 u[34] = tmp2;
3783 u[37] = tmp3;
3784 u[38] = tmp4;
3785
3786 tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3787 tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3788 tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3789 tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3790 u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3791 u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3792 u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3793 u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3794 u[41] = tmp1;
3795 u[42] = tmp2;
3796 u[45] = tmp3;
3797 u[46] = tmp4;
3798
3799 // stage 5
3800 u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
3801 u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
3802
3803 u[9] = u[8];
3804 u[10] = u[11];
3805 u[13] = u[12];
3806 u[14] = u[15];
3807
3808 tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3809 tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
3810 tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
3811 tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3812 u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3813 u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
3814 u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
3815 u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3816 u[17] = tmp1;
3817 u[18] = tmp2;
3818 u[21] = tmp3;
3819 u[22] = tmp4;
3820
3821 for (i = 32; i < 64; i += 8) {
3822 addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3823 &clamp_hi);
3824 addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3825 &clamp_hi);
3826
3827 addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3828 &clamp_hi);
3829 addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3830 &clamp_hi);
3831 }
3832
3833 // stage 6
3834 tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3835 u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3836 u[0] = tmp1;
3837 u[5] = u[4];
3838 u[6] = u[7];
3839
3840 tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3841 u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3842 u[9] = tmp1;
3843 tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3844 u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3845 u[10] = tmp2;
3846
3847 for (i = 16; i < 32; i += 8) {
3848 addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3849 &clamp_hi);
3850 addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3851 &clamp_hi);
3852
3853 addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3854 &clamp_hi);
3855 addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3856 &clamp_hi);
3857 }
3858
3859 tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3860 tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3861 tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3862 tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3863 u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3864 u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3865 u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3866 u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3867 u[34] = tmp1;
3868 u[35] = tmp2;
3869 u[36] = tmp3;
3870 u[37] = tmp4;
3871
3872 tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3873 tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3874 tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3875 tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3876 u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3877 u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3878 u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3879 u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3880 u[42] = tmp1;
3881 u[43] = tmp2;
3882 u[44] = tmp3;
3883 u[45] = tmp4;
3884
3885 // stage 7
3886 u[3] = u[0];
3887 u[2] = u[1];
3888 tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
3889 u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
3890 u[5] = tmp1;
3891 addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3892 addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3893 addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3894 addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3895
3896 tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3897 tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3898 tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3899 tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3900 u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3901 u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3902 u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3903 u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3904 u[18] = tmp1;
3905 u[19] = tmp2;
3906 u[20] = tmp3;
3907 u[21] = tmp4;
3908
3909 for (i = 32; i < 64; i += 16) {
3910 for (j = i; j < i + 4; j++) {
3911 addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3912 addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3913 &clamp_hi);
3914 }
3915 }
3916
3917 // stage 8
3918 for (i = 0; i < 4; ++i) {
3919 addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
3920 }
3921
3922 idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3923 &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3924
3925 // stage 9
3926 idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3927 bit);
3928
3929 // stage 10
3930 idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3931 bit);
3932
3933 // stage 11
3934 idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3935 }
3936}
3937
3938static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
3939 int bd, int out_shift) {
3940 int i, j;
3941 const int32_t *cospi = cospi_arr(bit);
3942 const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3943 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
3944 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3945 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3946
3947 const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3948 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3949 const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3950 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3951 const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3952 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3953 const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3954 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3955 const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
3956 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
3957 const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
3958 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3959 const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
3960 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
3961 const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
3962 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3963 const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
3964 const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
3965 const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
3966 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3967 const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
3968 const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
3969 const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
3970 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3971 const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
3972 const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
3973 const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
3974 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3975 const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
3976 const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
3977 const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
3978 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3979 const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
3980 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
3981 const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
3982 const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
3983 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3984 const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
3985 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3986 const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
3987 const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
3988 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3989 const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
3990 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
3991 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
3992 const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
3993 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3994 const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3995 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3996 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3997 const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3998
3999 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
4000 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4001 const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
4002 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4003 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
4004 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4005 const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
4006 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4007 const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
4008 const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4009 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4010 const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
4011 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4012 const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
4013 const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4014 const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
4015 const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
4016 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4017 const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
4018 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4019 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4020 const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
4021 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4022 const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
4023 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4024 const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
4025 const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
4026
4027 {
4028 __m128i u[64], v[64];
4029
4030 // stage 1
4031 u[32] = in[1];
4032 u[34] = in[17];
4033 u[36] = in[9];
4034 u[38] = in[25];
4035 u[40] = in[5];
4036 u[42] = in[21];
4037 u[44] = in[13];
4038 u[46] = in[29];
4039 u[48] = in[3];
4040 u[50] = in[19];
4041 u[52] = in[11];
4042 u[54] = in[27];
4043 u[56] = in[7];
4044 u[58] = in[23];
4045 u[60] = in[15];
4046 u[62] = in[31];
4047
4048 v[16] = in[2];
4049 v[18] = in[18];
4050 v[20] = in[10];
4051 v[22] = in[26];
4052 v[24] = in[6];
4053 v[26] = in[22];
4054 v[28] = in[14];
4055 v[30] = in[30];
4056
4057 u[8] = in[4];
4058 u[10] = in[20];
4059 u[12] = in[12];
4060 u[14] = in[28];
4061
4062 v[4] = in[8];
4063 v[6] = in[24];
4064
4065 u[0] = in[0];
4066 u[2] = in[16];
4067
4068 // stage 2
4069 v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
4070 v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
4071 v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
4072 v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
4073 v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
4074 v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
4075 v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
4076 v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
4077 v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
4078 v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
4079 v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
4080 v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
4081 v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
4082 v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
4083 v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
4084 v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
4085 v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
4086 v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
4087 v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
4088 v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
4089 v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
4090 v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
4091 v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
4092 v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
4093 v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
4094 v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
4095 v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
4096 v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
4097 v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
4098 v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
4099 v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
4100 v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
4101
4102 // stage 3
4103 u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
4104 u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
4105 u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
4106 u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
4107 u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
4108 u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
4109 u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit);
4110 u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit);
4111 u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit);
4112 u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit);
4113 u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit);
4114 u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit);
4115 u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit);
4116 u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit);
4117 u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit);
4118 u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit);
4119
4120 for (i = 32; i < 64; i += 4) {
4121 addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
4122 &clamp_hi);
4123 addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
4124 &clamp_hi);
4125 }
4126
4127 // stage 4
4128 v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
4129 v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
4130 v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
4131 v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
4132 v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
4133 v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
4134 v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
4135 v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
4136
4137 for (i = 16; i < 32; i += 4) {
4138 addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
4139 &clamp_hi);
4140 addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
4141 &clamp_hi);
4142 }
4143
4144 for (i = 32; i < 64; i += 4) {
4145 v[i + 0] = u[i + 0];
4146 v[i + 3] = u[i + 3];
4147 }
4148
4149 v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
4150 v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
4151 v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
4152 v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
4153 v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
4154 v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
4155 v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
4156 v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
4157 v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
4158 v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
4159 v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
4160 v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
4161 v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
4162 v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
4163 v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
4164 v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
4165
4166 // stage 5
4167 u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
4168 u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
4169 u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
4170 u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
4171
4172 for (i = 8; i < 16; i += 4) {
4173 addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
4174 &clamp_hi);
4175 addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
4176 &clamp_hi);
4177 }
4178
4179 for (i = 16; i < 32; i += 4) {
4180 u[i + 0] = v[i + 0];
4181 u[i + 3] = v[i + 3];
4182 }
4183
4184 u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
4185 u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
4186 u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
4187 u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
4188 u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
4189 u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
4190 u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
4191 u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
4192
4193 for (i = 32; i < 64; i += 8) {
4194 addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
4195 &clamp_hi);
4196 addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
4197 &clamp_hi);
4198
4199 addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
4200 &clamp_hi);
4201 addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
4202 &clamp_hi);
4203 }
4204
4205 // stage 6
4206 v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
4207 v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
4208 v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
4209 v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
4210
4211 addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
4212 addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
4213
4214 for (i = 8; i < 16; i += 4) {
4215 v[i + 0] = u[i + 0];
4216 v[i + 3] = u[i + 3];
4217 }
4218
4219 v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
4220 v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
4221 v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
4222 v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
4223
4224 for (i = 16; i < 32; i += 8) {
4225 addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
4226 &clamp_hi);
4227 addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
4228 &clamp_hi);
4229
4230 addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
4231 &clamp_hi);
4232 addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
4233 &clamp_hi);
4234 }
4235
4236 for (i = 32; i < 64; i += 8) {
4237 v[i + 0] = u[i + 0];
4238 v[i + 1] = u[i + 1];
4239 v[i + 6] = u[i + 6];
4240 v[i + 7] = u[i + 7];
4241 }
4242
4243 v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
4244 v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
4245 v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
4246 v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
4247 v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
4248 v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
4249 v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
4250 v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
4251 v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
4252 v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
4253 v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
4254 v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
4255 v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
4256 v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
4257 v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
4258 v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
4259
4260 // stage 7
4261 addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
4262 addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
4263
4264 u[4] = v[4];
4265 u[7] = v[7];
4266 u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
4267 u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
4268
4269 addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
4270 addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
4271 addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
4272 addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
4273
4274 for (i = 16; i < 32; i += 8) {
4275 u[i + 0] = v[i + 0];
4276 u[i + 1] = v[i + 1];
4277 u[i + 6] = v[i + 6];
4278 u[i + 7] = v[i + 7];
4279 }
4280
4281 u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
4282 u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
4283 u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
4284 u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
4285 u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
4286 u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
4287 u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
4288 u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
4289
4290 for (i = 32; i < 64; i += 16) {
4291 for (j = i; j < i + 4; j++) {
4292 addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
4293 addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
4294 &clamp_hi);
4295 }
4296 }
4297
4298 // stage 8
4299 for (i = 0; i < 4; ++i) {
4300 addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
4301 }
4302
4303 v[8] = u[8];
4304 v[9] = u[9];
4305 v[14] = u[14];
4306 v[15] = u[15];
4307
4308 v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
4309 v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
4310 v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
4311 v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
4312
4313 for (i = 16; i < 20; ++i) {
4314 addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
4315 addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
4316 &clamp_hi);
4317 }
4318
4319 for (i = 32; i < 36; ++i) {
4320 v[i] = u[i];
4321 v[i + 12] = u[i + 12];
4322 v[i + 16] = u[i + 16];
4323 v[i + 28] = u[i + 28];
4324 }
4325
4326 v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
4327 v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
4328 v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
4329 v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
4330 v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
4331 v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
4332 v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
4333 v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
4334 v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
4335 v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
4336 v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
4337 v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
4338 v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
4339 v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
4340 v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
4341 v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
4342
4343 // stage 9
4344 for (i = 0; i < 8; ++i) {
4345 addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
4346 }
4347
4348 for (i = 16; i < 20; ++i) {
4349 u[i] = v[i];
4350 u[i + 12] = v[i + 12];
4351 }
4352
4353 u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
4354 u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
4355 u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
4356 u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
4357 u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
4358 u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
4359 u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
4360 u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
4361
4362 for (i = 32; i < 40; i++) {
4363 addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
4364 }
4365
4366 for (i = 48; i < 56; i++) {
4367 addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
4368 }
4369
4370 // stage 10
4371 for (i = 0; i < 16; i++) {
4372 addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
4373 }
4374
4375 for (i = 32; i < 40; i++) v[i] = u[i];
4376
4377 v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
4378 v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
4379 v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
4380 v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
4381 v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
4382 v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
4383 v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
4384 v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
4385 v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
4386 v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
4387 v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
4388 v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
4389 v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
4390 v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
4391 v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
4392 v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
4393
4394 for (i = 56; i < 64; i++) v[i] = u[i];
4395
4396 // stage 11
4397 for (i = 0; i < 32; i++) {
4398 addsub_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
4399 &clamp_hi);
4400 }
4401
4402 if (!do_cols) {
4403 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
4404 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
4405 const __m128i clamp_hi_out =
4406 _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
4407 for (i = 0; i < 64; i += 4) {
4408 round_shift_4x4(out + i, out_shift);
4409 highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out,
4410 &clamp_hi_out, 4);
4411 }
4412 }
4413 }
4414}
4415
4416static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
4417 int do_cols, int bd, int out_shift) {
4418 const int32_t *cospi = cospi_arr(bit);
4419 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4420 const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
1
Assuming right operand of bit shift is non-negative but less than 32
4421 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
2
Assuming 'do_cols' is 0
3
'?' condition is false
4
Assuming the condition is true
5
'?' condition is true
4422 __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4423 __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4424 __m128i bf1;
4425
4426 // stage 0
4427 // stage 1
4428 bf1 = in[0];
4429
4430 // stage 2
4431 // stage 3
4432 // stage 4
4433 // stage 5
4434 bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit);
4435
4436 // stage 6
4437 // stage 7
4438 // stage 8
4439 // stage 9
4440 if (do_cols
5.1
'do_cols' is 0
) {
4441 bf1 = _mm_max_epi32(bf1, clamp_lo);
4442 bf1 = _mm_min_epi32(bf1, clamp_hi);
4443 } else {
4444 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
6
Taking false branch
7
Assuming the condition is false
8
'?' condition is false
4445 clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
9
The result of left shift is undefined because the right operand is not smaller than 32, the capacity of 'int'
4446 clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
4447 if (out_shift != 0) {
4448 __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
4449 bf1 = _mm_add_epi32(bf1, offset);
4450 bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
4451 }
4452 }
4453
4454 bf1 = _mm_max_epi32(bf1, clamp_lo);
4455 bf1 = _mm_min_epi32(bf1, clamp_hi);
4456 out[0] = bf1;
4457 out[1] = bf1;
4458 out[2] = bf1;
4459 out[3] = bf1;
4460 out[4] = bf1;
4461 out[5] = bf1;
4462 out[6] = bf1;
4463 out[7] = bf1;
4464 out[8] = bf1;
4465 out[9] = bf1;
4466 out[10] = bf1;
4467 out[11] = bf1;
4468 out[12] = bf1;
4469 out[13] = bf1;
4470 out[14] = bf1;
4471 out[15] = bf1;
4472 out[16] = bf1;
4473 out[17] = bf1;
4474 out[18] = bf1;
4475 out[19] = bf1;
4476 out[20] = bf1;
4477 out[21] = bf1;
4478 out[22] = bf1;
4479 out[23] = bf1;
4480 out[24] = bf1;
4481 out[25] = bf1;
4482 out[26] = bf1;
4483 out[27] = bf1;
4484 out[28] = bf1;
4485 out[29] = bf1;
4486 out[30] = bf1;
4487 out[31] = bf1;
4488}
4489
4490static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
4491 int do_cols, int bd, int out_shift) {
4492 const int32_t *cospi = cospi_arr(bit);
4493 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4494 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4495 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4496 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4497 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4498 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4499 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4500 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4501 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4502 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4503 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4504 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4505 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4506 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4507 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4508 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4509 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4510 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4511 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4512 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4513 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4514 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4515 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4516 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4517 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4518 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4519 const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4520 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
4521 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4522 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4523 __m128i bf1[32];
4524
4525 // stage 0
4526 // stage 1
4527 bf1[0] = in[0];
4528 bf1[4] = in[4];
4529 bf1[8] = in[2];
4530 bf1[12] = in[6];
4531 bf1[16] = in[1];
4532 bf1[20] = in[5];
4533 bf1[24] = in[3];
4534 bf1[28] = in[7];
4535
4536 // stage 2
4537 bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
4538 bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
4539 bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
4540 bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
4541 bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
4542 bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
4543 bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
4544 bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
4545
4546 // stage 3
4547 bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
4548 bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
4549
4550 bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
4551 bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
4552 bf1[17] = bf1[16];
4553 bf1[18] = bf1[19];
4554 bf1[21] = bf1[20];
4555 bf1[22] = bf1[23];
4556 bf1[25] = bf1[24];
4557 bf1[26] = bf1[27];
4558 bf1[29] = bf1[28];
4559 bf1[30] = bf1[31];
4560
4561 // stage 4 :
4562 bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
4563 bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
4564
4565 bf1[9] = bf1[8];
4566 bf1[10] = bf1[11];
4567 bf1[13] = bf1[12];
4568 bf1[14] = bf1[15];
4569
4570 idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
4571 &cospi24, &cospi40, &cospim24, &rounding, bit);
4572
4573 // stage 5
4574 bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
4575 bf1[1] = bf1[0];
4576 bf1[5] = bf1[4];
4577 bf1[6] = bf1[7];
4578
4579 idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
4580 &clamp_hi, &rounding, bit);
4581
4582 // stage 6
4583 bf1[3] = bf1[0];
4584 bf1[2] = bf1[1];
4585
4586 idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
4587 &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
4588
4589 // stage 7
4590 idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4591 &rounding, bit);
4592
4593 // stage 8
4594 idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4595 &rounding, bit);
4596
4597 // stage 9
4598 idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
4599}
4600
4601static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
4602 int do_cols, int bd, int out_shift) {
4603 const int32_t *cospi = cospi_arr(bit);
4604 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4605 const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4606 const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4607 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4608 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4609 const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4610 const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4611 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4612 const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4613 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4614 const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4615 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4616 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4617 const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4618 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4619 const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4620 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4621 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
4622 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
4623 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4624 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
4625 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4626 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4627 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4628 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4629 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4630 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4631 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4632 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4633 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4634 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4635 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4636 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4637 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4638 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4639 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4640 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4641 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4642 const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4643 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
4644 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4645 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4646 __m128i bf1[32];
4647
4648 // stage 0
4649 // stage 1
4650
4651 bf1[0] = in[0];
4652 bf1[2] = in[8];
4653 bf1[4] = in[4];
4654 bf1[6] = in[12];
4655 bf1[8] = in[2];
4656 bf1[10] = in[10];
4657 bf1[12] = in[6];
4658 bf1[14] = in[14];
4659 bf1[16] = in[1];
4660 bf1[18] = in[9];
4661 bf1[20] = in[5];
4662 bf1[22] = in[13];
4663 bf1[24] = in[3];
4664 bf1[26] = in[11];
4665 bf1[28] = in[7];
4666 bf1[30] = in[15];
4667
4668 // stage 2
4669 bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
4670 bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
4671 bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit);
4672 bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit);
4673 bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit);
4674 bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit);
4675 bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
4676 bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
4677 bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
4678 bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
4679 bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit);
4680 bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit);
4681 bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit);
4682 bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit);
4683 bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
4684 bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
4685
4686 // stage 3
4687 bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
4688 bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
4689 bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit);
4690 bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit);
4691 bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit);
4692 bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit);
4693 bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
4694 bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
4695
4696 addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4697 addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4698 addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4699 addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4700 addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4701 addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4702 addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4703 addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4704 // stage 4
4705 bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
4706 bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
4707 bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit);
4708 bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit);
4709
4710 addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
4711 addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
4712 addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
4713 addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
4714
4715 idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
4716 &cospi24, &cospi40, &cospim24, &rounding, bit);
4717
4718 // stage 5
4719 bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
4720 bf1[1] = bf1[0];
4721 bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit);
4722 bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit);
4723
4724 addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
4725 addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
4726
4727 idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
4728 &clamp_hi, &rounding, bit);
4729
4730 // stage 6
4731 addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
4732 addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
4733
4734 idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
4735 &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
4736
4737 // stage 7
4738 idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4739 &rounding, bit);
4740
4741 // stage 8
4742 idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4743 &rounding, bit);
4744 // stage 9
4745 idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
4746}
4747
4748static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
4749 int bd, int out_shift) {
4750 const int32_t *cospi = cospi_arr(bit);
4751 const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4752 const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4753 const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4754 const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4755 const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4756 const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4757 const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4758 const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4759 const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
4760 const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4761 const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
4762 const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4763 const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
4764 const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4765 const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
4766 const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4767 const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4768 const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
4769 const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4770 const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
4771 const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4772 const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
4773 const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4774 const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
4775 const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4776 const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
4777 const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
4778 const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4779 const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
4780 const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
4781 const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
4782 const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4783 const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4784 const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
4785 const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4786 const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
4787 const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4788 const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4789 const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4790 const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4791 const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4792 const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4793 const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4794 const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4795 const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4796 const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4797 const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4798 const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4799 const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4800 const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4801 const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4802 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
4803 const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4804 const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4805 __m128i bf1[32], bf0[32];
4806
4807 // stage 0
4808 // stage 1
4809 bf1[0] = in[0];
4810 bf1[1] = in[16];
4811 bf1[2] = in[8];
4812 bf1[3] = in[24];
4813 bf1[4] = in[4];
4814 bf1[5] = in[20];
4815 bf1[6] = in[12];
4816 bf1[7] = in[28];
4817 bf1[8] = in[2];
4818 bf1[9] = in[18];
4819 bf1[10] = in[10];
4820 bf1[11] = in[26];
4821 bf1[12] = in[6];
4822 bf1[13] = in[22];
4823 bf1[14] = in[14];
4824 bf1[15] = in[30];
4825 bf1[16] = in[1];
4826 bf1[17] = in[17];
4827 bf1[18] = in[9];
4828 bf1[19] = in[25];
4829 bf1[20] = in[5];
4830 bf1[21] = in[21];
4831 bf1[22] = in[13];
4832 bf1[23] = in[29];
4833 bf1[24] = in[3];
4834 bf1[25] = in[19];
4835 bf1[26] = in[11];
4836 bf1[27] = in[27];
4837 bf1[28] = in[7];
4838 bf1[29] = in[23];
4839 bf1[30] = in[15];
4840 bf1[31] = in[31];
4841
4842 // stage 2
4843 bf0[0] = bf1[0];
4844 bf0[1] = bf1[1];
4845 bf0[2] = bf1[2];
4846 bf0[3] = bf1[3];
4847 bf0[4] = bf1[4];
4848 bf0[5] = bf1[5];
4849 bf0[6] = bf1[6];
4850 bf0[7] = bf1[7];
4851 bf0[8] = bf1[8];
4852 bf0[9] = bf1[9];
4853 bf0[10] = bf1[10];
4854 bf0[11] = bf1[11];
4855 bf0[12] = bf1[12];
4856 bf0[13] = bf1[13];
4857 bf0[14] = bf1[14];
4858 bf0[15] = bf1[15];
4859 bf0[16] =
4860 half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
4861 bf0[17] =
4862 half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
4863 bf0[18] =
4864 half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
4865 bf0[19] =
4866 half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
4867 bf0[20] =
4868 half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
4869 bf0[21] =
4870 half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
4871 bf0[22] =
4872 half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
4873 bf0[23] =
4874 half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
4875 bf0[24] =
4876 half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
4877 bf0[25] =
4878 half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
4879 bf0[26] =
4880 half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
4881 bf0[27] =
4882 half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
4883 bf0[28] =
4884 half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
4885 bf0[29] =
4886 half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
4887 bf0[30] =
4888 half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
4889 bf0[31] =
4890 half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
4891
4892 // stage 3
4893 bf1[0] = bf0[0];
4894 bf1[1] = bf0[1];
4895 bf1[2] = bf0[2];
4896 bf1[3] = bf0[3];
4897 bf1[4] = bf0[4];
4898 bf1[5] = bf0[5];
4899 bf1[6] = bf0[6];
4900 bf1[7] = bf0[7];
4901 bf1[8] =
4902 half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
4903 bf1[9] =
4904 half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
4905 bf1[10] =
4906 half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
4907 bf1[11] =
4908 half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
4909 bf1[12] =
4910 half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
4911 bf1[13] =
4912 half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
4913 bf1[14] =
4914 half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
4915 bf1[15] =
4916 half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
4917
4918 addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4919 addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4920 addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4921 addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4922 addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4923 addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4924 addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4925 addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4926
4927 // stage 4
4928 bf0[0] = bf1[0];
4929 bf0[1] = bf1[1];
4930 bf0[2] = bf1[2];
4931 bf0[3] = bf1[3];
4932 bf0[4] =
4933 half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
4934 bf0[5] =
4935 half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
4936 bf0[6] =
4937 half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
4938 bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
4939
4940 addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
4941 addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
4942 addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
4943 addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
4944
4945 bf0[16] = bf1[16];
4946 bf0[17] =
4947 half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
4948 bf0[18] =
4949 half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
4950 bf0[19] = bf1[19];
4951 bf0[20] = bf1[20];
4952 bf0[21] =
4953 half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
4954 bf0[22] =
4955 half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
4956 bf0[23] = bf1[23];
4957 bf0[24] = bf1[24];
4958 bf0[25] =
4959 half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
4960 bf0[26] =
4961 half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
4962 bf0[27] = bf1[27];
4963 bf0[28] = bf1[28];
4964 bf0[29] =
4965 half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
4966 bf0[30] =
4967 half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
4968 bf0[31] = bf1[31];
4969
4970 // stage 5
4971 bf1[0] =
4972 half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
4973 bf1[1] =
4974 half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
4975 bf1[2] =
4976 half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
4977 bf1[3] =
4978 half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
4979 addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
4980 addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
4981 bf1[8] = bf0[8];
4982 bf1[9] =
4983 half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
4984 bf1[10] =
4985 half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
4986 bf1[11] = bf0[11];
4987 bf1[12] = bf0[12];
4988 bf1[13] =
4989 half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
4990 bf1[14] =
4991 half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
4992 bf1[15] = bf0[15];
4993 addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
4994 addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
4995 addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
4996 addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
4997 addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
4998 addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
4999 addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
5000 addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
5001
5002 // stage 6
5003 addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
5004 addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
5005 bf0[4] = bf1[4];
5006 bf0[5] =
5007 half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
5008 bf0[6] =
5009 half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
5010 bf0[7] = bf1[7];
5011 addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
5012 addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
5013 addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
5014 addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
5015 bf0[16] = bf1[16];
5016 bf0[17] = bf1[17];
5017 bf0[18] =
5018 half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
5019 bf0[19] =
5020 half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
5021 bf0[20] =
5022 half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
5023 bf0[21] =
5024 half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
5025 bf0[22] = bf1[22];
5026 bf0[23] = bf1[23];
5027 bf0[24] = bf1[24];
5028 bf0[25] = bf1[25];
5029 bf0[26] =
5030 half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
5031 bf0[27] =
5032 half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
5033 bf0[28] =
5034 half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
5035 bf0[29] =
5036 half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
5037 bf0[30] = bf1[30];
5038 bf0[31] = bf1[31];
5039
5040 // stage 7
5041 addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
5042 addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
5043 addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
5044 addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
5045 bf1[8] = bf0[8];
5046 bf1[9] = bf0[9];
5047 bf1[10] =
5048 half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
5049 bf1[11] =
5050 half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
5051 bf1[12] =
5052 half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
5053 bf1[13] =
5054 half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
5055 bf1[14] = bf0[14];
5056 bf1[15] = bf0[15];
5057 addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
5058 addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
5059 addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
5060 addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
5061 addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
5062 addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
5063 addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
5064 addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
5065
5066 // stage 8
5067 addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
5068 addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
5069 addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
5070 addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
5071 addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
5072 addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
5073 addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
5074 addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
5075 bf0[16] = bf1[16];
5076 bf0[17] = bf1[17];
5077 bf0[18] = bf1[18];
5078 bf0[19] = bf1[19];
5079 bf0[20] =
5080 half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
5081 bf0[21] =
5082 half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
5083 bf0[22] =
5084 half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
5085 bf0[23] =
5086 half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
5087 bf0[24] =
5088 half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
5089 bf0[25] =
5090 half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
5091 bf0[26] =
5092 half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
5093 bf0[27] =
5094 half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
5095 bf0[28] = bf1[28];
5096 bf0[29] = bf1[29];
5097 bf0[30] = bf1[30];
5098 bf0[31] = bf1[31];
5099
5100 // stage 9
5101 addsub_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
5102 addsub_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
5103 addsub_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
5104 addsub_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
5105 addsub_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
5106 addsub_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
5107 addsub_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
5108 addsub_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
5109 addsub_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
5110 addsub_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
5111 addsub_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
5112 addsub_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
5113 addsub_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
5114 addsub_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
5115 addsub_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
5116 addsub_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
5117
5118 if (!do_cols) {
5119 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
5120 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
5121 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
5122 round_shift_8x8(out, out_shift);
5123 round_shift_8x8(out + 16, out_shift);
5124 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
5125 }
5126}
5127
5128static void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input,
5129 uint8_t *dest, int stride,
5130 const TxfmParam *txfm_param) {
5131 int bd = txfm_param->bd;
5132 const TX_TYPE tx_type = txfm_param->tx_type;
5133 const int32_t *src = cast_to_int32(input);
5134 switch (tx_type) {
5135 case IDTX:
5136 case H_DCT:
5137 case H_ADST:
5138 case H_FLIPADST:
5139 case V_DCT:
5140 case V_ADST:
5141 case V_FLIPADST:
5142 av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
5143 txfm_param->tx_size,
5144 txfm_param->eob, bd);
5145 break;
5146 default:
5147 av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest)((uint16_t *)(((uintptr_t)(dest)) << 1)), stride,
5148 tx_type, bd);
5149 break;
5150 }
5151}
5152static void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input,
5153 uint8_t *dest, int stride,
5154 const TxfmParam *txfm_param) {
5155 assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type])((void) sizeof ((av1_ext_tx_used[txfm_param->tx_set_type][
txfm_param->tx_type]) ? 1 : 0), __extension__ ({ if (av1_ext_tx_used
[txfm_param->tx_set_type][txfm_param->tx_type]) ; else __assert_fail
("av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]"
, "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c"
, 5155, __extension__ __PRETTY_FUNCTION__); }))
;
5156 int eob = txfm_param->eob;
5157 int bd = txfm_param->bd;
5158 int lossless = txfm_param->lossless;
5159 const int32_t *src = cast_to_int32(input);
5160 const TX_TYPE tx_type = txfm_param->tx_type;
5161 if (lossless) {
5162 assert(tx_type == DCT_DCT)((void) sizeof ((tx_type == DCT_DCT) ? 1 : 0), __extension__ (
{ if (tx_type == DCT_DCT) ; else __assert_fail ("tx_type == DCT_DCT"
, "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c"
, 5162, __extension__ __PRETTY_FUNCTION__); }))
;
5163 av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
5164 return;
5165 }
5166 av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest)((uint16_t *)(((uintptr_t)(dest)) << 1)), stride, tx_type,
5167 bd);
5168}
5169static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
5170 int bd, int out_shift) {
5171 (void)bit;
5172 for (int i = 0; i < 32; i += 16) {
5173 out[i] = _mm_slli_epi32(in[i], 2);
5174 out[i + 1] = _mm_slli_epi32(in[i + 1], 2);
5175 out[i + 2] = _mm_slli_epi32(in[i + 2], 2);
5176 out[i + 3] = _mm_slli_epi32(in[i + 3], 2);
5177 out[i + 4] = _mm_slli_epi32(in[i + 4], 2);
5178 out[i + 5] = _mm_slli_epi32(in[i + 5], 2);
5179 out[i + 6] = _mm_slli_epi32(in[i + 6], 2);
5180 out[i + 7] = _mm_slli_epi32(in[i + 7], 2);
5181 out[i + 8] = _mm_slli_epi32(in[i + 8], 2);
5182 out[i + 9] = _mm_slli_epi32(in[i + 9], 2);
5183 out[i + 10] = _mm_slli_epi32(in[i + 10], 2);
5184 out[i + 11] = _mm_slli_epi32(in[i + 11], 2);
5185 out[i + 12] = _mm_slli_epi32(in[i + 12], 2);
5186 out[i + 13] = _mm_slli_epi32(in[i + 13], 2);
5187 out[i + 14] = _mm_slli_epi32(in[i + 14], 2);
5188 out[i + 15] = _mm_slli_epi32(in[i + 15], 2);
5189 }
5190
5191 if (!do_cols) {
5192 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
5193 const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
5194 const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
5195 round_shift_8x8(out, out_shift);
5196 round_shift_8x8(out + 16, out_shift);
5197 highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
5198 }
5199}
5200static const transform_1d_sse4_1
5201 highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
5202 {
5203 { idct4x4_sse4_1, NULL((void*)0), NULL((void*)0), NULL((void*)0) },
5204 { iadst4x4_sse4_1, NULL((void*)0), NULL((void*)0), NULL((void*)0) },
5205 { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL((void*)0) },
5206 },
5207 { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL((void*)0), NULL((void*)0) },
5208 { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL((void*)0), NULL((void*)0) },
5209 { iidentity8_sse4_1, iidentity8_sse4_1, NULL((void*)0), NULL((void*)0) } },
5210 {
5211 { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
5212 NULL((void*)0) },
5213 { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
5214 NULL((void*)0) },
5215 { iidentity16_sse4_1, NULL((void*)0), iidentity16_sse4_1, NULL((void*)0) },
5216 },
5217 { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
5218 idct32x32_sse4_1 },
5219 { NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0) },
5220 { iidentity32_sse4_1, NULL((void*)0), NULL((void*)0), NULL((void*)0) } },
5221 { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
5222 idct64x64_sse4_1 },
5223 { NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0) },
5224 { NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0) } }
5225 };
5226static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input,
5227 uint16_t *output,
5228 int stride, TX_TYPE tx_type,
5229 TX_SIZE tx_size, int eob,
5230 const int bd) {
5231 __m128i buf1[64];
5232 int eobx, eoby;
5233 get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
5234 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5235 const int txw_idx = get_txw_idx(tx_size);
5236 const int txh_idx = get_txh_idx(tx_size);
5237 const int txfm_size_col = tx_size_wide[tx_size];
5238 const int txfm_size_row = tx_size_high[tx_size];
5239 const int buf_size_w = AOMMIN(32, txfm_size_col)(((32) < (txfm_size_col)) ? (32) : (txfm_size_col));
5240 const int buf_size_w_div4 = buf_size_w >> 2;
5241 const int buf_size_h_div8 = (eoby + 8) >> 3;
5242 const int row_max = AOMMIN(32, txfm_size_row)(((32) < (txfm_size_row)) ? (32) : (txfm_size_row));
5243 const int input_stride = row_max;
5244 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5245 const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
5246 const transform_1d_sse4_1 row_txfm =
5247 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5248 const transform_1d_sse4_1 col_txfm =
5249 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
5250 int ud_flip, lr_flip;
5251 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5252
5253 for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
5254 __m128i buf0[16];
5255 load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
5256 if (rect_type == 1 || rect_type == -1) {
5257 av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0,
5258 NewInvSqrt2);
5259 }
5260 row_txfm(buf0, buf0, INV_COS_BIT12, 0, bd, -shift[0]);
5261
5262 __m128i *_buf1 = buf1 + i * 4;
5263
5264 for (int j = 0; j < buf_size_w_div4; ++j) {
5265 __m128i *buf0_cur = buf0 + j * 4;
5266 TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0_cur
[0], buf0_cur[1]); u1 = _mm_unpackhi_epi32(buf0_cur[0], buf0_cur
[1]); u2 = _mm_unpacklo_epi32(buf0_cur[2], buf0_cur[3]); u3 =
_mm_unpackhi_epi32(buf0_cur[2], buf0_cur[3]); buf0_cur[0] = _mm_unpacklo_epi64
(u0, u2); buf0_cur[1] = _mm_unpackhi_epi64(u0, u2); buf0_cur[
2] = _mm_unpacklo_epi64(u1, u3); buf0_cur[3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
5267 buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3])do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0_cur
[0], buf0_cur[1]); u1 = _mm_unpackhi_epi32(buf0_cur[0], buf0_cur
[1]); u2 = _mm_unpacklo_epi32(buf0_cur[2], buf0_cur[3]); u3 =
_mm_unpackhi_epi32(buf0_cur[2], buf0_cur[3]); buf0_cur[0] = _mm_unpacklo_epi64
(u0, u2); buf0_cur[1] = _mm_unpackhi_epi64(u0, u2); buf0_cur[
2] = _mm_unpacklo_epi64(u1, u3); buf0_cur[3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
;
5268 _buf1[j * txfm_size_row + 0] = buf0_cur[0];
5269 _buf1[j * txfm_size_row + 1] = buf0_cur[1];
5270 _buf1[j * txfm_size_row + 2] = buf0_cur[2];
5271 _buf1[j * txfm_size_row + 3] = buf0_cur[3];
5272 }
5273 }
5274 for (int i = 0; i < buf_size_w_div4; i++) {
5275 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT12, 1,
5276 bd, 0);
5277
5278 av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5279 buf1 + i * txfm_size_row, txfm_size_row,
5280 -shift[1]);
5281 }
5282
5283 // write to buffer
5284 for (int i = 0; i < (txfm_size_col >> 3); i++) {
5285 highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i,
5286 stride, ud_flip, txfm_size_row, bd);
5287 }
5288}
5289static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input,
5290 uint16_t *output,
5291 int stride, TX_TYPE tx_type,
5292 TX_SIZE tx_size, int eob,
5293 const int bd) {
5294 __m128i buf1[64];
5295 int eobx, eoby;
5296 get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
5297 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5298 const int txw_idx = get_txw_idx(tx_size);
5299 const int txh_idx = get_txh_idx(tx_size);
5300 const int txfm_size_col = tx_size_wide[tx_size];
5301 const int txfm_size_row = tx_size_high[tx_size];
5302 const int buf_size_w_div4 = AOMMIN(32, txfm_size_col)(((32) < (txfm_size_col)) ? (32) : (txfm_size_col)) >> 2;
5303 const int row_max = AOMMIN(32, txfm_size_row)(((32) < (txfm_size_row)) ? (32) : (txfm_size_row));
5304 const int input_stride = row_max;
5305 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
5306 const int buf_size_nonzero_w = buf_size_nonzero_w_div8 << 3;
5307 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5308 const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
5309 const transform_1d_sse4_1 row_txfm =
5310 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
5311 const transform_1d_sse4_1 col_txfm =
5312 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5313 int ud_flip, lr_flip;
5314 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5315
5316 for (int i = 0; i < (row_max >> 2); ++i) {
5317 __m128i buf0[16];
5318 load_buffer_32bit_input(input + i * 4, input_stride, buf0,
5319 buf_size_nonzero_w);
5320 if (rect_type == 1 || rect_type == -1) {
5321 av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0,
5322 NewInvSqrt2);
5323 }
5324 row_txfm(buf0, buf0, INV_COS_BIT12, 0, bd, -shift[0]);
5325
5326 __m128i *_buf1 = buf1 + i * 4;
5327 if (lr_flip) {
5328 for (int j = 0; j < buf_size_w_div4; ++j) {
5329 TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j + 3], buf0[4 * j + 2]); u1 = _mm_unpackhi_epi32(buf0[4 * j
+ 3], buf0[4 * j + 2]); u2 = _mm_unpacklo_epi32(buf0[4 * j +
1], buf0[4 * j]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 1], buf0
[4 * j]); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0
] = _mm_unpacklo_epi64(u0, u2); _buf1[txfm_size_row * (buf_size_w_div4
- 1 - j) + 1] = _mm_unpackhi_epi64(u0, u2); _buf1[txfm_size_row
* (buf_size_w_div4 - 1 - j) + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
5330 buf0[4 * j],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j + 3], buf0[4 * j + 2]); u1 = _mm_unpackhi_epi32(buf0[4 * j
+ 3], buf0[4 * j + 2]); u2 = _mm_unpacklo_epi32(buf0[4 * j +
1], buf0[4 * j]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 1], buf0
[4 * j]); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0
] = _mm_unpacklo_epi64(u0, u2); _buf1[txfm_size_row * (buf_size_w_div4
- 1 - j) + 1] = _mm_unpackhi_epi64(u0, u2); _buf1[txfm_size_row
* (buf_size_w_div4 - 1 - j) + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
5331 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j + 3], buf0[4 * j + 2]); u1 = _mm_unpackhi_epi32(buf0[4 * j
+ 3], buf0[4 * j + 2]); u2 = _mm_unpacklo_epi32(buf0[4 * j +
1], buf0[4 * j]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 1], buf0
[4 * j]); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0
] = _mm_unpacklo_epi64(u0, u2); _buf1[txfm_size_row * (buf_size_w_div4
- 1 - j) + 1] = _mm_unpackhi_epi64(u0, u2); _buf1[txfm_size_row
* (buf_size_w_div4 - 1 - j) + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
5332 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j + 3], buf0[4 * j + 2]); u1 = _mm_unpackhi_epi32(buf0[4 * j
+ 3], buf0[4 * j + 2]); u2 = _mm_unpacklo_epi32(buf0[4 * j +
1], buf0[4 * j]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 1], buf0
[4 * j]); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0
] = _mm_unpacklo_epi64(u0, u2); _buf1[txfm_size_row * (buf_size_w_div4
- 1 - j) + 1] = _mm_unpackhi_epi64(u0, u2); _buf1[txfm_size_row
* (buf_size_w_div4 - 1 - j) + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
5333 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j + 3], buf0[4 * j + 2]); u1 = _mm_unpackhi_epi32(buf0[4 * j
+ 3], buf0[4 * j + 2]); u2 = _mm_unpacklo_epi32(buf0[4 * j +
1], buf0[4 * j]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 1], buf0
[4 * j]); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0
] = _mm_unpacklo_epi64(u0, u2); _buf1[txfm_size_row * (buf_size_w_div4
- 1 - j) + 1] = _mm_unpackhi_epi64(u0, u2); _buf1[txfm_size_row
* (buf_size_w_div4 - 1 - j) + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
5334 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3])do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j + 3], buf0[4 * j + 2]); u1 = _mm_unpackhi_epi32(buf0[4 * j
+ 3], buf0[4 * j + 2]); u2 = _mm_unpacklo_epi32(buf0[4 * j +
1], buf0[4 * j]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 1], buf0
[4 * j]); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0
] = _mm_unpacklo_epi64(u0, u2); _buf1[txfm_size_row * (buf_size_w_div4
- 1 - j) + 1] = _mm_unpackhi_epi64(u0, u2); _buf1[txfm_size_row
* (buf_size_w_div4 - 1 - j) + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
;
5335 }
5336 } else {
5337 for (int j = 0; j < buf_size_w_div4; ++j) {
5338 TRANSPOSE_4X4(do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[j *
4 + 0], buf0[j * 4 + 1]); u1 = _mm_unpackhi_epi32(buf0[j * 4
+ 0], buf0[j * 4 + 1]); u2 = _mm_unpacklo_epi32(buf0[j * 4 +
2], buf0[j * 4 + 3]); u3 = _mm_unpackhi_epi32(buf0[j * 4 + 2
], buf0[j * 4 + 3]); _buf1[j * txfm_size_row + 0] = _mm_unpacklo_epi64
(u0, u2); _buf1[j * txfm_size_row + 1] = _mm_unpackhi_epi64(u0
, u2); _buf1[j * txfm_size_row + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[j * txfm_size_row + 3] = _mm_unpackhi_epi64(u1, u3);
} while (0)
5339 buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[j *
4 + 0], buf0[j * 4 + 1]); u1 = _mm_unpackhi_epi32(buf0[j * 4
+ 0], buf0[j * 4 + 1]); u2 = _mm_unpacklo_epi32(buf0[j * 4 +
2], buf0[j * 4 + 3]); u3 = _mm_unpackhi_epi32(buf0[j * 4 + 2
], buf0[j * 4 + 3]); _buf1[j * txfm_size_row + 0] = _mm_unpacklo_epi64
(u0, u2); _buf1[j * txfm_size_row + 1] = _mm_unpackhi_epi64(u0
, u2); _buf1[j * txfm_size_row + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[j * txfm_size_row + 3] = _mm_unpackhi_epi64(u1, u3);
} while (0)
5340 _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[j *
4 + 0], buf0[j * 4 + 1]); u1 = _mm_unpackhi_epi32(buf0[j * 4
+ 0], buf0[j * 4 + 1]); u2 = _mm_unpacklo_epi32(buf0[j * 4 +
2], buf0[j * 4 + 3]); u3 = _mm_unpackhi_epi32(buf0[j * 4 + 2
], buf0[j * 4 + 3]); _buf1[j * txfm_size_row + 0] = _mm_unpacklo_epi64
(u0, u2); _buf1[j * txfm_size_row + 1] = _mm_unpackhi_epi64(u0
, u2); _buf1[j * txfm_size_row + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[j * txfm_size_row + 3] = _mm_unpackhi_epi64(u1, u3);
} while (0)
5341 _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3])do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[j *
4 + 0], buf0[j * 4 + 1]); u1 = _mm_unpackhi_epi32(buf0[j * 4
+ 0], buf0[j * 4 + 1]); u2 = _mm_unpacklo_epi32(buf0[j * 4 +
2], buf0[j * 4 + 3]); u3 = _mm_unpackhi_epi32(buf0[j * 4 + 2
], buf0[j * 4 + 3]); _buf1[j * txfm_size_row + 0] = _mm_unpacklo_epi64
(u0, u2); _buf1[j * txfm_size_row + 1] = _mm_unpackhi_epi64(u0
, u2); _buf1[j * txfm_size_row + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[j * txfm_size_row + 3] = _mm_unpackhi_epi64(u1, u3);
} while (0)
;
5342 }
5343 }
5344 }
5345 for (int i = 0; i < buf_size_w_div4; i++) {
5346 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT12, 1,
5347 bd, 0);
5348
5349 av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5350 buf1 + i * txfm_size_row, txfm_size_row,
5351 -shift[1]);
5352 }
5353
5354 // write to buffer
5355 {
5356 for (int i = 0; i < (txfm_size_col >> 3); i++) {
5357 highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
5358 output + 8 * i, stride, ud_flip,
5359 txfm_size_row, bd);
5360 }
5361 }
5362}
5363static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input,
5364 uint16_t *output, int stride,
5365 TX_TYPE tx_type, TX_SIZE tx_size,
5366 int eob, const int bd) {
5367 (void)eob;
5368 __m128i buf1[64 * 4];
5369 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5370 const int txw_idx = get_txw_idx(tx_size);
5371 const int txh_idx = get_txh_idx(tx_size);
5372 const int txfm_size_col = tx_size_wide[tx_size];
5373 const int txfm_size_row = tx_size_high[tx_size];
5374 const int row_max = AOMMIN(32, txfm_size_row)(((32) < (txfm_size_row)) ? (32) : (txfm_size_row));
5375 const int input_stride = row_max;
5376 const int buf_size_w = AOMMIN(32, txfm_size_col)(((32) < (txfm_size_col)) ? (32) : (txfm_size_col));
5377 const int buf_size_w_div4 = buf_size_w >> 2;
5378 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5379 const transform_1d_sse4_1 row_txfm =
5380 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5381 const transform_1d_sse4_1 col_txfm =
5382 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5383
5384 for (int i = 0; i < (row_max >> 2); ++i) {
5385 __m128i buf0[32];
5386 load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
5387 if (rect_type == 1 || rect_type == -1) {
5388 av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0,
5389 NewInvSqrt2);
5390 }
5391 row_txfm(buf0, buf0, INV_COS_BIT12, 0, bd, -shift[0]);
5392
5393 __m128i *_buf1 = buf1 + i * 4;
5394 for (int j = 0; j < buf_size_w_div4; ++j) {
5395 __m128i *buf0_cur = buf0 + j * 4;
5396 TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0_cur
[0], buf0_cur[1]); u1 = _mm_unpackhi_epi32(buf0_cur[0], buf0_cur
[1]); u2 = _mm_unpacklo_epi32(buf0_cur[2], buf0_cur[3]); u3 =
_mm_unpackhi_epi32(buf0_cur[2], buf0_cur[3]); buf0_cur[0] = _mm_unpacklo_epi64
(u0, u2); buf0_cur[1] = _mm_unpackhi_epi64(u0, u2); buf0_cur[
2] = _mm_unpacklo_epi64(u1, u3); buf0_cur[3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
5397 buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3])do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0_cur
[0], buf0_cur[1]); u1 = _mm_unpackhi_epi32(buf0_cur[0], buf0_cur
[1]); u2 = _mm_unpacklo_epi32(buf0_cur[2], buf0_cur[3]); u3 =
_mm_unpackhi_epi32(buf0_cur[2], buf0_cur[3]); buf0_cur[0] = _mm_unpacklo_epi64
(u0, u2); buf0_cur[1] = _mm_unpackhi_epi64(u0, u2); buf0_cur[
2] = _mm_unpacklo_epi64(u1, u3); buf0_cur[3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
;
5398 _buf1[j * txfm_size_row + 0] = buf0_cur[0];
5399 _buf1[j * txfm_size_row + 1] = buf0_cur[1];
5400 _buf1[j * txfm_size_row + 2] = buf0_cur[2];
5401 _buf1[j * txfm_size_row + 3] = buf0_cur[3];
5402 }
5403 }
5404 for (int i = 0; i < buf_size_w_div4; i++) {
5405 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT12, 1,
5406 bd, 0);
5407
5408 av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5409 buf1 + i * txfm_size_row, txfm_size_row,
5410 -shift[1]);
5411 }
5412
5413 // write to buffer
5414 {
5415 for (int i = 0; i < (txfm_size_col >> 3); i++) {
5416 highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
5417 output + 8 * i, stride, 0, txfm_size_row,
5418 bd);
5419 }
5420 }
5421}
5422static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
5423 uint16_t *output,
5424 int stride, TX_TYPE tx_type,
5425 TX_SIZE tx_size, int eob,
5426 const int bd) {
5427 __m128i buf1[64 * 16];
5428 int eobx, eoby;
5429 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
5430 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5431 const int txw_idx = get_txw_idx(tx_size);
5432 const int txh_idx = get_txh_idx(tx_size);
5433 const int txfm_size_col = tx_size_wide[tx_size];
5434 const int txfm_size_row = tx_size_high[tx_size];
5435 const int buf_size_w_div4 = txfm_size_col >> 2;
5436 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
5437 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
5438 const int input_stride = AOMMIN(32, txfm_size_row)(((32) < (txfm_size_row)) ? (32) : (txfm_size_row));
5439 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5440
5441 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
5442 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
5443 const transform_1d_sse4_1 row_txfm =
5444 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
5445 const transform_1d_sse4_1 col_txfm =
5446 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
5447
5448 assert(col_txfm != NULL)((void) sizeof ((col_txfm != ((void*)0)) ? 1 : 0), __extension__
({ if (col_txfm != ((void*)0)) ; else __assert_fail ("col_txfm != NULL"
, "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c"
, 5448, __extension__ __PRETTY_FUNCTION__); }))
;
5449 assert(row_txfm != NULL)((void) sizeof ((row_txfm != ((void*)0)) ? 1 : 0), __extension__
({ if (row_txfm != ((void*)0)) ; else __assert_fail ("row_txfm != NULL"
, "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c"
, 5449, __extension__ __PRETTY_FUNCTION__); }))
;
5450 int ud_flip, lr_flip;
5451 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5452
5453 // 1st stage: column transform
5454 for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
5455 __m128i buf0[64];
5456 load_buffer_32bit_input(input + i * 4, input_stride, buf0,
5457 buf_size_nonzero_w);
5458 if (rect_type == 1 || rect_type == -1) {
5459 av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0,
5460 NewInvSqrt2);
5461 }
5462 row_txfm(buf0, buf0, INV_COS_BIT12, 0, bd, -shift[0]);
5463
5464 __m128i *_buf1 = buf1 + i * 4;
5465 if (lr_flip) {
5466 for (int j = 0; j < buf_size_w_div4; ++j) {
5467 TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j + 3], buf0[4 * j + 2]); u1 = _mm_unpackhi_epi32(buf0[4 * j
+ 3], buf0[4 * j + 2]); u2 = _mm_unpacklo_epi32(buf0[4 * j +
1], buf0[4 * j]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 1], buf0
[4 * j]); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0
] = _mm_unpacklo_epi64(u0, u2); _buf1[txfm_size_row * (buf_size_w_div4
- 1 - j) + 1] = _mm_unpackhi_epi64(u0, u2); _buf1[txfm_size_row
* (buf_size_w_div4 - 1 - j) + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
5468 buf0[4 * j],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j + 3], buf0[4 * j + 2]); u1 = _mm_unpackhi_epi32(buf0[4 * j
+ 3], buf0[4 * j + 2]); u2 = _mm_unpacklo_epi32(buf0[4 * j +
1], buf0[4 * j]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 1], buf0
[4 * j]); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0
] = _mm_unpacklo_epi64(u0, u2); _buf1[txfm_size_row * (buf_size_w_div4
- 1 - j) + 1] = _mm_unpackhi_epi64(u0, u2); _buf1[txfm_size_row
* (buf_size_w_div4 - 1 - j) + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
5469 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j + 3], buf0[4 * j + 2]); u1 = _mm_unpackhi_epi32(buf0[4 * j
+ 3], buf0[4 * j + 2]); u2 = _mm_unpacklo_epi32(buf0[4 * j +
1], buf0[4 * j]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 1], buf0
[4 * j]); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0
] = _mm_unpacklo_epi64(u0, u2); _buf1[txfm_size_row * (buf_size_w_div4
- 1 - j) + 1] = _mm_unpackhi_epi64(u0, u2); _buf1[txfm_size_row
* (buf_size_w_div4 - 1 - j) + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
5470 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j + 3], buf0[4 * j + 2]); u1 = _mm_unpackhi_epi32(buf0[4 * j
+ 3], buf0[4 * j + 2]); u2 = _mm_unpacklo_epi32(buf0[4 * j +
1], buf0[4 * j]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 1], buf0
[4 * j]); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0
] = _mm_unpacklo_epi64(u0, u2); _buf1[txfm_size_row * (buf_size_w_div4
- 1 - j) + 1] = _mm_unpackhi_epi64(u0, u2); _buf1[txfm_size_row
* (buf_size_w_div4 - 1 - j) + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
5471 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j + 3], buf0[4 * j + 2]); u1 = _mm_unpackhi_epi32(buf0[4 * j
+ 3], buf0[4 * j + 2]); u2 = _mm_unpacklo_epi32(buf0[4 * j +
1], buf0[4 * j]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 1], buf0
[4 * j]); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0
] = _mm_unpacklo_epi64(u0, u2); _buf1[txfm_size_row * (buf_size_w_div4
- 1 - j) + 1] = _mm_unpackhi_epi64(u0, u2); _buf1[txfm_size_row
* (buf_size_w_div4 - 1 - j) + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
5472 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3])do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j + 3], buf0[4 * j + 2]); u1 = _mm_unpackhi_epi32(buf0[4 * j
+ 3], buf0[4 * j + 2]); u2 = _mm_unpacklo_epi32(buf0[4 * j +
1], buf0[4 * j]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 1], buf0
[4 * j]); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0
] = _mm_unpacklo_epi64(u0, u2); _buf1[txfm_size_row * (buf_size_w_div4
- 1 - j) + 1] = _mm_unpackhi_epi64(u0, u2); _buf1[txfm_size_row
* (buf_size_w_div4 - 1 - j) + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
;
5473 }
5474 } else {
5475 for (int j = 0; j < buf_size_w_div4; ++j) {
5476 TRANSPOSE_4X4(do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[j *
4 + 0], buf0[j * 4 + 1]); u1 = _mm_unpackhi_epi32(buf0[j * 4
+ 0], buf0[j * 4 + 1]); u2 = _mm_unpacklo_epi32(buf0[j * 4 +
2], buf0[j * 4 + 3]); u3 = _mm_unpackhi_epi32(buf0[j * 4 + 2
], buf0[j * 4 + 3]); _buf1[j * txfm_size_row + 0] = _mm_unpacklo_epi64
(u0, u2); _buf1[j * txfm_size_row + 1] = _mm_unpackhi_epi64(u0
, u2); _buf1[j * txfm_size_row + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[j * txfm_size_row + 3] = _mm_unpackhi_epi64(u1, u3);
} while (0)
5477 buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[j *
4 + 0], buf0[j * 4 + 1]); u1 = _mm_unpackhi_epi32(buf0[j * 4
+ 0], buf0[j * 4 + 1]); u2 = _mm_unpacklo_epi32(buf0[j * 4 +
2], buf0[j * 4 + 3]); u3 = _mm_unpackhi_epi32(buf0[j * 4 + 2
], buf0[j * 4 + 3]); _buf1[j * txfm_size_row + 0] = _mm_unpacklo_epi64
(u0, u2); _buf1[j * txfm_size_row + 1] = _mm_unpackhi_epi64(u0
, u2); _buf1[j * txfm_size_row + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[j * txfm_size_row + 3] = _mm_unpackhi_epi64(u1, u3);
} while (0)
5478 _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[j *
4 + 0], buf0[j * 4 + 1]); u1 = _mm_unpackhi_epi32(buf0[j * 4
+ 0], buf0[j * 4 + 1]); u2 = _mm_unpacklo_epi32(buf0[j * 4 +
2], buf0[j * 4 + 3]); u3 = _mm_unpackhi_epi32(buf0[j * 4 + 2
], buf0[j * 4 + 3]); _buf1[j * txfm_size_row + 0] = _mm_unpacklo_epi64
(u0, u2); _buf1[j * txfm_size_row + 1] = _mm_unpackhi_epi64(u0
, u2); _buf1[j * txfm_size_row + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[j * txfm_size_row + 3] = _mm_unpackhi_epi64(u1, u3);
} while (0)
5479 _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3])do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[j *
4 + 0], buf0[j * 4 + 1]); u1 = _mm_unpackhi_epi32(buf0[j * 4
+ 0], buf0[j * 4 + 1]); u2 = _mm_unpacklo_epi32(buf0[j * 4 +
2], buf0[j * 4 + 3]); u3 = _mm_unpackhi_epi32(buf0[j * 4 + 2
], buf0[j * 4 + 3]); _buf1[j * txfm_size_row + 0] = _mm_unpacklo_epi64
(u0, u2); _buf1[j * txfm_size_row + 1] = _mm_unpackhi_epi64(u0
, u2); _buf1[j * txfm_size_row + 2] = _mm_unpacklo_epi64(u1, u3
); _buf1[j * txfm_size_row + 3] = _mm_unpackhi_epi64(u1, u3);
} while (0)
;
5480 }
5481 }
5482 }
5483 // 2nd stage: column transform
5484 for (int i = 0; i < buf_size_w_div4; i++) {
5485 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT12, 1,
5486 bd, 0);
5487
5488 av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5489 buf1 + i * txfm_size_row, txfm_size_row,
5490 -shift[1]);
5491 }
5492
5493 // write to buffer
5494 {
5495 for (int i = 0; i < (txfm_size_col >> 3); i++) {
5496 highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
5497 output + 8 * i, stride, ud_flip,
5498 txfm_size_row, bd);
5499 }
5500 }
5501}
5502
5503static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input,
5504 uint16_t *output, int stride,
5505 TX_TYPE tx_type, TX_SIZE tx_size,
5506 int eob, const int bd) {
5507 (void)eob;
5508 __m128i buf1[8];
5509 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5510 const int txw_idx = get_txw_idx(tx_size);
5511 const int txh_idx = get_txh_idx(tx_size);
5512 const int txfm_size_col = tx_size_wide[tx_size];
5513 const int txfm_size_row = tx_size_high[tx_size];
5514 const transform_1d_sse4_1 row_txfm =
5515 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5516 const transform_1d_sse4_1 col_txfm =
5517 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
5518 const int input_stride = AOMMIN(32, txfm_size_row)(((32) < (txfm_size_row)) ? (32) : (txfm_size_row));
5519
5520 assert(col_txfm != NULL)((void) sizeof ((col_txfm != ((void*)0)) ? 1 : 0), __extension__
({ if (col_txfm != ((void*)0)) ; else __assert_fail ("col_txfm != NULL"
, "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c"
, 5520, __extension__ __PRETTY_FUNCTION__); }))
;
5521 assert(row_txfm != NULL)((void) sizeof ((row_txfm != ((void*)0)) ? 1 : 0), __extension__
({ if (row_txfm != ((void*)0)) ; else __assert_fail ("row_txfm != NULL"
, "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c"
, 5521, __extension__ __PRETTY_FUNCTION__); }))
;
5522 int ud_flip, lr_flip;
5523 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5524
5525 // 1st stage: column transform
5526 __m128i buf0[8];
5527 load_buffer_32bit_input(input, input_stride, buf0, txfm_size_col);
5528 load_buffer_32bit_input(input + 4, input_stride, buf0 + 4, txfm_size_col);
5529 av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0,
5530 NewInvSqrt2);
5531 row_txfm(buf0, buf0, INV_COS_BIT12, 0, bd, -shift[0]);
5532 row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT12, 0, bd, -shift[0]);
5533
5534 if (lr_flip) {
5535 TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[3],
buf0[2]); u1 = _mm_unpackhi_epi32(buf0[3], buf0[2]); u2 = _mm_unpacklo_epi32
(buf0[1], buf0[0]); u3 = _mm_unpackhi_epi32(buf0[1], buf0[0])
; buf1[0] = _mm_unpacklo_epi64(u0, u2); buf1[1] = _mm_unpackhi_epi64
(u0, u2); buf1[2] = _mm_unpacklo_epi64(u1, u3); buf1[3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
5536 buf1[3])do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[3],
buf0[2]); u1 = _mm_unpackhi_epi32(buf0[3], buf0[2]); u2 = _mm_unpacklo_epi32
(buf0[1], buf0[0]); u3 = _mm_unpackhi_epi32(buf0[1], buf0[0])
; buf1[0] = _mm_unpacklo_epi64(u0, u2); buf1[1] = _mm_unpackhi_epi64
(u0, u2); buf1[2] = _mm_unpacklo_epi64(u1, u3); buf1[3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
;
5537
5538 TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[7],
buf0[6]); u1 = _mm_unpackhi_epi32(buf0[7], buf0[6]); u2 = _mm_unpacklo_epi32
(buf0[5], buf0[4]); u3 = _mm_unpackhi_epi32(buf0[5], buf0[4])
; buf1[4] = _mm_unpacklo_epi64(u0, u2); buf1[5] = _mm_unpackhi_epi64
(u0, u2); buf1[6] = _mm_unpacklo_epi64(u1, u3); buf1[7] = _mm_unpackhi_epi64
(u1, u3); } while (0)
5539 buf1[7])do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[7],
buf0[6]); u1 = _mm_unpackhi_epi32(buf0[7], buf0[6]); u2 = _mm_unpacklo_epi32
(buf0[5], buf0[4]); u3 = _mm_unpackhi_epi32(buf0[5], buf0[4])
; buf1[4] = _mm_unpacklo_epi64(u0, u2); buf1[5] = _mm_unpackhi_epi64
(u0, u2); buf1[6] = _mm_unpacklo_epi64(u1, u3); buf1[7] = _mm_unpackhi_epi64
(u1, u3); } while (0)
;
5540 } else {
5541 TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[0],
buf0[1]); u1 = _mm_unpackhi_epi32(buf0[0], buf0[1]); u2 = _mm_unpacklo_epi32
(buf0[2], buf0[3]); u3 = _mm_unpackhi_epi32(buf0[2], buf0[3])
; buf1[0] = _mm_unpacklo_epi64(u0, u2); buf1[1] = _mm_unpackhi_epi64
(u0, u2); buf1[2] = _mm_unpacklo_epi64(u1, u3); buf1[3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
5542 buf1[3])do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[0],
buf0[1]); u1 = _mm_unpackhi_epi32(buf0[0], buf0[1]); u2 = _mm_unpacklo_epi32
(buf0[2], buf0[3]); u3 = _mm_unpackhi_epi32(buf0[2], buf0[3])
; buf1[0] = _mm_unpacklo_epi64(u0, u2); buf1[1] = _mm_unpackhi_epi64
(u0, u2); buf1[2] = _mm_unpacklo_epi64(u1, u3); buf1[3] = _mm_unpackhi_epi64
(u1, u3); } while (0)
;
5543
5544 TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4],
buf0[5]); u1 = _mm_unpackhi_epi32(buf0[4], buf0[5]); u2 = _mm_unpacklo_epi32
(buf0[6], buf0[7]); u3 = _mm_unpackhi_epi32(buf0[6], buf0[7])
; buf1[4] = _mm_unpacklo_epi64(u0, u2); buf1[5] = _mm_unpackhi_epi64
(u0, u2); buf1[6] = _mm_unpacklo_epi64(u1, u3); buf1[7] = _mm_unpackhi_epi64
(u1, u3); } while (0)
5545 buf1[7])do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4],
buf0[5]); u1 = _mm_unpackhi_epi32(buf0[4], buf0[5]); u2 = _mm_unpacklo_epi32
(buf0[6], buf0[7]); u3 = _mm_unpackhi_epi32(buf0[6], buf0[7])
; buf1[4] = _mm_unpacklo_epi64(u0, u2); buf1[5] = _mm_unpackhi_epi64
(u0, u2); buf1[6] = _mm_unpacklo_epi64(u1, u3); buf1[7] = _mm_unpackhi_epi64
(u1, u3); } while (0)
;
5546 }
5547
5548 // 2nd stage: column transform
5549 col_txfm(buf1, buf1, INV_COS_BIT12, 1, bd, 0);
5550
5551 av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
5552
5553 // write to buffer
5554 highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
5555 bd);
5556}
5557
5558static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input,
5559 uint16_t *output, int stride,
5560 TX_TYPE tx_type, TX_SIZE tx_size,
5561 int eob, const int bd) {
5562 (void)eob;
5563 __m128i buf1[8];
5564 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5565 const int txw_idx = get_txw_idx(tx_size);
5566 const int txh_idx = get_txh_idx(tx_size);
5567 const int txfm_size_col = tx_size_wide[tx_size];
5568 const int txfm_size_row = tx_size_high[tx_size];
5569 const transform_1d_sse4_1 row_txfm =
5570 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
5571 const transform_1d_sse4_1 col_txfm =
5572 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5573
5574 assert(col_txfm != NULL)((void) sizeof ((col_txfm != ((void*)0)) ? 1 : 0), __extension__
({ if (col_txfm != ((void*)0)) ; else __assert_fail ("col_txfm != NULL"
, "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c"
, 5574, __extension__ __PRETTY_FUNCTION__); }))
;
5575 assert(row_txfm != NULL)((void) sizeof ((row_txfm != ((void*)0)) ? 1 : 0), __extension__
({ if (row_txfm != ((void*)0)) ; else __assert_fail ("row_txfm != NULL"
, "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c"
, 5575, __extension__ __PRETTY_FUNCTION__); }))
;
5576 int ud_flip, lr_flip;
5577 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5578
5579 // 1st stage: column transform
5580 __m128i buf0[8];
5581 const int32_t *input_row = input;
5582 load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
5583
5584 av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_col, 0,
5585 NewInvSqrt2);
5586 row_txfm(buf0, buf0, INV_COS_BIT12, 0, bd, -shift[0]);
5587
5588 __m128i *buf1_ptr;
5589 if (lr_flip) {
5590 flip_buf_sse2(buf0, buf1, txfm_size_col);
5591 buf1_ptr = buf1;
5592 } else {
5593 buf1_ptr = buf0;
5594 }
5595
5596 // 2nd stage: column transform
5597 for (int i = 0; i < 2; i++) {
5598 __m128i *buf1_cur = buf1_ptr + i * txfm_size_row;
5599 transpose_32bit_4x4(buf1_cur, buf1_cur);
5600 col_txfm(buf1_cur, buf1_cur, INV_COS_BIT12, 1, bd, 0);
5601 }
5602 av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
5603 // write to buffer
5604 highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip,
5605 txfm_size_row, bd);
5606}
5607
5608static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input,
5609 uint16_t *output, int stride,
5610 TX_TYPE tx_type, TX_SIZE tx_size,
5611 int eob, const int bd) {
5612 (void)eob;
5613 __m128i buf1[16];
5614 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5615 const int txw_idx = get_txw_idx(tx_size);
5616 const int txh_idx = get_txh_idx(tx_size);
5617 const int txfm_size_col = tx_size_wide[tx_size];
5618 const int txfm_size_row = tx_size_high[tx_size];
5619 const int buf_size_h_div8 = txfm_size_row >> 2;
5620 const transform_1d_sse4_1 row_txfm =
5621 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5622 const transform_1d_sse4_1 col_txfm =
5623 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
5624 const int input_stride = AOMMIN(32, txfm_size_row)(((32) < (txfm_size_row)) ? (32) : (txfm_size_row));
5625
5626 assert(col_txfm != NULL)((void) sizeof ((col_txfm != ((void*)0)) ? 1 : 0), __extension__
({ if (col_txfm != ((void*)0)) ; else __assert_fail ("col_txfm != NULL"
, "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c"
, 5626, __extension__ __PRETTY_FUNCTION__); }))
;
5627 assert(row_txfm != NULL)((void) sizeof ((row_txfm != ((void*)0)) ? 1 : 0), __extension__
({ if (row_txfm != ((void*)0)) ; else __assert_fail ("row_txfm != NULL"
, "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c"
, 5627, __extension__ __PRETTY_FUNCTION__); }))
;
5628 int ud_flip, lr_flip;
5629 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5630
5631 // 1st stage: column transform
5632 __m128i buf0[16];
5633 for (int i = 0; i < (txfm_size_row >> 2); i++) {
5634 const int32_t *input_row = input + i * 4;
5635 __m128i *buf0_cur = buf0 + i * 4;
5636 load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_col);
5637 row_txfm(buf0_cur, buf0_cur, INV_COS_BIT12, 0, bd, -shift[0]);
5638 }
5639
5640 if (lr_flip) {
5641 for (int j = 0; j < buf_size_h_div8; ++j) {
5642 TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j + 3], buf0[4 * j + 2]); u1 = _mm_unpackhi_epi32(buf0[4 * j
+ 3], buf0[4 * j + 2]); u2 = _mm_unpacklo_epi32(buf0[4 * j +
1], buf0[4 * j]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 1], buf0
[4 * j]); buf1[4 * j] = _mm_unpacklo_epi64(u0, u2); buf1[4 * j
+ 1] = _mm_unpackhi_epi64(u0, u2); buf1[4 * j + 2] = _mm_unpacklo_epi64
(u1, u3); buf1[4 * j + 3] = _mm_unpackhi_epi64(u1, u3); } while
(0)
5643 buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j + 3], buf0[4 * j + 2]); u1 = _mm_unpackhi_epi32(buf0[4 * j
+ 3], buf0[4 * j + 2]); u2 = _mm_unpacklo_epi32(buf0[4 * j +
1], buf0[4 * j]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 1], buf0
[4 * j]); buf1[4 * j] = _mm_unpacklo_epi64(u0, u2); buf1[4 * j
+ 1] = _mm_unpackhi_epi64(u0, u2); buf1[4 * j + 2] = _mm_unpacklo_epi64
(u1, u3); buf1[4 * j + 3] = _mm_unpackhi_epi64(u1, u3); } while
(0)
5644 buf1[4 * j + 3])do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j + 3], buf0[4 * j + 2]); u1 = _mm_unpackhi_epi32(buf0[4 * j
+ 3], buf0[4 * j + 2]); u2 = _mm_unpacklo_epi32(buf0[4 * j +
1], buf0[4 * j]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 1], buf0
[4 * j]); buf1[4 * j] = _mm_unpacklo_epi64(u0, u2); buf1[4 * j
+ 1] = _mm_unpackhi_epi64(u0, u2); buf1[4 * j + 2] = _mm_unpacklo_epi64
(u1, u3); buf1[4 * j + 3] = _mm_unpackhi_epi64(u1, u3); } while
(0)
;
5645 }
5646 } else {
5647 for (int j = 0; j < buf_size_h_div8; ++j) {
5648 TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j], buf0[4 * j + 1]); u1 = _mm_unpackhi_epi32(buf0[4 * j], buf0
[4 * j + 1]); u2 = _mm_unpacklo_epi32(buf0[4 * j + 2], buf0[4
* j + 3]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 2], buf0[4 *
j + 3]); buf1[4 * j] = _mm_unpacklo_epi64(u0, u2); buf1[4 * j
+ 1] = _mm_unpackhi_epi64(u0, u2); buf1[4 * j + 2] = _mm_unpacklo_epi64
(u1, u3); buf1[4 * j + 3] = _mm_unpackhi_epi64(u1, u3); } while
(0)
5649 buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j], buf0[4 * j + 1]); u1 = _mm_unpackhi_epi32(buf0[4 * j], buf0
[4 * j + 1]); u2 = _mm_unpacklo_epi32(buf0[4 * j + 2], buf0[4
* j + 3]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 2], buf0[4 *
j + 3]); buf1[4 * j] = _mm_unpacklo_epi64(u0, u2); buf1[4 * j
+ 1] = _mm_unpackhi_epi64(u0, u2); buf1[4 * j + 2] = _mm_unpacklo_epi64
(u1, u3); buf1[4 * j + 3] = _mm_unpackhi_epi64(u1, u3); } while
(0)
5650 buf1[4 * j + 2], buf1[4 * j + 3])do { __m128i u0, u1, u2, u3; u0 = _mm_unpacklo_epi32(buf0[4 *
j], buf0[4 * j + 1]); u1 = _mm_unpackhi_epi32(buf0[4 * j], buf0
[4 * j + 1]); u2 = _mm_unpacklo_epi32(buf0[4 * j + 2], buf0[4
* j + 3]); u3 = _mm_unpackhi_epi32(buf0[4 * j + 2], buf0[4 *
j + 3]); buf1[4 * j] = _mm_unpacklo_epi64(u0, u2); buf1[4 * j
+ 1] = _mm_unpackhi_epi64(u0, u2); buf1[4 * j + 2] = _mm_unpacklo_epi64
(u1, u3); buf1[4 * j + 3] = _mm_unpackhi_epi64(u1, u3); } while
(0)
;
5651 }
5652 }
5653
5654 // 2nd stage: column transform
5655 col_txfm(buf1, buf1, INV_COS_BIT12, 1, bd, 0);
5656
5657 av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
5658
5659 // write to buffer
5660 highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
5661 bd);
5662}
5663
5664static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input,
5665 uint16_t *output, int stride,
5666 TX_TYPE tx_type, TX_SIZE tx_size,
5667 int eob, const int bd) {
5668 (void)eob;
5669 __m128i buf1[16];
5670 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5671 const int txw_idx = get_txw_idx(tx_size);
5672 const int txh_idx = get_txh_idx(tx_size);
5673 const int txfm_size_col = tx_size_wide[tx_size];
5674 const int txfm_size_row = tx_size_high[tx_size];
5675 const int buf_size_w_div8 = txfm_size_col >> 2;
5676 const transform_1d_sse4_1 row_txfm =
5677 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
5678 const transform_1d_sse4_1 col_txfm =
5679 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5680
5681 assert(col_txfm != NULL)((void) sizeof ((col_txfm != ((void*)0)) ? 1 : 0), __extension__
({ if (col_txfm != ((void*)0)) ; else __assert_fail ("col_txfm != NULL"
, "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c"
, 5681, __extension__ __PRETTY_FUNCTION__); }))
;
5682 assert(row_txfm != NULL)((void) sizeof ((row_txfm != ((void*)0)) ? 1 : 0), __extension__
({ if (row_txfm != ((void*)0)) ; else __assert_fail ("row_txfm != NULL"
, "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c"
, 5682, __extension__ __PRETTY_FUNCTION__); }))
;
5683 int ud_flip, lr_flip;
5684 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5685
5686 // 1st stage: column transform
5687 __m128i buf0[16];
5688 const int32_t *input_row = input;
5689 load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
5690
5691 row_txfm(buf0, buf0, INV_COS_BIT12, 0, bd, -shift[0]);
5692
5693 __m128i *buf1_ptr;
5694 if (lr_flip) {
5695 flip_buf_sse2(buf0, buf1, txfm_size_col);
5696 buf1_ptr = buf1;
5697 } else {
5698 buf1_ptr = buf0;
5699 }
5700
5701 // 2nd stage: column transform
5702 for (int i = 0; i < buf_size_w_div8; i++) {
5703 __m128i *buf1_cur = buf1_ptr + i * txfm_size_row;
5704 transpose_32bit_4x4(buf1_cur, buf1_cur);
5705 col_txfm(buf1_cur, buf1_cur, INV_COS_BIT12, 1, bd, 0);
5706 }
5707 av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
5708
5709 // write to buffer
5710 for (int i = 0; i < (txfm_size_col >> 3); i++) {
5711 highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2,
5712 output + 8 * i, stride, ud_flip,
5713 txfm_size_row, bd);
5714 }
5715}
5716
5717void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
5718 uint8_t *output, int stride,
5719 TX_TYPE tx_type, TX_SIZE tx_size,
5720 int eob, const int bd) {
5721 switch (tx_type) {
5722 case DCT_DCT:
5723 case ADST_DCT:
5724 case DCT_ADST:
5725 case ADST_ADST:
5726 case FLIPADST_DCT:
5727 case DCT_FLIPADST:
5728 case FLIPADST_FLIPADST:
5729 case ADST_FLIPADST:
5730 case FLIPADST_ADST:
5731 highbd_inv_txfm2d_add_no_identity_sse41(
5732 input, CONVERT_TO_SHORTPTR(output)((uint16_t *)(((uintptr_t)(output)) << 1)), stride, tx_type, tx_size, eob,
5733 bd);
5734 break;
5735 case V_DCT:
5736 case V_ADST:
5737 case V_FLIPADST:
5738 highbd_inv_txfm2d_add_h_identity_ssse41(
5739 input, CONVERT_TO_SHORTPTR(output)((uint16_t *)(((uintptr_t)(output)) << 1)), stride, tx_type, tx_size, eob,
5740 bd);
5741 break;
5742 case H_DCT:
5743 case H_ADST:
5744 case H_FLIPADST:
5745 highbd_inv_txfm2d_add_v_identity_ssse41(
5746 input, CONVERT_TO_SHORTPTR(output)((uint16_t *)(((uintptr_t)(output)) << 1)), stride, tx_type, tx_size, eob,
5747 bd);
5748 break;
5749 case IDTX:
5750 highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output)((uint16_t *)(((uintptr_t)(output)) << 1)),
5751 stride, tx_type, tx_size, eob, bd);
5752 break;
5753 default: assert(0)((void) sizeof ((0) ? 1 : 0), __extension__ ({ if (0) ; else __assert_fail
("0", "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c"
, 5753, __extension__ __PRETTY_FUNCTION__); }))
; break;
5754 }
5755}
5756
5757static void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input,
5758 uint8_t *dest, int stride,
5759 const TxfmParam *txfm_param) {
5760 int bd = txfm_param->bd;
5761 const TX_TYPE tx_type = txfm_param->tx_type;
5762 const TX_SIZE tx_size = txfm_param->tx_size;
5763 int eob = txfm_param->eob;
5764 highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest)((uint16_t *)(((uintptr_t)(dest)) << 1)), stride,
5765 tx_type, tx_size, eob, bd);
5766}
5767
5768static void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input,
5769 uint8_t *dest, int stride,
5770 const TxfmParam *txfm_param) {
5771 int bd = txfm_param->bd;
5772 const TX_TYPE tx_type = txfm_param->tx_type;
5773 const TX_SIZE tx_size = txfm_param->tx_size;
5774 int eob = txfm_param->eob;
5775 highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest)((uint16_t *)(((uintptr_t)(dest)) << 1)), stride,
5776 tx_type, tx_size, eob, bd);
5777}
5778
5779static void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input,
5780 uint8_t *dest, int stride,
5781 const TxfmParam *txfm_param) {
5782 int bd = txfm_param->bd;
5783 const TX_TYPE tx_type = txfm_param->tx_type;
5784 const TX_SIZE tx_size = txfm_param->tx_size;
5785 int eob = txfm_param->eob;
5786 highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest)((uint16_t *)(((uintptr_t)(dest)) << 1)), stride,
5787 tx_type, tx_size, eob, bd);
5788}
5789
5790static void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input,
5791 uint8_t *dest, int stride,
5792 const TxfmParam *txfm_param) {
5793 int bd = txfm_param->bd;
5794 const TX_TYPE tx_type = txfm_param->tx_type;
5795 const TX_SIZE tx_size = txfm_param->tx_size;
5796 int eob = txfm_param->eob;
5797 highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest)((uint16_t *)(((uintptr_t)(dest)) << 1)), stride,
5798 tx_type, tx_size, eob, bd);
5799}
5800
5801void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
5802 int stride, const TxfmParam *txfm_param) {
5803 assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type])((void) sizeof ((av1_ext_tx_used[txfm_param->tx_set_type][
txfm_param->tx_type]) ? 1 : 0), __extension__ ({ if (av1_ext_tx_used
[txfm_param->tx_set_type][txfm_param->tx_type]) ; else __assert_fail
("av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]"
, "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c"
, 5803, __extension__ __PRETTY_FUNCTION__); }))
;
5804 const TX_SIZE tx_size = txfm_param->tx_size;
5805 switch (tx_size) {
5806 case TX_8X8:
5807 av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
5808 break;
5809 case TX_4X8:
5810 av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
5811 break;
5812 case TX_8X4:
5813 av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
5814 break;
5815 case TX_4X4:
5816 av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
5817 break;
5818 case TX_16X4:
5819 av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
5820 break;
5821 case TX_4X16:
5822 av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
5823 break;
5824 default:
5825 av1_highbd_inv_txfm2d_add_universe_sse4_1(
5826 input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob,
5827 txfm_param->bd);
5828 break;
5829 }
5830}