Bug Summary

File:root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
Warning:line 1343, column 33
The result of left shift is undefined because the right operand is not smaller than 32, the capacity of 'int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name highbd_inv_txfm_avx2.c -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=all -relaxed-aliasing -ffp-contract=off -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -target-feature +avx2 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/root/firefox-clang/obj-x86_64-pc-linux-gnu/media/libaom -fcoverage-compilation-dir=/root/firefox-clang/obj-x86_64-pc-linux-gnu/media/libaom -resource-dir /usr/lib/llvm-21/lib/clang/21 -include /root/firefox-clang/obj-x86_64-pc-linux-gnu/mozilla-config.h -U _FORTIFY_SOURCE -D _FORTIFY_SOURCE=2 -D _GLIBCXX_ASSERTIONS -D DEBUG=1 -D MOZ_HAS_MOZGLUE -I /root/firefox-clang/media/libaom -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/media/libaom -I /root/firefox-clang/media/libaom/config/linux/x64 -I /root/firefox-clang/media/libaom/config -I /root/firefox-clang/third_party/aom -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include/nspr -I /root/firefox-clang/obj-x86_64-pc-linux-gnu/dist/include/nss -D MOZILLA_CLIENT -internal-isystem /usr/lib/llvm-21/lib/clang/21/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/14/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-error=tautological-type-limit-compare -Wno-range-loop-analysis -Wno-error=deprecated-declarations -Wno-error=array-bounds -Wno-error=free-nonheap-object -Wno-error=atomic-alignment -Wno-error=deprecated-builtins -Wno-psabi -Wno-error=builtin-macro-redefined -Wno-unknown-warning-option -Wno-sign-compare -Wno-unused-function -Wno-unreachable-code -Wno-unneeded-internal-declaration -ferror-limit 19 -fstrict-flex-arrays=1 -stack-protector 2 -fstack-clash-protection -ftrivial-auto-var-init=pattern -fgnuc-version=4.2.1 -fskip-odr-check-in-gmf -vectorize-loops -vectorize-slp -analyzer-checker optin.performance.Padding -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2025-06-27-100320-3286336-1 -x c /root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
1/*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11#include <assert.h>
12#include <immintrin.h>
13
14#include "config/aom_config.h"
15#include "config/av1_rtcd.h"
16
17#include "av1/common/av1_inv_txfm1d_cfg.h"
18#include "av1/common/idct.h"
19#include "av1/common/x86/av1_inv_txfm_ssse3.h"
20#include "av1/common/x86/highbd_txfm_utility_sse4.h"
21#include "aom_dsp/x86/txfm_common_avx2.h"
22
23// Note:
24// Total 32x4 registers to represent 32x32 block coefficients.
25// For high bit depth, each coefficient is 4-byte.
26// Each __m256i register holds 8 coefficients.
27// So each "row" we needs 4 register. Totally 32 rows
28// Register layout:
29// v0, v1, v2, v3,
30// v4, v5, v6, v7,
31// ... ...
32// v124, v125, v126, v127
33
34static inline __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
35 const __m256i zero = _mm256_setzero_si256();
36 const __m256i one = _mm256_set1_epi16(1);
37 const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
38 __m256i clamped, mask;
39
40 mask = _mm256_cmpgt_epi16(u, max);
41 clamped = _mm256_andnot_si256(mask, u);
42 mask = _mm256_and_si256(mask, max);
43 clamped = _mm256_or_si256(mask, clamped);
44 mask = _mm256_cmpgt_epi16(clamped, zero);
45 clamped = _mm256_and_si256(clamped, mask);
46
47 return clamped;
48}
49
50static inline void round_shift_4x4_avx2(__m256i *in, int shift) {
51 if (shift != 0) {
52 __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
53 in[0] = _mm256_add_epi32(in[0], rnding);
54 in[1] = _mm256_add_epi32(in[1], rnding);
55 in[2] = _mm256_add_epi32(in[2], rnding);
56 in[3] = _mm256_add_epi32(in[3], rnding);
57
58 in[0] = _mm256_srai_epi32(in[0], shift);
59 in[1] = _mm256_srai_epi32(in[1], shift);
60 in[2] = _mm256_srai_epi32(in[2], shift);
61 in[3] = _mm256_srai_epi32(in[3], shift);
62 }
63}
64
65static inline void round_shift_8x8_avx2(__m256i *in, int shift) {
66 round_shift_4x4_avx2(in, shift);
67 round_shift_4x4_avx2(in + 4, shift);
68 round_shift_4x4_avx2(in + 8, shift);
69 round_shift_4x4_avx2(in + 12, shift);
70}
71
72static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out,
73 const __m256i *clamp_lo,
74 const __m256i *clamp_hi, int size) {
75 __m256i a0, a1;
76 for (int i = 0; i < size; i += 4) {
77 a0 = _mm256_max_epi32(in[i], *clamp_lo);
78 out[i] = _mm256_min_epi32(a0, *clamp_hi);
79
80 a1 = _mm256_max_epi32(in[i + 1], *clamp_lo);
81 out[i + 1] = _mm256_min_epi32(a1, *clamp_hi);
82
83 a0 = _mm256_max_epi32(in[i + 2], *clamp_lo);
84 out[i + 2] = _mm256_min_epi32(a0, *clamp_hi);
85
86 a1 = _mm256_max_epi32(in[i + 3], *clamp_lo);
87 out[i + 3] = _mm256_min_epi32(a1, *clamp_hi);
88 }
89}
90
91static inline __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
92 __m256i res0, __m256i res1,
93 const int bd) {
94 __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
95 __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1)((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)
(pred), (int)(1)))
);
96
97 x0 = _mm256_add_epi32(res0, x0);
98 x1 = _mm256_add_epi32(res1, x1);
99 x0 = _mm256_packus_epi32(x0, x1);
100 x0 = _mm256_permute4x64_epi64(x0, 0xd8)((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(x0), (int
)(0xd8)))
;
101 x0 = highbd_clamp_epi16_avx2(x0, bd);
102 return x0;
103}
104
105static inline void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
106 int stride, int flipud,
107 int height, const int bd) {
108 int j = flipud ? (height - 1) : 0;
109 const int step = flipud ? -1 : 1;
110 for (int i = 0; i < height; ++i, j += step) {
111 __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride));
112 __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
113
114 _mm256_storeu_si256((__m256i *)(output + i * stride), u);
115 }
116}
117static inline __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res,
118 const int bd) {
119 __m256i x0 = pred;
120 x0 = _mm256_add_epi32(res, x0);
121 x0 = _mm256_packus_epi32(x0, x0);
122 x0 = _mm256_permute4x64_epi64(x0, 0xd8)((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(x0), (int
)(0xd8)))
;
123 x0 = highbd_clamp_epi16_avx2(x0, bd);
124 return x0;
125}
126
127static inline void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output,
128 int stride, int flipud,
129 int height, const int bd) {
130 int j = flipud ? (height - 1) : 0;
131 __m128i temp;
132 const int step = flipud ? -1 : 1;
133 for (int i = 0; i < height; ++i, j += step) {
134 temp = _mm_loadu_si128((__m128i const *)(output + i * stride));
135 __m256i v = _mm256_cvtepi16_epi32(temp);
136 __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd);
137 __m128i u1 = _mm256_castsi256_si128(u);
138 _mm_storeu_si128((__m128i *)(output + i * stride), u1);
139 }
140}
141static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
142 __m256i *out1, const __m256i *clamp_lo,
143 const __m256i *clamp_hi, int shift) {
144 __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
145 __m256i a0 = _mm256_add_epi32(offset, in0);
146 __m256i a1 = _mm256_sub_epi32(offset, in1);
147
148 a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
149 a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
150
151 a0 = _mm256_max_epi32(a0, *clamp_lo);
152 a0 = _mm256_min_epi32(a0, *clamp_hi);
153 a1 = _mm256_max_epi32(a1, *clamp_lo);
154 a1 = _mm256_min_epi32(a1, *clamp_hi);
155
156 *out0 = a0;
157 *out1 = a1;
158}
159
160static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
161 __m256i u0, u1, u2, u3, u4, u5, u6, u7;
162 __m256i x0, x1;
163
164 u0 = _mm256_unpacklo_epi32(in[0], in[1]);
165 u1 = _mm256_unpackhi_epi32(in[0], in[1]);
166
167 u2 = _mm256_unpacklo_epi32(in[2], in[3]);
168 u3 = _mm256_unpackhi_epi32(in[2], in[3]);
169
170 u4 = _mm256_unpacklo_epi32(in[4], in[5]);
171 u5 = _mm256_unpackhi_epi32(in[4], in[5]);
172
173 u6 = _mm256_unpacklo_epi32(in[6], in[7]);
174 u7 = _mm256_unpackhi_epi32(in[6], in[7]);
175
176 x0 = _mm256_unpacklo_epi64(u0, u2);
177 x1 = _mm256_unpacklo_epi64(u4, u6);
178 out[0] = _mm256_permute2f128_si256(x0, x1, 0x20)((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(x0
), (__v8si)(__m256i)(x1), (int)(0x20)))
;
179 out[4] = _mm256_permute2f128_si256(x0, x1, 0x31)((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(x0
), (__v8si)(__m256i)(x1), (int)(0x31)))
;
180
181 x0 = _mm256_unpackhi_epi64(u0, u2);
182 x1 = _mm256_unpackhi_epi64(u4, u6);
183 out[1] = _mm256_permute2f128_si256(x0, x1, 0x20)((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(x0
), (__v8si)(__m256i)(x1), (int)(0x20)))
;
184 out[5] = _mm256_permute2f128_si256(x0, x1, 0x31)((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(x0
), (__v8si)(__m256i)(x1), (int)(0x31)))
;
185
186 x0 = _mm256_unpacklo_epi64(u1, u3);
187 x1 = _mm256_unpacklo_epi64(u5, u7);
188 out[2] = _mm256_permute2f128_si256(x0, x1, 0x20)((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(x0
), (__v8si)(__m256i)(x1), (int)(0x20)))
;
189 out[6] = _mm256_permute2f128_si256(x0, x1, 0x31)((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(x0
), (__v8si)(__m256i)(x1), (int)(0x31)))
;
190
191 x0 = _mm256_unpackhi_epi64(u1, u3);
192 x1 = _mm256_unpackhi_epi64(u5, u7);
193 out[3] = _mm256_permute2f128_si256(x0, x1, 0x20)((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(x0
), (__v8si)(__m256i)(x1), (int)(0x20)))
;
194 out[7] = _mm256_permute2f128_si256(x0, x1, 0x31)((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(x0
), (__v8si)(__m256i)(x1), (int)(0x31)))
;
195}
196
197static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) {
198 __m256i u0, u1, u2, u3, u4, u5, u6, u7;
199 __m256i x0, x1;
200
201 u0 = _mm256_unpacklo_epi32(in[7], in[6]);
202 u1 = _mm256_unpackhi_epi32(in[7], in[6]);
203
204 u2 = _mm256_unpacklo_epi32(in[5], in[4]);
205 u3 = _mm256_unpackhi_epi32(in[5], in[4]);
206
207 u4 = _mm256_unpacklo_epi32(in[3], in[2]);
208 u5 = _mm256_unpackhi_epi32(in[3], in[2]);
209
210 u6 = _mm256_unpacklo_epi32(in[1], in[0]);
211 u7 = _mm256_unpackhi_epi32(in[1], in[0]);
212
213 x0 = _mm256_unpacklo_epi64(u0, u2);
214 x1 = _mm256_unpacklo_epi64(u4, u6);
215 out[0] = _mm256_permute2f128_si256(x0, x1, 0x20)((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(x0
), (__v8si)(__m256i)(x1), (int)(0x20)))
;
216 out[4] = _mm256_permute2f128_si256(x0, x1, 0x31)((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(x0
), (__v8si)(__m256i)(x1), (int)(0x31)))
;
217
218 x0 = _mm256_unpackhi_epi64(u0, u2);
219 x1 = _mm256_unpackhi_epi64(u4, u6);
220 out[1] = _mm256_permute2f128_si256(x0, x1, 0x20)((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(x0
), (__v8si)(__m256i)(x1), (int)(0x20)))
;
221 out[5] = _mm256_permute2f128_si256(x0, x1, 0x31)((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(x0
), (__v8si)(__m256i)(x1), (int)(0x31)))
;
222
223 x0 = _mm256_unpacklo_epi64(u1, u3);
224 x1 = _mm256_unpacklo_epi64(u5, u7);
225 out[2] = _mm256_permute2f128_si256(x0, x1, 0x20)((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(x0
), (__v8si)(__m256i)(x1), (int)(0x20)))
;
226 out[6] = _mm256_permute2f128_si256(x0, x1, 0x31)((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(x0
), (__v8si)(__m256i)(x1), (int)(0x31)))
;
227
228 x0 = _mm256_unpackhi_epi64(u1, u3);
229 x1 = _mm256_unpackhi_epi64(u5, u7);
230 out[3] = _mm256_permute2f128_si256(x0, x1, 0x20)((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(x0
), (__v8si)(__m256i)(x1), (int)(0x20)))
;
231 out[7] = _mm256_permute2f128_si256(x0, x1, 0x31)((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(x0
), (__v8si)(__m256i)(x1), (int)(0x31)))
;
232}
233
234static inline void load_buffer_32bit_input(const int32_t *in, int stride,
235 __m256i *out, int out_size) {
236 for (int i = 0; i < out_size; ++i) {
237 out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));
238 }
239}
240
241static inline __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
242 const __m256i *rounding, int bit) {
243 __m256i x;
244 x = _mm256_mullo_epi32(*w0, *n0);
245 x = _mm256_add_epi32(x, *rounding);
246 x = _mm256_srai_epi32(x, bit);
247 return x;
248}
249
250static inline __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
251 const __m256i *w1, const __m256i *n1,
252 const __m256i *rounding, int bit) {
253 __m256i x, y;
254
255 x = _mm256_mullo_epi32(*w0, *n0);
256 y = _mm256_mullo_epi32(*w1, *n1);
257 x = _mm256_add_epi32(x, y);
258 x = _mm256_add_epi32(x, *rounding);
259 x = _mm256_srai_epi32(x, bit);
260 return x;
261}
262
263static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
264 __m256i *out1, const __m256i *clamp_lo,
265 const __m256i *clamp_hi) {
266 __m256i a0 = _mm256_add_epi32(in0, in1);
267 __m256i a1 = _mm256_sub_epi32(in0, in1);
268
269 a0 = _mm256_max_epi32(a0, *clamp_lo);
270 a0 = _mm256_min_epi32(a0, *clamp_hi);
271 a1 = _mm256_max_epi32(a1, *clamp_lo);
272 a1 = _mm256_min_epi32(a1, *clamp_hi);
273
274 *out0 = a0;
275 *out1 = a1;
276}
277
278static inline void idct32_stage4_avx2(
279 __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
280 const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
281 const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
282 const __m256i *rounding, int bit) {
283 __m256i temp1, temp2;
284 temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
285 bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
286 bf1[17] = temp1;
287
288 temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
289 bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
290 bf1[18] = temp2;
291
292 temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
293 bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
294 bf1[21] = temp1;
295
296 temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
297 bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
298 bf1[22] = temp2;
299}
300
301static inline void idct32_stage5_avx2(
302 __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
303 const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
304 const __m256i *clamp_hi, const __m256i *rounding, int bit) {
305 __m256i temp1, temp2;
306 temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
307 bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
308 bf1[9] = temp1;
309
310 temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
311 bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
312 bf1[10] = temp2;
313
314 addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
315 addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
316 addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
317 addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
318 addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
319 addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
320 addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
321 addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
322}
323
324static inline void idct32_stage6_avx2(
325 __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
326 const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
327 const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
328 const __m256i *rounding, int bit) {
329 __m256i temp1, temp2;
330 temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
331 bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
332 bf1[5] = temp1;
333
334 addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
335 addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
336 addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
337 addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
338
339 temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
340 bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
341 bf1[18] = temp1;
342 temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
343 bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
344 bf1[19] = temp2;
345 temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
346 bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
347 bf1[20] = temp1;
348 temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
349 bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
350 bf1[21] = temp2;
351}
352
353static inline void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
354 const __m256i *cospi32,
355 const __m256i *clamp_lo,
356 const __m256i *clamp_hi,
357 const __m256i *rounding, int bit) {
358 __m256i temp1, temp2;
359 addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
360 addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
361 addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
362 addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
363
364 temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
365 bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
366 bf1[10] = temp1;
367 temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
368 bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
369 bf1[11] = temp2;
370
371 addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
372 addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
373 addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
374 addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
375 addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
376 addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
377 addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
378 addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
379}
380
381static inline void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
382 const __m256i *cospi32,
383 const __m256i *clamp_lo,
384 const __m256i *clamp_hi,
385 const __m256i *rounding, int bit) {
386 __m256i temp1, temp2;
387 addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
388 addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
389 addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
390 addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
391 addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
392 addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
393 addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
394 addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
395
396 temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
397 bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
398 bf1[20] = temp1;
399 temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
400 bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
401 bf1[21] = temp2;
402 temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
403 bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
404 bf1[22] = temp1;
405 temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
406 bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
407 bf1[23] = temp2;
408}
409
410static inline void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
411 const int do_cols, const int bd,
412 const int out_shift,
413 const __m256i *clamp_lo,
414 const __m256i *clamp_hi) {
415 addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
416 addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
417 addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
418 addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
419 addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
420 addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
421 addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
422 addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
423 addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
424 addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
425 addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
426 addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
427 addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
428 addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
429 addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
430 addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
431 if (!do_cols) {
432 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
433 const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
434 const __m256i clamp_hi_out =
435 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
436 round_shift_8x8_avx2(out, out_shift);
437 round_shift_8x8_avx2(out + 16, out_shift);
438 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
439 }
440}
441
442static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
443 int bd, int out_shift) {
444 const int32_t *cospi = cospi_arr(bit);
445 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
446 const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
447 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
448 __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
449 __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
450 __m256i x;
451 // stage 0
452 // stage 1
453 // stage 2
454 // stage 3
455 // stage 4
456 // stage 5
457 x = _mm256_mullo_epi32(in[0], cospi32);
458 x = _mm256_add_epi32(x, rounding);
459 x = _mm256_srai_epi32(x, bit);
460
461 // stage 6
462 // stage 7
463 // stage 8
464 // stage 9
465 if (!do_cols) {
466 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
467 __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
468 clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
469 clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
470 x = _mm256_add_epi32(offset, x);
471 x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
472 }
473 x = _mm256_max_epi32(x, clamp_lo);
474 x = _mm256_min_epi32(x, clamp_hi);
475 out[0] = x;
476 out[1] = x;
477 out[2] = x;
478 out[3] = x;
479 out[4] = x;
480 out[5] = x;
481 out[6] = x;
482 out[7] = x;
483 out[8] = x;
484 out[9] = x;
485 out[10] = x;
486 out[11] = x;
487 out[12] = x;
488 out[13] = x;
489 out[14] = x;
490 out[15] = x;
491 out[16] = x;
492 out[17] = x;
493 out[18] = x;
494 out[19] = x;
495 out[20] = x;
496 out[21] = x;
497 out[22] = x;
498 out[23] = x;
499 out[24] = x;
500 out[25] = x;
501 out[26] = x;
502 out[27] = x;
503 out[28] = x;
504 out[29] = x;
505 out[30] = x;
506 out[31] = x;
507}
508
509static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
510 int bd, int out_shift) {
511 const int32_t *cospi = cospi_arr(bit);
512 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
513 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
514 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
515 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
516 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
517 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
518 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
519 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
520 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
521 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
522 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
523 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
524 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
525 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
526 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
527 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
528 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
529 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
530 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
531 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
532 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
533 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
534 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
535 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
536 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
537 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
538 const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
539 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
540 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
541 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
542 __m256i bf1[32];
543
544 {
545 // stage 0
546 // stage 1
547 bf1[0] = in[0];
548 bf1[4] = in[4];
549 bf1[8] = in[2];
550 bf1[12] = in[6];
551 bf1[16] = in[1];
552 bf1[20] = in[5];
553 bf1[24] = in[3];
554 bf1[28] = in[7];
555
556 // stage 2
557 bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
558 bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
559 bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
560 bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
561 bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
562 bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
563 bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
564 bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
565
566 // stage 3
567 bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
568 bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
569
570 bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
571 bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
572 bf1[17] = bf1[16];
573 bf1[18] = bf1[19];
574 bf1[21] = bf1[20];
575 bf1[22] = bf1[23];
576 bf1[25] = bf1[24];
577 bf1[26] = bf1[27];
578 bf1[29] = bf1[28];
579 bf1[30] = bf1[31];
580
581 // stage 4
582 bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
583 bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
584
585 bf1[9] = bf1[8];
586 bf1[10] = bf1[11];
587 bf1[13] = bf1[12];
588 bf1[14] = bf1[15];
589
590 idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
591 &cospi24, &cospi40, &cospim24, &rounding, bit);
592
593 // stage 5
594 bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
595 bf1[1] = bf1[0];
596 bf1[5] = bf1[4];
597 bf1[6] = bf1[7];
598
599 idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
600 &clamp_hi, &rounding, bit);
601
602 // stage 6
603 bf1[3] = bf1[0];
604 bf1[2] = bf1[1];
605
606 idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
607 &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
608
609 // stage 7
610 idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
611 &rounding, bit);
612
613 // stage 8
614 idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
615 &rounding, bit);
616
617 // stage 9
618 idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
619 }
620}
621
622static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
623 int bd, int out_shift) {
624 const int32_t *cospi = cospi_arr(bit);
625 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
626 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
627 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
628 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
629 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
630 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
631 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
632 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
633 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
634 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
635 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
636 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
637 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
638 const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
639 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
640 const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
641 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
642 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
643 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
644 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
645 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
646 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
647 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
648 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
649 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
650 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
651 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
652 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
653 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
654 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
655 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
656 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
657 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
658 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
659 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
660 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
661 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
662 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
663 const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
664 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
665 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
666 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
667 __m256i bf1[32];
668
669 {
670 // stage 0
671 // stage 1
672 bf1[0] = in[0];
673 bf1[2] = in[8];
674 bf1[4] = in[4];
675 bf1[6] = in[12];
676 bf1[8] = in[2];
677 bf1[10] = in[10];
678 bf1[12] = in[6];
679 bf1[14] = in[14];
680 bf1[16] = in[1];
681 bf1[18] = in[9];
682 bf1[20] = in[5];
683 bf1[22] = in[13];
684 bf1[24] = in[3];
685 bf1[26] = in[11];
686 bf1[28] = in[7];
687 bf1[30] = in[15];
688
689 // stage 2
690 bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
691 bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
692 bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
693 bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
694 bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
695 bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
696 bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
697 bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
698 bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
699 bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
700 bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
701 bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
702 bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
703 bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
704 bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
705 bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
706
707 // stage 3
708 bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
709 bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
710 bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
711 bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
712 bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
713 bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
714 bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
715 bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
716
717 addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
718 addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
719 addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
720 addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
721 addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
722 addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
723 addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
724 addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
725
726 // stage 4
727 bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
728 bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
729 bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
730 bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
731
732 addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
733 addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
734 addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
735 addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
736
737 idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
738 &cospi24, &cospi40, &cospim24, &rounding, bit);
739
740 // stage 5
741 bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
742 bf1[1] = bf1[0];
743 bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
744 bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
745
746 addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
747 addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
748
749 idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
750 &clamp_hi, &rounding, bit);
751
752 // stage 6
753 addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
754 addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
755
756 idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
757 &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
758
759 // stage 7
760 idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
761 &rounding, bit);
762
763 // stage 8
764 idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
765 &rounding, bit);
766
767 // stage 9
768 idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
769 }
770}
771
772static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
773 int out_shift) {
774 const int32_t *cospi = cospi_arr(bit);
775 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
776 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
777 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
778 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
779 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
780 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
781 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
782 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
783 const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
784 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
785 const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
786 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
787 const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
788 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
789 const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
790 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
791 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
792 const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
793 const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
794 const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
795 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
796 const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
797 const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
798 const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
799 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
800 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
801 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
802 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
803 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
804 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
805 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
806 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
807 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
808 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
809 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
810 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
811 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
812 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
813 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
814 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
815 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
816 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
817 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
818 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
819 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
820 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
821 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
822 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
823 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
824 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
825 const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
826 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
827 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
828 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
829 __m256i bf1[32], bf0[32];
830
831 {
832 // stage 0
833 // stage 1
834 bf1[0] = in[0];
835 bf1[1] = in[16];
836 bf1[2] = in[8];
837 bf1[3] = in[24];
838 bf1[4] = in[4];
839 bf1[5] = in[20];
840 bf1[6] = in[12];
841 bf1[7] = in[28];
842 bf1[8] = in[2];
843 bf1[9] = in[18];
844 bf1[10] = in[10];
845 bf1[11] = in[26];
846 bf1[12] = in[6];
847 bf1[13] = in[22];
848 bf1[14] = in[14];
849 bf1[15] = in[30];
850 bf1[16] = in[1];
851 bf1[17] = in[17];
852 bf1[18] = in[9];
853 bf1[19] = in[25];
854 bf1[20] = in[5];
855 bf1[21] = in[21];
856 bf1[22] = in[13];
857 bf1[23] = in[29];
858 bf1[24] = in[3];
859 bf1[25] = in[19];
860 bf1[26] = in[11];
861 bf1[27] = in[27];
862 bf1[28] = in[7];
863 bf1[29] = in[23];
864 bf1[30] = in[15];
865 bf1[31] = in[31];
866
867 // stage 2
868 bf0[0] = bf1[0];
869 bf0[1] = bf1[1];
870 bf0[2] = bf1[2];
871 bf0[3] = bf1[3];
872 bf0[4] = bf1[4];
873 bf0[5] = bf1[5];
874 bf0[6] = bf1[6];
875 bf0[7] = bf1[7];
876 bf0[8] = bf1[8];
877 bf0[9] = bf1[9];
878 bf0[10] = bf1[10];
879 bf0[11] = bf1[11];
880 bf0[12] = bf1[12];
881 bf0[13] = bf1[13];
882 bf0[14] = bf1[14];
883 bf0[15] = bf1[15];
884 bf0[16] =
885 half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
886 bf0[17] =
887 half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
888 bf0[18] =
889 half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
890 bf0[19] =
891 half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
892 bf0[20] =
893 half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
894 bf0[21] =
895 half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
896 bf0[22] =
897 half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
898 bf0[23] =
899 half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
900 bf0[24] =
901 half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
902 bf0[25] =
903 half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
904 bf0[26] =
905 half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
906 bf0[27] =
907 half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
908 bf0[28] =
909 half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
910 bf0[29] =
911 half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
912 bf0[30] =
913 half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
914 bf0[31] =
915 half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
916
917 // stage 3
918 bf1[0] = bf0[0];
919 bf1[1] = bf0[1];
920 bf1[2] = bf0[2];
921 bf1[3] = bf0[3];
922 bf1[4] = bf0[4];
923 bf1[5] = bf0[5];
924 bf1[6] = bf0[6];
925 bf1[7] = bf0[7];
926 bf1[8] =
927 half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
928 bf1[9] =
929 half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
930 bf1[10] =
931 half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
932 bf1[11] =
933 half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
934 bf1[12] =
935 half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
936 bf1[13] =
937 half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
938 bf1[14] =
939 half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
940 bf1[15] =
941 half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
942
943 addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
944 addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
945 addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
946 addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
947 addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
948 addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
949 addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
950 addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
951
952 // stage 4
953 bf0[0] = bf1[0];
954 bf0[1] = bf1[1];
955 bf0[2] = bf1[2];
956 bf0[3] = bf1[3];
957 bf0[4] =
958 half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
959 bf0[5] =
960 half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
961 bf0[6] =
962 half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
963 bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
964
965 addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
966 addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
967 addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
968 addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
969
970 bf0[16] = bf1[16];
971 bf0[17] =
972 half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
973 bf0[18] =
974 half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
975 bf0[19] = bf1[19];
976 bf0[20] = bf1[20];
977 bf0[21] =
978 half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
979 bf0[22] =
980 half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
981 bf0[23] = bf1[23];
982 bf0[24] = bf1[24];
983 bf0[25] =
984 half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
985 bf0[26] =
986 half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
987 bf0[27] = bf1[27];
988 bf0[28] = bf1[28];
989 bf0[29] =
990 half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
991 bf0[30] =
992 half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
993 bf0[31] = bf1[31];
994
995 // stage 5
996 bf1[0] =
997 half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
998 bf1[1] =
999 half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
1000 bf1[2] =
1001 half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
1002 bf1[3] =
1003 half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
1004 addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
1005 addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
1006 bf1[8] = bf0[8];
1007 bf1[9] =
1008 half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
1009 bf1[10] =
1010 half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
1011 bf1[11] = bf0[11];
1012 bf1[12] = bf0[12];
1013 bf1[13] =
1014 half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
1015 bf1[14] =
1016 half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
1017 bf1[15] = bf0[15];
1018 addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
1019 addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
1020 addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
1021 addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
1022 addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
1023 addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
1024 addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
1025 addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
1026
1027 // stage 6
1028 addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
1029 addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
1030 bf0[4] = bf1[4];
1031 bf0[5] =
1032 half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
1033 bf0[6] =
1034 half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
1035 bf0[7] = bf1[7];
1036 addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
1037 addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
1038 addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
1039 addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
1040 bf0[16] = bf1[16];
1041 bf0[17] = bf1[17];
1042 bf0[18] =
1043 half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
1044 bf0[19] =
1045 half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
1046 bf0[20] =
1047 half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
1048 bf0[21] =
1049 half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
1050 bf0[22] = bf1[22];
1051 bf0[23] = bf1[23];
1052 bf0[24] = bf1[24];
1053 bf0[25] = bf1[25];
1054 bf0[26] =
1055 half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
1056 bf0[27] =
1057 half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
1058 bf0[28] =
1059 half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
1060 bf0[29] =
1061 half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
1062 bf0[30] = bf1[30];
1063 bf0[31] = bf1[31];
1064
1065 // stage 7
1066 addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
1067 addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
1068 addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
1069 addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
1070 bf1[8] = bf0[8];
1071 bf1[9] = bf0[9];
1072 bf1[10] =
1073 half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
1074 bf1[11] =
1075 half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
1076 bf1[12] =
1077 half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
1078 bf1[13] =
1079 half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
1080 bf1[14] = bf0[14];
1081 bf1[15] = bf0[15];
1082 addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
1083 addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
1084 addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
1085 addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
1086 addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
1087 addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
1088 addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
1089 addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
1090
1091 // stage 8
1092 addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
1093 addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
1094 addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
1095 addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
1096 addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
1097 addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
1098 addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
1099 addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
1100 bf0[16] = bf1[16];
1101 bf0[17] = bf1[17];
1102 bf0[18] = bf1[18];
1103 bf0[19] = bf1[19];
1104 bf0[20] =
1105 half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
1106 bf0[21] =
1107 half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
1108 bf0[22] =
1109 half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
1110 bf0[23] =
1111 half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
1112 bf0[24] =
1113 half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
1114 bf0[25] =
1115 half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
1116 bf0[26] =
1117 half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
1118 bf0[27] =
1119 half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
1120 bf0[28] = bf1[28];
1121 bf0[29] = bf1[29];
1122 bf0[30] = bf1[30];
1123 bf0[31] = bf1[31];
1124
1125 // stage 9
1126 addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
1127 addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
1128 addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
1129 addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
1130 addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
1131 addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
1132 addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
1133 addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
1134 addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
1135 addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
1136 addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
1137 addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
1138 addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
1139 addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
1140 addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
1141 addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
1142 if (!do_cols) {
1143 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
1144 const __m256i clamp_lo_out =
1145 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1146 const __m256i clamp_hi_out =
1147 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1148 round_shift_8x8_avx2(out, out_shift);
1149 round_shift_8x8_avx2(out + 16, out_shift);
1150 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
1151 }
1152 }
1153}
1154static void idct16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1155 int bd, int out_shift) {
1156 const int32_t *cospi = cospi_arr(bit);
1157 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1158 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1159 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
1160 __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1161 __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1162
1163 {
1164 // stage 0
1165 // stage 1
1166 // stage 2
1167 // stage 3
1168 // stage 4
1169 in[0] = _mm256_mullo_epi32(in[0], cospi32);
1170 in[0] = _mm256_add_epi32(in[0], rnding);
1171 in[0] = _mm256_srai_epi32(in[0], bit);
1172
1173 // stage 5
1174 // stage 6
1175 // stage 7
1176 if (!do_cols) {
1177 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
1178 clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1179 clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1180 __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
1181 in[0] = _mm256_add_epi32(in[0], offset);
1182 in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
1183 }
1184 in[0] = _mm256_max_epi32(in[0], clamp_lo);
1185 in[0] = _mm256_min_epi32(in[0], clamp_hi);
1186 out[0] = in[0];
1187 out[1] = in[0];
1188 out[2] = in[0];
1189 out[3] = in[0];
1190 out[4] = in[0];
1191 out[5] = in[0];
1192 out[6] = in[0];
1193 out[7] = in[0];
1194 out[8] = in[0];
1195 out[9] = in[0];
1196 out[10] = in[0];
1197 out[11] = in[0];
1198 out[12] = in[0];
1199 out[13] = in[0];
1200 out[14] = in[0];
1201 out[15] = in[0];
1202 }
1203}
1204
1205static void idct16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1206 int bd, int out_shift) {
1207 const int32_t *cospi = cospi_arr(bit);
1208 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
1209 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
1210 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
1211 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
1212 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
1213 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
1214 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1215 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1216 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
1217 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1218 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1219 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1220 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1221 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
1222 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1223 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
1224 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
1225 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1
Assuming right operand of bit shift is non-negative but less than 32
1226 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
2
Assuming 'do_cols' is 0
3
'?' condition is false
4
Assuming the condition is true
5
'?' condition is true
1227 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1228 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1229 __m256i u[16], x, y;
1230
1231 {
1232 // stage 0
1233 // stage 1
1234 u[0] = in[0];
1235 u[2] = in[4];
1236 u[4] = in[2];
1237 u[6] = in[6];
1238 u[8] = in[1];
1239 u[10] = in[5];
1240 u[12] = in[3];
1241 u[14] = in[7];
1242
1243 // stage 2
1244 u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
1245 u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
1246
1247 u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
1248 u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
1249
1250 u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
1251 u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
1252
1253 u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
1254 u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
1255
1256 // stage 3
1257 u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
1258 u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
1259 u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit);
1260 u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit);
1261
1262 addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1263 addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1264 addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1265 addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1266
1267 // stage 4
1268 x = _mm256_mullo_epi32(u[0], cospi32);
1269 u[0] = _mm256_add_epi32(x, rnding);
1270 u[0] = _mm256_srai_epi32(u[0], bit);
1271 u[1] = u[0];
1272
1273 u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
1274 u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
1275
1276 addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
1277 addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
1278
1279 x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
1280 u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
1281 u[9] = x;
1282 y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
1283 u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
1284 u[10] = y;
1285
1286 // stage 5
1287 addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
1288 addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
1289
1290 x = _mm256_mullo_epi32(u[5], cospi32);
1291 y = _mm256_mullo_epi32(u[6], cospi32);
1292 u[5] = _mm256_sub_epi32(y, x);
1293 u[5] = _mm256_add_epi32(u[5], rnding);
1294 u[5] = _mm256_srai_epi32(u[5], bit);
1295
1296 u[6] = _mm256_add_epi32(y, x);
1297 u[6] = _mm256_add_epi32(u[6], rnding);
1298 u[6] = _mm256_srai_epi32(u[6], bit);
1299
1300 addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
1301 addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
1302 addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
1303 addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
1304
1305 // stage 6
1306 addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
1307 addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
1308 addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
1309 addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
1310
1311 x = _mm256_mullo_epi32(u[10], cospi32);
1312 y = _mm256_mullo_epi32(u[13], cospi32);
1313 u[10] = _mm256_sub_epi32(y, x);
1314 u[10] = _mm256_add_epi32(u[10], rnding);
1315 u[10] = _mm256_srai_epi32(u[10], bit);
1316
1317 u[13] = _mm256_add_epi32(x, y);
1318 u[13] = _mm256_add_epi32(u[13], rnding);
1319 u[13] = _mm256_srai_epi32(u[13], bit);
1320
1321 x = _mm256_mullo_epi32(u[11], cospi32);
1322 y = _mm256_mullo_epi32(u[12], cospi32);
1323 u[11] = _mm256_sub_epi32(y, x);
1324 u[11] = _mm256_add_epi32(u[11], rnding);
1325 u[11] = _mm256_srai_epi32(u[11], bit);
1326
1327 u[12] = _mm256_add_epi32(x, y);
1328 u[12] = _mm256_add_epi32(u[12], rnding);
1329 u[12] = _mm256_srai_epi32(u[12], bit);
1330 // stage 7
1331 addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
1332 addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
1333 addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
1334 addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
1335 addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
1336 addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
1337 addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
1338 addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
1339
1340 if (!do_cols
5.1
'do_cols' is 0
) {
1341 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
6
Taking true branch
7
Assuming the condition is false
8
'?' condition is false
1342 const __m256i clamp_lo_out =
1343 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
9
The result of left shift is undefined because the right operand is not smaller than 32, the capacity of 'int'
1344 const __m256i clamp_hi_out =
1345 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1346 round_shift_8x8_avx2(out, out_shift);
1347 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
1348 }
1349 }
1350}
1351
1352static void idct16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
1353 int out_shift) {
1354 const int32_t *cospi = cospi_arr(bit);
1355 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
1356 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
1357 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
1358 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
1359 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
1360 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
1361 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
1362 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
1363 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
1364 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
1365 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
1366 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
1367 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1368 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
1369 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1370 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
1371 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
1372 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1373 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1374 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1375 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1376 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
1377 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1378 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1379 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
1380 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1381 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1382 __m256i u[16], v[16], x, y;
1383
1384 {
1385 // stage 0
1386 // stage 1
1387 u[0] = in[0];
1388 u[1] = in[8];
1389 u[2] = in[4];
1390 u[3] = in[12];
1391 u[4] = in[2];
1392 u[5] = in[10];
1393 u[6] = in[6];
1394 u[7] = in[14];
1395 u[8] = in[1];
1396 u[9] = in[9];
1397 u[10] = in[5];
1398 u[11] = in[13];
1399 u[12] = in[3];
1400 u[13] = in[11];
1401 u[14] = in[7];
1402 u[15] = in[15];
1403
1404 // stage 2
1405 v[0] = u[0];
1406 v[1] = u[1];
1407 v[2] = u[2];
1408 v[3] = u[3];
1409 v[4] = u[4];
1410 v[5] = u[5];
1411 v[6] = u[6];
1412 v[7] = u[7];
1413
1414 v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
1415 v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
1416 v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
1417 v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
1418 v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
1419 v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
1420 v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
1421 v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
1422
1423 // stage 3
1424 u[0] = v[0];
1425 u[1] = v[1];
1426 u[2] = v[2];
1427 u[3] = v[3];
1428 u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
1429 u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
1430 u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
1431 u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
1432 addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1433 addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1434 addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1435 addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1436
1437 // stage 4
1438 x = _mm256_mullo_epi32(u[0], cospi32);
1439 y = _mm256_mullo_epi32(u[1], cospi32);
1440 v[0] = _mm256_add_epi32(x, y);
1441 v[0] = _mm256_add_epi32(v[0], rnding);
1442 v[0] = _mm256_srai_epi32(v[0], bit);
1443
1444 v[1] = _mm256_sub_epi32(x, y);
1445 v[1] = _mm256_add_epi32(v[1], rnding);
1446 v[1] = _mm256_srai_epi32(v[1], bit);
1447
1448 v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
1449 v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
1450 addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
1451 addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
1452 v[8] = u[8];
1453 v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
1454 v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
1455 v[11] = u[11];
1456 v[12] = u[12];
1457 v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
1458 v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
1459 v[15] = u[15];
1460
1461 // stage 5
1462 addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
1463 addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
1464 u[4] = v[4];
1465
1466 x = _mm256_mullo_epi32(v[5], cospi32);
1467 y = _mm256_mullo_epi32(v[6], cospi32);
1468 u[5] = _mm256_sub_epi32(y, x);
1469 u[5] = _mm256_add_epi32(u[5], rnding);
1470 u[5] = _mm256_srai_epi32(u[5], bit);
1471
1472 u[6] = _mm256_add_epi32(y, x);
1473 u[6] = _mm256_add_epi32(u[6], rnding);
1474 u[6] = _mm256_srai_epi32(u[6], bit);
1475
1476 u[7] = v[7];
1477 addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
1478 addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
1479 addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
1480 addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
1481
1482 // stage 6
1483 addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
1484 addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
1485 addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
1486 addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
1487 v[8] = u[8];
1488 v[9] = u[9];
1489
1490 x = _mm256_mullo_epi32(u[10], cospi32);
1491 y = _mm256_mullo_epi32(u[13], cospi32);
1492 v[10] = _mm256_sub_epi32(y, x);
1493 v[10] = _mm256_add_epi32(v[10], rnding);
1494 v[10] = _mm256_srai_epi32(v[10], bit);
1495
1496 v[13] = _mm256_add_epi32(x, y);
1497 v[13] = _mm256_add_epi32(v[13], rnding);
1498 v[13] = _mm256_srai_epi32(v[13], bit);
1499
1500 x = _mm256_mullo_epi32(u[11], cospi32);
1501 y = _mm256_mullo_epi32(u[12], cospi32);
1502 v[11] = _mm256_sub_epi32(y, x);
1503 v[11] = _mm256_add_epi32(v[11], rnding);
1504 v[11] = _mm256_srai_epi32(v[11], bit);
1505
1506 v[12] = _mm256_add_epi32(x, y);
1507 v[12] = _mm256_add_epi32(v[12], rnding);
1508 v[12] = _mm256_srai_epi32(v[12], bit);
1509
1510 v[14] = u[14];
1511 v[15] = u[15];
1512
1513 // stage 7
1514 addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
1515 addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
1516 addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
1517 addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
1518 addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
1519 addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
1520 addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
1521 addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
1522
1523 if (!do_cols) {
1524 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
1525 const __m256i clamp_lo_out =
1526 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1527 const __m256i clamp_hi_out =
1528 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1529 round_shift_8x8_avx2(out, out_shift);
1530 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
1531 }
1532 }
1533}
1534
1535static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1536 int bd, int out_shift) {
1537 const int32_t *cospi = cospi_arr(bit);
1538 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
1539 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
1540 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1541 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1542 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1543 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1544 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1545 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1546 const __m256i zero = _mm256_setzero_si256();
1547 __m256i v[16], x, y, temp1, temp2;
1548
1549 // Calculate the column 0, 1, 2, 3
1550 {
1551 // stage 0
1552 // stage 1
1553 // stage 2
1554 x = _mm256_mullo_epi32(in[0], cospi62);
1555 v[0] = _mm256_add_epi32(x, rnding);
1556 v[0] = _mm256_srai_epi32(v[0], bit);
1557
1558 x = _mm256_mullo_epi32(in[0], cospi2);
1559 v[1] = _mm256_sub_epi32(zero, x);
1560 v[1] = _mm256_add_epi32(v[1], rnding);
1561 v[1] = _mm256_srai_epi32(v[1], bit);
1562
1563 // stage 3
1564 v[8] = v[0];
1565 v[9] = v[1];
1566
1567 // stage 4
1568 temp1 = _mm256_mullo_epi32(v[8], cospi8);
1569 x = _mm256_mullo_epi32(v[9], cospi56);
1570 temp1 = _mm256_add_epi32(temp1, x);
1571 temp1 = _mm256_add_epi32(temp1, rnding);
1572 temp1 = _mm256_srai_epi32(temp1, bit);
1573
1574 temp2 = _mm256_mullo_epi32(v[8], cospi56);
1575 x = _mm256_mullo_epi32(v[9], cospi8);
1576 temp2 = _mm256_sub_epi32(temp2, x);
1577 temp2 = _mm256_add_epi32(temp2, rnding);
1578 temp2 = _mm256_srai_epi32(temp2, bit);
1579 v[8] = temp1;
1580 v[9] = temp2;
1581
1582 // stage 5
1583 v[4] = v[0];
1584 v[5] = v[1];
1585 v[12] = v[8];
1586 v[13] = v[9];
1587
1588 // stage 6
1589 temp1 = _mm256_mullo_epi32(v[4], cospi16);
1590 x = _mm256_mullo_epi32(v[5], cospi48);
1591 temp1 = _mm256_add_epi32(temp1, x);
1592 temp1 = _mm256_add_epi32(temp1, rnding);
1593 temp1 = _mm256_srai_epi32(temp1, bit);
1594
1595 temp2 = _mm256_mullo_epi32(v[4], cospi48);
1596 x = _mm256_mullo_epi32(v[5], cospi16);
1597 temp2 = _mm256_sub_epi32(temp2, x);
1598 temp2 = _mm256_add_epi32(temp2, rnding);
1599 temp2 = _mm256_srai_epi32(temp2, bit);
1600 v[4] = temp1;
1601 v[5] = temp2;
1602
1603 temp1 = _mm256_mullo_epi32(v[12], cospi16);
1604 x = _mm256_mullo_epi32(v[13], cospi48);
1605 temp1 = _mm256_add_epi32(temp1, x);
1606 temp1 = _mm256_add_epi32(temp1, rnding);
1607 temp1 = _mm256_srai_epi32(temp1, bit);
1608
1609 temp2 = _mm256_mullo_epi32(v[12], cospi48);
1610 x = _mm256_mullo_epi32(v[13], cospi16);
1611 temp2 = _mm256_sub_epi32(temp2, x);
1612 temp2 = _mm256_add_epi32(temp2, rnding);
1613 temp2 = _mm256_srai_epi32(temp2, bit);
1614 v[12] = temp1;
1615 v[13] = temp2;
1616
1617 // stage 7
1618 v[2] = v[0];
1619 v[3] = v[1];
1620 v[6] = v[4];
1621 v[7] = v[5];
1622 v[10] = v[8];
1623 v[11] = v[9];
1624 v[14] = v[12];
1625 v[15] = v[13];
1626
1627 // stage 8
1628 y = _mm256_mullo_epi32(v[2], cospi32);
1629 x = _mm256_mullo_epi32(v[3], cospi32);
1630 v[2] = _mm256_add_epi32(y, x);
1631 v[2] = _mm256_add_epi32(v[2], rnding);
1632 v[2] = _mm256_srai_epi32(v[2], bit);
1633
1634 v[3] = _mm256_sub_epi32(y, x);
1635 v[3] = _mm256_add_epi32(v[3], rnding);
1636 v[3] = _mm256_srai_epi32(v[3], bit);
1637
1638 y = _mm256_mullo_epi32(v[6], cospi32);
1639 x = _mm256_mullo_epi32(v[7], cospi32);
1640 v[6] = _mm256_add_epi32(y, x);
1641 v[6] = _mm256_add_epi32(v[6], rnding);
1642 v[6] = _mm256_srai_epi32(v[6], bit);
1643
1644 v[7] = _mm256_sub_epi32(y, x);
1645 v[7] = _mm256_add_epi32(v[7], rnding);
1646 v[7] = _mm256_srai_epi32(v[7], bit);
1647
1648 y = _mm256_mullo_epi32(v[10], cospi32);
1649 x = _mm256_mullo_epi32(v[11], cospi32);
1650 v[10] = _mm256_add_epi32(y, x);
1651 v[10] = _mm256_add_epi32(v[10], rnding);
1652 v[10] = _mm256_srai_epi32(v[10], bit);
1653
1654 v[11] = _mm256_sub_epi32(y, x);
1655 v[11] = _mm256_add_epi32(v[11], rnding);
1656 v[11] = _mm256_srai_epi32(v[11], bit);
1657
1658 y = _mm256_mullo_epi32(v[14], cospi32);
1659 x = _mm256_mullo_epi32(v[15], cospi32);
1660 v[14] = _mm256_add_epi32(y, x);
1661 v[14] = _mm256_add_epi32(v[14], rnding);
1662 v[14] = _mm256_srai_epi32(v[14], bit);
1663
1664 v[15] = _mm256_sub_epi32(y, x);
1665 v[15] = _mm256_add_epi32(v[15], rnding);
1666 v[15] = _mm256_srai_epi32(v[15], bit);
1667
1668 // stage 9
1669 if (do_cols) {
1670 out[0] = v[0];
1671 out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
1672 out[2] = v[12];
1673 out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
1674 out[4] = v[6];
1675 out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
1676 out[6] = v[10];
1677 out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
1678 out[8] = v[3];
1679 out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
1680 out[10] = v[15];
1681 out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
1682 out[12] = v[5];
1683 out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
1684 out[14] = v[9];
1685 out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
1686 } else {
1687 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
1688 const __m256i clamp_lo_out =
1689 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1690 const __m256i clamp_hi_out =
1691 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1692
1693 neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1694 out_shift);
1695 neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
1696 &clamp_hi_out, out_shift);
1697 neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
1698 &clamp_hi_out, out_shift);
1699 neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
1700 &clamp_hi_out, out_shift);
1701 neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
1702 &clamp_hi_out, out_shift);
1703 neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
1704 &clamp_hi_out, out_shift);
1705 neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
1706 &clamp_hi_out, out_shift);
1707 neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
1708 &clamp_hi_out, out_shift);
1709 }
1710 }
1711}
1712
1713static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1714 int bd, int out_shift) {
1715 const int32_t *cospi = cospi_arr(bit);
1716 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
1717 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
1718 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
1719 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
1720 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
1721 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
1722 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
1723 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
1724 const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
1725 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
1726 const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
1727 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
1728 const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
1729 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
1730 const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
1731 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
1732 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1733 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1734 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
1735 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1736 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
1737 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
1738 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1739 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1740 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1741 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1742 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1743 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
1744 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1745 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1746 __m256i u[16], x, y;
1747
1748 {
1749 // stage 0
1750 // stage 1
1751 // stage 2
1752 __m256i zero = _mm256_setzero_si256();
1753 x = _mm256_mullo_epi32(in[0], cospi62);
1754 u[0] = _mm256_add_epi32(x, rnding);
1755 u[0] = _mm256_srai_epi32(u[0], bit);
1756
1757 x = _mm256_mullo_epi32(in[0], cospi2);
1758 u[1] = _mm256_sub_epi32(zero, x);
1759 u[1] = _mm256_add_epi32(u[1], rnding);
1760 u[1] = _mm256_srai_epi32(u[1], bit);
1761
1762 x = _mm256_mullo_epi32(in[2], cospi54);
1763 u[2] = _mm256_add_epi32(x, rnding);
1764 u[2] = _mm256_srai_epi32(u[2], bit);
1765
1766 x = _mm256_mullo_epi32(in[2], cospi10);
1767 u[3] = _mm256_sub_epi32(zero, x);
1768 u[3] = _mm256_add_epi32(u[3], rnding);
1769 u[3] = _mm256_srai_epi32(u[3], bit);
1770
1771 x = _mm256_mullo_epi32(in[4], cospi46);
1772 u[4] = _mm256_add_epi32(x, rnding);
1773 u[4] = _mm256_srai_epi32(u[4], bit);
1774
1775 x = _mm256_mullo_epi32(in[4], cospi18);
1776 u[5] = _mm256_sub_epi32(zero, x);
1777 u[5] = _mm256_add_epi32(u[5], rnding);
1778 u[5] = _mm256_srai_epi32(u[5], bit);
1779
1780 x = _mm256_mullo_epi32(in[6], cospi38);
1781 u[6] = _mm256_add_epi32(x, rnding);
1782 u[6] = _mm256_srai_epi32(u[6], bit);
1783
1784 x = _mm256_mullo_epi32(in[6], cospi26);
1785 u[7] = _mm256_sub_epi32(zero, x);
1786 u[7] = _mm256_add_epi32(u[7], rnding);
1787 u[7] = _mm256_srai_epi32(u[7], bit);
1788
1789 u[8] = _mm256_mullo_epi32(in[7], cospi34);
1790 u[8] = _mm256_add_epi32(u[8], rnding);
1791 u[8] = _mm256_srai_epi32(u[8], bit);
1792
1793 u[9] = _mm256_mullo_epi32(in[7], cospi30);
1794 u[9] = _mm256_add_epi32(u[9], rnding);
1795 u[9] = _mm256_srai_epi32(u[9], bit);
1796
1797 u[10] = _mm256_mullo_epi32(in[5], cospi42);
1798 u[10] = _mm256_add_epi32(u[10], rnding);
1799 u[10] = _mm256_srai_epi32(u[10], bit);
1800
1801 u[11] = _mm256_mullo_epi32(in[5], cospi22);
1802 u[11] = _mm256_add_epi32(u[11], rnding);
1803 u[11] = _mm256_srai_epi32(u[11], bit);
1804
1805 u[12] = _mm256_mullo_epi32(in[3], cospi50);
1806 u[12] = _mm256_add_epi32(u[12], rnding);
1807 u[12] = _mm256_srai_epi32(u[12], bit);
1808
1809 u[13] = _mm256_mullo_epi32(in[3], cospi14);
1810 u[13] = _mm256_add_epi32(u[13], rnding);
1811 u[13] = _mm256_srai_epi32(u[13], bit);
1812
1813 u[14] = _mm256_mullo_epi32(in[1], cospi58);
1814 u[14] = _mm256_add_epi32(u[14], rnding);
1815 u[14] = _mm256_srai_epi32(u[14], bit);
1816
1817 u[15] = _mm256_mullo_epi32(in[1], cospi6);
1818 u[15] = _mm256_add_epi32(u[15], rnding);
1819 u[15] = _mm256_srai_epi32(u[15], bit);
1820
1821 // stage 3
1822 addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
1823 addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
1824 addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
1825 addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
1826 addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
1827 addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
1828 addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
1829 addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
1830
1831 // stage 4
1832 y = _mm256_mullo_epi32(u[8], cospi56);
1833 x = _mm256_mullo_epi32(u[9], cospi56);
1834 u[8] = _mm256_mullo_epi32(u[8], cospi8);
1835 u[8] = _mm256_add_epi32(u[8], x);
1836 u[8] = _mm256_add_epi32(u[8], rnding);
1837 u[8] = _mm256_srai_epi32(u[8], bit);
1838
1839 x = _mm256_mullo_epi32(u[9], cospi8);
1840 u[9] = _mm256_sub_epi32(y, x);
1841 u[9] = _mm256_add_epi32(u[9], rnding);
1842 u[9] = _mm256_srai_epi32(u[9], bit);
1843
1844 x = _mm256_mullo_epi32(u[11], cospi24);
1845 y = _mm256_mullo_epi32(u[10], cospi24);
1846 u[10] = _mm256_mullo_epi32(u[10], cospi40);
1847 u[10] = _mm256_add_epi32(u[10], x);
1848 u[10] = _mm256_add_epi32(u[10], rnding);
1849 u[10] = _mm256_srai_epi32(u[10], bit);
1850
1851 x = _mm256_mullo_epi32(u[11], cospi40);
1852 u[11] = _mm256_sub_epi32(y, x);
1853 u[11] = _mm256_add_epi32(u[11], rnding);
1854 u[11] = _mm256_srai_epi32(u[11], bit);
1855
1856 x = _mm256_mullo_epi32(u[13], cospi8);
1857 y = _mm256_mullo_epi32(u[12], cospi8);
1858 u[12] = _mm256_mullo_epi32(u[12], cospim56);
1859 u[12] = _mm256_add_epi32(u[12], x);
1860 u[12] = _mm256_add_epi32(u[12], rnding);
1861 u[12] = _mm256_srai_epi32(u[12], bit);
1862
1863 x = _mm256_mullo_epi32(u[13], cospim56);
1864 u[13] = _mm256_sub_epi32(y, x);
1865 u[13] = _mm256_add_epi32(u[13], rnding);
1866 u[13] = _mm256_srai_epi32(u[13], bit);
1867
1868 x = _mm256_mullo_epi32(u[15], cospi40);
1869 y = _mm256_mullo_epi32(u[14], cospi40);
1870 u[14] = _mm256_mullo_epi32(u[14], cospim24);
1871 u[14] = _mm256_add_epi32(u[14], x);
1872 u[14] = _mm256_add_epi32(u[14], rnding);
1873 u[14] = _mm256_srai_epi32(u[14], bit);
1874
1875 x = _mm256_mullo_epi32(u[15], cospim24);
1876 u[15] = _mm256_sub_epi32(y, x);
1877 u[15] = _mm256_add_epi32(u[15], rnding);
1878 u[15] = _mm256_srai_epi32(u[15], bit);
1879
1880 // stage 5
1881 addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
1882 addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
1883 addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
1884 addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
1885 addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
1886 addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
1887 addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
1888 addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
1889
1890 // stage 6
1891 x = _mm256_mullo_epi32(u[5], cospi48);
1892 y = _mm256_mullo_epi32(u[4], cospi48);
1893 u[4] = _mm256_mullo_epi32(u[4], cospi16);
1894 u[4] = _mm256_add_epi32(u[4], x);
1895 u[4] = _mm256_add_epi32(u[4], rnding);
1896 u[4] = _mm256_srai_epi32(u[4], bit);
1897
1898 x = _mm256_mullo_epi32(u[5], cospi16);
1899 u[5] = _mm256_sub_epi32(y, x);
1900 u[5] = _mm256_add_epi32(u[5], rnding);
1901 u[5] = _mm256_srai_epi32(u[5], bit);
1902
1903 x = _mm256_mullo_epi32(u[7], cospi16);
1904 y = _mm256_mullo_epi32(u[6], cospi16);
1905 u[6] = _mm256_mullo_epi32(u[6], cospim48);
1906 u[6] = _mm256_add_epi32(u[6], x);
1907 u[6] = _mm256_add_epi32(u[6], rnding);
1908 u[6] = _mm256_srai_epi32(u[6], bit);
1909
1910 x = _mm256_mullo_epi32(u[7], cospim48);
1911 u[7] = _mm256_sub_epi32(y, x);
1912 u[7] = _mm256_add_epi32(u[7], rnding);
1913 u[7] = _mm256_srai_epi32(u[7], bit);
1914
1915 x = _mm256_mullo_epi32(u[13], cospi48);
1916 y = _mm256_mullo_epi32(u[12], cospi48);
1917 u[12] = _mm256_mullo_epi32(u[12], cospi16);
1918 u[12] = _mm256_add_epi32(u[12], x);
1919 u[12] = _mm256_add_epi32(u[12], rnding);
1920 u[12] = _mm256_srai_epi32(u[12], bit);
1921
1922 x = _mm256_mullo_epi32(u[13], cospi16);
1923 u[13] = _mm256_sub_epi32(y, x);
1924 u[13] = _mm256_add_epi32(u[13], rnding);
1925 u[13] = _mm256_srai_epi32(u[13], bit);
1926
1927 x = _mm256_mullo_epi32(u[15], cospi16);
1928 y = _mm256_mullo_epi32(u[14], cospi16);
1929 u[14] = _mm256_mullo_epi32(u[14], cospim48);
1930 u[14] = _mm256_add_epi32(u[14], x);
1931 u[14] = _mm256_add_epi32(u[14], rnding);
1932 u[14] = _mm256_srai_epi32(u[14], bit);
1933
1934 x = _mm256_mullo_epi32(u[15], cospim48);
1935 u[15] = _mm256_sub_epi32(y, x);
1936 u[15] = _mm256_add_epi32(u[15], rnding);
1937 u[15] = _mm256_srai_epi32(u[15], bit);
1938
1939 // stage 7
1940 addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
1941 addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
1942 addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
1943 addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
1944 addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
1945 addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
1946 addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
1947 addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
1948
1949 // stage 8
1950 y = _mm256_mullo_epi32(u[2], cospi32);
1951 x = _mm256_mullo_epi32(u[3], cospi32);
1952 u[2] = _mm256_add_epi32(y, x);
1953 u[2] = _mm256_add_epi32(u[2], rnding);
1954 u[2] = _mm256_srai_epi32(u[2], bit);
1955
1956 u[3] = _mm256_sub_epi32(y, x);
1957 u[3] = _mm256_add_epi32(u[3], rnding);
1958 u[3] = _mm256_srai_epi32(u[3], bit);
1959 y = _mm256_mullo_epi32(u[6], cospi32);
1960 x = _mm256_mullo_epi32(u[7], cospi32);
1961 u[6] = _mm256_add_epi32(y, x);
1962 u[6] = _mm256_add_epi32(u[6], rnding);
1963 u[6] = _mm256_srai_epi32(u[6], bit);
1964
1965 u[7] = _mm256_sub_epi32(y, x);
1966 u[7] = _mm256_add_epi32(u[7], rnding);
1967 u[7] = _mm256_srai_epi32(u[7], bit);
1968
1969 y = _mm256_mullo_epi32(u[10], cospi32);
1970 x = _mm256_mullo_epi32(u[11], cospi32);
1971 u[10] = _mm256_add_epi32(y, x);
1972 u[10] = _mm256_add_epi32(u[10], rnding);
1973 u[10] = _mm256_srai_epi32(u[10], bit);
1974
1975 u[11] = _mm256_sub_epi32(y, x);
1976 u[11] = _mm256_add_epi32(u[11], rnding);
1977 u[11] = _mm256_srai_epi32(u[11], bit);
1978
1979 y = _mm256_mullo_epi32(u[14], cospi32);
1980 x = _mm256_mullo_epi32(u[15], cospi32);
1981 u[14] = _mm256_add_epi32(y, x);
1982 u[14] = _mm256_add_epi32(u[14], rnding);
1983 u[14] = _mm256_srai_epi32(u[14], bit);
1984
1985 u[15] = _mm256_sub_epi32(y, x);
1986 u[15] = _mm256_add_epi32(u[15], rnding);
1987 u[15] = _mm256_srai_epi32(u[15], bit);
1988
1989 // stage 9
1990 if (do_cols) {
1991 out[0] = u[0];
1992 out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]);
1993 out[2] = u[12];
1994 out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]);
1995 out[4] = u[6];
1996 out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]);
1997 out[6] = u[10];
1998 out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]);
1999 out[8] = u[3];
2000 out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]);
2001 out[10] = u[15];
2002 out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]);
2003 out[12] = u[5];
2004 out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]);
2005 out[14] = u[9];
2006 out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]);
2007 } else {
2008 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
2009 const __m256i clamp_lo_out =
2010 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2011 const __m256i clamp_hi_out =
2012 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2013
2014 neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2015 out_shift);
2016 neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
2017 &clamp_hi_out, out_shift);
2018 neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
2019 &clamp_hi_out, out_shift);
2020 neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
2021 &clamp_hi_out, out_shift);
2022 neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
2023 &clamp_hi_out, out_shift);
2024 neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
2025 &clamp_hi_out, out_shift);
2026 neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
2027 &clamp_hi_out, out_shift);
2028 neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
2029 &clamp_hi_out, out_shift);
2030 }
2031 }
2032}
2033
2034static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2035 int bd, int out_shift) {
2036 const int32_t *cospi = cospi_arr(bit);
2037 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
2038 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
2039 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
2040 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
2041 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
2042 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
2043 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
2044 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
2045 const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
2046 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
2047 const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
2048 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
2049 const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
2050 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
2051 const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
2052 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
2053 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
2054 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
2055 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
2056 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
2057 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
2058 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
2059 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2060 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2061 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
2062 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2063 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2064 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
2065 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2066 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2067 __m256i u[16], v[16], x, y;
2068
2069 {
2070 // stage 0
2071 // stage 1
2072 // stage 2
2073 v[0] = _mm256_mullo_epi32(in[15], cospi2);
2074 x = _mm256_mullo_epi32(in[0], cospi62);
2075 v[0] = _mm256_add_epi32(v[0], x);
2076 v[0] = _mm256_add_epi32(v[0], rnding);
2077 v[0] = _mm256_srai_epi32(v[0], bit);
2078
2079 v[1] = _mm256_mullo_epi32(in[15], cospi62);
2080 x = _mm256_mullo_epi32(in[0], cospi2);
2081 v[1] = _mm256_sub_epi32(v[1], x);
2082 v[1] = _mm256_add_epi32(v[1], rnding);
2083 v[1] = _mm256_srai_epi32(v[1], bit);
2084
2085 v[2] = _mm256_mullo_epi32(in[13], cospi10);
2086 x = _mm256_mullo_epi32(in[2], cospi54);
2087 v[2] = _mm256_add_epi32(v[2], x);
2088 v[2] = _mm256_add_epi32(v[2], rnding);
2089 v[2] = _mm256_srai_epi32(v[2], bit);
2090
2091 v[3] = _mm256_mullo_epi32(in[13], cospi54);
2092 x = _mm256_mullo_epi32(in[2], cospi10);
2093 v[3] = _mm256_sub_epi32(v[3], x);
2094 v[3] = _mm256_add_epi32(v[3], rnding);
2095 v[3] = _mm256_srai_epi32(v[3], bit);
2096
2097 v[4] = _mm256_mullo_epi32(in[11], cospi18);
2098 x = _mm256_mullo_epi32(in[4], cospi46);
2099 v[4] = _mm256_add_epi32(v[4], x);
2100 v[4] = _mm256_add_epi32(v[4], rnding);
2101 v[4] = _mm256_srai_epi32(v[4], bit);
2102
2103 v[5] = _mm256_mullo_epi32(in[11], cospi46);
2104 x = _mm256_mullo_epi32(in[4], cospi18);
2105 v[5] = _mm256_sub_epi32(v[5], x);
2106 v[5] = _mm256_add_epi32(v[5], rnding);
2107 v[5] = _mm256_srai_epi32(v[5], bit);
2108
2109 v[6] = _mm256_mullo_epi32(in[9], cospi26);
2110 x = _mm256_mullo_epi32(in[6], cospi38);
2111 v[6] = _mm256_add_epi32(v[6], x);
2112 v[6] = _mm256_add_epi32(v[6], rnding);
2113 v[6] = _mm256_srai_epi32(v[6], bit);
2114
2115 v[7] = _mm256_mullo_epi32(in[9], cospi38);
2116 x = _mm256_mullo_epi32(in[6], cospi26);
2117 v[7] = _mm256_sub_epi32(v[7], x);
2118 v[7] = _mm256_add_epi32(v[7], rnding);
2119 v[7] = _mm256_srai_epi32(v[7], bit);
2120
2121 v[8] = _mm256_mullo_epi32(in[7], cospi34);
2122 x = _mm256_mullo_epi32(in[8], cospi30);
2123 v[8] = _mm256_add_epi32(v[8], x);
2124 v[8] = _mm256_add_epi32(v[8], rnding);
2125 v[8] = _mm256_srai_epi32(v[8], bit);
2126
2127 v[9] = _mm256_mullo_epi32(in[7], cospi30);
2128 x = _mm256_mullo_epi32(in[8], cospi34);
2129 v[9] = _mm256_sub_epi32(v[9], x);
2130 v[9] = _mm256_add_epi32(v[9], rnding);
2131 v[9] = _mm256_srai_epi32(v[9], bit);
2132
2133 v[10] = _mm256_mullo_epi32(in[5], cospi42);
2134 x = _mm256_mullo_epi32(in[10], cospi22);
2135 v[10] = _mm256_add_epi32(v[10], x);
2136 v[10] = _mm256_add_epi32(v[10], rnding);
2137 v[10] = _mm256_srai_epi32(v[10], bit);
2138
2139 v[11] = _mm256_mullo_epi32(in[5], cospi22);
2140 x = _mm256_mullo_epi32(in[10], cospi42);
2141 v[11] = _mm256_sub_epi32(v[11], x);
2142 v[11] = _mm256_add_epi32(v[11], rnding);
2143 v[11] = _mm256_srai_epi32(v[11], bit);
2144
2145 v[12] = _mm256_mullo_epi32(in[3], cospi50);
2146 x = _mm256_mullo_epi32(in[12], cospi14);
2147 v[12] = _mm256_add_epi32(v[12], x);
2148 v[12] = _mm256_add_epi32(v[12], rnding);
2149 v[12] = _mm256_srai_epi32(v[12], bit);
2150
2151 v[13] = _mm256_mullo_epi32(in[3], cospi14);
2152 x = _mm256_mullo_epi32(in[12], cospi50);
2153 v[13] = _mm256_sub_epi32(v[13], x);
2154 v[13] = _mm256_add_epi32(v[13], rnding);
2155 v[13] = _mm256_srai_epi32(v[13], bit);
2156
2157 v[14] = _mm256_mullo_epi32(in[1], cospi58);
2158 x = _mm256_mullo_epi32(in[14], cospi6);
2159 v[14] = _mm256_add_epi32(v[14], x);
2160 v[14] = _mm256_add_epi32(v[14], rnding);
2161 v[14] = _mm256_srai_epi32(v[14], bit);
2162
2163 v[15] = _mm256_mullo_epi32(in[1], cospi6);
2164 x = _mm256_mullo_epi32(in[14], cospi58);
2165 v[15] = _mm256_sub_epi32(v[15], x);
2166 v[15] = _mm256_add_epi32(v[15], rnding);
2167 v[15] = _mm256_srai_epi32(v[15], bit);
2168
2169 // stage 3
2170 addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2171 addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2172 addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2173 addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2174 addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2175 addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2176 addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2177 addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2178
2179 // stage 4
2180 v[0] = u[0];
2181 v[1] = u[1];
2182 v[2] = u[2];
2183 v[3] = u[3];
2184 v[4] = u[4];
2185 v[5] = u[5];
2186 v[6] = u[6];
2187 v[7] = u[7];
2188
2189 v[8] = _mm256_mullo_epi32(u[8], cospi8);
2190 x = _mm256_mullo_epi32(u[9], cospi56);
2191 v[8] = _mm256_add_epi32(v[8], x);
2192 v[8] = _mm256_add_epi32(v[8], rnding);
2193 v[8] = _mm256_srai_epi32(v[8], bit);
2194
2195 v[9] = _mm256_mullo_epi32(u[8], cospi56);
2196 x = _mm256_mullo_epi32(u[9], cospi8);
2197 v[9] = _mm256_sub_epi32(v[9], x);
2198 v[9] = _mm256_add_epi32(v[9], rnding);
2199 v[9] = _mm256_srai_epi32(v[9], bit);
2200
2201 v[10] = _mm256_mullo_epi32(u[10], cospi40);
2202 x = _mm256_mullo_epi32(u[11], cospi24);
2203 v[10] = _mm256_add_epi32(v[10], x);
2204 v[10] = _mm256_add_epi32(v[10], rnding);
2205 v[10] = _mm256_srai_epi32(v[10], bit);
2206
2207 v[11] = _mm256_mullo_epi32(u[10], cospi24);
2208 x = _mm256_mullo_epi32(u[11], cospi40);
2209 v[11] = _mm256_sub_epi32(v[11], x);
2210 v[11] = _mm256_add_epi32(v[11], rnding);
2211 v[11] = _mm256_srai_epi32(v[11], bit);
2212
2213 v[12] = _mm256_mullo_epi32(u[12], cospim56);
2214 x = _mm256_mullo_epi32(u[13], cospi8);
2215 v[12] = _mm256_add_epi32(v[12], x);
2216 v[12] = _mm256_add_epi32(v[12], rnding);
2217 v[12] = _mm256_srai_epi32(v[12], bit);
2218
2219 v[13] = _mm256_mullo_epi32(u[12], cospi8);
2220 x = _mm256_mullo_epi32(u[13], cospim56);
2221 v[13] = _mm256_sub_epi32(v[13], x);
2222 v[13] = _mm256_add_epi32(v[13], rnding);
2223 v[13] = _mm256_srai_epi32(v[13], bit);
2224
2225 v[14] = _mm256_mullo_epi32(u[14], cospim24);
2226 x = _mm256_mullo_epi32(u[15], cospi40);
2227 v[14] = _mm256_add_epi32(v[14], x);
2228 v[14] = _mm256_add_epi32(v[14], rnding);
2229 v[14] = _mm256_srai_epi32(v[14], bit);
2230
2231 v[15] = _mm256_mullo_epi32(u[14], cospi40);
2232 x = _mm256_mullo_epi32(u[15], cospim24);
2233 v[15] = _mm256_sub_epi32(v[15], x);
2234 v[15] = _mm256_add_epi32(v[15], rnding);
2235 v[15] = _mm256_srai_epi32(v[15], bit);
2236
2237 // stage 5
2238 addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2239 addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2240 addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2241 addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2242 addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2243 addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2244 addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2245 addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2246
2247 // stage 6
2248 v[0] = u[0];
2249 v[1] = u[1];
2250 v[2] = u[2];
2251 v[3] = u[3];
2252
2253 v[4] = _mm256_mullo_epi32(u[4], cospi16);
2254 x = _mm256_mullo_epi32(u[5], cospi48);
2255 v[4] = _mm256_add_epi32(v[4], x);
2256 v[4] = _mm256_add_epi32(v[4], rnding);
2257 v[4] = _mm256_srai_epi32(v[4], bit);
2258
2259 v[5] = _mm256_mullo_epi32(u[4], cospi48);
2260 x = _mm256_mullo_epi32(u[5], cospi16);
2261 v[5] = _mm256_sub_epi32(v[5], x);
2262 v[5] = _mm256_add_epi32(v[5], rnding);
2263 v[5] = _mm256_srai_epi32(v[5], bit);
2264
2265 v[6] = _mm256_mullo_epi32(u[6], cospim48);
2266 x = _mm256_mullo_epi32(u[7], cospi16);
2267 v[6] = _mm256_add_epi32(v[6], x);
2268 v[6] = _mm256_add_epi32(v[6], rnding);
2269 v[6] = _mm256_srai_epi32(v[6], bit);
2270
2271 v[7] = _mm256_mullo_epi32(u[6], cospi16);
2272 x = _mm256_mullo_epi32(u[7], cospim48);
2273 v[7] = _mm256_sub_epi32(v[7], x);
2274 v[7] = _mm256_add_epi32(v[7], rnding);
2275 v[7] = _mm256_srai_epi32(v[7], bit);
2276
2277 v[8] = u[8];
2278 v[9] = u[9];
2279 v[10] = u[10];
2280 v[11] = u[11];
2281
2282 v[12] = _mm256_mullo_epi32(u[12], cospi16);
2283 x = _mm256_mullo_epi32(u[13], cospi48);
2284 v[12] = _mm256_add_epi32(v[12], x);
2285 v[12] = _mm256_add_epi32(v[12], rnding);
2286 v[12] = _mm256_srai_epi32(v[12], bit);
2287
2288 v[13] = _mm256_mullo_epi32(u[12], cospi48);
2289 x = _mm256_mullo_epi32(u[13], cospi16);
2290 v[13] = _mm256_sub_epi32(v[13], x);
2291 v[13] = _mm256_add_epi32(v[13], rnding);
2292 v[13] = _mm256_srai_epi32(v[13], bit);
2293
2294 v[14] = _mm256_mullo_epi32(u[14], cospim48);
2295 x = _mm256_mullo_epi32(u[15], cospi16);
2296 v[14] = _mm256_add_epi32(v[14], x);
2297 v[14] = _mm256_add_epi32(v[14], rnding);
2298 v[14] = _mm256_srai_epi32(v[14], bit);
2299
2300 v[15] = _mm256_mullo_epi32(u[14], cospi16);
2301 x = _mm256_mullo_epi32(u[15], cospim48);
2302 v[15] = _mm256_sub_epi32(v[15], x);
2303 v[15] = _mm256_add_epi32(v[15], rnding);
2304 v[15] = _mm256_srai_epi32(v[15], bit);
2305
2306 // stage 7
2307 addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2308 addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2309 addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2310 addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2311 addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2312 addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2313 addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2314 addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2315
2316 // stage 8
2317 v[0] = u[0];
2318 v[1] = u[1];
2319
2320 y = _mm256_mullo_epi32(u[2], cospi32);
2321 x = _mm256_mullo_epi32(u[3], cospi32);
2322 v[2] = _mm256_add_epi32(y, x);
2323 v[2] = _mm256_add_epi32(v[2], rnding);
2324 v[2] = _mm256_srai_epi32(v[2], bit);
2325
2326 v[3] = _mm256_sub_epi32(y, x);
2327 v[3] = _mm256_add_epi32(v[3], rnding);
2328 v[3] = _mm256_srai_epi32(v[3], bit);
2329
2330 v[4] = u[4];
2331 v[5] = u[5];
2332
2333 y = _mm256_mullo_epi32(u[6], cospi32);
2334 x = _mm256_mullo_epi32(u[7], cospi32);
2335 v[6] = _mm256_add_epi32(y, x);
2336 v[6] = _mm256_add_epi32(v[6], rnding);
2337 v[6] = _mm256_srai_epi32(v[6], bit);
2338
2339 v[7] = _mm256_sub_epi32(y, x);
2340 v[7] = _mm256_add_epi32(v[7], rnding);
2341 v[7] = _mm256_srai_epi32(v[7], bit);
2342
2343 v[8] = u[8];
2344 v[9] = u[9];
2345
2346 y = _mm256_mullo_epi32(u[10], cospi32);
2347 x = _mm256_mullo_epi32(u[11], cospi32);
2348 v[10] = _mm256_add_epi32(y, x);
2349 v[10] = _mm256_add_epi32(v[10], rnding);
2350 v[10] = _mm256_srai_epi32(v[10], bit);
2351
2352 v[11] = _mm256_sub_epi32(y, x);
2353 v[11] = _mm256_add_epi32(v[11], rnding);
2354 v[11] = _mm256_srai_epi32(v[11], bit);
2355
2356 v[12] = u[12];
2357 v[13] = u[13];
2358
2359 y = _mm256_mullo_epi32(u[14], cospi32);
2360 x = _mm256_mullo_epi32(u[15], cospi32);
2361 v[14] = _mm256_add_epi32(y, x);
2362 v[14] = _mm256_add_epi32(v[14], rnding);
2363 v[14] = _mm256_srai_epi32(v[14], bit);
2364
2365 v[15] = _mm256_sub_epi32(y, x);
2366 v[15] = _mm256_add_epi32(v[15], rnding);
2367 v[15] = _mm256_srai_epi32(v[15], bit);
2368
2369 // stage 9
2370 if (do_cols) {
2371 out[0] = v[0];
2372 out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
2373 out[2] = v[12];
2374 out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
2375 out[4] = v[6];
2376 out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
2377 out[6] = v[10];
2378 out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
2379 out[8] = v[3];
2380 out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
2381 out[10] = v[15];
2382 out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
2383 out[12] = v[5];
2384 out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
2385 out[14] = v[9];
2386 out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
2387 } else {
2388 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
2389 const __m256i clamp_lo_out =
2390 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2391 const __m256i clamp_hi_out =
2392 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2393
2394 neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2395 out_shift);
2396 neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
2397 &clamp_hi_out, out_shift);
2398 neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
2399 &clamp_hi_out, out_shift);
2400 neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
2401 &clamp_hi_out, out_shift);
2402 neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
2403 &clamp_hi_out, out_shift);
2404 neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
2405 &clamp_hi_out, out_shift);
2406 neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
2407 &clamp_hi_out, out_shift);
2408 neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
2409 &clamp_hi_out, out_shift);
2410 }
2411 }
2412}
2413static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2414 int bd, int out_shift) {
2415 const int32_t *cospi = cospi_arr(bit);
2416 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2417 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2418 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
2419 __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2420 __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2421 __m256i x;
2422
2423 // stage 0
2424 // stage 1
2425 // stage 2
2426 // stage 3
2427 x = _mm256_mullo_epi32(in[0], cospi32);
2428 x = _mm256_add_epi32(x, rnding);
2429 x = _mm256_srai_epi32(x, bit);
2430
2431 // stage 4
2432 // stage 5
2433 if (!do_cols) {
2434 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
2435 __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
2436 clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2437 clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2438 x = _mm256_add_epi32(x, offset);
2439 x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
2440 }
2441 x = _mm256_max_epi32(x, clamp_lo);
2442 x = _mm256_min_epi32(x, clamp_hi);
2443 out[0] = x;
2444 out[1] = x;
2445 out[2] = x;
2446 out[3] = x;
2447 out[4] = x;
2448 out[5] = x;
2449 out[6] = x;
2450 out[7] = x;
2451}
2452static void idct8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2453 int bd, int out_shift) {
2454 const int32_t *cospi = cospi_arr(bit);
2455 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
2456 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
2457 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
2458 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
2459 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
2460 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
2461 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2462 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2463 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
2464 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2465 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2466 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
2467 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2468 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2469 __m256i u0, u1, u2, u3, u4, u5, u6, u7;
2470 __m256i v0, v1, v2, v3, v4, v5, v6, v7;
2471 __m256i x, y;
2472
2473 // stage 0
2474 // stage 1
2475 // stage 2
2476 u0 = in[0];
2477 u1 = in[4];
2478 u2 = in[2];
2479 u3 = in[6];
2480
2481 x = _mm256_mullo_epi32(in[1], cospi56);
2482 y = _mm256_mullo_epi32(in[7], cospim8);
2483 u4 = _mm256_add_epi32(x, y);
2484 u4 = _mm256_add_epi32(u4, rnding);
2485 u4 = _mm256_srai_epi32(u4, bit);
2486
2487 x = _mm256_mullo_epi32(in[1], cospi8);
2488 y = _mm256_mullo_epi32(in[7], cospi56);
2489 u7 = _mm256_add_epi32(x, y);
2490 u7 = _mm256_add_epi32(u7, rnding);
2491 u7 = _mm256_srai_epi32(u7, bit);
2492
2493 x = _mm256_mullo_epi32(in[5], cospi24);
2494 y = _mm256_mullo_epi32(in[3], cospim40);
2495 u5 = _mm256_add_epi32(x, y);
2496 u5 = _mm256_add_epi32(u5, rnding);
2497 u5 = _mm256_srai_epi32(u5, bit);
2498
2499 x = _mm256_mullo_epi32(in[5], cospi40);
2500 y = _mm256_mullo_epi32(in[3], cospi24);
2501 u6 = _mm256_add_epi32(x, y);
2502 u6 = _mm256_add_epi32(u6, rnding);
2503 u6 = _mm256_srai_epi32(u6, bit);
2504
2505 // stage 3
2506 x = _mm256_mullo_epi32(u0, cospi32);
2507 y = _mm256_mullo_epi32(u1, cospi32);
2508 v0 = _mm256_add_epi32(x, y);
2509 v0 = _mm256_add_epi32(v0, rnding);
2510 v0 = _mm256_srai_epi32(v0, bit);
2511
2512 v1 = _mm256_sub_epi32(x, y);
2513 v1 = _mm256_add_epi32(v1, rnding);
2514 v1 = _mm256_srai_epi32(v1, bit);
2515
2516 x = _mm256_mullo_epi32(u2, cospi48);
2517 y = _mm256_mullo_epi32(u3, cospim16);
2518 v2 = _mm256_add_epi32(x, y);
2519 v2 = _mm256_add_epi32(v2, rnding);
2520 v2 = _mm256_srai_epi32(v2, bit);
2521
2522 x = _mm256_mullo_epi32(u2, cospi16);
2523 y = _mm256_mullo_epi32(u3, cospi48);
2524 v3 = _mm256_add_epi32(x, y);
2525 v3 = _mm256_add_epi32(v3, rnding);
2526 v3 = _mm256_srai_epi32(v3, bit);
2527
2528 addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
2529 addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
2530
2531 // stage 4
2532 addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
2533 addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
2534 u4 = v4;
2535 u7 = v7;
2536
2537 x = _mm256_mullo_epi32(v5, cospi32);
2538 y = _mm256_mullo_epi32(v6, cospi32);
2539 u6 = _mm256_add_epi32(y, x);
2540 u6 = _mm256_add_epi32(u6, rnding);
2541 u6 = _mm256_srai_epi32(u6, bit);
2542
2543 u5 = _mm256_sub_epi32(y, x);
2544 u5 = _mm256_add_epi32(u5, rnding);
2545 u5 = _mm256_srai_epi32(u5, bit);
2546
2547 addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
2548 addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
2549 addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
2550 addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
2551 // stage 5
2552 if (!do_cols) {
2553 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
2554 const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2555 const __m256i clamp_hi_out =
2556 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2557
2558 round_shift_4x4_avx2(out, out_shift);
2559 round_shift_4x4_avx2(out + 4, out_shift);
2560 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8);
2561 }
2562}
2563static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2564 int bd, int out_shift) {
2565 const int32_t *cospi = cospi_arr(bit);
2566 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
2567 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
2568 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2569 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2570 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2571 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2572 const __m256i kZero = _mm256_setzero_si256();
2573 __m256i u[8], x;
2574
2575 // stage 0
2576 // stage 1
2577 // stage 2
2578
2579 x = _mm256_mullo_epi32(in[0], cospi60);
2580 u[0] = _mm256_add_epi32(x, rnding);
2581 u[0] = _mm256_srai_epi32(u[0], bit);
2582
2583 x = _mm256_mullo_epi32(in[0], cospi4);
2584 u[1] = _mm256_sub_epi32(kZero, x);
2585 u[1] = _mm256_add_epi32(u[1], rnding);
2586 u[1] = _mm256_srai_epi32(u[1], bit);
2587
2588 // stage 3
2589 // stage 4
2590 __m256i temp1, temp2;
2591 temp1 = _mm256_mullo_epi32(u[0], cospi16);
2592 x = _mm256_mullo_epi32(u[1], cospi48);
2593 temp1 = _mm256_add_epi32(temp1, x);
2594 temp1 = _mm256_add_epi32(temp1, rnding);
2595 temp1 = _mm256_srai_epi32(temp1, bit);
2596 u[4] = temp1;
2597
2598 temp2 = _mm256_mullo_epi32(u[0], cospi48);
2599 x = _mm256_mullo_epi32(u[1], cospi16);
2600 u[5] = _mm256_sub_epi32(temp2, x);
2601 u[5] = _mm256_add_epi32(u[5], rnding);
2602 u[5] = _mm256_srai_epi32(u[5], bit);
2603
2604 // stage 5
2605 // stage 6
2606 temp1 = _mm256_mullo_epi32(u[0], cospi32);
2607 x = _mm256_mullo_epi32(u[1], cospi32);
2608 u[2] = _mm256_add_epi32(temp1, x);
2609 u[2] = _mm256_add_epi32(u[2], rnding);
2610 u[2] = _mm256_srai_epi32(u[2], bit);
2611
2612 u[3] = _mm256_sub_epi32(temp1, x);
2613 u[3] = _mm256_add_epi32(u[3], rnding);
2614 u[3] = _mm256_srai_epi32(u[3], bit);
2615
2616 temp1 = _mm256_mullo_epi32(u[4], cospi32);
2617 x = _mm256_mullo_epi32(u[5], cospi32);
2618 u[6] = _mm256_add_epi32(temp1, x);
2619 u[6] = _mm256_add_epi32(u[6], rnding);
2620 u[6] = _mm256_srai_epi32(u[6], bit);
2621
2622 u[7] = _mm256_sub_epi32(temp1, x);
2623 u[7] = _mm256_add_epi32(u[7], rnding);
2624 u[7] = _mm256_srai_epi32(u[7], bit);
2625
2626 // stage 7
2627 if (do_cols) {
2628 out[0] = u[0];
2629 out[1] = _mm256_sub_epi32(kZero, u[4]);
2630 out[2] = u[6];
2631 out[3] = _mm256_sub_epi32(kZero, u[2]);
2632 out[4] = u[3];
2633 out[5] = _mm256_sub_epi32(kZero, u[7]);
2634 out[6] = u[5];
2635 out[7] = _mm256_sub_epi32(kZero, u[1]);
2636 } else {
2637 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
2638 const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2639 const __m256i clamp_hi_out =
2640 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2641
2642 neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2643 out_shift);
2644 neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
2645 out_shift);
2646 neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
2647 out_shift);
2648 neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
2649 out_shift);
2650 }
2651}
2652
2653static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2654 int bd, int out_shift) {
2655 const int32_t *cospi = cospi_arr(bit);
2656 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
2657 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
2658 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
2659 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
2660 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
2661 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
2662 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
2663 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
2664 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2665 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2666 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
2667 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2668 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2669 const __m256i kZero = _mm256_setzero_si256();
2670 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
2671 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2672 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2673 __m256i u[8], v[8], x;
2674
2675 // stage 0
2676 // stage 1
2677 // stage 2
2678
2679 u[0] = _mm256_mullo_epi32(in[7], cospi4);
2680 x = _mm256_mullo_epi32(in[0], cospi60);
2681 u[0] = _mm256_add_epi32(u[0], x);
2682 u[0] = _mm256_add_epi32(u[0], rnding);
2683 u[0] = _mm256_srai_epi32(u[0], bit);
2684
2685 u[1] = _mm256_mullo_epi32(in[7], cospi60);
2686 x = _mm256_mullo_epi32(in[0], cospi4);
2687 u[1] = _mm256_sub_epi32(u[1], x);
2688 u[1] = _mm256_add_epi32(u[1], rnding);
2689 u[1] = _mm256_srai_epi32(u[1], bit);
2690
2691 u[2] = _mm256_mullo_epi32(in[5], cospi20);
2692 x = _mm256_mullo_epi32(in[2], cospi44);
2693 u[2] = _mm256_add_epi32(u[2], x);
2694 u[2] = _mm256_add_epi32(u[2], rnding);
2695 u[2] = _mm256_srai_epi32(u[2], bit);
2696
2697 u[3] = _mm256_mullo_epi32(in[5], cospi44);
2698 x = _mm256_mullo_epi32(in[2], cospi20);
2699 u[3] = _mm256_sub_epi32(u[3], x);
2700 u[3] = _mm256_add_epi32(u[3], rnding);
2701 u[3] = _mm256_srai_epi32(u[3], bit);
2702
2703 u[4] = _mm256_mullo_epi32(in[3], cospi36);
2704 x = _mm256_mullo_epi32(in[4], cospi28);
2705 u[4] = _mm256_add_epi32(u[4], x);
2706 u[4] = _mm256_add_epi32(u[4], rnding);
2707 u[4] = _mm256_srai_epi32(u[4], bit);
2708
2709 u[5] = _mm256_mullo_epi32(in[3], cospi28);
2710 x = _mm256_mullo_epi32(in[4], cospi36);
2711 u[5] = _mm256_sub_epi32(u[5], x);
2712 u[5] = _mm256_add_epi32(u[5], rnding);
2713 u[5] = _mm256_srai_epi32(u[5], bit);
2714
2715 u[6] = _mm256_mullo_epi32(in[1], cospi52);
2716 x = _mm256_mullo_epi32(in[6], cospi12);
2717 u[6] = _mm256_add_epi32(u[6], x);
2718 u[6] = _mm256_add_epi32(u[6], rnding);
2719 u[6] = _mm256_srai_epi32(u[6], bit);
2720
2721 u[7] = _mm256_mullo_epi32(in[1], cospi12);
2722 x = _mm256_mullo_epi32(in[6], cospi52);
2723 u[7] = _mm256_sub_epi32(u[7], x);
2724 u[7] = _mm256_add_epi32(u[7], rnding);
2725 u[7] = _mm256_srai_epi32(u[7], bit);
2726
2727 // stage 3
2728 addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
2729 addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
2730 addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
2731 addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
2732
2733 // stage 4
2734 u[0] = v[0];
2735 u[1] = v[1];
2736 u[2] = v[2];
2737 u[3] = v[3];
2738
2739 u[4] = _mm256_mullo_epi32(v[4], cospi16);
2740 x = _mm256_mullo_epi32(v[5], cospi48);
2741 u[4] = _mm256_add_epi32(u[4], x);
2742 u[4] = _mm256_add_epi32(u[4], rnding);
2743 u[4] = _mm256_srai_epi32(u[4], bit);
2744
2745 u[5] = _mm256_mullo_epi32(v[4], cospi48);
2746 x = _mm256_mullo_epi32(v[5], cospi16);
2747 u[5] = _mm256_sub_epi32(u[5], x);
2748 u[5] = _mm256_add_epi32(u[5], rnding);
2749 u[5] = _mm256_srai_epi32(u[5], bit);
2750
2751 u[6] = _mm256_mullo_epi32(v[6], cospim48);
2752 x = _mm256_mullo_epi32(v[7], cospi16);
2753 u[6] = _mm256_add_epi32(u[6], x);
2754 u[6] = _mm256_add_epi32(u[6], rnding);
2755 u[6] = _mm256_srai_epi32(u[6], bit);
2756
2757 u[7] = _mm256_mullo_epi32(v[6], cospi16);
2758 x = _mm256_mullo_epi32(v[7], cospim48);
2759 u[7] = _mm256_sub_epi32(u[7], x);
2760 u[7] = _mm256_add_epi32(u[7], rnding);
2761 u[7] = _mm256_srai_epi32(u[7], bit);
2762
2763 // stage 5
2764 addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
2765 addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
2766 addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
2767 addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
2768
2769 // stage 6
2770 u[0] = v[0];
2771 u[1] = v[1];
2772 u[4] = v[4];
2773 u[5] = v[5];
2774
2775 v[0] = _mm256_mullo_epi32(v[2], cospi32);
2776 x = _mm256_mullo_epi32(v[3], cospi32);
2777 u[2] = _mm256_add_epi32(v[0], x);
2778 u[2] = _mm256_add_epi32(u[2], rnding);
2779 u[2] = _mm256_srai_epi32(u[2], bit);
2780
2781 u[3] = _mm256_sub_epi32(v[0], x);
2782 u[3] = _mm256_add_epi32(u[3], rnding);
2783 u[3] = _mm256_srai_epi32(u[3], bit);
2784
2785 v[0] = _mm256_mullo_epi32(v[6], cospi32);
2786 x = _mm256_mullo_epi32(v[7], cospi32);
2787 u[6] = _mm256_add_epi32(v[0], x);
2788 u[6] = _mm256_add_epi32(u[6], rnding);
2789 u[6] = _mm256_srai_epi32(u[6], bit);
2790
2791 u[7] = _mm256_sub_epi32(v[0], x);
2792 u[7] = _mm256_add_epi32(u[7], rnding);
2793 u[7] = _mm256_srai_epi32(u[7], bit);
2794
2795 // stage 7
2796 if (do_cols) {
2797 out[0] = u[0];
2798 out[1] = _mm256_sub_epi32(kZero, u[4]);
2799 out[2] = u[6];
2800 out[3] = _mm256_sub_epi32(kZero, u[2]);
2801 out[4] = u[3];
2802 out[5] = _mm256_sub_epi32(kZero, u[7]);
2803 out[6] = u[5];
2804 out[7] = _mm256_sub_epi32(kZero, u[1]);
2805 } else {
2806 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
2807 const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2808 const __m256i clamp_hi_out =
2809 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2810
2811 neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2812 out_shift);
2813 neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
2814 out_shift);
2815 neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
2816 out_shift);
2817 neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
2818 out_shift);
2819 }
2820}
2821static inline void idct64_stage8_avx2(
2822 __m256i *u, const __m256i *cospim32, const __m256i *cospi32,
2823 const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
2824 const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
2825 const __m256i *rnding, int bit) {
2826 int i;
2827 __m256i temp1, temp2, temp3, temp4;
2828 temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit);
2829 u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit);
2830 u[10] = temp1;
2831 temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit);
2832 u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit);
2833 u[11] = temp2;
2834
2835 for (i = 16; i < 20; ++i) {
2836 addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
2837 addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
2838 }
2839
2840 temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit);
2841 temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit);
2842 temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit);
2843 temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit);
2844 u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit);
2845 u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit);
2846 u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit);
2847 u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit);
2848 u[36] = temp1;
2849 u[37] = temp2;
2850 u[38] = temp3;
2851 u[39] = temp4;
2852
2853 temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit);
2854 temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit);
2855 temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit);
2856 temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit);
2857 u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit);
2858 u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit);
2859 u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit);
2860 u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit);
2861 u[40] = temp1;
2862 u[41] = temp2;
2863 u[42] = temp3;
2864 u[43] = temp4;
2865}
2866
2867static inline void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32,
2868 const __m256i *cospi32,
2869 const __m256i *clamp_lo,
2870 const __m256i *clamp_hi,
2871 const __m256i *rnding, int bit) {
2872 int i;
2873 __m256i temp1, temp2, temp3, temp4;
2874 for (i = 0; i < 8; ++i) {
2875 addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
2876 }
2877
2878 temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit);
2879 temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit);
2880 temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit);
2881 temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit);
2882 u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit);
2883 u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit);
2884 u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit);
2885 u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit);
2886 u[20] = temp1;
2887 u[21] = temp2;
2888 u[22] = temp3;
2889 u[23] = temp4;
2890 for (i = 32; i < 40; i++) {
2891 addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
2892 }
2893
2894 for (i = 48; i < 56; i++) {
2895 addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
2896 }
2897}
2898
2899static inline void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32,
2900 const __m256i *cospi32,
2901 const __m256i *clamp_lo,
2902 const __m256i *clamp_hi,
2903 const __m256i *rnding, int bit) {
2904 __m256i temp1, temp2, temp3, temp4;
2905 for (int i = 0; i < 16; i++) {
2906 addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
2907 }
2908
2909 temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit);
2910 temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit);
2911 temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit);
2912 temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit);
2913 u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit);
2914 u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit);
2915 u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit);
2916 u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit);
2917 u[40] = temp1;
2918 u[41] = temp2;
2919 u[42] = temp3;
2920 u[43] = temp4;
2921
2922 temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit);
2923 temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit);
2924 temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit);
2925 temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit);
2926 u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit);
2927 u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit);
2928 u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit);
2929 u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit);
2930 u[44] = temp1;
2931 u[45] = temp2;
2932 u[46] = temp3;
2933 u[47] = temp4;
2934}
2935
2936static inline void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols,
2937 int bd, int out_shift,
2938 const __m256i *clamp_lo,
2939 const __m256i *clamp_hi) {
2940 for (int i = 0; i < 32; i++) {
2941 addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi);
2942 }
2943
2944 if (!do_cols) {
2945 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
2946 const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2947 const __m256i clamp_hi_out =
2948 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2949
2950 round_shift_8x8_avx2(out, out_shift);
2951 round_shift_8x8_avx2(out + 16, out_shift);
2952 round_shift_8x8_avx2(out + 32, out_shift);
2953 round_shift_8x8_avx2(out + 48, out_shift);
2954 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
2955 }
2956}
2957
2958static void idct64_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2959 int bd, int out_shift) {
2960 const int32_t *cospi = cospi_arr(bit);
2961 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2962 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
2963 __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2964 __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2965
2966 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2967
2968 {
2969 __m256i x;
2970
2971 // stage 1
2972 // stage 2
2973 // stage 3
2974 // stage 4
2975 // stage 5
2976 // stage 6
2977 x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit);
2978
2979 // stage 8
2980 // stage 9
2981 // stage 10
2982 // stage 11
2983 if (!do_cols) {
2984 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
2985 clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2986 clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2987 if (out_shift != 0) {
2988 __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
2989 x = _mm256_add_epi32(x, offset);
2990 x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
2991 }
2992 }
2993 x = _mm256_max_epi32(x, clamp_lo);
2994 x = _mm256_min_epi32(x, clamp_hi);
2995 out[0] = x;
2996 out[1] = x;
2997 out[2] = x;
2998 out[3] = x;
2999 out[4] = x;
3000 out[5] = x;
3001 out[6] = x;
3002 out[7] = x;
3003 out[8] = x;
3004 out[9] = x;
3005 out[10] = x;
3006 out[11] = x;
3007 out[12] = x;
3008 out[13] = x;
3009 out[14] = x;
3010 out[15] = x;
3011 out[16] = x;
3012 out[17] = x;
3013 out[18] = x;
3014 out[19] = x;
3015 out[20] = x;
3016 out[21] = x;
3017 out[22] = x;
3018 out[23] = x;
3019 out[24] = x;
3020 out[25] = x;
3021 out[26] = x;
3022 out[27] = x;
3023 out[28] = x;
3024 out[29] = x;
3025 out[30] = x;
3026 out[31] = x;
3027 out[32] = x;
3028 out[33] = x;
3029 out[34] = x;
3030 out[35] = x;
3031 out[36] = x;
3032 out[37] = x;
3033 out[38] = x;
3034 out[39] = x;
3035 out[40] = x;
3036 out[41] = x;
3037 out[42] = x;
3038 out[43] = x;
3039 out[44] = x;
3040 out[45] = x;
3041 out[46] = x;
3042 out[47] = x;
3043 out[48] = x;
3044 out[49] = x;
3045 out[50] = x;
3046 out[51] = x;
3047 out[52] = x;
3048 out[53] = x;
3049 out[54] = x;
3050 out[55] = x;
3051 out[56] = x;
3052 out[57] = x;
3053 out[58] = x;
3054 out[59] = x;
3055 out[60] = x;
3056 out[61] = x;
3057 out[62] = x;
3058 out[63] = x;
3059 }
3060}
3061static void idct64_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
3062 int bd, int out_shift) {
3063 int i, j;
3064 const int32_t *cospi = cospi_arr(bit);
3065 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3066 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
3067 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3068 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3069
3070 const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
3071 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3072 const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
3073 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
3074 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
3075 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3076 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
3077 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3078 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
3079 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3080 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
3081 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3082 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3083 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
3084 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3085 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3086 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
3087 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
3088 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
3089 const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
3090 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
3091 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
3092 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
3093 const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
3094 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
3095 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
3096 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
3097 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3098 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
3099 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
3100 const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
3101 const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
3102 const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
3103 const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
3104 const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
3105 const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
3106 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
3107 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3108
3109 {
3110 __m256i u[64];
3111
3112 // stage 1
3113 u[0] = in[0];
3114 u[8] = in[4];
3115 u[16] = in[2];
3116 u[24] = in[6];
3117 u[32] = in[1];
3118 u[40] = in[5];
3119 u[48] = in[3];
3120 u[56] = in[7];
3121
3122 // stage 2
3123 u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
3124 u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
3125 u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
3126 u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
3127 u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
3128 u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
3129 u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
3130 u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
3131
3132 // stage 3
3133 u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
3134 u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
3135 u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
3136 u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
3137 u[33] = u[32];
3138 u[38] = u[39];
3139 u[41] = u[40];
3140 u[46] = u[47];
3141 u[49] = u[48];
3142 u[54] = u[55];
3143 u[57] = u[56];
3144 u[62] = u[63];
3145
3146 // stage 4
3147 __m256i temp1, temp2;
3148 u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
3149 u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
3150 u[17] = u[16];
3151 u[22] = u[23];
3152 u[25] = u[24];
3153 u[30] = u[31];
3154
3155 temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3156 u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3157 u[33] = temp1;
3158
3159 temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3160 u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3161 u[57] = temp2;
3162
3163 temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3164 u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3165 u[41] = temp1;
3166
3167 temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3168 u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3169 u[46] = temp2;
3170
3171 // stage 5
3172 u[9] = u[8];
3173 u[14] = u[15];
3174
3175 temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3176 u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3177 u[17] = temp1;
3178
3179 temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3180 u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3181 u[22] = temp2;
3182
3183 u[35] = u[32];
3184 u[34] = u[33];
3185 u[36] = u[39];
3186 u[37] = u[38];
3187 u[43] = u[40];
3188 u[42] = u[41];
3189 u[44] = u[47];
3190 u[45] = u[46];
3191 u[51] = u[48];
3192 u[50] = u[49];
3193 u[52] = u[55];
3194 u[53] = u[54];
3195 u[59] = u[56];
3196 u[58] = u[57];
3197 u[60] = u[63];
3198 u[61] = u[62];
3199
3200 // stage 6
3201 temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3202 u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3203 u[0] = temp1;
3204
3205 temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3206 u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3207 u[9] = temp2;
3208 u[19] = u[16];
3209 u[18] = u[17];
3210 u[20] = u[23];
3211 u[21] = u[22];
3212 u[27] = u[24];
3213 u[26] = u[25];
3214 u[28] = u[31];
3215 u[29] = u[30];
3216
3217 temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3218 u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3219 u[34] = temp1;
3220 temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3221 u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3222 u[35] = temp2;
3223 temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3224 u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3225 u[36] = temp1;
3226 temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3227 u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3228 u[37] = temp2;
3229 temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3230 u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3231 u[42] = temp1;
3232 temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3233 u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3234 u[43] = temp2;
3235 temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3236 u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3237 u[44] = temp1;
3238 temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3239 u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3240 u[45] = temp2;
3241
3242 // stage 7
3243 u[3] = u[0];
3244 u[2] = u[1];
3245 u[11] = u[8];
3246 u[10] = u[9];
3247 u[12] = u[15];
3248 u[13] = u[14];
3249
3250 temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3251 u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3252 u[18] = temp1;
3253 temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3254 u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3255 u[19] = temp2;
3256 temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3257 u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3258 u[20] = temp1;
3259 temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3260 u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3261 u[21] = temp2;
3262 for (i = 32; i < 64; i += 16) {
3263 for (j = i; j < i + 4; j++) {
3264 addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3265 addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3266 &clamp_hi);
3267 }
3268 }
3269
3270 // stage 8
3271 u[7] = u[0];
3272 u[6] = u[1];
3273 u[5] = u[2];
3274 u[4] = u[3];
3275
3276 idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3277 &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3278
3279 // stage 9
3280 idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3281 bit);
3282
3283 // stage 10
3284 idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3285 bit);
3286
3287 // stage 11
3288 idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3289 }
3290}
3291static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
3292 int bd, int out_shift) {
3293 int i, j;
3294 const int32_t *cospi = cospi_arr(bit);
3295 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3296 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
3297 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3298 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3299
3300 const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
3301 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3302 const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
3303 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
3304 const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
3305 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
3306 const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
3307 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3308 const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
3309 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
3310 const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
3311 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
3312 const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
3313 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
3314 const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
3315 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3316 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
3317 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3318 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
3319 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3320 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
3321 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3322 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
3323 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3324 const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
3325 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
3326 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
3327 const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
3328 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3329 const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
3330 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
3331 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3332 const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
3333
3334 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
3335 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
3336 const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
3337 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
3338 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
3339 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
3340 const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
3341 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
3342 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
3343 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
3344 const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
3345 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3346 const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
3347 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
3348 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
3349 const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
3350 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
3351 const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
3352 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
3353 const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
3354 const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
3355
3356 {
3357 __m256i u[64];
3358 __m256i tmp1, tmp2, tmp3, tmp4;
3359 // stage 1
3360 u[0] = in[0];
3361 u[32] = in[1];
3362 u[36] = in[9];
3363 u[40] = in[5];
3364 u[44] = in[13];
3365 u[48] = in[3];
3366 u[52] = in[11];
3367 u[56] = in[7];
3368 u[60] = in[15];
3369 u[16] = in[2];
3370 u[20] = in[10];
3371 u[24] = in[6];
3372 u[28] = in[14];
3373 u[4] = in[8];
3374 u[8] = in[4];
3375 u[12] = in[12];
3376
3377 // stage 2
3378 u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
3379 u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
3380 u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
3381 u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
3382 u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
3383 u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
3384 u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
3385 u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
3386 u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
3387 u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
3388 u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
3389 u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
3390 u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
3391 u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
3392 u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
3393 u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
3394
3395 // stage 3
3396 u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
3397 u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
3398 u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit);
3399 u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit);
3400 u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit);
3401 u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit);
3402 u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
3403 u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
3404 u[33] = u[32];
3405 u[34] = u[35];
3406 u[37] = u[36];
3407 u[38] = u[39];
3408 u[41] = u[40];
3409 u[42] = u[43];
3410 u[45] = u[44];
3411 u[46] = u[47];
3412 u[49] = u[48];
3413 u[50] = u[51];
3414 u[53] = u[52];
3415 u[54] = u[55];
3416 u[57] = u[56];
3417 u[58] = u[59];
3418 u[61] = u[60];
3419 u[62] = u[63];
3420
3421 // stage 4
3422 u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
3423 u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
3424 u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
3425 u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
3426
3427 u[17] = u[16];
3428 u[18] = u[19];
3429 u[21] = u[20];
3430 u[22] = u[23];
3431 u[25] = u[24];
3432 u[26] = u[27];
3433 u[29] = u[28];
3434 u[30] = u[31];
3435
3436 tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3437 tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3438 tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3439 tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3440 u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3441 u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3442 u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3443 u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3444 u[33] = tmp1;
3445 u[34] = tmp2;
3446 u[37] = tmp3;
3447 u[38] = tmp4;
3448
3449 tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3450 tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3451 tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3452 tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3453 u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3454 u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3455 u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3456 u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3457 u[41] = tmp1;
3458 u[42] = tmp2;
3459 u[45] = tmp3;
3460 u[46] = tmp4;
3461
3462 // stage 5
3463 u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
3464 u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
3465
3466 u[9] = u[8];
3467 u[10] = u[11];
3468 u[13] = u[12];
3469 u[14] = u[15];
3470
3471 tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3472 tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
3473 tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
3474 tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3475 u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3476 u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
3477 u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
3478 u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3479 u[17] = tmp1;
3480 u[18] = tmp2;
3481 u[21] = tmp3;
3482 u[22] = tmp4;
3483
3484 for (i = 32; i < 64; i += 8) {
3485 addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3486 &clamp_hi);
3487 addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3488 &clamp_hi);
3489
3490 addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3491 &clamp_hi);
3492 addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3493 &clamp_hi);
3494 }
3495
3496 // stage 6
3497 tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3498 u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3499 u[0] = tmp1;
3500 u[5] = u[4];
3501 u[6] = u[7];
3502
3503 tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3504 u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3505 u[9] = tmp1;
3506 tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3507 u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3508 u[10] = tmp2;
3509
3510 for (i = 16; i < 32; i += 8) {
3511 addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3512 &clamp_hi);
3513 addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3514 &clamp_hi);
3515
3516 addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3517 &clamp_hi);
3518 addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3519 &clamp_hi);
3520 }
3521
3522 tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3523 tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3524 tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3525 tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3526 u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3527 u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3528 u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3529 u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3530 u[34] = tmp1;
3531 u[35] = tmp2;
3532 u[36] = tmp3;
3533 u[37] = tmp4;
3534
3535 tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3536 tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3537 tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3538 tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3539 u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3540 u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3541 u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3542 u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3543 u[42] = tmp1;
3544 u[43] = tmp2;
3545 u[44] = tmp3;
3546 u[45] = tmp4;
3547
3548 // stage 7
3549 u[3] = u[0];
3550 u[2] = u[1];
3551 tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
3552 u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
3553 u[5] = tmp1;
3554 addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3555 addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3556 addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3557 addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3558
3559 tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3560 tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3561 tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3562 tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3563 u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3564 u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3565 u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3566 u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3567 u[18] = tmp1;
3568 u[19] = tmp2;
3569 u[20] = tmp3;
3570 u[21] = tmp4;
3571
3572 for (i = 32; i < 64; i += 16) {
3573 for (j = i; j < i + 4; j++) {
3574 addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3575 addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3576 &clamp_hi);
3577 }
3578 }
3579
3580 // stage 8
3581 for (i = 0; i < 4; ++i) {
3582 addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
3583 }
3584
3585 idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3586 &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3587
3588 // stage 9
3589 idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3590 bit);
3591
3592 // stage 10
3593 idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3594 bit);
3595
3596 // stage 11
3597 idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3598 }
3599}
3600static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
3601 int out_shift) {
3602 int i, j;
3603 const int32_t *cospi = cospi_arr(bit);
3604 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3605 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8))(((16) > (bd + (do_cols ? 6 : 8))) ? (16) : (bd + (do_cols
? 6 : 8)))
;
3606 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3607 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3608
3609 const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
3610 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3611 const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
3612 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
3613 const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
3614 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
3615 const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
3616 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3617 const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
3618 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
3619 const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
3620 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
3621 const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
3622 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
3623 const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
3624 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3625 const __m256i cospi17 = _mm256_set1_epi32(cospi[17]);
3626 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
3627 const __m256i cospi19 = _mm256_set1_epi32(cospi[19]);
3628 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
3629 const __m256i cospi21 = _mm256_set1_epi32(cospi[21]);
3630 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
3631 const __m256i cospi23 = _mm256_set1_epi32(cospi[23]);
3632 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3633 const __m256i cospi25 = _mm256_set1_epi32(cospi[25]);
3634 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
3635 const __m256i cospi27 = _mm256_set1_epi32(cospi[27]);
3636 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
3637 const __m256i cospi29 = _mm256_set1_epi32(cospi[29]);
3638 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
3639 const __m256i cospi31 = _mm256_set1_epi32(cospi[31]);
3640 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3641 const __m256i cospi35 = _mm256_set1_epi32(cospi[35]);
3642 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
3643 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
3644 const __m256i cospi39 = _mm256_set1_epi32(cospi[39]);
3645 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3646 const __m256i cospi43 = _mm256_set1_epi32(cospi[43]);
3647 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
3648 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
3649 const __m256i cospi47 = _mm256_set1_epi32(cospi[47]);
3650 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3651 const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
3652 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
3653 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
3654 const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
3655 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3656 const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
3657 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
3658 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3659 const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
3660
3661 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
3662 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
3663 const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
3664 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
3665 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
3666 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
3667 const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
3668 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
3669 const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]);
3670 const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
3671 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
3672 const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]);
3673 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
3674 const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]);
3675 const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
3676 const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
3677 const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]);
3678 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3679 const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
3680 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
3681 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
3682 const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
3683 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
3684 const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
3685 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
3686 const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
3687 const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
3688
3689 {
3690 __m256i u[64], v[64];
3691
3692 // stage 1
3693 u[32] = in[1];
3694 u[34] = in[17];
3695 u[36] = in[9];
3696 u[38] = in[25];
3697 u[40] = in[5];
3698 u[42] = in[21];
3699 u[44] = in[13];
3700 u[46] = in[29];
3701 u[48] = in[3];
3702 u[50] = in[19];
3703 u[52] = in[11];
3704 u[54] = in[27];
3705 u[56] = in[7];
3706 u[58] = in[23];
3707 u[60] = in[15];
3708 u[62] = in[31];
3709
3710 v[16] = in[2];
3711 v[18] = in[18];
3712 v[20] = in[10];
3713 v[22] = in[26];
3714 v[24] = in[6];
3715 v[26] = in[22];
3716 v[28] = in[14];
3717 v[30] = in[30];
3718
3719 u[8] = in[4];
3720 u[10] = in[20];
3721 u[12] = in[12];
3722 u[14] = in[28];
3723
3724 v[4] = in[8];
3725 v[6] = in[24];
3726
3727 u[0] = in[0];
3728 u[2] = in[16];
3729
3730 // stage 2
3731 v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
3732 v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit);
3733 v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit);
3734 v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
3735 v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
3736 v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit);
3737 v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit);
3738 v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
3739 v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
3740 v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit);
3741 v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit);
3742 v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
3743 v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
3744 v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit);
3745 v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit);
3746 v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
3747 v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
3748 v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit);
3749 v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit);
3750 v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
3751 v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
3752 v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit);
3753 v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit);
3754 v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
3755 v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
3756 v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit);
3757 v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit);
3758 v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
3759 v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
3760 v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit);
3761 v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit);
3762 v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
3763
3764 // stage 3
3765 u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit);
3766 u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit);
3767 u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit);
3768 u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit);
3769 u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit);
3770 u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit);
3771 u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit);
3772 u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit);
3773 u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit);
3774 u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit);
3775 u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit);
3776 u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit);
3777 u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit);
3778 u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit);
3779 u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit);
3780 u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit);
3781
3782 for (i = 32; i < 64; i += 4) {
3783 addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
3784 &clamp_hi);
3785 addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
3786 &clamp_hi);
3787 }
3788
3789 // stage 4
3790 v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
3791 v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
3792 v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
3793 v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
3794 v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
3795 v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
3796 v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
3797 v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
3798
3799 for (i = 16; i < 32; i += 4) {
3800 addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
3801 &clamp_hi);
3802 addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
3803 &clamp_hi);
3804 }
3805
3806 for (i = 32; i < 64; i += 4) {
3807 v[i + 0] = u[i + 0];
3808 v[i + 3] = u[i + 3];
3809 }
3810
3811 v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3812 v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3813 v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3814 v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3815 v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3816 v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3817 v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3818 v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3819 v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3820 v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3821 v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3822 v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3823 v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3824 v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3825 v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3826 v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3827
3828 // stage 5
3829 u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit);
3830 u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit);
3831 u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit);
3832 u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit);
3833
3834 for (i = 8; i < 16; i += 4) {
3835 addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
3836 &clamp_hi);
3837 addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
3838 &clamp_hi);
3839 }
3840
3841 for (i = 16; i < 32; i += 4) {
3842 u[i + 0] = v[i + 0];
3843 u[i + 3] = v[i + 3];
3844 }
3845
3846 u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
3847 u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
3848 u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
3849 u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
3850 u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
3851 u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
3852 u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
3853 u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
3854
3855 for (i = 32; i < 64; i += 8) {
3856 addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3857 &clamp_hi);
3858 addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3859 &clamp_hi);
3860
3861 addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3862 &clamp_hi);
3863 addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3864 &clamp_hi);
3865 }
3866
3867 // stage 6
3868 v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3869 v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3870 v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
3871 v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
3872
3873 addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
3874 addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
3875
3876 for (i = 8; i < 16; i += 4) {
3877 v[i + 0] = u[i + 0];
3878 v[i + 3] = u[i + 3];
3879 }
3880
3881 v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3882 v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3883 v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3884 v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3885
3886 for (i = 16; i < 32; i += 8) {
3887 addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
3888 &clamp_hi);
3889 addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
3890 &clamp_hi);
3891
3892 addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
3893 &clamp_hi);
3894 addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
3895 &clamp_hi);
3896 }
3897
3898 for (i = 32; i < 64; i += 8) {
3899 v[i + 0] = u[i + 0];
3900 v[i + 1] = u[i + 1];
3901 v[i + 6] = u[i + 6];
3902 v[i + 7] = u[i + 7];
3903 }
3904
3905 v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3906 v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3907 v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3908 v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3909 v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3910 v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3911 v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3912 v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3913 v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3914 v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3915 v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3916 v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3917 v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3918 v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3919 v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3920 v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3921
3922 // stage 7
3923 addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
3924 addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
3925
3926 u[4] = v[4];
3927 u[7] = v[7];
3928 u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
3929 u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
3930
3931 addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3932 addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3933 addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3934 addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3935
3936 for (i = 16; i < 32; i += 8) {
3937 u[i + 0] = v[i + 0];
3938 u[i + 1] = v[i + 1];
3939 u[i + 6] = v[i + 6];
3940 u[i + 7] = v[i + 7];
3941 }
3942
3943 u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
3944 u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
3945 u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
3946 u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
3947 u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
3948 u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
3949 u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
3950 u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
3951
3952 for (i = 32; i < 64; i += 16) {
3953 for (j = i; j < i + 4; j++) {
3954 addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3955 addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3956 &clamp_hi);
3957 }
3958 }
3959
3960 // stage 8
3961 for (i = 0; i < 4; ++i) {
3962 addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
3963 }
3964
3965 v[8] = u[8];
3966 v[9] = u[9];
3967 v[14] = u[14];
3968 v[15] = u[15];
3969
3970 v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
3971 v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
3972 v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
3973 v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
3974
3975 for (i = 16; i < 20; ++i) {
3976 addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
3977 addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
3978 &clamp_hi);
3979 }
3980
3981 for (i = 32; i < 36; ++i) {
3982 v[i] = u[i];
3983 v[i + 12] = u[i + 12];
3984 v[i + 16] = u[i + 16];
3985 v[i + 28] = u[i + 28];
3986 }
3987
3988 v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
3989 v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
3990 v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
3991 v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
3992 v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
3993 v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
3994 v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
3995 v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
3996 v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
3997 v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
3998 v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
3999 v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
4000 v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
4001 v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
4002 v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
4003 v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
4004
4005 // stage 9
4006 for (i = 0; i < 8; ++i) {
4007 addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
4008 }
4009
4010 for (i = 16; i < 20; ++i) {
4011 u[i] = v[i];
4012 u[i + 12] = v[i + 12];
4013 }
4014
4015 u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
4016 u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
4017 u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
4018 u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
4019 u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
4020 u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
4021 u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
4022 u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
4023
4024 for (i = 32; i < 40; i++) {
4025 addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
4026 }
4027
4028 for (i = 48; i < 56; i++) {
4029 addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
4030 }
4031
4032 // stage 10
4033 for (i = 0; i < 16; i++) {
4034 addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
4035 }
4036
4037 for (i = 32; i < 40; i++) v[i] = u[i];
4038
4039 v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
4040 v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
4041 v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
4042 v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
4043 v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
4044 v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
4045 v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
4046 v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
4047 v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
4048 v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
4049 v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
4050 v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
4051 v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
4052 v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
4053 v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
4054 v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
4055
4056 for (i = 56; i < 64; i++) v[i] = u[i];
4057
4058 // stage 11
4059 for (i = 0; i < 32; i++) {
4060 addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
4061 &clamp_hi);
4062 }
4063 if (!do_cols) {
4064 const int log_range_out = AOMMAX(16, bd + 6)(((16) > (bd + 6)) ? (16) : (bd + 6));
4065 const __m256i clamp_lo_out =
4066 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
4067 const __m256i clamp_hi_out =
4068 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
4069
4070 round_shift_8x8_avx2(out, out_shift);
4071 round_shift_8x8_avx2(out + 16, out_shift);
4072 round_shift_8x8_avx2(out + 32, out_shift);
4073 round_shift_8x8_avx2(out + 48, out_shift);
4074 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
4075 }
4076 }
4077}
4078typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit,
4079 int do_cols, int bd, int out_shift);
4080
4081static const transform_1d_avx2
4082 highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
4083 {
4084 { NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0) },
4085 { NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0) },
4086 { NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0) },
4087 },
4088 {
4089 { idct8x8_low1_avx2, idct8x8_avx2, NULL((void*)0), NULL((void*)0) },
4090 { iadst8x8_low1_avx2, iadst8x8_avx2, NULL((void*)0), NULL((void*)0) },
4091 { NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0) },
4092 },
4093 {
4094 { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL((void*)0) },
4095 { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL((void*)0) },
4096 { NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0) },
4097 },
4098 { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
4099 { NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0) },
4100 { NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0) } },
4101
4102 { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 },
4103 { NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0) },
4104 { NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0) } }
4105 };
4106
4107static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
4108 uint16_t *output, int stride,
4109 TX_TYPE tx_type,
4110 TX_SIZE tx_size, int eob,
4111 const int bd) {
4112 __m256i buf1[64 * 8];
4113 int eobx, eoby;
4114 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
4115 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
4116 const int txw_idx = get_txw_idx(tx_size);
4117 const int txh_idx = get_txh_idx(tx_size);
4118 const int txfm_size_col = tx_size_wide[tx_size];
4119 const int txfm_size_row = tx_size_high[tx_size];
4120 const int buf_size_w_div8 = txfm_size_col >> 3;
4121 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
4122 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
4123 const int input_stride = AOMMIN(32, txfm_size_row)(((32) < (txfm_size_row)) ? (32) : (txfm_size_row));
4124 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
4125 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
4126 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
4127 const transform_1d_avx2 row_txfm =
4128 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
4129 const transform_1d_avx2 col_txfm =
4130 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
4131
4132 assert(col_txfm != NULL)((void) sizeof ((col_txfm != ((void*)0)) ? 1 : 0), __extension__
({ if (col_txfm != ((void*)0)) ; else __assert_fail ("col_txfm != NULL"
, "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c"
, 4132, __extension__ __PRETTY_FUNCTION__); }))
;
4133 assert(row_txfm != NULL)((void) sizeof ((row_txfm != ((void*)0)) ? 1 : 0), __extension__
({ if (row_txfm != ((void*)0)) ; else __assert_fail ("row_txfm != NULL"
, "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c"
, 4133, __extension__ __PRETTY_FUNCTION__); }))
;
4134 int ud_flip, lr_flip;
4135 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4136
4137 // 1st stage: column transform
4138 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
4139 __m256i buf0[64];
4140 load_buffer_32bit_input(input + i * 8, input_stride, buf0,
4141 buf_size_nonzero_w);
4142 if (rect_type == 1 || rect_type == -1) {
4143 round_shift_rect_array_32_avx2(buf0, buf0, buf_size_nonzero_w, 0,
4144 NewInvSqrt2);
4145 }
4146 row_txfm(buf0, buf0, INV_COS_BIT12, 0, bd, -shift[0]);
4147
4148 __m256i *_buf1 = buf1 + i * 8;
4149 if (lr_flip) {
4150 for (int j = 0; j < buf_size_w_div8; ++j) {
4151 transpose_8x8_flip_avx2(
4152 &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]);
4153 }
4154 } else {
4155 for (int j = 0; j < buf_size_w_div8; ++j) {
4156 transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
4157 }
4158 }
4159 }
4160 // 2nd stage: column transform
4161 for (int i = 0; i < buf_size_w_div8; i++) {
4162 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT12, 1,
4163 bd, 0);
4164
4165 round_shift_array_32_avx2(buf1 + i * txfm_size_row,
4166 buf1 + i * txfm_size_row, txfm_size_row,
4167 -shift[1]);
4168 }
4169
4170 // write to buffer
4171 if (txfm_size_col >= 16) {
4172 for (int i = 0; i < (txfm_size_col >> 4); i++) {
4173 highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
4174 output + 16 * i, stride, ud_flip,
4175 txfm_size_row, bd);
4176 }
4177 } else if (txfm_size_col == 8) {
4178 highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row,
4179 bd);
4180 }
4181}
4182
4183static void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
4184 uint8_t *output, int stride,
4185 TX_TYPE tx_type,
4186 TX_SIZE tx_size, int eob,
4187 const int bd) {
4188 switch (tx_type) {
4189 case DCT_DCT:
4190 case ADST_DCT:
4191 case DCT_ADST:
4192 case ADST_ADST:
4193 case FLIPADST_DCT:
4194 case DCT_FLIPADST:
4195 case FLIPADST_FLIPADST:
4196 case ADST_FLIPADST:
4197 case FLIPADST_ADST:
4198 highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output)((uint16_t *)(((uintptr_t)(output)) << 1)),
4199 stride, tx_type, tx_size, eob, bd);
4200 break;
4201 case IDTX:
4202 case H_DCT:
4203 case H_ADST:
4204 case H_FLIPADST:
4205 case V_DCT:
4206 case V_ADST:
4207 case V_FLIPADST:
4208 av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type,
4209 tx_size, eob, bd);
4210 break;
4211 default: assert(0)((void) sizeof ((0) ? 1 : 0), __extension__ ({ if (0) ; else __assert_fail
("0", "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c"
, 4211, __extension__ __PRETTY_FUNCTION__); }))
; break;
4212 }
4213}
4214void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
4215 int stride, const TxfmParam *txfm_param) {
4216 assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type])((void) sizeof ((av1_ext_tx_used[txfm_param->tx_set_type][
txfm_param->tx_type]) ? 1 : 0), __extension__ ({ if (av1_ext_tx_used
[txfm_param->tx_set_type][txfm_param->tx_type]) ; else __assert_fail
("av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]"
, "/root/firefox-clang/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c"
, 4216, __extension__ __PRETTY_FUNCTION__); }))
;
4217 const TX_SIZE tx_size = txfm_param->tx_size;
4218 switch (tx_size) {
4219 case TX_4X8:
4220 case TX_8X4:
4221 case TX_4X4:
4222 case TX_16X4:
4223 case TX_4X16:
4224 av1_highbd_inv_txfm_add_sse4_1(input, dest, stride, txfm_param);
4225 break;
4226 default:
4227 av1_highbd_inv_txfm2d_add_universe_avx2(
4228 input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
4229 txfm_param->eob, txfm_param->bd);
4230 break;
4231 }
4232}