summaryrefslogtreecommitdiff
path: root/media/libjxl/src/lib/jxl/fast_dct64-inl.h
diff options
context:
space:
mode:
Diffstat (limited to 'media/libjxl/src/lib/jxl/fast_dct64-inl.h')
-rw-r--r--media/libjxl/src/lib/jxl/fast_dct64-inl.h985
1 files changed, 985 insertions, 0 deletions
diff --git a/media/libjxl/src/lib/jxl/fast_dct64-inl.h b/media/libjxl/src/lib/jxl/fast_dct64-inl.h
new file mode 100644
index 0000000000..400da1a9de
--- /dev/null
+++ b/media/libjxl/src/lib/jxl/fast_dct64-inl.h
@@ -0,0 +1,985 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/* This file is automatically generated. Do not modify it directly. */
+#if HWY_TARGET != HWY_NEON
+#error "only include this file from fast_dct-inl.h"
+#endif
+
+constexpr size_t FastIDCTIntegerBits(FastDCTTag<64>) { return 1; }
+
+void FastIDCT(FastDCTTag<64>, const int16_t* in, size_t in_stride, int16_t* out,
+ size_t out_stride, size_t count) {
+ JXL_ASSERT(count % 8 == 0);
+ for (size_t i = 0; i < count; i += 8) {
+ int16x8_t v0 = vld1q_s16(in + in_stride * 0 + i);
+ int16x8_t v1 = vld1q_s16(in + in_stride * 32 + i);
+ int16x8_t v2 = vaddq_s16(v0, v1);
+ int16x8_t v3 = vld1q_s16(in + in_stride * 16 + i);
+ int16x8_t v4_tmp = vqrdmulhq_n_s16(v3, 13573);
+ int16x8_t v4 = vaddq_s16(v4_tmp, v3);
+ int16x8_t v5 = vld1q_s16(in + in_stride * 48 + i);
+ int16x8_t v6 = vaddq_s16(v5, v3);
+ int16x8_t v7 = vaddq_s16(v4, v6);
+ int16x8_t v8 = vqrdmulhq_n_s16(v7, 17734);
+ int16x8_t v9 = vaddq_s16(v2, v8);
+ int16x8_t v10 = vld1q_s16(in + in_stride * 8 + i);
+ int16x8_t v11_tmp = vqrdmulhq_n_s16(v10, 13573);
+ int16x8_t v11 = vaddq_s16(v11_tmp, v10);
+ int16x8_t v12 = vld1q_s16(in + in_stride * 40 + i);
+ int16x8_t v13 = vld1q_s16(in + in_stride * 24 + i);
+ int16x8_t v14 = vaddq_s16(v12, v13);
+ int16x8_t v15 = vaddq_s16(v11, v14);
+ int16x8_t v16 = vld1q_s16(in + in_stride * 56 + i);
+ int16x8_t v17 = vaddq_s16(v16, v12);
+ int16x8_t v18 = vaddq_s16(v13, v10);
+ int16x8_t v19 = vaddq_s16(v17, v18);
+ int16x8_t v20 = vqrdmulhq_n_s16(v19, 17734);
+ int16x8_t v21 = vqrdmulhq_n_s16(v18, 25080);
+ int16x8_t v22 = vaddq_s16(v20, v21);
+ int16x8_t v23 = vaddq_s16(v15, v22);
+ int16x8_t v24 = vqrdmulhq_n_s16(v23, 16705);
+ int16x8_t v25 = vaddq_s16(v9, v24);
+ int16x8_t v26 = vld1q_s16(in + in_stride * 4 + i);
+ int16x8_t v27_tmp = vqrdmulhq_n_s16(v26, 13573);
+ int16x8_t v27 = vaddq_s16(v27_tmp, v26);
+ int16x8_t v28 = vld1q_s16(in + in_stride * 36 + i);
+ int16x8_t v29 = vld1q_s16(in + in_stride * 28 + i);
+ int16x8_t v30 = vaddq_s16(v28, v29);
+ int16x8_t v31 = vaddq_s16(v27, v30);
+ int16x8_t v32 = vld1q_s16(in + in_stride * 20 + i);
+ int16x8_t v33 = vld1q_s16(in + in_stride * 12 + i);
+ int16x8_t v34 = vaddq_s16(v32, v33);
+ int16x8_t v35 = vqrdmulhq_n_s16(v34, 25080);
+ int16x8_t v36 = vld1q_s16(in + in_stride * 52 + i);
+ int16x8_t v37 = vld1q_s16(in + in_stride * 44 + i);
+ int16x8_t v38 = vaddq_s16(v36, v37);
+ int16x8_t v39 = vaddq_s16(v38, v34);
+ int16x8_t v40 = vqrdmulhq_n_s16(v39, 17734);
+ int16x8_t v41 = vaddq_s16(v35, v40);
+ int16x8_t v42 = vaddq_s16(v31, v41);
+ int16x8_t v43 = vaddq_s16(v33, v26);
+ int16x8_t v44_tmp = vqrdmulhq_n_s16(v43, 13573);
+ int16x8_t v44 = vaddq_s16(v44_tmp, v43);
+ int16x8_t v45 = vaddq_s16(v37, v28);
+ int16x8_t v46 = vaddq_s16(v29, v32);
+ int16x8_t v47 = vaddq_s16(v45, v46);
+ int16x8_t v48 = vaddq_s16(v44, v47);
+ int16x8_t v49 = vqrdmulhq_n_s16(v48, 16705);
+ int16x8_t v50 = vaddq_s16(v46, v43);
+ int16x8_t v51_tmp = vqrdmulhq_n_s16(v50, 10045);
+ int16x8_t v51 = vaddq_s16(v51_tmp, v50);
+ int16x8_t v52 = vld1q_s16(in + in_stride * 60 + i);
+ int16x8_t v53 = vaddq_s16(v52, v36);
+ int16x8_t v54 = vaddq_s16(v53, v45);
+ int16x8_t v55 = vqrdmulhq_n_s16(v54, 17734);
+ int16x8_t v56 = vaddq_s16(v51, v55);
+ int16x8_t v57 = vqrdmulhq_n_s16(v56, 16705);
+ int16x8_t v58 = vaddq_s16(v49, v57);
+ int16x8_t v59 = vaddq_s16(v42, v58);
+ int16x8_t v60 = vqrdmulhq_n_s16(v59, 16463);
+ int16x8_t v61 = vaddq_s16(v25, v60);
+ int16x8_t v62 = vld1q_s16(in + in_stride * 2 + i);
+ int16x8_t v63_tmp = vqrdmulhq_n_s16(v62, 13573);
+ int16x8_t v63 = vaddq_s16(v63_tmp, v62);
+ int16x8_t v64 = vld1q_s16(in + in_stride * 34 + i);
+ int16x8_t v65 = vld1q_s16(in + in_stride * 30 + i);
+ int16x8_t v66 = vaddq_s16(v64, v65);
+ int16x8_t v67 = vaddq_s16(v63, v66);
+ int16x8_t v68 = vld1q_s16(in + in_stride * 18 + i);
+ int16x8_t v69 = vld1q_s16(in + in_stride * 14 + i);
+ int16x8_t v70 = vaddq_s16(v68, v69);
+ int16x8_t v71 = vqrdmulhq_n_s16(v70, 25080);
+ int16x8_t v72 = vld1q_s16(in + in_stride * 50 + i);
+ int16x8_t v73 = vld1q_s16(in + in_stride * 46 + i);
+ int16x8_t v74 = vaddq_s16(v72, v73);
+ int16x8_t v75 = vaddq_s16(v74, v70);
+ int16x8_t v76 = vqrdmulhq_n_s16(v75, 17734);
+ int16x8_t v77 = vaddq_s16(v71, v76);
+ int16x8_t v78 = vaddq_s16(v67, v77);
+ int16x8_t v79 = vld1q_s16(in + in_stride * 10 + i);
+ int16x8_t v80 = vld1q_s16(in + in_stride * 6 + i);
+ int16x8_t v81 = vaddq_s16(v79, v80);
+ int16x8_t v82_tmp = vqrdmulhq_n_s16(v81, 13573);
+ int16x8_t v82 = vaddq_s16(v82_tmp, v81);
+ int16x8_t v83 = vld1q_s16(in + in_stride * 42 + i);
+ int16x8_t v84 = vld1q_s16(in + in_stride * 38 + i);
+ int16x8_t v85 = vaddq_s16(v83, v84);
+ int16x8_t v86 = vld1q_s16(in + in_stride * 26 + i);
+ int16x8_t v87 = vld1q_s16(in + in_stride * 22 + i);
+ int16x8_t v88 = vaddq_s16(v86, v87);
+ int16x8_t v89 = vaddq_s16(v85, v88);
+ int16x8_t v90 = vaddq_s16(v82, v89);
+ int16x8_t v91 = vqrdmulhq_n_s16(v90, 16705);
+ int16x8_t v92 = vaddq_s16(v88, v81);
+ int16x8_t v93_tmp = vqrdmulhq_n_s16(v92, 10045);
+ int16x8_t v93 = vaddq_s16(v93_tmp, v92);
+ int16x8_t v94 = vld1q_s16(in + in_stride * 58 + i);
+ int16x8_t v95 = vld1q_s16(in + in_stride * 54 + i);
+ int16x8_t v96 = vaddq_s16(v94, v95);
+ int16x8_t v97 = vaddq_s16(v96, v85);
+ int16x8_t v98 = vqrdmulhq_n_s16(v97, 17734);
+ int16x8_t v99 = vaddq_s16(v93, v98);
+ int16x8_t v100 = vqrdmulhq_n_s16(v99, 16705);
+ int16x8_t v101 = vaddq_s16(v91, v100);
+ int16x8_t v102 = vaddq_s16(v78, v101);
+ int16x8_t v103 = vaddq_s16(v69, v79);
+ int16x8_t v104 = vaddq_s16(v80, v62);
+ int16x8_t v105 = vaddq_s16(v103, v104);
+ int16x8_t v106_tmp = vqrdmulhq_n_s16(v105, 13573);
+ int16x8_t v106 = vaddq_s16(v106_tmp, v105);
+ int16x8_t v107 = vaddq_s16(v73, v83);
+ int16x8_t v108 = vaddq_s16(v84, v64);
+ int16x8_t v109 = vaddq_s16(v107, v108);
+ int16x8_t v110 = vaddq_s16(v65, v86);
+ int16x8_t v111 = vaddq_s16(v87, v68);
+ int16x8_t v112 = vaddq_s16(v110, v111);
+ int16x8_t v113 = vaddq_s16(v109, v112);
+ int16x8_t v114 = vaddq_s16(v106, v113);
+ int16x8_t v115 = vqrdmulhq_n_s16(v114, 16705);
+ int16x8_t v116 = vaddq_s16(v112, v105);
+ int16x8_t v117 = vqrdmulhq_n_s16(v116, 25080);
+ int16x8_t v118 = vqrdmulhq_n_s16(v116, 17734);
+ int16x8_t v119 = vld1q_s16(in + in_stride * 62 + i);
+ int16x8_t v120 = vaddq_s16(v119, v94);
+ int16x8_t v121 = vaddq_s16(v95, v72);
+ int16x8_t v122 = vaddq_s16(v120, v121);
+ int16x8_t v123 = vaddq_s16(v122, v109);
+ int16x8_t v124 = vqrdmulhq_n_s16(v123, 17734);
+ int16x8_t v125 = vaddq_s16(v118, v124);
+ int16x8_t v126 = vaddq_s16(v117, v125);
+ int16x8_t v127 = vqrdmulhq_n_s16(v126, 16705);
+ int16x8_t v128 = vaddq_s16(v115, v127);
+ int16x8_t v129 = vqrdmulhq_n_s16(v128, 16463);
+ int16x8_t v130_tmp = vqrdmulhq_n_s16(v104, 13573);
+ int16x8_t v130 = vaddq_s16(v130_tmp, v104);
+ int16x8_t v131 = vaddq_s16(v108, v110);
+ int16x8_t v132 = vaddq_s16(v130, v131);
+ int16x8_t v133 = vaddq_s16(v111, v103);
+ int16x8_t v134_tmp = vqrdmulhq_n_s16(v133, 10045);
+ int16x8_t v134 = vaddq_s16(v134_tmp, v133);
+ int16x8_t v135 = vaddq_s16(v121, v107);
+ int16x8_t v136 = vqrdmulhq_n_s16(v135, 17734);
+ int16x8_t v137 = vaddq_s16(v134, v136);
+ int16x8_t v138 = vaddq_s16(v132, v137);
+ int16x8_t v139 = vqrdmulhq_n_s16(v138, 16463);
+ int16x8_t v140 = vaddq_s16(v129, v139);
+ int16x8_t v141 = vaddq_s16(v102, v140);
+ int16x8_t v142 = vqrdmulhq_n_s16(v141, 16404);
+ int16x8_t v143 = vaddq_s16(v61, v142);
+ int16x8_t v144 = vld1q_s16(in + in_stride * 1 + i);
+ int16x8_t v145_tmp = vqrdmulhq_n_s16(v144, 13573);
+ int16x8_t v145 = vaddq_s16(v145_tmp, v144);
+ int16x8_t v146 = vld1q_s16(in + in_stride * 33 + i);
+ int16x8_t v147 = vld1q_s16(in + in_stride * 31 + i);
+ int16x8_t v148 = vaddq_s16(v146, v147);
+ int16x8_t v149 = vaddq_s16(v145, v148);
+ int16x8_t v150 = vld1q_s16(in + in_stride * 17 + i);
+ int16x8_t v151 = vld1q_s16(in + in_stride * 15 + i);
+ int16x8_t v152 = vaddq_s16(v150, v151);
+ int16x8_t v153 = vqrdmulhq_n_s16(v152, 25080);
+ int16x8_t v154 = vld1q_s16(in + in_stride * 49 + i);
+ int16x8_t v155 = vld1q_s16(in + in_stride * 47 + i);
+ int16x8_t v156 = vaddq_s16(v154, v155);
+ int16x8_t v157 = vaddq_s16(v156, v152);
+ int16x8_t v158 = vqrdmulhq_n_s16(v157, 17734);
+ int16x8_t v159 = vaddq_s16(v153, v158);
+ int16x8_t v160 = vaddq_s16(v149, v159);
+ int16x8_t v161 = vld1q_s16(in + in_stride * 9 + i);
+ int16x8_t v162 = vld1q_s16(in + in_stride * 7 + i);
+ int16x8_t v163 = vaddq_s16(v161, v162);
+ int16x8_t v164_tmp = vqrdmulhq_n_s16(v163, 13573);
+ int16x8_t v164 = vaddq_s16(v164_tmp, v163);
+ int16x8_t v165 = vld1q_s16(in + in_stride * 41 + i);
+ int16x8_t v166 = vld1q_s16(in + in_stride * 39 + i);
+ int16x8_t v167 = vaddq_s16(v165, v166);
+ int16x8_t v168 = vld1q_s16(in + in_stride * 25 + i);
+ int16x8_t v169 = vld1q_s16(in + in_stride * 23 + i);
+ int16x8_t v170 = vaddq_s16(v168, v169);
+ int16x8_t v171 = vaddq_s16(v167, v170);
+ int16x8_t v172 = vaddq_s16(v164, v171);
+ int16x8_t v173 = vqrdmulhq_n_s16(v172, 16705);
+ int16x8_t v174 = vaddq_s16(v170, v163);
+ int16x8_t v175_tmp = vqrdmulhq_n_s16(v174, 10045);
+ int16x8_t v175 = vaddq_s16(v175_tmp, v174);
+ int16x8_t v176 = vld1q_s16(in + in_stride * 57 + i);
+ int16x8_t v177 = vld1q_s16(in + in_stride * 55 + i);
+ int16x8_t v178 = vaddq_s16(v176, v177);
+ int16x8_t v179 = vaddq_s16(v178, v167);
+ int16x8_t v180 = vqrdmulhq_n_s16(v179, 17734);
+ int16x8_t v181 = vaddq_s16(v175, v180);
+ int16x8_t v182 = vqrdmulhq_n_s16(v181, 16705);
+ int16x8_t v183 = vaddq_s16(v173, v182);
+ int16x8_t v184 = vaddq_s16(v160, v183);
+ int16x8_t v185 = vld1q_s16(in + in_stride * 37 + i);
+ int16x8_t v186 = vld1q_s16(in + in_stride * 35 + i);
+ int16x8_t v187 = vaddq_s16(v185, v186);
+ int16x8_t v188 = vld1q_s16(in + in_stride * 45 + i);
+ int16x8_t v189 = vld1q_s16(in + in_stride * 43 + i);
+ int16x8_t v190 = vaddq_s16(v188, v189);
+ int16x8_t v191 = vaddq_s16(v187, v190);
+ int16x8_t v192 = vld1q_s16(in + in_stride * 29 + i);
+ int16x8_t v193 = vld1q_s16(in + in_stride * 27 + i);
+ int16x8_t v194 = vaddq_s16(v192, v193);
+ int16x8_t v195 = vld1q_s16(in + in_stride * 21 + i);
+ int16x8_t v196 = vld1q_s16(in + in_stride * 19 + i);
+ int16x8_t v197 = vaddq_s16(v195, v196);
+ int16x8_t v198 = vaddq_s16(v194, v197);
+ int16x8_t v199 = vaddq_s16(v191, v198);
+ int16x8_t v200 = vld1q_s16(in + in_stride * 5 + i);
+ int16x8_t v201 = vld1q_s16(in + in_stride * 3 + i);
+ int16x8_t v202 = vaddq_s16(v200, v201);
+ int16x8_t v203 = vld1q_s16(in + in_stride * 13 + i);
+ int16x8_t v204 = vld1q_s16(in + in_stride * 11 + i);
+ int16x8_t v205 = vaddq_s16(v203, v204);
+ int16x8_t v206 = vaddq_s16(v202, v205);
+ int16x8_t v207_tmp = vqrdmulhq_n_s16(v206, 13573);
+ int16x8_t v207 = vaddq_s16(v207_tmp, v206);
+ int16x8_t v208 = vaddq_s16(v199, v207);
+ int16x8_t v209 = vqrdmulhq_n_s16(v208, 16705);
+ int16x8_t v210 = vaddq_s16(v198, v206);
+ int16x8_t v211 = vqrdmulhq_n_s16(v210, 25080);
+ int16x8_t v212 = vqrdmulhq_n_s16(v210, 17734);
+ int16x8_t v213 = vld1q_s16(in + in_stride * 53 + i);
+ int16x8_t v214 = vld1q_s16(in + in_stride * 51 + i);
+ int16x8_t v215 = vaddq_s16(v213, v214);
+ int16x8_t v216 = vld1q_s16(in + in_stride * 61 + i);
+ int16x8_t v217 = vld1q_s16(in + in_stride * 59 + i);
+ int16x8_t v218 = vaddq_s16(v216, v217);
+ int16x8_t v219 = vaddq_s16(v215, v218);
+ int16x8_t v220 = vaddq_s16(v219, v191);
+ int16x8_t v221 = vqrdmulhq_n_s16(v220, 17734);
+ int16x8_t v222 = vaddq_s16(v212, v221);
+ int16x8_t v223 = vaddq_s16(v211, v222);
+ int16x8_t v224 = vqrdmulhq_n_s16(v223, 16705);
+ int16x8_t v225 = vaddq_s16(v209, v224);
+ int16x8_t v226 = vqrdmulhq_n_s16(v225, 16463);
+ int16x8_t v227_tmp = vqrdmulhq_n_s16(v202, 13573);
+ int16x8_t v227 = vaddq_s16(v227_tmp, v202);
+ int16x8_t v228 = vaddq_s16(v187, v194);
+ int16x8_t v229 = vaddq_s16(v227, v228);
+ int16x8_t v230 = vaddq_s16(v215, v190);
+ int16x8_t v231 = vqrdmulhq_n_s16(v230, 17734);
+ int16x8_t v232 = vaddq_s16(v197, v205);
+ int16x8_t v233_tmp = vqrdmulhq_n_s16(v232, 10045);
+ int16x8_t v233 = vaddq_s16(v233_tmp, v232);
+ int16x8_t v234 = vaddq_s16(v231, v233);
+ int16x8_t v235 = vaddq_s16(v229, v234);
+ int16x8_t v236 = vqrdmulhq_n_s16(v235, 16463);
+ int16x8_t v237 = vaddq_s16(v226, v236);
+ int16x8_t v238 = vaddq_s16(v184, v237);
+ int16x8_t v239 = vaddq_s16(v201, v144);
+ int16x8_t v240_tmp = vqrdmulhq_n_s16(v239, 13573);
+ int16x8_t v240 = vaddq_s16(v240_tmp, v239);
+ int16x8_t v241 = vaddq_s16(v186, v146);
+ int16x8_t v242 = vaddq_s16(v147, v192);
+ int16x8_t v243 = vaddq_s16(v241, v242);
+ int16x8_t v244 = vaddq_s16(v240, v243);
+ int16x8_t v245 = vaddq_s16(v196, v150);
+ int16x8_t v246 = vaddq_s16(v151, v203);
+ int16x8_t v247 = vaddq_s16(v245, v246);
+ int16x8_t v248_tmp = vqrdmulhq_n_s16(v247, 10045);
+ int16x8_t v248 = vaddq_s16(v248_tmp, v247);
+ int16x8_t v249 = vaddq_s16(v155, v188);
+ int16x8_t v250 = vaddq_s16(v214, v154);
+ int16x8_t v251 = vaddq_s16(v249, v250);
+ int16x8_t v252 = vqrdmulhq_n_s16(v251, 17734);
+ int16x8_t v253 = vaddq_s16(v248, v252);
+ int16x8_t v254 = vaddq_s16(v244, v253);
+ int16x8_t v255 = vaddq_s16(v204, v161);
+ int16x8_t v256 = vaddq_s16(v162, v200);
+ int16x8_t v257 = vaddq_s16(v255, v256);
+ int16x8_t v258_tmp = vqrdmulhq_n_s16(v257, 13573);
+ int16x8_t v258 = vaddq_s16(v258_tmp, v257);
+ int16x8_t v259 = vaddq_s16(v189, v165);
+ int16x8_t v260 = vaddq_s16(v166, v185);
+ int16x8_t v261 = vaddq_s16(v259, v260);
+ int16x8_t v262 = vaddq_s16(v169, v195);
+ int16x8_t v263 = vaddq_s16(v193, v168);
+ int16x8_t v264 = vaddq_s16(v262, v263);
+ int16x8_t v265 = vaddq_s16(v261, v264);
+ int16x8_t v266 = vaddq_s16(v258, v265);
+ int16x8_t v267 = vqrdmulhq_n_s16(v266, 16705);
+ int16x8_t v268 = vaddq_s16(v264, v257);
+ int16x8_t v269 = vqrdmulhq_n_s16(v268, 25080);
+ int16x8_t v270 = vaddq_s16(v217, v176);
+ int16x8_t v271 = vaddq_s16(v177, v213);
+ int16x8_t v272 = vaddq_s16(v270, v271);
+ int16x8_t v273 = vaddq_s16(v272, v261);
+ int16x8_t v274 = vqrdmulhq_n_s16(v273, 17734);
+ int16x8_t v275 = vqrdmulhq_n_s16(v268, 17734);
+ int16x8_t v276 = vaddq_s16(v274, v275);
+ int16x8_t v277 = vaddq_s16(v269, v276);
+ int16x8_t v278 = vqrdmulhq_n_s16(v277, 16705);
+ int16x8_t v279 = vaddq_s16(v267, v278);
+ int16x8_t v280 = vaddq_s16(v254, v279);
+ int16x8_t v281 = vqrdmulhq_n_s16(v280, 16404);
+ int16x8_t v282 = vaddq_s16(v256, v239);
+ int16x8_t v283_tmp = vqrdmulhq_n_s16(v282, 13573);
+ int16x8_t v283 = vaddq_s16(v283_tmp, v282);
+ int16x8_t v284 = vaddq_s16(v260, v241);
+ int16x8_t v285 = vaddq_s16(v242, v263);
+ int16x8_t v286 = vaddq_s16(v284, v285);
+ int16x8_t v287 = vaddq_s16(v283, v286);
+ int16x8_t v288 = vaddq_s16(v262, v245);
+ int16x8_t v289 = vaddq_s16(v246, v255);
+ int16x8_t v290 = vaddq_s16(v288, v289);
+ int16x8_t v291 = vqrdmulhq_n_s16(v290, 25080);
+ int16x8_t v292 = vqrdmulhq_n_s16(v290, 17734);
+ int16x8_t v293 = vaddq_s16(v271, v250);
+ int16x8_t v294 = vaddq_s16(v249, v259);
+ int16x8_t v295 = vaddq_s16(v293, v294);
+ int16x8_t v296 = vqrdmulhq_n_s16(v295, 17734);
+ int16x8_t v297 = vaddq_s16(v292, v296);
+ int16x8_t v298 = vaddq_s16(v291, v297);
+ int16x8_t v299 = vaddq_s16(v287, v298);
+ int16x8_t v300 = vqrdmulhq_n_s16(v299, 16463);
+ int16x8_t v301 = vaddq_s16(v289, v282);
+ int16x8_t v302 = vqrdmulhq_n_s16(v301, 23624);
+ int16x8_t v303 = vaddq_s16(v294, v284);
+ int16x8_t v304 = vqrdmulhq_n_s16(v303, 19705);
+ int16x8_t v305 = vaddq_s16(v285, v288);
+ int16x8_t v306 = vqrdmulhq_n_s16(v305, 19705);
+ int16x8_t v307 = vaddq_s16(v304, v306);
+ int16x8_t v308 = vqrdmulhq_n_s16(v307, 27779);
+ int16x8_t v309 = vaddq_s16(v302, v308);
+ int16x8_t v310 = vaddq_s16(v305, v301);
+ int16x8_t v311 = vqrdmulhq_n_s16(v310, 25080);
+ int16x8_t v312 = vqrdmulhq_n_s16(v310, 17734);
+ int16x8_t v313 = vld1q_s16(in + in_stride * 63 + i);
+ int16x8_t v314 = vaddq_s16(v313, v216);
+ int16x8_t v315 = vaddq_s16(v314, v270);
+ int16x8_t v316 = vaddq_s16(v315, v293);
+ int16x8_t v317 = vqrdmulhq_n_s16(v316, 25746);
+ int16x8_t v318 = vqrdmulhq_n_s16(v303, 25746);
+ int16x8_t v319 = vaddq_s16(v317, v318);
+ int16x8_t v320 = vqrdmulhq_n_s16(v319, 22571);
+ int16x8_t v321 = vaddq_s16(v312, v320);
+ int16x8_t v322 = vaddq_s16(v311, v321);
+ int16x8_t v323 = vqrdmulhq_n_s16(v322, 16705);
+ int16x8_t v324 = vaddq_s16(v309, v323);
+ int16x8_t v325 = vqrdmulhq_n_s16(v324, 16463);
+ int16x8_t v326 = vaddq_s16(v300, v325);
+ int16x8_t v327 = vqrdmulhq_n_s16(v326, 16404);
+ int16x8_t v328 = vaddq_s16(v281, v327);
+ int16x8_t v329 = vaddq_s16(v238, v328);
+ int16x8_t v330 = vqrdmulhq_n_s16(v329, 16389);
+ int16x8_t v331 = vaddq_s16(v143, v330);
+ int16x8_t v332 = vsubq_s16(v82, v89);
+ int16x8_t v333 = vqrdmulhq_n_s16(v332, 19705);
+ int16x8_t v334 = vqrdmulhq_n_s16(v92, 13573);
+ int16x8_t v335 = vsubq_s16(v334, v97);
+ int16x8_t v336 = vqrdmulhq_n_s16(v335, 25746);
+ int16x8_t v337 = vaddq_s16(v333, v336);
+ int16x8_t v338 = vsubq_s16(v63, v66);
+ int16x8_t v339 = vqrdmulhq_n_s16(v70, 17734);
+ int16x8_t v340_tmp = vqrdmulhq_n_s16(v74, 10045);
+ int16x8_t v340 = vaddq_s16(v340_tmp, v74);
+ int16x8_t v341 = vsubq_s16(v339, v340);
+ int16x8_t v342 = vaddq_s16(v338, v341);
+ int16x8_t v343 = vaddq_s16(v337, v342);
+ int16x8_t v344 = vsubq_s16(v130, v131);
+ int16x8_t v345 = vqrdmulhq_n_s16(v133, 13573);
+ int16x8_t v346 = vsubq_s16(v345, v135);
+ int16x8_t v347_tmp = vqrdmulhq_n_s16(v346, 10045);
+ int16x8_t v347 = vaddq_s16(v347_tmp, v346);
+ int16x8_t v348 = vaddq_s16(v344, v347);
+ int16x8_t v349 = vqrdmulhq_n_s16(v348, 17121);
+ int16x8_t v350 = vqrdmulhq_n_s16(v105, 27867);
+ int16x8_t v351 = vqrdmulhq_n_s16(v113, 19705);
+ int16x8_t v352 = vsubq_s16(v350, v351);
+ int16x8_t v353 = vqrdmulhq_n_s16(v116, 13573);
+ int16x8_t v354 = vsubq_s16(v353, v123);
+ int16x8_t v355 = vqrdmulhq_n_s16(v354, 25746);
+ int16x8_t v356 = vaddq_s16(v352, v355);
+ int16x8_t v357 = vqrdmulhq_n_s16(v356, 17121);
+ int16x8_t v358 = vaddq_s16(v349, v357);
+ int16x8_t v359 = vaddq_s16(v343, v358);
+ int16x8_t v360 = vqrdmulhq_n_s16(v359, 16563);
+ int16x8_t v361 = vsubq_s16(v27, v30);
+ int16x8_t v362 = vqrdmulhq_n_s16(v34, 17734);
+ int16x8_t v363_tmp = vqrdmulhq_n_s16(v38, 10045);
+ int16x8_t v363 = vaddq_s16(v363_tmp, v38);
+ int16x8_t v364 = vsubq_s16(v362, v363);
+ int16x8_t v365 = vaddq_s16(v361, v364);
+ int16x8_t v366 = vsubq_s16(v44, v47);
+ int16x8_t v367 = vqrdmulhq_n_s16(v366, 19705);
+ int16x8_t v368 = vqrdmulhq_n_s16(v50, 13573);
+ int16x8_t v369 = vsubq_s16(v368, v54);
+ int16x8_t v370 = vqrdmulhq_n_s16(v369, 25746);
+ int16x8_t v371 = vaddq_s16(v367, v370);
+ int16x8_t v372 = vaddq_s16(v365, v371);
+ int16x8_t v373 = vqrdmulhq_n_s16(v372, 17121);
+ int16x8_t v374 = vsubq_s16(v0, v1);
+ int16x8_t v375 = vsubq_s16(v4, v6);
+ int16x8_t v376_tmp = vqrdmulhq_n_s16(v375, 10045);
+ int16x8_t v376 = vaddq_s16(v376_tmp, v375);
+ int16x8_t v377 = vaddq_s16(v374, v376);
+ int16x8_t v378 = vsubq_s16(v11, v14);
+ int16x8_t v379 = vqrdmulhq_n_s16(v18, 17734);
+ int16x8_t v380_tmp = vqrdmulhq_n_s16(v17, 10045);
+ int16x8_t v380 = vaddq_s16(v380_tmp, v17);
+ int16x8_t v381 = vsubq_s16(v379, v380);
+ int16x8_t v382 = vaddq_s16(v378, v381);
+ int16x8_t v383 = vqrdmulhq_n_s16(v382, 19705);
+ int16x8_t v384 = vaddq_s16(v377, v383);
+ int16x8_t v385 = vaddq_s16(v373, v384);
+ int16x8_t v386 = vaddq_s16(v360, v385);
+ int16x8_t v387 = vsubq_s16(v145, v148);
+ int16x8_t v388 = vqrdmulhq_n_s16(v152, 17734);
+ int16x8_t v389_tmp = vqrdmulhq_n_s16(v156, 10045);
+ int16x8_t v389 = vaddq_s16(v389_tmp, v156);
+ int16x8_t v390 = vsubq_s16(v388, v389);
+ int16x8_t v391 = vaddq_s16(v387, v390);
+ int16x8_t v392 = vsubq_s16(v164, v171);
+ int16x8_t v393 = vqrdmulhq_n_s16(v392, 19705);
+ int16x8_t v394 = vqrdmulhq_n_s16(v174, 13573);
+ int16x8_t v395 = vsubq_s16(v394, v179);
+ int16x8_t v396 = vqrdmulhq_n_s16(v395, 25746);
+ int16x8_t v397 = vaddq_s16(v393, v396);
+ int16x8_t v398 = vaddq_s16(v391, v397);
+ int16x8_t v399 = vsubq_s16(v227, v228);
+ int16x8_t v400 = vqrdmulhq_n_s16(v232, 13573);
+ int16x8_t v401 = vsubq_s16(v400, v230);
+ int16x8_t v402_tmp = vqrdmulhq_n_s16(v401, 10045);
+ int16x8_t v402 = vaddq_s16(v402_tmp, v401);
+ int16x8_t v403 = vaddq_s16(v399, v402);
+ int16x8_t v404 = vqrdmulhq_n_s16(v403, 17121);
+ int16x8_t v405 = vqrdmulhq_n_s16(v206, 27867);
+ int16x8_t v406 = vqrdmulhq_n_s16(v199, 19705);
+ int16x8_t v407 = vsubq_s16(v405, v406);
+ int16x8_t v408 = vqrdmulhq_n_s16(v210, 13573);
+ int16x8_t v409 = vsubq_s16(v408, v220);
+ int16x8_t v410 = vqrdmulhq_n_s16(v409, 25746);
+ int16x8_t v411 = vaddq_s16(v407, v410);
+ int16x8_t v412 = vqrdmulhq_n_s16(v411, 17121);
+ int16x8_t v413 = vaddq_s16(v404, v412);
+ int16x8_t v414 = vaddq_s16(v398, v413);
+ int16x8_t v415 = vsubq_s16(v240, v243);
+ int16x8_t v416 = vqrdmulhq_n_s16(v247, 13573);
+ int16x8_t v417 = vsubq_s16(v416, v251);
+ int16x8_t v418_tmp = vqrdmulhq_n_s16(v417, 10045);
+ int16x8_t v418 = vaddq_s16(v418_tmp, v417);
+ int16x8_t v419 = vaddq_s16(v415, v418);
+ int16x8_t v420 = vqrdmulhq_n_s16(v257, 27867);
+ int16x8_t v421 = vqrdmulhq_n_s16(v265, 19705);
+ int16x8_t v422 = vsubq_s16(v420, v421);
+ int16x8_t v423 = vqrdmulhq_n_s16(v268, 13573);
+ int16x8_t v424 = vsubq_s16(v423, v273);
+ int16x8_t v425 = vqrdmulhq_n_s16(v424, 25746);
+ int16x8_t v426 = vaddq_s16(v422, v425);
+ int16x8_t v427 = vaddq_s16(v419, v426);
+ int16x8_t v428 = vqrdmulhq_n_s16(v427, 16563);
+ int16x8_t v429 = vqrdmulhq_n_s16(v301, 27867);
+ int16x8_t v430 = vsubq_s16(v429, v307);
+ int16x8_t v431 = vqrdmulhq_n_s16(v310, 10664);
+ int16x8_t v432 = vsubq_s16(v431, v319);
+ int16x8_t v433 = vaddq_s16(v430, v432);
+ int16x8_t v434 = vqrdmulhq_n_s16(v433, 17121);
+ int16x8_t v435 = vsubq_s16(v283, v286);
+ int16x8_t v436 = vqrdmulhq_n_s16(v290, 13573);
+ int16x8_t v437 = vsubq_s16(v436, v295);
+ int16x8_t v438_tmp = vqrdmulhq_n_s16(v437, 10045);
+ int16x8_t v438 = vaddq_s16(v438_tmp, v437);
+ int16x8_t v439 = vaddq_s16(v435, v438);
+ int16x8_t v440 = vqrdmulhq_n_s16(v439, 17121);
+ int16x8_t v441 = vaddq_s16(v434, v440);
+ int16x8_t v442 = vqrdmulhq_n_s16(v441, 16563);
+ int16x8_t v443 = vaddq_s16(v428, v442);
+ int16x8_t v444 = vaddq_s16(v414, v443);
+ int16x8_t v445 = vqrdmulhq_n_s16(v444, 16429);
+ int16x8_t v446 = vaddq_s16(v386, v445);
+ int16x8_t v447 = vsubq_s16(v374, v376);
+ int16x8_t v448 = vsubq_s16(v378, v381);
+ int16x8_t v449 = vqrdmulhq_n_s16(v448, 29490);
+ int16x8_t v450 = vaddq_s16(v447, v449);
+ int16x8_t v451 = vsubq_s16(v361, v364);
+ int16x8_t v452 = vqrdmulhq_n_s16(v366, 29490);
+ int16x8_t v453_tmp = vqrdmulhq_n_s16(v369, 5763);
+ int16x8_t v453 = vaddq_s16(v453_tmp, v369);
+ int16x8_t v454 = vsubq_s16(v452, v453);
+ int16x8_t v455 = vaddq_s16(v451, v454);
+ int16x8_t v456 = vqrdmulhq_n_s16(v455, 18578);
+ int16x8_t v457 = vaddq_s16(v450, v456);
+ int16x8_t v458 = vsubq_s16(v338, v341);
+ int16x8_t v459 = vqrdmulhq_n_s16(v332, 29490);
+ int16x8_t v460_tmp = vqrdmulhq_n_s16(v335, 5763);
+ int16x8_t v460 = vaddq_s16(v460_tmp, v335);
+ int16x8_t v461 = vsubq_s16(v459, v460);
+ int16x8_t v462 = vaddq_s16(v458, v461);
+ int16x8_t v463 = vqrdmulhq_n_s16(v352, 27803);
+ int16x8_t v464 = vqrdmulhq_n_s16(v354, 21845);
+ int16x8_t v465 = vsubq_s16(v463, v464);
+ int16x8_t v466 = vsubq_s16(v344, v347);
+ int16x8_t v467 = vqrdmulhq_n_s16(v466, 18578);
+ int16x8_t v468 = vaddq_s16(v465, v467);
+ int16x8_t v469 = vaddq_s16(v462, v468);
+ int16x8_t v470 = vqrdmulhq_n_s16(v469, 16890);
+ int16x8_t v471 = vaddq_s16(v457, v470);
+ int16x8_t v472 = vsubq_s16(v415, v418);
+ int16x8_t v473_tmp = vqrdmulhq_n_s16(v422, 16273);
+ int16x8_t v473 = vaddq_s16(v473_tmp, v422);
+ int16x8_t v474_tmp = vqrdmulhq_n_s16(v424, 5763);
+ int16x8_t v474 = vaddq_s16(v474_tmp, v424);
+ int16x8_t v475 = vsubq_s16(v473, v474);
+ int16x8_t v476 = vaddq_s16(v472, v475);
+ int16x8_t v477 = vqrdmulhq_n_s16(v476, 16890);
+ int16x8_t v478 = vqrdmulhq_n_s16(v435, 20261);
+ int16x8_t v479 = vqrdmulhq_n_s16(v437, 26472);
+ int16x8_t v480 = vsubq_s16(v478, v479);
+ int16x8_t v481 = vqrdmulhq_n_s16(v480, 30046);
+ int16x8_t v482 = vqrdmulhq_n_s16(v430, 30322);
+ int16x8_t v483 = vqrdmulhq_n_s16(v432, 30322);
+ int16x8_t v484 = vsubq_s16(v482, v483);
+ int16x8_t v485 = vqrdmulhq_n_s16(v484, 30046);
+ int16x8_t v486 = vaddq_s16(v481, v485);
+ int16x8_t v487 = vqrdmulhq_n_s16(v486, 16890);
+ int16x8_t v488 = vaddq_s16(v477, v487);
+ int16x8_t v489 = vsubq_s16(v387, v390);
+ int16x8_t v490 = vqrdmulhq_n_s16(v392, 29490);
+ int16x8_t v491_tmp = vqrdmulhq_n_s16(v395, 5763);
+ int16x8_t v491 = vaddq_s16(v491_tmp, v395);
+ int16x8_t v492 = vsubq_s16(v490, v491);
+ int16x8_t v493 = vaddq_s16(v489, v492);
+ int16x8_t v494 = vsubq_s16(v399, v402);
+ int16x8_t v495 = vqrdmulhq_n_s16(v494, 18578);
+ int16x8_t v496 = vqrdmulhq_n_s16(v407, 27803);
+ int16x8_t v497 = vqrdmulhq_n_s16(v409, 21845);
+ int16x8_t v498 = vsubq_s16(v496, v497);
+ int16x8_t v499 = vaddq_s16(v495, v498);
+ int16x8_t v500 = vaddq_s16(v493, v499);
+ int16x8_t v501 = vaddq_s16(v488, v500);
+ int16x8_t v502 = vqrdmulhq_n_s16(v501, 16508);
+ int16x8_t v503 = vaddq_s16(v471, v502);
+ int16x8_t v504 = vsubq_s16(v2, v8);
+ int16x8_t v505 = vsubq_s16(v15, v22);
+ int16x8_t v506_tmp = vqrdmulhq_n_s16(v505, 18446);
+ int16x8_t v506 = vmlaq_n_s16(v506_tmp, v505, 2);
+ int16x8_t v507 = vaddq_s16(v504, v506);
+ int16x8_t v508 = vsubq_s16(v31, v41);
+ int16x8_t v509 = vsubq_s16(v48, v56);
+ int16x8_t v510_tmp = vqrdmulhq_n_s16(v509, 18446);
+ int16x8_t v510 = vmlaq_n_s16(v510_tmp, v509, 2);
+ int16x8_t v511 = vaddq_s16(v508, v510);
+ int16x8_t v512 = vqrdmulhq_n_s16(v511, 21195);
+ int16x8_t v513 = vaddq_s16(v507, v512);
+ int16x8_t v514 = vsubq_s16(v67, v77);
+ int16x8_t v515 = vsubq_s16(v90, v99);
+ int16x8_t v516_tmp = vqrdmulhq_n_s16(v515, 18446);
+ int16x8_t v516 = vmlaq_n_s16(v516_tmp, v515, 2);
+ int16x8_t v517 = vaddq_s16(v514, v516);
+ int16x8_t v518 = vsubq_s16(v114, v126);
+ int16x8_t v519_tmp = vqrdmulhq_n_s16(v518, 18446);
+ int16x8_t v519 = vmlaq_n_s16(v519_tmp, v518, 2);
+ int16x8_t v520 = vsubq_s16(v132, v137);
+ int16x8_t v521 = vaddq_s16(v519, v520);
+ int16x8_t v522 = vqrdmulhq_n_s16(v521, 21195);
+ int16x8_t v523 = vaddq_s16(v517, v522);
+ int16x8_t v524 = vqrdmulhq_n_s16(v523, 17401);
+ int16x8_t v525 = vaddq_s16(v513, v524);
+ int16x8_t v526 = vsubq_s16(v172, v181);
+ int16x8_t v527_tmp = vqrdmulhq_n_s16(v526, 18446);
+ int16x8_t v527 = vmlaq_n_s16(v527_tmp, v526, 2);
+ int16x8_t v528 = vsubq_s16(v149, v159);
+ int16x8_t v529 = vaddq_s16(v527, v528);
+ int16x8_t v530 = vsubq_s16(v229, v234);
+ int16x8_t v531 = vsubq_s16(v208, v223);
+ int16x8_t v532_tmp = vqrdmulhq_n_s16(v531, 18446);
+ int16x8_t v532 = vmlaq_n_s16(v532_tmp, v531, 2);
+ int16x8_t v533 = vaddq_s16(v530, v532);
+ int16x8_t v534 = vqrdmulhq_n_s16(v533, 21195);
+ int16x8_t v535 = vaddq_s16(v529, v534);
+ int16x8_t v536 = vsubq_s16(v244, v253);
+ int16x8_t v537 = vsubq_s16(v266, v277);
+ int16x8_t v538_tmp = vqrdmulhq_n_s16(v537, 18446);
+ int16x8_t v538 = vmlaq_n_s16(v538_tmp, v537, 2);
+ int16x8_t v539 = vaddq_s16(v536, v538);
+ int16x8_t v540 = vqrdmulhq_n_s16(v539, 17401);
+ int16x8_t v541 = vqrdmulhq_n_s16(v287, 25826);
+ int16x8_t v542 = vqrdmulhq_n_s16(v298, 25826);
+ int16x8_t v543 = vsubq_s16(v541, v542);
+ int16x8_t v544 = vqrdmulhq_n_s16(v543, 14281);
+ int16x8_t v545_tmp = vqrdmulhq_n_s16(v309, 31509);
+ int16x8_t v545 = vaddq_s16(v545_tmp, v309);
+ int16x8_t v546 = vsubq_s16(v545, v322);
+ int16x8_t v547 = vqrdmulhq_n_s16(v546, 28847);
+ int16x8_t v548 = vaddq_s16(v544, v547);
+ int16x8_t v549 = vaddq_s16(v540, v548);
+ int16x8_t v550 = vaddq_s16(v535, v549);
+ int16x8_t v551 = vqrdmulhq_n_s16(v550, 16629);
+ int16x8_t v552 = vaddq_s16(v525, v551);
+ int16x8_t v553 = vsubq_s16(v504, v506);
+ int16x8_t v554 = vsubq_s16(v508, v510);
+ int16x8_t v555 = vqrdmulhq_n_s16(v554, 25826);
+ int16x8_t v556 = vaddq_s16(v553, v555);
+ int16x8_t v557 = vsubq_s16(v514, v516);
+ int16x8_t v558 = vsubq_s16(v520, v519);
+ int16x8_t v559 = vqrdmulhq_n_s16(v558, 25826);
+ int16x8_t v560 = vaddq_s16(v557, v559);
+ int16x8_t v561 = vqrdmulhq_n_s16(v560, 18124);
+ int16x8_t v562 = vaddq_s16(v556, v561);
+ int16x8_t v563 = vsubq_s16(v528, v527);
+ int16x8_t v564 = vsubq_s16(v530, v532);
+ int16x8_t v565 = vqrdmulhq_n_s16(v564, 25826);
+ int16x8_t v566 = vaddq_s16(v563, v565);
+ int16x8_t v567 = vsubq_s16(v536, v538);
+ int16x8_t v568 = vqrdmulhq_n_s16(v567, 18124);
+ int16x8_t v569_tmp = vqrdmulhq_n_s16(v546, 654);
+ int16x8_t v569 = vmlaq_n_s16(v569_tmp, v546, 2);
+ int16x8_t v570 = vsubq_s16(v543, v569);
+ int16x8_t v571 = vqrdmulhq_n_s16(v570, 18124);
+ int16x8_t v572 = vaddq_s16(v568, v571);
+ int16x8_t v573 = vaddq_s16(v566, v572);
+ int16x8_t v574 = vqrdmulhq_n_s16(v573, 16792);
+ int16x8_t v575 = vaddq_s16(v562, v574);
+ int16x8_t v576 = vsubq_s16(v458, v461);
+ int16x8_t v577_tmp = vqrdmulhq_n_s16(v465, 25030);
+ int16x8_t v577 = vaddq_s16(v577_tmp, v465);
+ int16x8_t v578 = vsubq_s16(v466, v577);
+ int16x8_t v579_tmp = vqrdmulhq_n_s16(v578, 1988);
+ int16x8_t v579 = vaddq_s16(v579_tmp, v578);
+ int16x8_t v580 = vaddq_s16(v576, v579);
+ int16x8_t v581 = vqrdmulhq_n_s16(v580, 19102);
+ int16x8_t v582 = vsubq_s16(v447, v449);
+ int16x8_t v583 = vsubq_s16(v451, v454);
+ int16x8_t v584_tmp = vqrdmulhq_n_s16(v583, 1988);
+ int16x8_t v584 = vaddq_s16(v584_tmp, v583);
+ int16x8_t v585 = vaddq_s16(v582, v584);
+ int16x8_t v586 = vaddq_s16(v581, v585);
+ int16x8_t v587 = vsubq_s16(v489, v492);
+ int16x8_t v588_tmp = vqrdmulhq_n_s16(v498, 25030);
+ int16x8_t v588 = vaddq_s16(v588_tmp, v498);
+ int16x8_t v589 = vsubq_s16(v494, v588);
+ int16x8_t v590_tmp = vqrdmulhq_n_s16(v589, 1988);
+ int16x8_t v590 = vaddq_s16(v590_tmp, v589);
+ int16x8_t v591 = vaddq_s16(v587, v590);
+ int16x8_t v592 = vsubq_s16(v472, v475);
+ int16x8_t v593 = vqrdmulhq_n_s16(v592, 19102);
+ int16x8_t v594 = vsubq_s16(v480, v484);
+ int16x8_t v595 = vaddq_s16(v593, v594);
+ int16x8_t v596 = vaddq_s16(v591, v595);
+ int16x8_t v597 = vqrdmulhq_n_s16(v596, 17000);
+ int16x8_t v598 = vaddq_s16(v586, v597);
+ int16x8_t v599 = vsubq_s16(v365, v371);
+ int16x8_t v600_tmp = vqrdmulhq_n_s16(v599, 23673);
+ int16x8_t v600 = vaddq_s16(v600_tmp, v599);
+ int16x8_t v601 = vsubq_s16(v377, v383);
+ int16x8_t v602 = vaddq_s16(v600, v601);
+ int16x8_t v603 = vsubq_s16(v348, v356);
+ int16x8_t v604_tmp = vqrdmulhq_n_s16(v603, 23673);
+ int16x8_t v604 = vaddq_s16(v604_tmp, v603);
+ int16x8_t v605 = vsubq_s16(v342, v337);
+ int16x8_t v606 = vaddq_s16(v604, v605);
+ int16x8_t v607 = vqrdmulhq_n_s16(v606, 20398);
+ int16x8_t v608 = vaddq_s16(v602, v607);
+ int16x8_t v609 = vsubq_s16(v391, v397);
+ int16x8_t v610 = vsubq_s16(v403, v411);
+ int16x8_t v611_tmp = vqrdmulhq_n_s16(v610, 23673);
+ int16x8_t v611 = vaddq_s16(v611_tmp, v610);
+ int16x8_t v612 = vaddq_s16(v609, v611);
+ int16x8_t v613 = vsubq_s16(v419, v426);
+ int16x8_t v614 = vqrdmulhq_n_s16(v613, 20398);
+ int16x8_t v615 = vsubq_s16(v439, v433);
+ int16x8_t v616_tmp = vqrdmulhq_n_s16(v615, 2367);
+ int16x8_t v616 = vaddq_s16(v616_tmp, v615);
+ int16x8_t v617 = vaddq_s16(v614, v616);
+ int16x8_t v618 = vaddq_s16(v612, v617);
+ int16x8_t v619 = vqrdmulhq_n_s16(v618, 17255);
+ int16x8_t v620 = vaddq_s16(v608, v619);
+ int16x8_t v621 = vsubq_s16(v160, v183);
+ int16x8_t v622 = vsubq_s16(v235, v225);
+ int16x8_t v623_tmp = vqrdmulhq_n_s16(v622, 3314);
+ int16x8_t v623 = vmlaq_n_s16(v623_tmp, v622, 5);
+ int16x8_t v624 = vaddq_s16(v621, v623);
+ int16x8_t v625 = vsubq_s16(v254, v279);
+ int16x8_t v626 = vsubq_s16(v299, v324);
+ int16x8_t v627_tmp = vqrdmulhq_n_s16(v626, 3314);
+ int16x8_t v627 = vmlaq_n_s16(v627_tmp, v626, 5);
+ int16x8_t v628 = vaddq_s16(v625, v627);
+ int16x8_t v629 = vqrdmulhq_n_s16(v628, 22112);
+ int16x8_t v630 = vaddq_s16(v624, v629);
+ int16x8_t v631 = vqrdmulhq_n_s16(v630, 17561);
+ int16x8_t v632 = vsubq_s16(v9, v24);
+ int16x8_t v633 = vsubq_s16(v42, v58);
+ int16x8_t v634_tmp = vqrdmulhq_n_s16(v633, 3314);
+ int16x8_t v634 = vmlaq_n_s16(v634_tmp, v633, 5);
+ int16x8_t v635 = vaddq_s16(v632, v634);
+ int16x8_t v636 = vsubq_s16(v78, v101);
+ int16x8_t v637 = vsubq_s16(v138, v128);
+ int16x8_t v638_tmp = vqrdmulhq_n_s16(v637, 3314);
+ int16x8_t v638 = vmlaq_n_s16(v638_tmp, v637, 5);
+ int16x8_t v639 = vaddq_s16(v636, v638);
+ int16x8_t v640 = vqrdmulhq_n_s16(v639, 22112);
+ int16x8_t v641 = vaddq_s16(v635, v640);
+ int16x8_t v642 = vaddq_s16(v631, v641);
+ int16x8_t v643 = vsubq_s16(v632, v634);
+ int16x8_t v644 = vsubq_s16(v636, v638);
+ int16x8_t v645 = vqrdmulhq_n_s16(v644, 24397);
+ int16x8_t v646 = vaddq_s16(v643, v645);
+ int16x8_t v647 = vsubq_s16(v621, v623);
+ int16x8_t v648 = vsubq_s16(v625, v627);
+ int16x8_t v649 = vqrdmulhq_n_s16(v648, 24397);
+ int16x8_t v650 = vaddq_s16(v647, v649);
+ int16x8_t v651 = vqrdmulhq_n_s16(v650, 17921);
+ int16x8_t v652 = vaddq_s16(v646, v651);
+ int16x8_t v653 = vsubq_s16(v601, v600);
+ int16x8_t v654 = vsubq_s16(v605, v604);
+ int16x8_t v655 = vqrdmulhq_n_s16(v654, 27504);
+ int16x8_t v656 = vaddq_s16(v653, v655);
+ int16x8_t v657 = vsubq_s16(v609, v611);
+ int16x8_t v658 = vqrdmulhq_n_s16(v613, 27504);
+ int16x8_t v659_tmp = vqrdmulhq_n_s16(v615, 14606);
+ int16x8_t v659 = vaddq_s16(v659_tmp, v615);
+ int16x8_t v660 = vsubq_s16(v658, v659);
+ int16x8_t v661 = vaddq_s16(v657, v660);
+ int16x8_t v662 = vqrdmulhq_n_s16(v661, 18343);
+ int16x8_t v663 = vaddq_s16(v656, v662);
+ int16x8_t v664 = vsubq_s16(v582, v584);
+ int16x8_t v665 = vsubq_s16(v576, v579);
+ int16x8_t v666 = vqrdmulhq_n_s16(v665, 31869);
+ int16x8_t v667 = vaddq_s16(v664, v666);
+ int16x8_t v668 = vsubq_s16(v587, v590);
+ int16x8_t v669_tmp = vqrdmulhq_n_s16(v594, 23444);
+ int16x8_t v669 = vaddq_s16(v669_tmp, v594);
+ int16x8_t v670 = vsubq_s16(v592, v669);
+ int16x8_t v671 = vqrdmulhq_n_s16(v670, 31869);
+ int16x8_t v672 = vaddq_s16(v668, v671);
+ int16x8_t v673 = vqrdmulhq_n_s16(v672, 18830);
+ int16x8_t v674 = vaddq_s16(v667, v673);
+ int16x8_t v675 = vsubq_s16(v553, v555);
+ int16x8_t v676 = vsubq_s16(v557, v559);
+ int16x8_t v677_tmp = vqrdmulhq_n_s16(v676, 5552);
+ int16x8_t v677 = vaddq_s16(v677_tmp, v676);
+ int16x8_t v678 = vaddq_s16(v675, v677);
+ int16x8_t v679 = vsubq_s16(v563, v565);
+ int16x8_t v680 = vsubq_s16(v567, v570);
+ int16x8_t v681_tmp = vqrdmulhq_n_s16(v680, 5552);
+ int16x8_t v681 = vaddq_s16(v681_tmp, v680);
+ int16x8_t v682 = vaddq_s16(v679, v681);
+ int16x8_t v683 = vqrdmulhq_n_s16(v682, 19393);
+ int16x8_t v684 = vaddq_s16(v678, v683);
+ int16x8_t v685 = vsubq_s16(v507, v512);
+ int16x8_t v686 = vsubq_s16(v517, v522);
+ int16x8_t v687_tmp = vqrdmulhq_n_s16(v686, 15865);
+ int16x8_t v687 = vaddq_s16(v687_tmp, v686);
+ int16x8_t v688 = vaddq_s16(v685, v687);
+ int16x8_t v689 = vsubq_s16(v529, v534);
+ int16x8_t v690_tmp = vqrdmulhq_n_s16(v548, 28937);
+ int16x8_t v690 = vaddq_s16(v690_tmp, v548);
+ int16x8_t v691 = vsubq_s16(v539, v690);
+ int16x8_t v692_tmp = vqrdmulhq_n_s16(v691, 15865);
+ int16x8_t v692 = vaddq_s16(v692_tmp, v691);
+ int16x8_t v693 = vaddq_s16(v689, v692);
+ int16x8_t v694 = vqrdmulhq_n_s16(v693, 20040);
+ int16x8_t v695 = vaddq_s16(v688, v694);
+ int16x8_t v696 = vsubq_s16(v476, v486);
+ int16x8_t v697_tmp = vqrdmulhq_n_s16(v696, 1893);
+ int16x8_t v697 = vmlaq_n_s16(v697_tmp, v696, 2);
+ int16x8_t v698 = vsubq_s16(v493, v499);
+ int16x8_t v699 = vaddq_s16(v697, v698);
+ int16x8_t v700 = vqrdmulhq_n_s16(v699, 20783);
+ int16x8_t v701 = vsubq_s16(v450, v456);
+ int16x8_t v702 = vsubq_s16(v462, v468);
+ int16x8_t v703_tmp = vqrdmulhq_n_s16(v702, 1893);
+ int16x8_t v703 = vmlaq_n_s16(v703_tmp, v702, 2);
+ int16x8_t v704 = vaddq_s16(v701, v703);
+ int16x8_t v705 = vaddq_s16(v700, v704);
+ int16x8_t v706 = vsubq_s16(v384, v373);
+ int16x8_t v707 = vsubq_s16(v343, v358);
+ int16x8_t v708_tmp = vqrdmulhq_n_s16(v707, 13357);
+ int16x8_t v708 = vmlaq_n_s16(v708_tmp, v707, 3);
+ int16x8_t v709 = vaddq_s16(v706, v708);
+ int16x8_t v710 = vsubq_s16(v398, v413);
+ int16x8_t v711 = vsubq_s16(v427, v441);
+ int16x8_t v712_tmp = vqrdmulhq_n_s16(v711, 13357);
+ int16x8_t v712 = vmlaq_n_s16(v712_tmp, v711, 3);
+ int16x8_t v713 = vaddq_s16(v710, v712);
+ int16x8_t v714 = vqrdmulhq_n_s16(v713, 21637);
+ int16x8_t v715 = vaddq_s16(v709, v714);
+ int16x8_t v716 = vsubq_s16(v25, v60);
+ int16x8_t v717 = vsubq_s16(v102, v140);
+ int16x8_t v718_tmp = vqrdmulhq_n_s16(v717, 6226);
+ int16x8_t v718 = vmlaq_n_s16(v718_tmp, v717, 10);
+ int16x8_t v719 = vaddq_s16(v716, v718);
+ int16x8_t v720 = vsubq_s16(v280, v326);
+ int16x8_t v721_tmp = vqrdmulhq_n_s16(v720, 6226);
+ int16x8_t v721 = vmlaq_n_s16(v721_tmp, v720, 10);
+ int16x8_t v722 = vsubq_s16(v184, v237);
+ int16x8_t v723 = vaddq_s16(v721, v722);
+ int16x8_t v724 = vqrdmulhq_n_s16(v723, 22622);
+ int16x8_t v725 = vaddq_s16(v719, v724);
+ int16x8_t v726 = vsubq_s16(v716, v718);
+ int16x8_t v727 = vsubq_s16(v722, v721);
+ int16x8_t v728 = vqrdmulhq_n_s16(v727, 23761);
+ int16x8_t v729 = vaddq_s16(v726, v728);
+ int16x8_t v730 = vsubq_s16(v706, v708);
+ int16x8_t v731 = vsubq_s16(v710, v712);
+ int16x8_t v732 = vqrdmulhq_n_s16(v731, 25084);
+ int16x8_t v733 = vaddq_s16(v730, v732);
+ int16x8_t v734 = vsubq_s16(v701, v703);
+ int16x8_t v735 = vsubq_s16(v698, v697);
+ int16x8_t v736 = vqrdmulhq_n_s16(v735, 26631);
+ int16x8_t v737 = vaddq_s16(v734, v736);
+ int16x8_t v738 = vsubq_s16(v685, v687);
+ int16x8_t v739 = vsubq_s16(v689, v692);
+ int16x8_t v740 = vqrdmulhq_n_s16(v739, 28454);
+ int16x8_t v741 = vaddq_s16(v738, v740);
+ int16x8_t v742 = vsubq_s16(v675, v677);
+ int16x8_t v743 = vsubq_s16(v679, v681);
+ int16x8_t v744 = vqrdmulhq_n_s16(v743, 30624);
+ int16x8_t v745 = vaddq_s16(v742, v744);
+ int16x8_t v746 = vsubq_s16(v664, v666);
+ int16x8_t v747 = vsubq_s16(v668, v671);
+ int16x8_t v748_tmp = vqrdmulhq_n_s16(v747, 472);
+ int16x8_t v748 = vaddq_s16(v748_tmp, v747);
+ int16x8_t v749 = vaddq_s16(v746, v748);
+ int16x8_t v750 = vsubq_s16(v653, v655);
+ int16x8_t v751 = vsubq_s16(v657, v660);
+ int16x8_t v752_tmp = vqrdmulhq_n_s16(v751, 3672);
+ int16x8_t v752 = vaddq_s16(v752_tmp, v751);
+ int16x8_t v753 = vaddq_s16(v750, v752);
+ int16x8_t v754 = vsubq_s16(v643, v645);
+ int16x8_t v755 = vsubq_s16(v647, v649);
+ int16x8_t v756_tmp = vqrdmulhq_n_s16(v755, 7662);
+ int16x8_t v756 = vaddq_s16(v756_tmp, v755);
+ int16x8_t v757 = vaddq_s16(v754, v756);
+ int16x8_t v758 = vsubq_s16(v635, v640);
+ int16x8_t v759 = vsubq_s16(v624, v629);
+ int16x8_t v760_tmp = vqrdmulhq_n_s16(v759, 12756);
+ int16x8_t v760 = vaddq_s16(v760_tmp, v759);
+ int16x8_t v761 = vaddq_s16(v758, v760);
+ int16x8_t v762 = vsubq_s16(v602, v607);
+ int16x8_t v763 = vsubq_s16(v612, v617);
+ int16x8_t v764_tmp = vqrdmulhq_n_s16(v763, 19463);
+ int16x8_t v764 = vaddq_s16(v764_tmp, v763);
+ int16x8_t v765 = vaddq_s16(v762, v764);
+ int16x8_t v766 = vsubq_s16(v585, v581);
+ int16x8_t v767 = vsubq_s16(v591, v595);
+ int16x8_t v768_tmp = vqrdmulhq_n_s16(v767, 28661);
+ int16x8_t v768 = vaddq_s16(v768_tmp, v767);
+ int16x8_t v769 = vaddq_s16(v766, v768);
+ int16x8_t v770 = vsubq_s16(v556, v561);
+ int16x8_t v771 = vsubq_s16(v566, v572);
+ int16x8_t v772_tmp = vqrdmulhq_n_s16(v771, 9242);
+ int16x8_t v772 = vmlaq_n_s16(v772_tmp, v771, 2);
+ int16x8_t v773 = vaddq_s16(v770, v772);
+ int16x8_t v774 = vsubq_s16(v513, v524);
+ int16x8_t v775 = vsubq_s16(v535, v549);
+ int16x8_t v776_tmp = vqrdmulhq_n_s16(v775, 30298);
+ int16x8_t v776 = vmlaq_n_s16(v776_tmp, v775, 2);
+ int16x8_t v777 = vaddq_s16(v774, v776);
+ int16x8_t v778 = vsubq_s16(v457, v470);
+ int16x8_t v779 = vsubq_s16(v500, v488);
+ int16x8_t v780_tmp = vqrdmulhq_n_s16(v779, 2773);
+ int16x8_t v780 = vmlaq_n_s16(v780_tmp, v779, 4);
+ int16x8_t v781 = vaddq_s16(v778, v780);
+ int16x8_t v782 = vsubq_s16(v385, v360);
+ int16x8_t v783 = vsubq_s16(v414, v443);
+ int16x8_t v784_tmp = vqrdmulhq_n_s16(v783, 26108);
+ int16x8_t v784 = vmlaq_n_s16(v784_tmp, v783, 6);
+ int16x8_t v785 = vaddq_s16(v782, v784);
+ int16x8_t v786 = vsubq_s16(v61, v142);
+ int16x8_t v787 = vsubq_s16(v238, v328);
+ int16x8_t v788_tmp = vqrdmulhq_n_s16(v787, 12251);
+ int16x8_t v788 = vmlaq_n_s16(v788_tmp, v787, 20);
+ int16x8_t v789 = vaddq_s16(v786, v788);
+ int16x8_t v790 = vsubq_s16(v786, v788);
+ int16x8_t v791 = vsubq_s16(v782, v784);
+ int16x8_t v792 = vsubq_s16(v778, v780);
+ int16x8_t v793 = vsubq_s16(v774, v776);
+ int16x8_t v794 = vsubq_s16(v770, v772);
+ int16x8_t v795 = vsubq_s16(v766, v768);
+ int16x8_t v796 = vsubq_s16(v762, v764);
+ int16x8_t v797 = vsubq_s16(v758, v760);
+ int16x8_t v798 = vsubq_s16(v754, v756);
+ int16x8_t v799 = vsubq_s16(v750, v752);
+ int16x8_t v800 = vsubq_s16(v746, v748);
+ int16x8_t v801 = vsubq_s16(v742, v744);
+ int16x8_t v802 = vsubq_s16(v738, v740);
+ int16x8_t v803 = vsubq_s16(v734, v736);
+ int16x8_t v804 = vsubq_s16(v730, v732);
+ int16x8_t v805 = vsubq_s16(v726, v728);
+ int16x8_t v806 = vsubq_s16(v719, v724);
+ int16x8_t v807 = vsubq_s16(v709, v714);
+ int16x8_t v808 = vsubq_s16(v704, v700);
+ int16x8_t v809 = vsubq_s16(v688, v694);
+ int16x8_t v810 = vsubq_s16(v678, v683);
+ int16x8_t v811 = vsubq_s16(v667, v673);
+ int16x8_t v812 = vsubq_s16(v656, v662);
+ int16x8_t v813 = vsubq_s16(v646, v651);
+ int16x8_t v814 = vsubq_s16(v641, v631);
+ int16x8_t v815 = vsubq_s16(v608, v619);
+ int16x8_t v816 = vsubq_s16(v586, v597);
+ int16x8_t v817 = vsubq_s16(v562, v574);
+ int16x8_t v818 = vsubq_s16(v525, v551);
+ int16x8_t v819 = vsubq_s16(v471, v502);
+ int16x8_t v820 = vsubq_s16(v386, v445);
+ int16x8_t v821 = vsubq_s16(v143, v330);
+ vst1q_s16(out + out_stride * 0 + i, v331);
+ vst1q_s16(out + out_stride * 1 + i, v446);
+ vst1q_s16(out + out_stride * 2 + i, v503);
+ vst1q_s16(out + out_stride * 3 + i, v552);
+ vst1q_s16(out + out_stride * 4 + i, v575);
+ vst1q_s16(out + out_stride * 5 + i, v598);
+ vst1q_s16(out + out_stride * 6 + i, v620);
+ vst1q_s16(out + out_stride * 7 + i, v642);
+ vst1q_s16(out + out_stride * 8 + i, v652);
+ vst1q_s16(out + out_stride * 9 + i, v663);
+ vst1q_s16(out + out_stride * 10 + i, v674);
+ vst1q_s16(out + out_stride * 11 + i, v684);
+ vst1q_s16(out + out_stride * 12 + i, v695);
+ vst1q_s16(out + out_stride * 13 + i, v705);
+ vst1q_s16(out + out_stride * 14 + i, v715);
+ vst1q_s16(out + out_stride * 15 + i, v725);
+ vst1q_s16(out + out_stride * 16 + i, v729);
+ vst1q_s16(out + out_stride * 17 + i, v733);
+ vst1q_s16(out + out_stride * 18 + i, v737);
+ vst1q_s16(out + out_stride * 19 + i, v741);
+ vst1q_s16(out + out_stride * 20 + i, v745);
+ vst1q_s16(out + out_stride * 21 + i, v749);
+ vst1q_s16(out + out_stride * 22 + i, v753);
+ vst1q_s16(out + out_stride * 23 + i, v757);
+ vst1q_s16(out + out_stride * 24 + i, v761);
+ vst1q_s16(out + out_stride * 25 + i, v765);
+ vst1q_s16(out + out_stride * 26 + i, v769);
+ vst1q_s16(out + out_stride * 27 + i, v773);
+ vst1q_s16(out + out_stride * 28 + i, v777);
+ vst1q_s16(out + out_stride * 29 + i, v781);
+ vst1q_s16(out + out_stride * 30 + i, v785);
+ vst1q_s16(out + out_stride * 31 + i, v789);
+ vst1q_s16(out + out_stride * 32 + i, v790);
+ vst1q_s16(out + out_stride * 33 + i, v791);
+ vst1q_s16(out + out_stride * 34 + i, v792);
+ vst1q_s16(out + out_stride * 35 + i, v793);
+ vst1q_s16(out + out_stride * 36 + i, v794);
+ vst1q_s16(out + out_stride * 37 + i, v795);
+ vst1q_s16(out + out_stride * 38 + i, v796);
+ vst1q_s16(out + out_stride * 39 + i, v797);
+ vst1q_s16(out + out_stride * 40 + i, v798);
+ vst1q_s16(out + out_stride * 41 + i, v799);
+ vst1q_s16(out + out_stride * 42 + i, v800);
+ vst1q_s16(out + out_stride * 43 + i, v801);
+ vst1q_s16(out + out_stride * 44 + i, v802);
+ vst1q_s16(out + out_stride * 45 + i, v803);
+ vst1q_s16(out + out_stride * 46 + i, v804);
+ vst1q_s16(out + out_stride * 47 + i, v805);
+ vst1q_s16(out + out_stride * 48 + i, v806);
+ vst1q_s16(out + out_stride * 49 + i, v807);
+ vst1q_s16(out + out_stride * 50 + i, v808);
+ vst1q_s16(out + out_stride * 51 + i, v809);
+ vst1q_s16(out + out_stride * 52 + i, v810);
+ vst1q_s16(out + out_stride * 53 + i, v811);
+ vst1q_s16(out + out_stride * 54 + i, v812);
+ vst1q_s16(out + out_stride * 55 + i, v813);
+ vst1q_s16(out + out_stride * 56 + i, v814);
+ vst1q_s16(out + out_stride * 57 + i, v815);
+ vst1q_s16(out + out_stride * 58 + i, v816);
+ vst1q_s16(out + out_stride * 59 + i, v817);
+ vst1q_s16(out + out_stride * 60 + i, v818);
+ vst1q_s16(out + out_stride * 61 + i, v819);
+ vst1q_s16(out + out_stride * 62 + i, v820);
+ vst1q_s16(out + out_stride * 63 + i, v821);
+ }
+}