75 files changed, 14360 insertions, 2318 deletions
diff --git a/media/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c b/media/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c
new file mode 100644
index 0000000000..e33dff20c2
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/adaptive_quantize_avx2.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/av1_quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin,
+                                      const int16_t *round_ptr, __m256i *round,
+                                      const int16_t *quant_ptr, __m256i *quant,
+                                      const int16_t *dequant_ptr,
+                                      __m256i *dequant,
+                                      const int16_t *shift_ptr,
+                                      __m256i *shift) {
+  *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+  *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+  *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
+  *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+  *round = _mm256_permute4x64_epi64(*round, 0x54);
+  *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+  *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+  *dequant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+  *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+  *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+  *shift = _mm256_permute4x64_epi64(*shift, 0x54);
+}
+
+static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+  const __m256i coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr));
+  const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+  return _mm256_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void update_mask1_avx2(__m256i *cmp_mask,
+                                     const int16_t *iscan_ptr, int *is_found,
+                                     __m256i *mask) {
+  __m256i temp_mask = _mm256_setzero_si256();
+  if (_mm256_movemask_epi8(*cmp_mask)) {
+    __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr));
+    temp_mask = _mm256_and_si256(*cmp_mask, iscan);
+    *is_found = 1;
+  }
+  *mask = _mm256_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void update_mask0_avx2(__m256i *qcoeff, __m256i *threshold,
+                                     const int16_t *iscan_ptr, int *is_found,
+                                     __m256i *mask) {
+  __m256i zero = _mm256_setzero_si256();
+  __m256i coeff[2], cmp_mask0, cmp_mask1;
+  coeff[0] = _mm256_unpacklo_epi16(*qcoeff, zero);
+  coeff[1] = _mm256_unpackhi_epi16(*qcoeff, zero);
+  coeff[0] = _mm256_slli_epi32(coeff[0], AOM_QM_BITS);
+  cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]);
+  coeff[1] = _mm256_slli_epi32(coeff[1], AOM_QM_BITS);
+  cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]);
+  cmp_mask0 =
+      _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8);
+  update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE void calculate_qcoeff_avx2(__m256i *coeff, const __m256i *round,
+                                         const __m256i *quant,
+                                         const __m256i *shift) {
+  __m256i tmp, qcoeff;
+  qcoeff = _mm256_adds_epi16(*coeff, *round);
+  tmp = _mm256_mulhi_epi16(qcoeff, *quant);
+  qcoeff = _mm256_add_epi16(tmp, qcoeff);
+  *coeff = _mm256_mulhi_epi16(qcoeff, *shift);
+}
+
+static INLINE __m256i calculate_dqcoeff_avx2(__m256i qcoeff, __m256i dequant) {
+  return _mm256_mullo_epi16(qcoeff, dequant);
+}
+
+static INLINE void store_coefficients_avx2(__m256i coeff_vals,
+                                           tran_low_t *coeff_ptr) {
+  __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+  __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+  __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+  _mm256_store_si256((__m256i *)(coeff_ptr), coeff_vals_lo);
+  _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+}
+
+void aom_quantize_b_adaptive_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i zbin, round, quant, dequant, shift;
+  __m256i coeff, qcoeff;
+  __m256i cmp_mask, mask0 = zero, mask1 = zero;
+  __m128i temp_mask0, temp_mask1;
+  int prescan_add[2];
+  int thresh[2];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+  }
+  __m256i threshold[2];
+  threshold[0] = _mm256_set1_epi32(thresh[0]);
+  threshold[1] = _mm256_set1_epi32(thresh[1]);
+  threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+
+  // Setup global values.
+  load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                     dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff = load_coefficients_avx2(coeff_ptr);
+  qcoeff = _mm256_abs_epi16(coeff);
+  update_mask0_avx2(&qcoeff, threshold, iscan, &is_found0, &mask0);
+  __m256i temp0 = _mm256_cmpgt_epi16(qcoeff, zbin);
+  zbin = _mm256_unpackhi_epi64(zbin, zbin);
+  cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8);
+  update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+  threshold[0] = threshold[1];
+  if (_mm256_movemask_epi8(cmp_mask) == 0) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+    round = _mm256_unpackhi_epi64(round, round);
+    quant = _mm256_unpackhi_epi64(quant, quant);
+    shift = _mm256_unpackhi_epi64(shift, shift);
+    dequant = _mm256_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift);
+    round = _mm256_unpackhi_epi64(round, round);
+    quant = _mm256_unpackhi_epi64(quant, quant);
+    shift = _mm256_unpackhi_epi64(shift, shift);
+    // Reinsert signs
+    qcoeff = _mm256_sign_epi16(qcoeff, coeff);
+    // Mask out zbin threshold coeffs
+    qcoeff = _mm256_and_si256(qcoeff, temp0);
+    store_coefficients_avx2(qcoeff, qcoeff_ptr);
+    coeff = calculate_dqcoeff_avx2(qcoeff, dequant);
+    dequant = _mm256_unpackhi_epi64(dequant, dequant);
+    store_coefficients_avx2(coeff, dqcoeff_ptr);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff = load_coefficients_avx2(coeff_ptr + index);
+    qcoeff = _mm256_abs_epi16(coeff);
+    update_mask0_avx2(&qcoeff, threshold, iscan + index, &is_found0, &mask0);
+    temp0 = _mm256_cmpgt_epi16(qcoeff, zbin);
+    cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8);
+    update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+    if (_mm256_movemask_epi8(cmp_mask) == 0) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+      index += 16;
+      continue;
+    }
+    calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift);
+    qcoeff = _mm256_sign_epi16(qcoeff, coeff);
+    qcoeff = _mm256_and_si256(qcoeff, temp0);
+    store_coefficients_avx2(qcoeff, qcoeff_ptr + index);
+    coeff = calculate_dqcoeff_avx2(qcoeff, dequant);
+    store_coefficients_avx2(coeff, dqcoeff_ptr + index);
+    index += 16;
+  }
+  if (is_found0) {
+    temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+                               _mm256_extracti128_si256(mask0, 1));
+    non_zero_count = calculate_non_zero_count(temp_mask0);
+  }
+  if (is_found1) {
+    temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+                               _mm256_extracti128_si256(mask1, 1));
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+  }
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff0 = qcoeff_ptr[rc];
+    if (qcoeff0) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff0 = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff0);
+      const int abs_coeff = (coeff0 ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff <
+          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
diff --git a/media/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c b/media/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c
new file mode 100644
index 0000000000..584cd671f1
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/adaptive_quantize_sse2.c
@@ -0,0 +1,633 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/av1_quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+void aom_quantize_b_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  // Poor man's abs().
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr);
+    store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_coefficients(coeff0, dqcoeff_ptr);
+    store_coefficients(coeff1, dqcoeff_ptr + 8);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+                 &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+      index += 16;
+      continue;
+    }
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_coefficients(coeff0, dqcoeff_ptr + index);
+    store_coefficients(coeff1, dqcoeff_ptr + index + 8);
+
+    index += 16;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff <
+          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_quantize_b_32x32_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  const int log_scale = 1;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, log_scale_vec);
+  round = _mm_add_epi16(round, log_scale_vec);
+  zbin = _mm_srli_epi16(zbin, log_scale);
+  round = _mm_srli_epi16(round, log_scale);
+  zbin = _mm_sub_epi16(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr);
+    store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+                                          &log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + 8, &log_scale);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+                 &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+      index += 16;
+      continue;
+    }
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+                                          dqcoeff_ptr + index, &log_scale);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + index + 8, &log_scale);
+    index += 16;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_quantize_b_64x64_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  const int log_scale = 2;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, log_scale_vec);
+  round = _mm_add_epi16(round, log_scale_vec);
+  zbin = _mm_srli_epi16(zbin, log_scale);
+  round = _mm_srli_epi16(round, log_scale);
+  zbin = _mm_sub_epi16(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr);
+    store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+                                          &log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + 8, &log_scale);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+                 &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+      index += 16;
+      continue;
+    }
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+                                          dqcoeff_ptr + index, &log_scale);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + index + 8, &log_scale);
+    index += 16;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
diff --git a/media/libaom/src/aom_dsp/x86/aom_asm_stubs.c b/media/libaom/src/aom_dsp/x86/aom_asm_stubs.c
index 5f5bf5f14e..ce8285e43d 100644
--- a/media/libaom/src/aom_dsp/x86/aom_asm_stubs.c
+++ b/media/libaom/src/aom_dsp/x86/aom_asm_stubs.c
@@ -21,13 +21,13 @@ filter8_1dfunction aom_filter_block1d8_v8_sse2;
 filter8_1dfunction aom_filter_block1d8_h8_sse2;
 filter8_1dfunction aom_filter_block1d4_v8_sse2;
 filter8_1dfunction aom_filter_block1d4_h8_sse2;
+filter8_1dfunction aom_filter_block1d16_v4_sse2;
+filter8_1dfunction aom_filter_block1d16_h4_sse2;
 
-#define aom_filter_block1d16_h4_sse2 aom_filter_block1d16_h8_sse2
-#define aom_filter_block1d16_v4_sse2 aom_filter_block1d16_v8_sse2
-#define aom_filter_block1d8_h4_sse2 aom_filter_block1d8_h8_sse2
-#define aom_filter_block1d8_v4_sse2 aom_filter_block1d8_v8_sse2
-#define aom_filter_block1d4_h4_sse2 aom_filter_block1d4_h8_sse2
-#define aom_filter_block1d4_v4_sse2 aom_filter_block1d4_v8_sse2
+filter8_1dfunction aom_filter_block1d8_h4_sse2;
+filter8_1dfunction aom_filter_block1d8_v4_sse2;
+filter8_1dfunction aom_filter_block1d4_h4_sse2;
+filter8_1dfunction aom_filter_block1d4_v4_sse2;
 
 filter8_1dfunction aom_filter_block1d16_v2_sse2;
 filter8_1dfunction aom_filter_block1d16_h2_sse2;
@@ -49,7 +49,7 @@ filter8_1dfunction aom_filter_block1d4_h2_sse2;
 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
 
-#if ARCH_X86_64
+#if CONFIG_AV1_HIGHBITDEPTH
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
@@ -57,6 +57,13 @@ highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
 
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h4_sse2;
+
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
@@ -84,6 +91,5 @@ highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
 //                                     int w, int h, int bd);
 HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
 HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-
-#endif  // ARCH_X86_64
+#endif
 #endif  // HAVE_SSE2
diff --git a/media/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/media/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
index 7b3fe6419a..a7152be57c 100644
--- a/media/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
+++ b/media/libaom/src/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
@@ -67,7 +67,6 @@
     dec         rcx
 %endm
 
-%if ARCH_X86_64
 %macro HIGH_GET_PARAM 0
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
@@ -86,14 +85,17 @@
 
     mov         rdx, 0x00010001
     movsxd      rcx, DWORD PTR arg(6)       ;bps
-    movq        xmm8, rdx
+    movq        xmm3, rdx
     movq        xmm5, rcx
-    pshufd      xmm8, xmm8, 0b
-    movdqa      xmm1, xmm8
-    psllw       xmm8, xmm5
-    psubw       xmm8, xmm1                  ;max value (for clamping)
+    pshufd      xmm3, xmm3, 0b
+    movdqa      xmm1, xmm3
+    psllw       xmm3, xmm5
+    psubw       xmm3, xmm1                  ;max value (for clamping)
     pxor        xmm5, xmm5                  ;min value (for clamping)
 
+    movdqa      max, xmm3
+    movdqa      min, xmm5
+
     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
     movsxd      rcx, DWORD PTR arg(4)       ;output_height
@@ -113,8 +115,8 @@
     packssdw    xmm0, xmm6                  ;pack back to word
 
     ;clamp the values
-    pminsw      xmm0, xmm8
-    pmaxsw      xmm0, xmm5
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
 
 %if %1
     movdqu      xmm1, [rdi]
@@ -128,36 +130,36 @@
 %endm
 
 %macro HIGH_APPLY_FILTER_16 1
-    movdqa      xmm9, xmm0
+    movdqa      xmm5, xmm0
     movdqa      xmm6, xmm2
-    punpckhwd   xmm9, xmm1
+    punpckhwd   xmm5, xmm1
     punpckhwd   xmm6, xmm3
     punpcklwd   xmm0, xmm1
     punpcklwd   xmm2, xmm3
 
-    pmaddwd     xmm9, xmm7
+    pmaddwd     xmm5, xmm7
     pmaddwd     xmm6, xmm7
     pmaddwd     xmm0, xmm7
     pmaddwd     xmm2, xmm7
 
-    paddd       xmm9, xmm4                  ;rounding
+    paddd       xmm5, xmm4                  ;rounding
     paddd       xmm6, xmm4
     paddd       xmm0, xmm4
     paddd       xmm2, xmm4
 
-    psrad       xmm9, 7                     ;shift
+    psrad       xmm5, 7                     ;shift
     psrad       xmm6, 7
     psrad       xmm0, 7
     psrad       xmm2, 7
 
-    packssdw    xmm0, xmm9                  ;pack back to word
+    packssdw    xmm0, xmm5                  ;pack back to word
     packssdw    xmm2, xmm6                  ;pack back to word
 
     ;clamp the values
-    pminsw      xmm0, xmm8
-    pmaxsw      xmm0, xmm5
-    pminsw      xmm2, xmm8
-    pmaxsw      xmm2, xmm5
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+    pminsw      xmm2, max
+    pmaxsw      xmm2, min
 
 %if %1
     movdqu      xmm1, [rdi]
@@ -172,7 +174,6 @@
     lea         rdi, [rdi + 2*rdx]
     dec         rcx
 %endm
-%endif
 
 SECTION .text
 
@@ -200,7 +201,6 @@ sym(aom_highbd_filter_block1d4_v2_sse2):
     pop         rbp
     ret
 
-%if ARCH_X86_64
 global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE
 sym(aom_highbd_filter_block1d8_v2_sse2):
     push        rbp
@@ -211,6 +211,11 @@ sym(aom_highbd_filter_block1d8_v2_sse2):
     push        rdi
     ; end prolog
 
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 2
+    %define max [rsp + 16 * 0]
+    %define min [rsp + 16 * 1]
+
     HIGH_GET_PARAM
 .loop:
     movdqu      xmm0, [rsi]                 ;0
@@ -219,6 +224,9 @@ sym(aom_highbd_filter_block1d8_v2_sse2):
     HIGH_APPLY_FILTER_8 0
     jnz         .loop
 
+    add rsp, 16 * 2
+    pop rsp
+
     ; begin epilog
     pop         rdi
     pop         rsi
@@ -237,6 +245,11 @@ sym(aom_highbd_filter_block1d16_v2_sse2):
     push        rdi
     ; end prolog
 
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 2
+    %define max [rsp + 16 * 0]
+    %define min [rsp + 16 * 1]
+
     HIGH_GET_PARAM
 .loop:
     movdqu        xmm0, [rsi]               ;0
@@ -247,6 +260,9 @@ sym(aom_highbd_filter_block1d16_v2_sse2):
     HIGH_APPLY_FILTER_16 0
     jnz         .loop
 
+    add rsp, 16 * 2
+    pop rsp
+
     ; begin epilog
     pop         rdi
     pop         rsi
@@ -254,7 +270,6 @@ sym(aom_highbd_filter_block1d16_v2_sse2):
     UNSHADOW_ARGS
     pop         rbp
     ret
-%endif
 
 global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE
 sym(aom_highbd_filter_block1d4_h2_sse2):
@@ -281,7 +296,6 @@ sym(aom_highbd_filter_block1d4_h2_sse2):
     pop         rbp
     ret
 
-%if ARCH_X86_64
 global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE
 sym(aom_highbd_filter_block1d8_h2_sse2):
     push        rbp
@@ -292,6 +306,11 @@ sym(aom_highbd_filter_block1d8_h2_sse2):
     push        rdi
     ; end prolog
 
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 2
+    %define max [rsp + 16 * 0]
+    %define min [rsp + 16 * 1]
+
     HIGH_GET_PARAM
 .loop:
     movdqu      xmm0, [rsi]                 ;load src
@@ -300,6 +319,9 @@ sym(aom_highbd_filter_block1d8_h2_sse2):
     HIGH_APPLY_FILTER_8 0
     jnz         .loop
 
+    add rsp, 16 * 2
+    pop rsp
+
     ; begin epilog
     pop         rdi
     pop         rsi
@@ -318,6 +340,11 @@ sym(aom_highbd_filter_block1d16_h2_sse2):
     push        rdi
     ; end prolog
 
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 2
+    %define max [rsp + 16 * 0]
+    %define min [rsp + 16 * 1]
+
     HIGH_GET_PARAM
 .loop:
     movdqu      xmm0,   [rsi]               ;load src
@@ -328,6 +355,9 @@ sym(aom_highbd_filter_block1d16_h2_sse2):
     HIGH_APPLY_FILTER_16 0
     jnz         .loop
 
+    add rsp, 16 * 2
+    pop rsp
+
     ; begin epilog
     pop         rdi
     pop         rsi
@@ -335,4 +365,3 @@ sym(aom_highbd_filter_block1d16_h2_sse2):
     UNSHADOW_ARGS
     pop         rbp
     ret
-%endif
diff --git a/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
new file mode 100644
index 0000000000..cff7f43eee
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
@@ -0,0 +1,569 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/convolve.h"
+#include "aom_ports/mem.h"
+
+void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
+                                  ptrdiff_t src_pixels_per_line,
+                                  uint8_t *output_ptr, ptrdiff_t output_pitch,
+                                  uint32_t output_height,
+                                  const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1,
+      srcRegFilt32b2_2;
+  __m128i srcReg32b1, srcReg32b2;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
+    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
+    __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+    __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+    __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
+    __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
+    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+    __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3);
+    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5);
+    __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
+    __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+    d1 = _mm_madd_epi16(ss_1_2, secondFilters);
+    d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
+    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
+
+    __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+    __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
+
+    // reading stride of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+
+    ss_2 = _mm_srli_si128(srcReg32b2, 2);
+    ss_4 = _mm_srli_si128(srcReg32b2, 4);
+    ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+    ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+    d1 = _mm_madd_epi16(ss_1_1, secondFilters);
+    d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
+    srcRegFilt32b2_1 = _mm_add_epi32(d1, d2);
+
+    ss_1 = _mm_srli_si128(srcReg32b2, 3);
+    ss_3 = _mm_srli_si128(srcReg32b2, 5);
+    ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
+    ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+    d1 = _mm_madd_epi16(ss_1_2, secondFilters);
+    d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
+    srcRegFilt32b2_2 = _mm_add_epi32(d1, d2);
+
+    res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
+    res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
+    srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+    srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+    src_ptr += src_pixels_per_line;
+
+    _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                  uint8_t *output_ptr, ptrdiff_t out_pitch,
+                                  uint32_t output_height,
+                                  const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
+  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
+  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
+  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  __m128i tmp_0, tmp_1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
+  srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
+  __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
+  __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
+  __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128());
+  __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128());
+
+  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
+  srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
+  __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
+  __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
+  __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128());
+  __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128());
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+
+    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
+    srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
+    resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
+    resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
+    __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
+    resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
+    __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
+    resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    // add and saturate the results together
+    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters);
+    resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+    tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters);
+    resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128());
+    __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters);
+    resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128());
+    __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters);
+    resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+    // add and saturate the results together
+    resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
+    resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
+
+    // shift by 6 bit each 16 bit
+    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
+    resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
+    resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
+    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
+    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
+    resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
+    resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
+    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
+
+    src_ptr += src_stride;
+
+    _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
+    _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    resReg23_lo_1 = resReg45_lo_1;
+    resReg23_lo_2 = resReg45_lo_2;
+    resReg23_hi_1 = resReg45_hi_1;
+    resReg23_hi_2 = resReg45_hi_2;
+    resReg34_lo_1 = resReg56_lo_1;
+    resReg34_lo_2 = resReg56_lo_2;
+    resReg34_hi_1 = resReg56_hi_1;
+    resReg34_hi_2 = resReg56_hi_2;
+    srcReg4 = srcReg6;
+  }
+}
+
+void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
+                                 ptrdiff_t src_pixels_per_line,
+                                 uint8_t *output_ptr, ptrdiff_t output_pitch,
+                                 uint32_t output_height,
+                                 const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
+  __m128i srcReg32b1;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
+    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
+    ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+    ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+    __m128i d1 = _mm_madd_epi16(ss_2, secondFilters);
+    __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
+    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
+    __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
+    ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+    ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
+    d1 = _mm_madd_epi16(ss_3, secondFilters);
+    d2 = _mm_madd_epi16(ss_5, thirdFilters);
+    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
+
+    __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+    __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    src_ptr += src_pixels_per_line;
+
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
+                                 uint32_t output_height,
+                                 const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23_lo, srcReg34_lo;
+  __m128i srcReg45_lo, srcReg56_lo;
+  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+  __m128i resReg23_45_lo, resReg34_56_lo;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  __m128i tmp_0, tmp_1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
+  __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
+  __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
+
+  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
+  __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
+  __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
+    resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
+    resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
+    __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
+    resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
+    __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
+    resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    // add and saturate the results together
+    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
+
+    // shift by 6 bit each 16 bit
+    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
+    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
+    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128());
+    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128());
+
+    src_ptr += src_stride;
+
+    _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
+    _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    resReg23_lo_1 = resReg45_lo_1;
+    resReg23_lo_2 = resReg45_lo_2;
+    resReg34_lo_1 = resReg56_lo_1;
+    resReg34_lo_2 = resReg56_lo_2;
+    srcReg4 = srcReg6;
+  }
+}
+
+void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
+                                 ptrdiff_t src_pixels_per_line,
+                                 uint8_t *output_ptr, ptrdiff_t output_pitch,
+                                 uint32_t output_height,
+                                 const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1;
+  __m128i srcReg32b1;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
+    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
+    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
+    __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
+
+    ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+    ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+    ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+    ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
+
+    __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3);
+    __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5);
+
+    __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
+    __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters);
+    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+    srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    src_ptr += src_pixels_per_line;
+
+    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
+                                 uint32_t output_height,
+                                 const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23, srcReg34, srcReg45, srcReg56;
+  __m128i resReg23_34, resReg45_56;
+  __m128i resReg23_34_45_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  __m128i tmp_0, tmp_1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
+  __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128());
+
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+  __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128());
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+    srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
+    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+    srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    tmp_0 = _mm_madd_epi16(resReg23, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg34, secondFilters);
+    resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128());
+    __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128());
+
+    tmp_0 = _mm_madd_epi16(resReg45, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg56, thirdFilters);
+    resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1);
+
+    // add and saturate the results together
+    resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56);
+
+    // shift by 6 bit each 16 bit
+    resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32);
+    resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_34_45_56 =
+        _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128());
+
+    src_ptr += src_stride;
+
+    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56);
+    *((uint32_t *)(output_ptr + out_pitch)) =
+        _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    resReg23 = resReg45;
+    resReg34 = resReg56;
+    srcReg4 = srcReg6;
+  }
+}
diff --git a/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 325a21b761..f64b821ea4 100644
--- a/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/media/libaom/src/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -20,29 +20,44 @@
 #include "aom_ports/emmintrin_compat.h"
 
 // filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
-};
+DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = { 0, 1, 1, 2, 2, 3,
+                                                              3, 4, 2, 3, 3, 4,
+                                                              4, 5, 5, 6 };
 
-DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
-};
+DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = { 4, 5, 5, 6, 6, 7,
+                                                              7, 8, 6, 7, 7, 8,
+                                                              8, 9, 9, 10 };
 
 // filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+DECLARE_ALIGNED(16, static const uint8_t,
+                filt1_global[16]) = { 0, 1, 1, 2, 2, 3, 3, 4,
+                                      4, 5, 5, 6, 6, 7, 7, 8 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                filt2_global[16]) = { 2, 3, 3, 4, 4, 5, 5, 6,
+                                      6, 7, 7, 8, 8, 9, 9, 10 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                filt3_global[16]) = { 4, 5, 5, 6,  6,  7,  7,  8,
+                                      8, 9, 9, 10, 10, 11, 11, 12 };
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                filt4_global[16]) = { 6,  7,  7,  8,  8,  9,  9,  10,
+                                      10, 11, 11, 12, 12, 13, 13, 14 };
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = {
+  0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
+  2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
+  5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
+  7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
+  10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
+  12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
+  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
 };
 
-DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+DECLARE_ALIGNED(32, static const uint8_t, filtd4[]) = {
+  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
 };
 
 // These are reused by the avx2 intrinsics.
@@ -50,6 +65,133 @@ filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
 filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
 filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
 
+static void aom_filter_block1d4_h4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
+  filt1Reg = _mm_load_si128((__m128i const *)(filtd4));
+
+  for (i = output_height; i > 0; i -= 1) {
+    // load the 2 strides of source
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg);
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+    srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    src_ptr += src_pixels_per_line;
+
+    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
+    output_ptr += output_pitch;
+  }
+}
+
+static void aom_filter_block1d4_v4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32;
+  __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45,
+      srcReg6, srcReg56;
+  __m128i srcReg23_34_lo, srcReg45_56_lo;
+  __m128i srcReg2345_3456_lo, srcReg2345_3456_hi;
+  __m128i resReglo, resReghi;
+  __m128i firstFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3);
+
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4);
+
+  srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34);
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+    srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+    srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6);
+
+    // merge every two consecutive registers
+    srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56);
+
+    srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
+    srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
+    resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters);
+
+    resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128());
+    resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128());
+
+    // shift by 6 bit each 16 bit
+    resReglo = _mm_adds_epi16(resReglo, addFilterReg32);
+    resReghi = _mm_adds_epi16(resReghi, addFilterReg32);
+    resReglo = _mm_srai_epi16(resReglo, 6);
+    resReghi = _mm_srai_epi16(resReghi, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReglo = _mm_packus_epi16(resReglo, resReglo);
+    resReghi = _mm_packus_epi16(resReghi, resReghi);
+
+    src_ptr += src_stride;
+
+    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReglo);
+    *((uint32_t *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_34_lo = srcReg45_56_lo;
+    srcReg4 = srcReg6;
+  }
+}
+
 void aom_filter_block1d4_h8_intrin_ssse3(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -118,6 +260,145 @@ void aom_filter_block1d4_h8_intrin_ssse3(
   }
 }
 
+static void aom_filter_block1d8_h4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32, filt2Reg, filt3Reg;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m128i srcReg32b1;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+  filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
+  filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    src_ptr += src_pixels_per_line;
+
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+static void aom_filter_block1d8_v4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23, srcReg34, srcReg45, srcReg56;
+  __m128i resReg23, resReg34, resReg45, resReg56;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
+
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+
+    srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+
+    srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23 = _mm_maddubs_epi16(srcReg23, secondFilters);
+    resReg34 = _mm_maddubs_epi16(srcReg34, secondFilters);
+    resReg45 = _mm_maddubs_epi16(srcReg45, thirdFilters);
+    resReg56 = _mm_maddubs_epi16(srcReg56, thirdFilters);
+
+    // add and saturate the results together
+    resReg23_45 = _mm_adds_epi16(resReg23, resReg45);
+    resReg34_56 = _mm_adds_epi16(resReg34, resReg56);
+
+    // shift by 6 bit each 16 bit
+    resReg23_45 = _mm_adds_epi16(resReg23_45, addFilterReg32);
+    resReg34_56 = _mm_adds_epi16(resReg34_56, addFilterReg32);
+    resReg23_45 = _mm_srai_epi16(resReg23_45, 6);
+    resReg34_56 = _mm_srai_epi16(resReg34_56, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packus_epi16(resReg23_45, _mm_setzero_si128());
+    resReg34_56 = _mm_packus_epi16(resReg34_56, _mm_setzero_si128());
+
+    src_ptr += src_stride;
+
+    _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
+    _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23 = srcReg45;
+    srcReg34 = srcReg56;
+    srcReg4 = srcReg6;
+  }
+}
+
 void aom_filter_block1d8_h8_intrin_ssse3(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -280,6 +561,187 @@ void aom_filter_block1d8_v8_intrin_ssse3(
   }
 }
 
+static void aom_filter_block1d16_h4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32, filt2Reg, filt3Reg;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m128i srcReg32b1, srcReg32b2;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+  filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
+  filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // reading stride of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b2, filt2Reg);
+    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+    srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+    src_ptr += src_pixels_per_line;
+
+    _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+static void aom_filter_block1d16_v4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
+  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
+  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
+  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
+  srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
+
+  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+
+  // have consecutive loads on the same 256 register
+  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
+  srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+
+    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
+    srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters);
+    resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters);
+    resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters);
+    resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters);
+
+    // add and saturate the results together
+    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters);
+    resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters);
+    resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters);
+    resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters);
+
+    // add and saturate the results together
+    resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
+    resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
+
+    // shift by 6 bit each 16 bit
+    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
+    resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
+    resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
+    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
+    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
+    resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
+    resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
+    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
+
+    src_ptr += src_stride;
+
+    _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
+    _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_lo = srcReg45_lo;
+    srcReg34_lo = srcReg56_lo;
+    srcReg23_hi = srcReg45_hi;
+    srcReg34_hi = srcReg56_hi;
+    srcReg4 = srcReg6;
+  }
+}
+
 filter8_1dfunction aom_filter_block1d16_v8_ssse3;
 filter8_1dfunction aom_filter_block1d16_h8_ssse3;
 filter8_1dfunction aom_filter_block1d8_v8_ssse3;
@@ -287,13 +749,6 @@ filter8_1dfunction aom_filter_block1d8_h8_ssse3;
 filter8_1dfunction aom_filter_block1d4_v8_ssse3;
 filter8_1dfunction aom_filter_block1d4_h8_ssse3;
 
-#define aom_filter_block1d16_h4_ssse3 aom_filter_block1d16_h8_ssse3
-#define aom_filter_block1d16_v4_ssse3 aom_filter_block1d16_v8_ssse3
-#define aom_filter_block1d8_h4_ssse3 aom_filter_block1d8_h8_ssse3
-#define aom_filter_block1d8_v4_ssse3 aom_filter_block1d8_v8_ssse3
-#define aom_filter_block1d4_h4_ssse3 aom_filter_block1d4_h8_ssse3
-#define aom_filter_block1d4_v4_ssse3 aom_filter_block1d4_v8_ssse3
-
 filter8_1dfunction aom_filter_block1d16_v2_ssse3;
 filter8_1dfunction aom_filter_block1d16_h2_ssse3;
 filter8_1dfunction aom_filter_block1d8_v2_ssse3;
diff --git a/media/libaom/src/aom_dsp/x86/avg_intrin_avx2.c b/media/libaom/src/aom_dsp/x86/avg_intrin_avx2.c
new file mode 100644
index 0000000000..3bbffbd805
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/avg_intrin_avx2.c
@@ -0,0 +1,504 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/bitdepth_conversion_avx2.h"
+#include "aom_ports/mem.h"
+
+static void hadamard_col8x2_avx2(__m256i *in, int iter) {
+  __m256i a0 = in[0];
+  __m256i a1 = in[1];
+  __m256i a2 = in[2];
+  __m256i a3 = in[3];
+  __m256i a4 = in[4];
+  __m256i a5 = in[5];
+  __m256i a6 = in[6];
+  __m256i a7 = in[7];
+
+  __m256i b0 = _mm256_add_epi16(a0, a1);
+  __m256i b1 = _mm256_sub_epi16(a0, a1);
+  __m256i b2 = _mm256_add_epi16(a2, a3);
+  __m256i b3 = _mm256_sub_epi16(a2, a3);
+  __m256i b4 = _mm256_add_epi16(a4, a5);
+  __m256i b5 = _mm256_sub_epi16(a4, a5);
+  __m256i b6 = _mm256_add_epi16(a6, a7);
+  __m256i b7 = _mm256_sub_epi16(a6, a7);
+
+  a0 = _mm256_add_epi16(b0, b2);
+  a1 = _mm256_add_epi16(b1, b3);
+  a2 = _mm256_sub_epi16(b0, b2);
+  a3 = _mm256_sub_epi16(b1, b3);
+  a4 = _mm256_add_epi16(b4, b6);
+  a5 = _mm256_add_epi16(b5, b7);
+  a6 = _mm256_sub_epi16(b4, b6);
+  a7 = _mm256_sub_epi16(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm256_add_epi16(a0, a4);
+    b7 = _mm256_add_epi16(a1, a5);
+    b3 = _mm256_add_epi16(a2, a6);
+    b4 = _mm256_add_epi16(a3, a7);
+    b2 = _mm256_sub_epi16(a0, a4);
+    b6 = _mm256_sub_epi16(a1, a5);
+    b1 = _mm256_sub_epi16(a2, a6);
+    b5 = _mm256_sub_epi16(a3, a7);
+
+    a0 = _mm256_unpacklo_epi16(b0, b1);
+    a1 = _mm256_unpacklo_epi16(b2, b3);
+    a2 = _mm256_unpackhi_epi16(b0, b1);
+    a3 = _mm256_unpackhi_epi16(b2, b3);
+    a4 = _mm256_unpacklo_epi16(b4, b5);
+    a5 = _mm256_unpacklo_epi16(b6, b7);
+    a6 = _mm256_unpackhi_epi16(b4, b5);
+    a7 = _mm256_unpackhi_epi16(b6, b7);
+
+    b0 = _mm256_unpacklo_epi32(a0, a1);
+    b1 = _mm256_unpacklo_epi32(a4, a5);
+    b2 = _mm256_unpackhi_epi32(a0, a1);
+    b3 = _mm256_unpackhi_epi32(a4, a5);
+    b4 = _mm256_unpacklo_epi32(a2, a3);
+    b5 = _mm256_unpacklo_epi32(a6, a7);
+    b6 = _mm256_unpackhi_epi32(a2, a3);
+    b7 = _mm256_unpackhi_epi32(a6, a7);
+
+    in[0] = _mm256_unpacklo_epi64(b0, b1);
+    in[1] = _mm256_unpackhi_epi64(b0, b1);
+    in[2] = _mm256_unpacklo_epi64(b2, b3);
+    in[3] = _mm256_unpackhi_epi64(b2, b3);
+    in[4] = _mm256_unpacklo_epi64(b4, b5);
+    in[5] = _mm256_unpackhi_epi64(b4, b5);
+    in[6] = _mm256_unpacklo_epi64(b6, b7);
+    in[7] = _mm256_unpackhi_epi64(b6, b7);
+  } else {
+    in[0] = _mm256_add_epi16(a0, a4);
+    in[7] = _mm256_add_epi16(a1, a5);
+    in[3] = _mm256_add_epi16(a2, a6);
+    in[4] = _mm256_add_epi16(a3, a7);
+    in[2] = _mm256_sub_epi16(a0, a4);
+    in[6] = _mm256_sub_epi16(a1, a5);
+    in[1] = _mm256_sub_epi16(a2, a6);
+    in[5] = _mm256_sub_epi16(a3, a7);
+  }
+}
+
+static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  __m256i src[8];
+  src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
+  src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+
+  hadamard_col8x2_avx2(src, 0);
+  hadamard_col8x2_avx2(src, 1);
+
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[0], src[1], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[2], src[3], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[4], src[5], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[6], src[7], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[0], src[1], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[2], src[3], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[4], src[5], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[6], src[7], 0x31));
+}
+
+static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
+                                       ptrdiff_t src_stride, tran_low_t *coeff,
+                                       int is_final) {
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+  int16_t *t_coeff = temp_coeff;
+  int16_t *coeff16 = (int16_t *)coeff;
+  int idx;
+  for (idx = 0; idx < 2; ++idx) {
+    const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
+    hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
+  }
+
+  for (idx = 0; idx < 64; idx += 16) {
+    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi16(b0, 1);
+    b1 = _mm256_srai_epi16(b1, 1);
+    b2 = _mm256_srai_epi16(b2, 1);
+    b3 = _mm256_srai_epi16(b3, 1);
+    if (is_final) {
+      store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+      store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
+      store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
+      store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
+      coeff += 16;
+    } else {
+      _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3));
+      coeff16 += 16;
+    }
+    t_coeff += 16;
+  }
+}
+
+void aom_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  hadamard_16x16_avx2(src_diff, src_stride, coeff, 1);
+}
+
+void aom_hadamard_lp_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  int16_t *t_coeff = coeff;
+  for (int idx = 0; idx < 2; ++idx) {
+    const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
+    hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
+  }
+
+  for (int idx = 0; idx < 64; idx += 16) {
+    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi16(b0, 1);
+    b1 = _mm256_srai_epi16(b1, 1);
+    b2 = _mm256_srai_epi16(b2, 1);
+    b3 = _mm256_srai_epi16(b3, 1);
+    _mm256_storeu_si256((__m256i *)coeff, _mm256_add_epi16(b0, b2));
+    _mm256_storeu_si256((__m256i *)(coeff + 64), _mm256_add_epi16(b1, b3));
+    _mm256_storeu_si256((__m256i *)(coeff + 128), _mm256_sub_epi16(b0, b2));
+    _mm256_storeu_si256((__m256i *)(coeff + 192), _mm256_sub_epi16(b1, b3));
+    coeff += 16;
+    t_coeff += 16;
+  }
+}
+
+void aom_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  // For high bitdepths, it is unnecessary to store_tran_low
+  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+  // next stage.  Output to an intermediate buffer first, then store_tran_low()
+  // in the final stage.
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+  int16_t *t_coeff = temp_coeff;
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    hadamard_16x16_avx2(src_ptr, src_stride,
+                        (tran_low_t *)(t_coeff + idx * 256), 0);
+  }
+
+  for (idx = 0; idx < 256; idx += 16) {
+    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi16(b0, 2);
+    b1 = _mm256_srai_epi16(b1, 2);
+    b2 = _mm256_srai_epi16(b2, 2);
+    b3 = _mm256_srai_epi16(b3, 2);
+
+    store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+    store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
+    store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512);
+    store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768);
+
+    coeff += 16;
+    t_coeff += 16;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
+  __m256i a0 = in[0];
+  __m256i a1 = in[1];
+  __m256i a2 = in[2];
+  __m256i a3 = in[3];
+  __m256i a4 = in[4];
+  __m256i a5 = in[5];
+  __m256i a6 = in[6];
+  __m256i a7 = in[7];
+
+  __m256i b0 = _mm256_add_epi32(a0, a1);
+  __m256i b1 = _mm256_sub_epi32(a0, a1);
+  __m256i b2 = _mm256_add_epi32(a2, a3);
+  __m256i b3 = _mm256_sub_epi32(a2, a3);
+  __m256i b4 = _mm256_add_epi32(a4, a5);
+  __m256i b5 = _mm256_sub_epi32(a4, a5);
+  __m256i b6 = _mm256_add_epi32(a6, a7);
+  __m256i b7 = _mm256_sub_epi32(a6, a7);
+
+  a0 = _mm256_add_epi32(b0, b2);
+  a1 = _mm256_add_epi32(b1, b3);
+  a2 = _mm256_sub_epi32(b0, b2);
+  a3 = _mm256_sub_epi32(b1, b3);
+  a4 = _mm256_add_epi32(b4, b6);
+  a5 = _mm256_add_epi32(b5, b7);
+  a6 = _mm256_sub_epi32(b4, b6);
+  a7 = _mm256_sub_epi32(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm256_add_epi32(a0, a4);
+    b7 = _mm256_add_epi32(a1, a5);
+    b3 = _mm256_add_epi32(a2, a6);
+    b4 = _mm256_add_epi32(a3, a7);
+    b2 = _mm256_sub_epi32(a0, a4);
+    b6 = _mm256_sub_epi32(a1, a5);
+    b1 = _mm256_sub_epi32(a2, a6);
+    b5 = _mm256_sub_epi32(a3, a7);
+
+    a0 = _mm256_unpacklo_epi32(b0, b1);
+    a1 = _mm256_unpacklo_epi32(b2, b3);
+    a2 = _mm256_unpackhi_epi32(b0, b1);
+    a3 = _mm256_unpackhi_epi32(b2, b3);
+    a4 = _mm256_unpacklo_epi32(b4, b5);
+    a5 = _mm256_unpacklo_epi32(b6, b7);
+    a6 = _mm256_unpackhi_epi32(b4, b5);
+    a7 = _mm256_unpackhi_epi32(b6, b7);
+
+    b0 = _mm256_unpacklo_epi64(a0, a1);
+    b1 = _mm256_unpacklo_epi64(a4, a5);
+    b2 = _mm256_unpackhi_epi64(a0, a1);
+    b3 = _mm256_unpackhi_epi64(a4, a5);
+    b4 = _mm256_unpacklo_epi64(a2, a3);
+    b5 = _mm256_unpacklo_epi64(a6, a7);
+    b6 = _mm256_unpackhi_epi64(a2, a3);
+    b7 = _mm256_unpackhi_epi64(a6, a7);
+
+    in[0] = _mm256_permute2x128_si256(b0, b1, 0x20);
+    in[1] = _mm256_permute2x128_si256(b0, b1, 0x31);
+    in[2] = _mm256_permute2x128_si256(b2, b3, 0x20);
+    in[3] = _mm256_permute2x128_si256(b2, b3, 0x31);
+    in[4] = _mm256_permute2x128_si256(b4, b5, 0x20);
+    in[5] = _mm256_permute2x128_si256(b4, b5, 0x31);
+    in[6] = _mm256_permute2x128_si256(b6, b7, 0x20);
+    in[7] = _mm256_permute2x128_si256(b6, b7, 0x31);
+  } else {
+    in[0] = _mm256_add_epi32(a0, a4);
+    in[7] = _mm256_add_epi32(a1, a5);
+    in[3] = _mm256_add_epi32(a2, a6);
+    in[4] = _mm256_add_epi32(a3, a7);
+    in[2] = _mm256_sub_epi32(a0, a4);
+    in[6] = _mm256_sub_epi32(a1, a5);
+    in[1] = _mm256_sub_epi32(a2, a6);
+    in[5] = _mm256_sub_epi32(a3, a7);
+  }
+}
+
+void aom_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                  tran_low_t *coeff) {
+  __m128i src16[8];
+  __m256i src32[8];
+
+  src16[0] = _mm_loadu_si128((const __m128i *)src_diff);
+  src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+
+  src32[0] = _mm256_cvtepi16_epi32(src16[0]);
+  src32[1] = _mm256_cvtepi16_epi32(src16[1]);
+  src32[2] = _mm256_cvtepi16_epi32(src16[2]);
+  src32[3] = _mm256_cvtepi16_epi32(src16[3]);
+  src32[4] = _mm256_cvtepi16_epi32(src16[4]);
+  src32[5] = _mm256_cvtepi16_epi32(src16[5]);
+  src32[6] = _mm256_cvtepi16_epi32(src16[6]);
+  src32[7] = _mm256_cvtepi16_epi32(src16[7]);
+
+  highbd_hadamard_col8_avx2(src32, 0);
+  highbd_hadamard_col8_avx2(src32, 1);
+
+  _mm256_storeu_si256((__m256i *)coeff, src32[0]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[1]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[2]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[3]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[4]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[5]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[6]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[7]);
+}
+
+void aom_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int idx;
+  tran_low_t *t_coeff = coeff;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    aom_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
+  }
+
+  for (idx = 0; idx < 64; idx += 8) {
+    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi32(b0, 1);
+    b1 = _mm256_srai_epi32(b1, 1);
+    b2 = _mm256_srai_epi32(b2, 1);
+    b3 = _mm256_srai_epi32(b3, 1);
+
+    coeff0 = _mm256_add_epi32(b0, b2);
+    coeff1 = _mm256_add_epi32(b1, b3);
+    coeff2 = _mm256_sub_epi32(b0, b2);
+    coeff3 = _mm256_sub_epi32(b1, b3);
+
+    _mm256_storeu_si256((__m256i *)coeff, coeff0);
+    _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
+    _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
+    _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
+
+void aom_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int idx;
+  tran_low_t *t_coeff = coeff;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    aom_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
+  }
+
+  for (idx = 0; idx < 256; idx += 8) {
+    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi32(b0, 2);
+    b1 = _mm256_srai_epi32(b1, 2);
+    b2 = _mm256_srai_epi32(b2, 2);
+    b3 = _mm256_srai_epi32(b3, 2);
+
+    coeff0 = _mm256_add_epi32(b0, b2);
+    coeff1 = _mm256_add_epi32(b1, b3);
+    coeff2 = _mm256_sub_epi32(b0, b2);
+    coeff3 = _mm256_sub_epi32(b1, b3);
+
+    _mm256_storeu_si256((__m256i *)coeff, coeff0);
+    _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
+    _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
+    _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+int aom_satd_avx2(const tran_low_t *coeff, int length) {
+  __m256i accum = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < length; i += 8, coeff += 8) {
+    const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
+    const __m256i abs = _mm256_abs_epi32(src_line);
+    accum = _mm256_add_epi32(accum, abs);
+  }
+
+  {  // 32 bit horizontal add
+    const __m256i a = _mm256_srli_si256(accum, 8);
+    const __m256i b = _mm256_add_epi32(accum, a);
+    const __m256i c = _mm256_srli_epi64(b, 32);
+    const __m256i d = _mm256_add_epi32(b, c);
+    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+                                            _mm256_extractf128_si256(d, 1));
+    return _mm_cvtsi128_si32(accum_128);
+  }
+}
+
+int aom_satd_lp_avx2(const int16_t *coeff, int length) {
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i accum = _mm256_setzero_si256();
+
+  for (int i = 0; i < length; i += 16) {
+    const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
+    const __m256i abs = _mm256_abs_epi16(src_line);
+    const __m256i sum = _mm256_madd_epi16(abs, one);
+    accum = _mm256_add_epi32(accum, sum);
+    coeff += 16;
+  }
+
+  {  // 32 bit horizontal add
+    const __m256i a = _mm256_srli_si256(accum, 8);
+    const __m256i b = _mm256_add_epi32(accum, a);
+    const __m256i c = _mm256_srli_epi64(b, 32);
+    const __m256i d = _mm256_add_epi32(b, c);
+    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+                                            _mm256_extractf128_si256(d, 1));
+    return _mm_cvtsi128_si32(accum_128);
+  }
+}
diff --git a/media/libaom/src/aom_dsp/x86/avg_intrin_sse2.c b/media/libaom/src/aom_dsp/x86/avg_intrin_sse2.c
new file mode 100644
index 0000000000..260ca2ad17
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/avg_intrin_sse2.c
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/bitdepth_conversion_sse2.h"
+#include "aom_ports/mem.h"
+
+void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+                         int *min, int *max) {
+  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
+  u0 = _mm_setzero_si128();
+  // Row 0
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff0 = _mm_max_epi16(diff, negdiff);
+  // Row 1
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
+  minabsdiff = _mm_min_epi16(absdiff0, absdiff);
+  // Row 2
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 3
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 4
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 5
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 6
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 7
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
+  *max = _mm_extract_epi16(maxabsdiff, 0);
+
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
+  *min = _mm_extract_epi16(minabsdiff, 0);
+}
+
+unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
+  __m128i s0, s1, u0;
+  unsigned int avg = 0;
+  u0 = _mm_setzero_si128();
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+  avg = _mm_extract_epi16(s0, 0);
+  return (avg + 32) >> 6;
+}
+
+unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
+  __m128i s0, s1, u0;
+  unsigned int avg = 0;
+  u0 = _mm_setzero_si128();
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+  avg = _mm_extract_epi16(s0, 0);
+  return (avg + 8) >> 4;
+}
+
+static INLINE void hadamard_col8_sse2(__m128i *in, int iter) {
+  __m128i a0 = in[0];
+  __m128i a1 = in[1];
+  __m128i a2 = in[2];
+  __m128i a3 = in[3];
+  __m128i a4 = in[4];
+  __m128i a5 = in[5];
+  __m128i a6 = in[6];
+  __m128i a7 = in[7];
+
+  __m128i b0 = _mm_add_epi16(a0, a1);
+  __m128i b1 = _mm_sub_epi16(a0, a1);
+  __m128i b2 = _mm_add_epi16(a2, a3);
+  __m128i b3 = _mm_sub_epi16(a2, a3);
+  __m128i b4 = _mm_add_epi16(a4, a5);
+  __m128i b5 = _mm_sub_epi16(a4, a5);
+  __m128i b6 = _mm_add_epi16(a6, a7);
+  __m128i b7 = _mm_sub_epi16(a6, a7);
+
+  a0 = _mm_add_epi16(b0, b2);
+  a1 = _mm_add_epi16(b1, b3);
+  a2 = _mm_sub_epi16(b0, b2);
+  a3 = _mm_sub_epi16(b1, b3);
+  a4 = _mm_add_epi16(b4, b6);
+  a5 = _mm_add_epi16(b5, b7);
+  a6 = _mm_sub_epi16(b4, b6);
+  a7 = _mm_sub_epi16(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm_add_epi16(a0, a4);
+    b7 = _mm_add_epi16(a1, a5);
+    b3 = _mm_add_epi16(a2, a6);
+    b4 = _mm_add_epi16(a3, a7);
+    b2 = _mm_sub_epi16(a0, a4);
+    b6 = _mm_sub_epi16(a1, a5);
+    b1 = _mm_sub_epi16(a2, a6);
+    b5 = _mm_sub_epi16(a3, a7);
+
+    a0 = _mm_unpacklo_epi16(b0, b1);
+    a1 = _mm_unpacklo_epi16(b2, b3);
+    a2 = _mm_unpackhi_epi16(b0, b1);
+    a3 = _mm_unpackhi_epi16(b2, b3);
+    a4 = _mm_unpacklo_epi16(b4, b5);
+    a5 = _mm_unpacklo_epi16(b6, b7);
+    a6 = _mm_unpackhi_epi16(b4, b5);
+    a7 = _mm_unpackhi_epi16(b6, b7);
+
+    b0 = _mm_unpacklo_epi32(a0, a1);
+    b1 = _mm_unpacklo_epi32(a4, a5);
+    b2 = _mm_unpackhi_epi32(a0, a1);
+    b3 = _mm_unpackhi_epi32(a4, a5);
+    b4 = _mm_unpacklo_epi32(a2, a3);
+    b5 = _mm_unpacklo_epi32(a6, a7);
+    b6 = _mm_unpackhi_epi32(a2, a3);
+    b7 = _mm_unpackhi_epi32(a6, a7);
+
+    in[0] = _mm_unpacklo_epi64(b0, b1);
+    in[1] = _mm_unpackhi_epi64(b0, b1);
+    in[2] = _mm_unpacklo_epi64(b2, b3);
+    in[3] = _mm_unpackhi_epi64(b2, b3);
+    in[4] = _mm_unpacklo_epi64(b4, b5);
+    in[5] = _mm_unpackhi_epi64(b4, b5);
+    in[6] = _mm_unpacklo_epi64(b6, b7);
+    in[7] = _mm_unpackhi_epi64(b6, b7);
+  } else {
+    in[0] = _mm_add_epi16(a0, a4);
+    in[7] = _mm_add_epi16(a1, a5);
+    in[3] = _mm_add_epi16(a2, a6);
+    in[4] = _mm_add_epi16(a3, a7);
+    in[2] = _mm_sub_epi16(a0, a4);
+    in[6] = _mm_sub_epi16(a1, a5);
+    in[1] = _mm_sub_epi16(a2, a6);
+    in[5] = _mm_sub_epi16(a3, a7);
+  }
+}
+
+static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
+                                     ptrdiff_t src_stride, tran_low_t *coeff,
+                                     int is_final) {
+  __m128i src[8];
+  src[0] = _mm_load_si128((const __m128i *)src_diff);
+  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+
+  hadamard_col8_sse2(src, 0);
+  hadamard_col8_sse2(src, 1);
+
+  if (is_final) {
+    store_tran_low(src[0], coeff);
+    coeff += 8;
+    store_tran_low(src[1], coeff);
+    coeff += 8;
+    store_tran_low(src[2], coeff);
+    coeff += 8;
+    store_tran_low(src[3], coeff);
+    coeff += 8;
+    store_tran_low(src[4], coeff);
+    coeff += 8;
+    store_tran_low(src[5], coeff);
+    coeff += 8;
+    store_tran_low(src[6], coeff);
+    coeff += 8;
+    store_tran_low(src[7], coeff);
+  } else {
+    int16_t *coeff16 = (int16_t *)coeff;
+    _mm_store_si128((__m128i *)coeff16, src[0]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[1]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[2]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[3]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[4]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[5]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[6]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[7]);
+  }
+}
+
+void aom_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                           tran_low_t *coeff) {
+  hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
+}
+
+void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                              int16_t *coeff) {
+  __m128i src[8];
+  src[0] = _mm_load_si128((const __m128i *)src_diff);
+  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+
+  hadamard_col8_sse2(src, 0);
+  hadamard_col8_sse2(src, 1);
+
+  _mm_store_si128((__m128i *)coeff, src[0]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[1]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[2]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[3]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[4]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[5]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[6]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[7]);
+}
+
+static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
+                                       ptrdiff_t src_stride, tran_low_t *coeff,
+                                       int is_final) {
+  // For high bitdepths, it is unnecessary to store_tran_low
+  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+  // next stage.  Output to an intermediate buffer first, then store_tran_low()
+  // in the final stage.
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+  int16_t *t_coeff = temp_coeff;
+  int16_t *coeff16 = (int16_t *)coeff;
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
+                      0);
+  }
+
+  for (idx = 0; idx < 64; idx += 8) {
+    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
+
+    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm_srai_epi16(b0, 1);
+    b1 = _mm_srai_epi16(b1, 1);
+    b2 = _mm_srai_epi16(b2, 1);
+    b3 = _mm_srai_epi16(b3, 1);
+
+    coeff0 = _mm_add_epi16(b0, b2);
+    coeff1 = _mm_add_epi16(b1, b3);
+    coeff2 = _mm_sub_epi16(b0, b2);
+    coeff3 = _mm_sub_epi16(b1, b3);
+
+    if (is_final) {
+      store_tran_low(coeff0, coeff);
+      store_tran_low(coeff1, coeff + 64);
+      store_tran_low(coeff2, coeff + 128);
+      store_tran_low(coeff3, coeff + 192);
+      coeff += 8;
+    } else {
+      _mm_store_si128((__m128i *)coeff16, coeff0);
+      _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
+      _mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
+      _mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
+      coeff16 += 8;
+    }
+
+    t_coeff += 8;
+  }
+}
+
+void aom_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  hadamard_16x16_sse2(src_diff, src_stride, coeff, 1);
+}
+
+void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  // For high bitdepths, it is unnecessary to store_tran_low
+  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+  // next stage.  Output to an intermediate buffer first, then store_tran_low()
+  // in the final stage.
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+  int16_t *t_coeff = temp_coeff;
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    hadamard_16x16_sse2(src_ptr, src_stride,
+                        (tran_low_t *)(t_coeff + idx * 256), 0);
+  }
+
+  for (idx = 0; idx < 256; idx += 8) {
+    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
+
+    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm_srai_epi16(b0, 2);
+    b1 = _mm_srai_epi16(b1, 2);
+    b2 = _mm_srai_epi16(b2, 2);
+    b3 = _mm_srai_epi16(b3, 2);
+
+    coeff0 = _mm_add_epi16(b0, b2);
+    coeff1 = _mm_add_epi16(b1, b3);
+    store_tran_low(coeff0, coeff);
+    store_tran_low(coeff1, coeff + 256);
+
+    coeff2 = _mm_sub_epi16(b0, b2);
+    coeff3 = _mm_sub_epi16(b1, b3);
+    store_tran_low(coeff2, coeff + 512);
+    store_tran_low(coeff3, coeff + 768);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
+
+int aom_satd_sse2(const tran_low_t *coeff, int length) {
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i accum = zero;
+
+  for (i = 0; i < length; i += 8) {
+    const __m128i src_line = load_tran_low(coeff);
+    const __m128i inv = _mm_sub_epi16(zero, src_line);
+    const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
+    const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
+    const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
+    const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
+    accum = _mm_add_epi32(accum, sum);
+    coeff += 8;
+  }
+
+  {  // cascading summation of accum
+    __m128i hi = _mm_srli_si128(accum, 8);
+    accum = _mm_add_epi32(accum, hi);
+    hi = _mm_srli_epi64(accum, 32);
+    accum = _mm_add_epi32(accum, hi);
+  }
+
+  return _mm_cvtsi128_si32(accum);
+}
+
+void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
+                          const int ref_stride, const int height) {
+  int idx = 1;
+  __m128i zero = _mm_setzero_si128();
+  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
+  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
+  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
+  __m128i t0, t1;
+  int height_1 = height - 1;
+  ref += ref_stride;
+  do {
+    src_line = _mm_loadu_si128((const __m128i *)ref);
+    t0 = _mm_unpacklo_epi8(src_line, zero);
+    t1 = _mm_unpackhi_epi8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, t0);
+    s1 = _mm_adds_epu16(s1, t1);
+    ref += ref_stride;
+
+    src_line = _mm_loadu_si128((const __m128i *)ref);
+    t0 = _mm_unpacklo_epi8(src_line, zero);
+    t1 = _mm_unpackhi_epi8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, t0);
+    s1 = _mm_adds_epu16(s1, t1);
+    ref += ref_stride;
+    idx += 2;
+  } while (idx < height_1);
+
+  src_line = _mm_loadu_si128((const __m128i *)ref);
+  t0 = _mm_unpacklo_epi8(src_line, zero);
+  t1 = _mm_unpackhi_epi8(src_line, zero);
+  s0 = _mm_adds_epu16(s0, t0);
+  s1 = _mm_adds_epu16(s1, t1);
+  if (height == 128) {
+    s0 = _mm_srai_epi16(s0, 6);
+    s1 = _mm_srai_epi16(s1, 6);
+  } else if (height == 64) {
+    s0 = _mm_srai_epi16(s0, 5);
+    s1 = _mm_srai_epi16(s1, 5);
+  } else if (height == 32) {
+    s0 = _mm_srai_epi16(s0, 4);
+    s1 = _mm_srai_epi16(s1, 4);
+  } else {
+    assert(height == 16);
+    s0 = _mm_srai_epi16(s0, 3);
+    s1 = _mm_srai_epi16(s1, 3);
+  }
+
+  _mm_storeu_si128((__m128i *)hbuf, s0);
+  hbuf += 8;
+  _mm_storeu_si128((__m128i *)hbuf, s1);
+}
+
+int16_t aom_int_pro_col_sse2(const uint8_t *ref, const int width) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
+  __m128i s0 = _mm_sad_epu8(src_line, zero);
+  __m128i s1;
+  int i;
+
+  for (i = 16; i < width; i += 16) {
+    ref += 16;
+    src_line = _mm_loadu_si128((const __m128i *)ref);
+    s1 = _mm_sad_epu8(src_line, zero);
+    s0 = _mm_adds_epu16(s0, s1);
+  }
+
+  s1 = _mm_srli_si128(s0, 8);
+  s0 = _mm_adds_epu16(s0, s1);
+
+  return _mm_extract_epi16(s0, 0);
+}
diff --git a/media/libaom/src/aom_dsp/x86/bitdepth_conversion_avx2.h b/media/libaom/src/aom_dsp/x86/bitdepth_conversion_avx2.h
new file mode 100644
index 0000000000..85896e2768
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/bitdepth_conversion_avx2.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE __m256i load_tran_low(const tran_low_t *a) {
+  const __m256i a_low = _mm256_loadu_si256((const __m256i *)a);
+  const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8));
+  return _mm256_packs_epi32(a_low, a_high);
+}
+
+static INLINE void store_tran_low(__m256i a, tran_low_t *b) {
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i a_hi = _mm256_mulhi_epi16(a, one);
+  const __m256i a_lo = _mm256_mullo_epi16(a, one);
+  const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi);
+  const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi);
+  _mm256_storeu_si256((__m256i *)b, a_1);
+  _mm256_storeu_si256((__m256i *)(b + 8), a_2);
+}
diff --git a/media/libaom/src/aom_dsp/x86/bitdepth_conversion_sse2.h b/media/libaom/src/aom_dsp/x86/bitdepth_conversion_sse2.h
new file mode 100644
index 0000000000..42bb2d1d32
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/bitdepth_conversion_sse2.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <xmmintrin.h>
+
+#include "config/aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+// Load 8 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE __m128i load_tran_low(const tran_low_t *a) {
+  const __m128i a_low = _mm_load_si128((const __m128i *)a);
+  return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a_hi = _mm_mulhi_epi16(a, one);
+  const __m128i a_lo = _mm_mullo_epi16(a, one);
+  const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
+  const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
+  _mm_store_si128((__m128i *)(b), a_1);
+  _mm_store_si128((__m128i *)(b + 4), a_2);
+}
diff --git a/media/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c b/media/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c
index 4f5e3f8c1b..e0289abe12 100644
--- a/media/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c
+++ b/media/libaom/src/aom_dsp/x86/blend_a64_hmask_sse4.c
@@ -24,6 +24,7 @@ void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
                             src1_stride, mask, 0, w, h, 0, 0);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 void aom_highbd_blend_a64_hmask_sse4_1(
     uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
     uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
@@ -32,3 +33,4 @@ void aom_highbd_blend_a64_hmask_sse4_1(
                                    src1_8, src1_stride, mask, 0, w, h, 0, 0,
                                    bd);
 }
+#endif
diff --git a/media/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c b/media/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c
index 67fb4d32bd..95383d2fd1 100644
--- a/media/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -870,7 +870,7 @@ void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
                              const uint8_t *src0, uint32_t src0_stride,
                              const uint8_t *src1, uint32_t src1_stride,
                              const uint8_t *mask, uint32_t mask_stride, int w,
-                             int h, int subx, int suby) {
+                             int h, int subw, int subh) {
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
   assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
 
@@ -881,15 +881,15 @@ void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
 
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                         mask, mask_stride, w, h, subx, suby);
+                         mask, mask_stride, w, h, subw, subh);
   } else {
-    if (subx & suby) {
+    if (subw & subh) {
       blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
                                 src1_stride, mask, mask_stride, w, h);
-    } else if (subx) {
+    } else if (subw) {
       blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
                              src1_stride, mask, mask_stride, w, h);
-    } else if (suby) {
+    } else if (subh) {
       blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
                              src1_stride, mask, mask_stride, w, h);
     } else {
@@ -898,3 +898,477 @@ void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
     }
   }
 }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// aom_highbd_blend_a64_d16_mask_avx2()
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void highbd_blend_a64_d16_mask_w4_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0,
+    const __m256i *round_offset, int shift, const __m256i *clip_low,
+    const __m256i *clip_high, const __m256i *mask_max) {
+  // Load 4x u16 pixels from each of 4 rows from each source
+  const __m256i s0 = _mm256_set_epi64x(*(uint64_t *)(src0 + 3 * src0_stride),
+                                       *(uint64_t *)(src0 + 2 * src0_stride),
+                                       *(uint64_t *)(src0 + 1 * src0_stride),
+                                       *(uint64_t *)(src0 + 0 * src0_stride));
+  const __m256i s1 = _mm256_set_epi64x(*(uint64_t *)(src1 + 3 * src1_stride),
+                                       *(uint64_t *)(src1 + 2 * src1_stride),
+                                       *(uint64_t *)(src1 + 1 * src1_stride),
+                                       *(uint64_t *)(src1 + 0 * src1_stride));
+  // Generate the inverse mask
+  const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0);
+
+  // Multiply each mask by the respective source
+  const __m256i mul0_highs = _mm256_mulhi_epu16(*mask0, s0);
+  const __m256i mul0_lows = _mm256_mullo_epi16(*mask0, s0);
+  const __m256i mul0h = _mm256_unpackhi_epi16(mul0_lows, mul0_highs);
+  const __m256i mul0l = _mm256_unpacklo_epi16(mul0_lows, mul0_highs);
+  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
+  // lanes Later, packs does the same again which cancels this out with no need
+  // for a permute.  The intermediate values being reordered makes no difference
+
+  const __m256i mul1_highs = _mm256_mulhi_epu16(mask1, s1);
+  const __m256i mul1_lows = _mm256_mullo_epi16(mask1, s1);
+  const __m256i mul1h = _mm256_unpackhi_epi16(mul1_lows, mul1_highs);
+  const __m256i mul1l = _mm256_unpacklo_epi16(mul1_lows, mul1_highs);
+
+  const __m256i sumh = _mm256_add_epi32(mul0h, mul1h);
+  const __m256i suml = _mm256_add_epi32(mul0l, mul1l);
+
+  const __m256i roundh =
+      _mm256_srai_epi32(_mm256_sub_epi32(sumh, *round_offset), shift);
+  const __m256i roundl =
+      _mm256_srai_epi32(_mm256_sub_epi32(suml, *round_offset), shift);
+
+  const __m256i pack = _mm256_packs_epi32(roundl, roundh);
+  const __m256i clip =
+      _mm256_min_epi16(_mm256_max_epi16(pack, *clip_low), *clip_high);
+
+  // _mm256_extract_epi64 doesn't exist on x86, so do it the old-fashioned way:
+  const __m128i cliph = _mm256_extracti128_si256(clip, 1);
+  xx_storel_64(dst + 3 * dst_stride, _mm_srli_si128(cliph, 8));
+  xx_storel_64(dst + 2 * dst_stride, cliph);
+  const __m128i clipl = _mm256_castsi256_si128(clip);
+  xx_storel_64(dst + 1 * dst_stride, _mm_srli_si128(clipl, 8));
+  xx_storel_64(dst + 0 * dst_stride, clipl);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m256i *round_offset, int shift, const __m256i *clip_low,
+    const __m256i *clip_high, const __m256i *mask_max) {
+  do {
+    // Load 8x u8 pixels from each of 4 rows of the mask, pad each to u16
+    const __m128i mask08 = _mm_set_epi32(*(uint32_t *)(mask + 3 * mask_stride),
+                                         *(uint32_t *)(mask + 2 * mask_stride),
+                                         *(uint32_t *)(mask + 1 * mask_stride),
+                                         *(uint32_t *)(mask + 0 * mask_stride));
+    const __m256i mask0 = _mm256_cvtepu8_epi16(mask08);
+
+    highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                      src1_stride, &mask0, round_offset, shift,
+                                      clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 4;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m256i *round_offset, int shift, const __m256i *clip_low,
+    const __m256i *clip_high, const __m256i *mask_max) {
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i two_w = _mm256_set1_epi16(2);
+  do {
+    // Load 8 pixels from each of 8 rows of mask,
+    // (saturating) add together rows then use madd to add adjacent pixels
+    // Finally, divide each value by 4 (with rounding)
+    const __m256i m0246 =
+        _mm256_set_epi64x(*(uint64_t *)(mask + 6 * mask_stride),
+                          *(uint64_t *)(mask + 4 * mask_stride),
+                          *(uint64_t *)(mask + 2 * mask_stride),
+                          *(uint64_t *)(mask + 0 * mask_stride));
+    const __m256i m1357 =
+        _mm256_set_epi64x(*(uint64_t *)(mask + 7 * mask_stride),
+                          *(uint64_t *)(mask + 5 * mask_stride),
+                          *(uint64_t *)(mask + 3 * mask_stride),
+                          *(uint64_t *)(mask + 1 * mask_stride));
+    const __m256i addrows = _mm256_adds_epu8(m0246, m1357);
+    const __m256i adjacent = _mm256_maddubs_epi16(addrows, one_b);
+    const __m256i mask0 =
+        _mm256_srli_epi16(_mm256_add_epi16(adjacent, two_w), 2);
+
+    highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                      src1_stride, &mask0, round_offset, shift,
+                                      clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 8;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w8_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
+    const __m256i *mask0b, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  // Load 8x u16 pixels from each of 4 rows from each source
+  const __m256i s0a =
+      yy_loadu2_128(src0 + 0 * src0_stride, src0 + 1 * src0_stride);
+  const __m256i s0b =
+      yy_loadu2_128(src0 + 2 * src0_stride, src0 + 3 * src0_stride);
+  const __m256i s1a =
+      yy_loadu2_128(src1 + 0 * src1_stride, src1 + 1 * src1_stride);
+  const __m256i s1b =
+      yy_loadu2_128(src1 + 2 * src1_stride, src1 + 3 * src1_stride);
+
+  // Generate inverse masks
+  const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
+  const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
+
+  // Multiply sources by respective masks
+  const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
+  const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
+  const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
+  const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
+  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
+  // lanes Later, packs does the same again which cancels this out with no need
+  // for a permute.  The intermediate values being reordered makes no difference
+
+  const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
+  const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
+  const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
+  const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+  const __m256i sumah = _mm256_add_epi32(mul0ah, mul1ah);
+  const __m256i sumal = _mm256_add_epi32(mul0al, mul1al);
+
+  const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
+  const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
+  const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
+  const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
+
+  const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
+  const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
+  const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
+  const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+  const __m256i sumbh = _mm256_add_epi32(mul0bh, mul1bh);
+  const __m256i sumbl = _mm256_add_epi32(mul0bl, mul1bl);
+
+  // Divide down each result, with rounding
+  const __m256i roundah =
+      _mm256_srai_epi32(_mm256_sub_epi32(sumah, *round_offset), shift);
+  const __m256i roundal =
+      _mm256_srai_epi32(_mm256_sub_epi32(sumal, *round_offset), shift);
+  const __m256i roundbh =
+      _mm256_srai_epi32(_mm256_sub_epi32(sumbh, *round_offset), shift);
+  const __m256i roundbl =
+      _mm256_srai_epi32(_mm256_sub_epi32(sumbl, *round_offset), shift);
+
+  // Pack each i32 down to an i16 with saturation, then clip to valid range
+  const __m256i packa = _mm256_packs_epi32(roundal, roundah);
+  const __m256i clipa =
+      _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
+  const __m256i packb = _mm256_packs_epi32(roundbl, roundbh);
+  const __m256i clipb =
+      _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
+
+  // Store 8x u16 pixels to each of 4 rows in the destination
+  yy_storeu2_128(dst + 0 * dst_stride, dst + 1 * dst_stride, clipa);
+  yy_storeu2_128(dst + 2 * dst_stride, dst + 3 * dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+    int mask_stride, int h, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  do {
+    // Load 8x u8 pixels from each of 4 rows in the mask
+    const __m128i mask0a8 =
+        _mm_set_epi64x(*(uint64_t *)mask, *(uint64_t *)(mask + mask_stride));
+    const __m128i mask0b8 =
+        _mm_set_epi64x(*(uint64_t *)(mask + 2 * mask_stride),
+                       *(uint64_t *)(mask + 3 * mask_stride));
+    const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8);
+    const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8);
+
+    highbd_blend_a64_d16_mask_w8_avx2(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
+        round_offset, shift, clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 4;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+    int mask_stride, int h, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i two_w = _mm256_set1_epi16(2);
+  do {
+    // Load 16x u8 pixels from each of 8 rows in the mask,
+    // (saturating) add together rows then use madd to add adjacent pixels
+    // Finally, divide each value by 4 (with rounding)
+    const __m256i m02 =
+        yy_loadu2_128(mask + 0 * mask_stride, mask + 2 * mask_stride);
+    const __m256i m13 =
+        yy_loadu2_128(mask + 1 * mask_stride, mask + 3 * mask_stride);
+    const __m256i m0123 =
+        _mm256_maddubs_epi16(_mm256_adds_epu8(m02, m13), one_b);
+    const __m256i mask_0a =
+        _mm256_srli_epi16(_mm256_add_epi16(m0123, two_w), 2);
+    const __m256i m46 =
+        yy_loadu2_128(mask + 4 * mask_stride, mask + 6 * mask_stride);
+    const __m256i m57 =
+        yy_loadu2_128(mask + 5 * mask_stride, mask + 7 * mask_stride);
+    const __m256i m4567 =
+        _mm256_maddubs_epi16(_mm256_adds_epu8(m46, m57), one_b);
+    const __m256i mask_0b =
+        _mm256_srli_epi16(_mm256_add_epi16(m4567, two_w), 2);
+
+    highbd_blend_a64_d16_mask_w8_avx2(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
+        &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 8;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w16_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
+    const __m256i *mask0b, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  // Load 16x pixels from each of 2 rows from each source
+  const __m256i s0a = yy_loadu_256(src0);
+  const __m256i s0b = yy_loadu_256(src0 + src0_stride);
+  const __m256i s1a = yy_loadu_256(src1);
+  const __m256i s1b = yy_loadu_256(src1 + src1_stride);
+
+  // Calculate inverse masks
+  const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
+  const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
+
+  // Multiply each source by appropriate mask
+  const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
+  const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
+  const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
+  const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
+  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
+  // lanes Later, packs does the same again which cancels this out with no need
+  // for a permute.  The intermediate values being reordered makes no difference
+
+  const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
+  const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
+  const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
+  const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+  const __m256i mulah = _mm256_add_epi32(mul0ah, mul1ah);
+  const __m256i mulal = _mm256_add_epi32(mul0al, mul1al);
+
+  const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
+  const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
+  const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
+  const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
+
+  const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
+  const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
+  const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
+  const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+  const __m256i mulbh = _mm256_add_epi32(mul0bh, mul1bh);
+  const __m256i mulbl = _mm256_add_epi32(mul0bl, mul1bl);
+
+  const __m256i resah =
+      _mm256_srai_epi32(_mm256_sub_epi32(mulah, *round_offset), shift);
+  const __m256i resal =
+      _mm256_srai_epi32(_mm256_sub_epi32(mulal, *round_offset), shift);
+  const __m256i resbh =
+      _mm256_srai_epi32(_mm256_sub_epi32(mulbh, *round_offset), shift);
+  const __m256i resbl =
+      _mm256_srai_epi32(_mm256_sub_epi32(mulbl, *round_offset), shift);
+
+  // Signed saturating pack from i32 to i16:
+  const __m256i packa = _mm256_packs_epi32(resal, resah);
+  const __m256i packb = _mm256_packs_epi32(resbl, resbh);
+
+  // Clip the values to the valid range
+  const __m256i clipa =
+      _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
+  const __m256i clipb =
+      _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
+
+  // Store 16 pixels
+  yy_storeu_256(dst, clipa);
+  yy_storeu_256(dst + dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+    int mask_stride, int h, int w, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  for (int i = 0; i < h; i += 2) {
+    for (int j = 0; j < w; j += 16) {
+      // Load 16x u8 alpha-mask values from each of two rows and pad to u16
+      const __m128i masks_a8 = xx_loadu_128(mask + j);
+      const __m128i masks_b8 = xx_loadu_128(mask + mask_stride + j);
+      const __m256i mask0a = _mm256_cvtepu8_epi16(masks_a8);
+      const __m256i mask0b = _mm256_cvtepu8_epi16(masks_b8);
+
+      highbd_blend_a64_d16_mask_w16_avx2(
+          dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
+          &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
+    }
+    dst += dst_stride * 2;
+    src0 += src0_stride * 2;
+    src1 += src1_stride * 2;
+    mask += mask_stride * 2;
+  }
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+    int mask_stride, int h, int w, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i two_w = _mm256_set1_epi16(2);
+  for (int i = 0; i < h; i += 2) {
+    for (int j = 0; j < w; j += 16) {
+      // Load 32x u8 alpha-mask values from each of four rows
+      // (saturating) add pairs of rows, then use madd to add adjacent values
+      // Finally, divide down each result with rounding
+      const __m256i m0 = yy_loadu_256(mask + 0 * mask_stride + 2 * j);
+      const __m256i m1 = yy_loadu_256(mask + 1 * mask_stride + 2 * j);
+      const __m256i m2 = yy_loadu_256(mask + 2 * mask_stride + 2 * j);
+      const __m256i m3 = yy_loadu_256(mask + 3 * mask_stride + 2 * j);
+
+      const __m256i m01_8 = _mm256_adds_epu8(m0, m1);
+      const __m256i m23_8 = _mm256_adds_epu8(m2, m3);
+
+      const __m256i m01 = _mm256_maddubs_epi16(m01_8, one_b);
+      const __m256i m23 = _mm256_maddubs_epi16(m23_8, one_b);
+
+      const __m256i mask0a = _mm256_srli_epi16(_mm256_add_epi16(m01, two_w), 2);
+      const __m256i mask0b = _mm256_srli_epi16(_mm256_add_epi16(m23, two_w), 2);
+
+      highbd_blend_a64_d16_mask_w16_avx2(
+          dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
+          &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
+    }
+    dst += dst_stride * 2;
+    src0 += src0_stride * 2;
+    src1 += src1_stride * 2;
+    mask += mask_stride * 4;
+  }
+}
+
+void aom_highbd_blend_a64_d16_mask_avx2(
+    uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params, const int bd) {
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int32_t round_offset =
+      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+       (1 << (round_bits - 1)))
+      << AOM_BLEND_A64_ROUND_BITS;
+  const __m256i v_round_offset = _mm256_set1_epi32(round_offset);
+  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+
+  const __m256i clip_low = _mm256_set1_epi16(0);
+  const __m256i clip_high = _mm256_set1_epi16((1 << bd) - 1);
+  const __m256i mask_max = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    switch (w) {
+      case 4:
+        highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      case 8:
+        highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      default:  // >= 16
+        highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+    }
+
+  } else if (subw == 1 && subh == 1) {
+    switch (w) {
+      case 4:
+        highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      case 8:
+        highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      default:  // >= 16
+        highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+    }
+  } else {
+    // Sub-sampling in only one axis doesn't seem to happen very much, so fall
+    // back to the vanilla C implementation instead of having all the optimised
+    // code for these.
+    aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, w, h, subw,
+                                    subh, conv_params, bd);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c b/media/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c
index 9d6b4c2f74..4a368ef947 100644
--- a/media/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c
+++ b/media/libaom/src/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -339,8 +339,8 @@ static void blend_a64_mask_sx_sy_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
     uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
   const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
   const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
   do {
@@ -386,7 +386,7 @@ void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
                                const uint8_t *src0, uint32_t src0_stride,
                                const uint8_t *src1, uint32_t src1_stride,
                                const uint8_t *mask, uint32_t mask_stride, int w,
-                               int h, int subx, int suby) {
+                               int h, int subw, int subh) {
   typedef void (*blend_fn)(
       uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
       uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
@@ -415,14 +415,15 @@ void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
 
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                         mask, mask_stride, w, h, subx, suby);
+                         mask, mask_stride, w, h, subw, subh);
   } else {
-    blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0,
+    blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0,
                                               src0_stride, src1, src1_stride,
                                               mask, mask_stride, w, h);
   }
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 //////////////////////////////////////////////////////////////////////////////
 // No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
@@ -518,8 +519,8 @@ static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -565,8 +566,8 @@ static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h,
     blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -710,8 +711,8 @@ static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -762,8 +763,8 @@ static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride, int w, int h,
     blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
 
   do {
@@ -812,20 +813,19 @@ static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
 //////////////////////////////////////////////////////////////////////////////
 // Dispatch
 //////////////////////////////////////////////////////////////////////////////
-
 void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
                                       const uint8_t *src0_8,
                                       uint32_t src0_stride,
                                       const uint8_t *src1_8,
                                       uint32_t src1_stride, const uint8_t *mask,
                                       uint32_t mask_stride, int w, int h,
-                                      int subx, int suby, int bd) {
+                                      int subw, int subh, int bd) {
   typedef void (*blend_fn)(
       uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
       uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
       const uint8_t *mask, uint32_t mask_stride, int w, int h);
 
-  // Dimensions are: bd_index X width_index X subx X suby
+  // Dimensions are: bd_index X width_index X subw X subh
   static const blend_fn blend[2][2][2][2] = {
     {   // bd == 8 or 10
       { // w % 8 == 0
@@ -858,18 +858,19 @@ void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
   assert(bd == 8 || bd == 10 || bd == 12);
   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
     aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                                src1_stride, mask, mask_stride, w, h, subx,
-                                suby, bd);
+                                src1_stride, mask, mask_stride, w, h, subw,
+                                subh, bd);
   } else {
     uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
     const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
     const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
 
-    blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](
+    blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0](
         dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
         mask_stride, w, h);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 static INLINE void blend_a64_d16_mask_w16_sse41(
     uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
@@ -1107,3 +1108,453 @@ void aom_lowbd_blend_a64_d16_mask_sse4_1(
     }
   }
 }
+
+//////////////////////////////////////////////////////////////////////////////
+// aom_highbd_blend_a64_d16_mask_sse4_1()
+//////////////////////////////////////////////////////////////////////////////
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_blend_a64_d16_mask_w4_sse4_1(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
+    const __m128i *mask0b, const __m128i *round_offset, int shift,
+    const __m128i *clip_low, const __m128i *clip_high,
+    const __m128i *mask_max) {
+  // Load 4 pixels from each of 4 rows from each source
+  const __m128i s0a =
+      _mm_set_epi64x(*(uint64_t *)src0, *(uint64_t *)(src0 + src0_stride));
+  const __m128i s0b = _mm_set_epi64x(*(uint64_t *)(src0 + 2 * src0_stride),
+                                     *(uint64_t *)(src0 + 3 * src0_stride));
+  const __m128i s1a =
+      _mm_set_epi64x(*(uint64_t *)(src1), *(uint64_t *)(src1 + src1_stride));
+  const __m128i s1b = _mm_set_epi64x(*(uint64_t *)(src1 + 2 * src1_stride),
+                                     *(uint64_t *)(src1 + 3 * src1_stride));
+
+  // Generate the inverse masks
+  const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a);
+  const __m128i mask1b = _mm_sub_epi16(*mask_max, *mask0b);
+
+  // Multiply each mask by the respective source
+  const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a);
+  const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a);
+  const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs);
+  const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs);
+  const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a);
+  const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a);
+  const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs);
+  const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+  const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b);
+  const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b);
+  const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs);
+  const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs);
+  const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b);
+  const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b);
+  const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs);
+  const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+  const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah);
+  const __m128i sumal = _mm_add_epi32(mul0al, mul1al);
+  const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh);
+  const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl);
+
+  const __m128i roundah =
+      _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift);
+  const __m128i roundbh =
+      _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift);
+  const __m128i roundal =
+      _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift);
+  const __m128i roundbl =
+      _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift);
+
+  const __m128i packa = _mm_packs_epi32(roundal, roundah);
+  const __m128i packb = _mm_packs_epi32(roundbl, roundbh);
+
+  const __m128i clipa =
+      _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high);
+  const __m128i clipb =
+      _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high);
+
+  xx_storel_64(dst, _mm_srli_si128(clipa, 8));
+  xx_storel_64(dst + dst_stride, clipa);
+  xx_storel_64(dst + 2 * dst_stride, _mm_srli_si128(clipb, 8));
+  xx_storel_64(dst + 3 * dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *mask_max) {
+  do {
+    const __m128i mask0a8 = _mm_set_epi32(0, 0, *(uint32_t *)mask,
+                                          *(uint32_t *)(mask + mask_stride));
+    const __m128i mask0b8 =
+        _mm_set_epi32(0, 0, *(uint32_t *)(mask + 2 * mask_stride),
+                      *(uint32_t *)(mask + 3 * mask_stride));
+    const __m128i mask0a = _mm_cvtepu8_epi16(mask0a8);
+    const __m128i mask0b = _mm_cvtepu8_epi16(mask0b8);
+
+    highbd_blend_a64_d16_mask_w4_sse4_1(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
+        round_offset, shift, clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 4;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *mask_max) {
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  do {
+    // Load 8 pixels from each of 8 rows of mask,
+    // (saturating) add together rows then use madd to add adjacent pixels
+    // Finally, divide each value by 4 (with rounding)
+    const __m128i m02 = _mm_set_epi64x(*(uint64_t *)(mask),
+                                       *(uint64_t *)(mask + 2 * mask_stride));
+    const __m128i m13 = _mm_set_epi64x(*(uint64_t *)(mask + mask_stride),
+                                       *(uint64_t *)(mask + 3 * mask_stride));
+    const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b);
+    const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2);
+    const __m128i m46 = _mm_set_epi64x(*(uint64_t *)(mask + 4 * mask_stride),
+                                       *(uint64_t *)(mask + 6 * mask_stride));
+    const __m128i m57 = _mm_set_epi64x(*(uint64_t *)(mask + 5 * mask_stride),
+                                       *(uint64_t *)(mask + 7 * mask_stride));
+    const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b);
+    const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2);
+
+    highbd_blend_a64_d16_mask_w4_sse4_1(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
+        &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 8;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w8_sse4_1(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
+    const __m128i *mask0b, const __m128i *round_offset, int shift,
+    const __m128i *clip_low, const __m128i *clip_high,
+    const __m128i *max_mask) {
+  // Load 8x pixels from each of 2 rows from each source
+  const __m128i s0a = xx_loadu_128(src0);
+  const __m128i s0b = xx_loadu_128(src0 + src0_stride);
+  const __m128i s1a = xx_loadu_128(src1);
+  const __m128i s1b = xx_loadu_128(src1 + src1_stride);
+
+  // Generate inverse masks
+  const __m128i mask1a = _mm_sub_epi16(*max_mask, *mask0a);
+  const __m128i mask1b = _mm_sub_epi16(*max_mask, *mask0b);
+
+  // Multiply sources by respective masks
+  const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a);
+  const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a);
+  const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs);
+  const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs);
+
+  const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a);
+  const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a);
+  const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs);
+  const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+  const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah);
+  const __m128i sumal = _mm_add_epi32(mul0al, mul1al);
+
+  const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b);
+  const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b);
+  const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs);
+  const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs);
+
+  const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b);
+  const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b);
+  const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs);
+  const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+  const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh);
+  const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl);
+
+  const __m128i roundah =
+      _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift);
+  const __m128i roundal =
+      _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift);
+  const __m128i roundbh =
+      _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift);
+  const __m128i roundbl =
+      _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift);
+
+  const __m128i packa = _mm_packs_epi32(roundal, roundah);
+  const __m128i clipa =
+      _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high);
+  const __m128i packb = _mm_packs_epi32(roundbl, roundbh);
+  const __m128i clipb =
+      _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high);
+
+  xx_storeu_128(dst, clipa);
+  xx_storeu_128(dst + dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *max_mask) {
+  do {
+    const __m128i mask0a = _mm_cvtepu8_epi16(xx_loadl_64(mask));
+    const __m128i mask0b = _mm_cvtepu8_epi16(xx_loadl_64(mask + mask_stride));
+    highbd_blend_a64_d16_mask_w8_sse4_1(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
+        round_offset, shift, clip_low, clip_high, max_mask);
+
+    dst += dst_stride * 2;
+    src0 += src0_stride * 2;
+    src1 += src1_stride * 2;
+    mask += mask_stride * 2;
+  } while (h -= 2);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *max_mask) {
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  do {
+    const __m128i mask_thisrowa = xx_loadu_128(mask);
+    const __m128i mask_nextrowa = xx_loadu_128(mask + mask_stride);
+    const __m128i mask_thisrowb = xx_loadu_128(mask + 2 * mask_stride);
+    const __m128i mask_nextrowb = xx_loadu_128(mask + 3 * mask_stride);
+    const __m128i mask_bothrowsa = _mm_adds_epu8(mask_thisrowa, mask_nextrowa);
+    const __m128i mask_bothrowsb = _mm_adds_epu8(mask_thisrowb, mask_nextrowb);
+    const __m128i mask_16a = _mm_maddubs_epi16(mask_bothrowsa, one_b);
+    const __m128i mask_16b = _mm_maddubs_epi16(mask_bothrowsb, one_b);
+    const __m128i mask_sa = _mm_srli_epi16(_mm_add_epi16(mask_16a, two_w), 2);
+    const __m128i mask_sb = _mm_srli_epi16(_mm_add_epi16(mask_16b, two_w), 2);
+
+    highbd_blend_a64_d16_mask_w8_sse4_1(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_sa,
+        &mask_sb, round_offset, shift, clip_low, clip_high, max_mask);
+
+    dst += dst_stride * 2;
+    src0 += src0_stride * 2;
+    src1 += src1_stride * 2;
+    mask += mask_stride * 4;
+  } while (h -= 2);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w16_sse4_1(
+    uint16_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m128i *round_offset, int shift, const __m128i *mask0l,
+    const __m128i *mask0h, const __m128i *clip_low, const __m128i *clip_high,
+    const __m128i *mask_max) {
+  // Load 16x u16 pixels for this row from each src
+  const __m128i s0l = xx_loadu_128(src0);
+  const __m128i s0h = xx_loadu_128(src0 + 8);
+  const __m128i s1l = xx_loadu_128(src1);
+  const __m128i s1h = xx_loadu_128(src1 + 8);
+
+  // Calculate inverse masks
+  const __m128i mask1h = _mm_sub_epi16(*mask_max, *mask0h);
+  const __m128i mask1l = _mm_sub_epi16(*mask_max, *mask0l);
+
+  const __m128i mul0_highs = _mm_mulhi_epu16(*mask0h, s0h);
+  const __m128i mul0_lows = _mm_mullo_epi16(*mask0h, s0h);
+  const __m128i mul0h = _mm_unpackhi_epi16(mul0_lows, mul0_highs);
+  const __m128i mul0l = _mm_unpacklo_epi16(mul0_lows, mul0_highs);
+
+  const __m128i mul1_highs = _mm_mulhi_epu16(mask1h, s1h);
+  const __m128i mul1_lows = _mm_mullo_epi16(mask1h, s1h);
+  const __m128i mul1h = _mm_unpackhi_epi16(mul1_lows, mul1_highs);
+  const __m128i mul1l = _mm_unpacklo_epi16(mul1_lows, mul1_highs);
+
+  const __m128i mulhh = _mm_add_epi32(mul0h, mul1h);
+  const __m128i mulhl = _mm_add_epi32(mul0l, mul1l);
+
+  const __m128i mul2_highs = _mm_mulhi_epu16(*mask0l, s0l);
+  const __m128i mul2_lows = _mm_mullo_epi16(*mask0l, s0l);
+  const __m128i mul2h = _mm_unpackhi_epi16(mul2_lows, mul2_highs);
+  const __m128i mul2l = _mm_unpacklo_epi16(mul2_lows, mul2_highs);
+
+  const __m128i mul3_highs = _mm_mulhi_epu16(mask1l, s1l);
+  const __m128i mul3_lows = _mm_mullo_epi16(mask1l, s1l);
+  const __m128i mul3h = _mm_unpackhi_epi16(mul3_lows, mul3_highs);
+  const __m128i mul3l = _mm_unpacklo_epi16(mul3_lows, mul3_highs);
+
+  const __m128i mullh = _mm_add_epi32(mul2h, mul3h);
+  const __m128i mulll = _mm_add_epi32(mul2l, mul3l);
+
+  const __m128i reshh =
+      _mm_srai_epi32(_mm_sub_epi32(mulhh, *round_offset), shift);
+  const __m128i reshl =
+      _mm_srai_epi32(_mm_sub_epi32(mulhl, *round_offset), shift);
+  const __m128i reslh =
+      _mm_srai_epi32(_mm_sub_epi32(mullh, *round_offset), shift);
+  const __m128i resll =
+      _mm_srai_epi32(_mm_sub_epi32(mulll, *round_offset), shift);
+
+  // Signed saturating pack from i32 to i16:
+  const __m128i packh = _mm_packs_epi32(reshl, reshh);
+  const __m128i packl = _mm_packs_epi32(resll, reslh);
+
+  // Clip the values to the valid range
+  const __m128i cliph =
+      _mm_min_epi16(_mm_max_epi16(packh, *clip_low), *clip_high);
+  const __m128i clipl =
+      _mm_min_epi16(_mm_max_epi16(packl, *clip_low), *clip_high);
+
+  // Store 16 pixels
+  xx_storeu_128(dst, clipl);
+  xx_storeu_128(dst + 8, cliph);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *mask_max) {
+  for (int i = 0; i < h; i++) {
+    for (int j = 0; j < w; j += 16) {
+      // Load 16x u8 alpha-mask values and pad to u16
+      const __m128i masks_u8 = xx_loadu_128(mask + j);
+      const __m128i mask0l = _mm_cvtepu8_epi16(masks_u8);
+      const __m128i mask0h = _mm_cvtepu8_epi16(_mm_srli_si128(masks_u8, 8));
+
+      highbd_blend_a64_d16_mask_w16_sse4_1(
+          dst + j, src0 + j, src1 + j, round_offset, shift, &mask0l, &mask0h,
+          clip_low, clip_high, mask_max);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  }
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *mask_max) {
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  for (int i = 0; i < h; i++) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
+      const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
+
+      const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
+      const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
+      const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
+      const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
+      const __m128i mask_l = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
+      const __m128i mask_h = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
+
+      highbd_blend_a64_d16_mask_w16_sse4_1(
+          dst + j, src0 + j, src1 + j, round_offset, shift, &mask_l, &mask_h,
+          clip_low, clip_high, mask_max);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride * 2;
+  }
+}
+
+void aom_highbd_blend_a64_d16_mask_sse4_1(
+    uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params, const int bd) {
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int32_t round_offset =
+      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+       (1 << (round_bits - 1)))
+      << AOM_BLEND_A64_ROUND_BITS;
+  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+
+  const __m128i clip_low = _mm_set1_epi16(0);
+  const __m128i clip_high = _mm_set1_epi16((1 << bd) - 1);
+  const __m128i mask_max = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    switch (w) {
+      case 4:
+        highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      case 8:
+        highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      default:  // >=16
+        highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+    }
+
+  } else if (subw == 1 && subh == 1) {
+    switch (w) {
+      case 4:
+        highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      case 8:
+        highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      default:  // >=16
+        highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+    }
+  } else {
+    // Sub-sampling in only one axis doesn't seem to happen very much, so fall
+    // back to the vanilla C implementation instead of having all the optimised
+    // code for these.
+    aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, w, h, subw,
+                                    subh, conv_params, bd);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c b/media/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c
index 0649102325..75fb1c5a94 100644
--- a/media/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c
+++ b/media/libaom/src/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -143,6 +143,7 @@ void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
                  h);
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 //////////////////////////////////////////////////////////////////////////////
 // Implementation - No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
@@ -281,3 +282,4 @@ void aom_highbd_blend_a64_vmask_sse4_1(
                                   src1_stride, mask, w, h);
   }
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/aom_dsp/x86/blk_sse_sum_avx2.c b/media/libaom/src/aom_dsp/x86/blk_sse_sum_avx2.c
new file mode 100644
index 0000000000..f7c0eb0370
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/blk_sse_sum_avx2.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void accumulate_sse_sum(__m256i regx_sum, __m256i regx2_sum,
+                                      int *x_sum, int64_t *x2_sum) {
+  __m256i sum_buffer, sse_buffer;
+  __m128i out_buffer;
+
+  // Accumulate the various elements of register into first element.
+  sum_buffer = _mm256_permute2f128_si256(regx_sum, regx_sum, 1);
+  regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+  regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 8));
+  regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 4));
+
+  sse_buffer = _mm256_permute2f128_si256(regx2_sum, regx2_sum, 1);
+  regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+  regx2_sum = _mm256_add_epi64(regx2_sum, _mm256_srli_si256(regx2_sum, 8));
+
+  out_buffer = _mm256_castsi256_si128(regx_sum);
+  *x_sum += _mm_cvtsi128_si32(out_buffer);
+  out_buffer = _mm256_castsi256_si128(regx2_sum);
+#if ARCH_X86_64
+  *x2_sum += _mm_cvtsi128_si64(out_buffer);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, out_buffer);
+    *x2_sum += tmp;
+  }
+#endif
+}
+
+static INLINE void sse_sum_wd4_avx2(const int16_t *data, int stride, int bh,
+                                    int *x_sum, int64_t *x2_sum) {
+  __m128i row1, row2, row3;
+  __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+      temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+  const int16_t *data_tmp = data;
+  __m256i one = _mm256_set1_epi16(1);
+  regx_sum = _mm256_setzero_si256();
+  regx2_sum = regx_sum;
+  sum_buffer = _mm256_setzero_si256();
+  sse_buffer = sum_buffer;
+
+  for (int j = 0; j < (bh >> 2); ++j) {
+    // Load 4 rows at a time.
+    row1 = _mm_loadl_epi64((__m128i const *)(data_tmp));
+    row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + stride));
+    row1 = _mm_unpacklo_epi64(row1, row2);
+    row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + 2 * stride));
+    row3 = _mm_loadl_epi64((__m128i const *)(data_tmp + 3 * stride));
+    row2 = _mm_unpacklo_epi64(row2, row3);
+    load_pixels =
+        _mm256_insertf128_si256(_mm256_castsi128_si256(row1), row2, 1);
+
+    row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+    row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+    sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+    sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+    data_tmp += 4 * stride;
+  }
+
+  // To prevent 32-bit variable overflow, unpack the elements to 64-bit.
+  temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+  temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+  sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+  regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+  regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+  accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+static INLINE void sse_sum_wd8_avx2(const int16_t *data, int stride, int bh,
+                                    int *x_sum, int64_t *x2_sum) {
+  __m128i load_128bit, load_next_128bit;
+  __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+      temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+  const int16_t *data_tmp = data;
+  __m256i one = _mm256_set1_epi16(1);
+  regx_sum = _mm256_setzero_si256();
+  regx2_sum = regx_sum;
+  sum_buffer = _mm256_setzero_si256();
+  sse_buffer = sum_buffer;
+
+  for (int j = 0; j < (bh >> 1); ++j) {
+    // Load 2 rows at a time.
+    load_128bit = _mm_loadu_si128((__m128i const *)(data_tmp));
+    load_next_128bit = _mm_loadu_si128((__m128i const *)(data_tmp + stride));
+    load_pixels = _mm256_insertf128_si256(_mm256_castsi128_si256(load_128bit),
+                                          load_next_128bit, 1);
+
+    row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+    row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+    sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+    sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+    data_tmp += 2 * stride;
+  }
+
+  temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+  temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+  sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+  regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+  regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+  accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+static INLINE void sse_sum_wd16_avx2(const int16_t *data, int stride, int bh,
+                                     int *x_sum, int64_t *x2_sum,
+                                     int loop_count) {
+  __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+      temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+  const int16_t *data_tmp = data;
+  __m256i one = _mm256_set1_epi16(1);
+  regx_sum = _mm256_setzero_si256();
+  regx2_sum = regx_sum;
+  sum_buffer = _mm256_setzero_si256();
+  sse_buffer = sum_buffer;
+
+  for (int i = 0; i < loop_count; ++i) {
+    data_tmp = data + 16 * i;
+    for (int j = 0; j < bh; ++j) {
+      load_pixels = _mm256_lddqu_si256((__m256i const *)(data_tmp));
+
+      row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+      row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+      sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+      sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+      data_tmp += stride;
+    }
+  }
+
+  temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+  temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+  sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+  regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+  regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+  accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+void aom_get_blk_sse_sum_avx2(const int16_t *data, int stride, int bw, int bh,
+                              int *x_sum, int64_t *x2_sum) {
+  *x_sum = 0;
+  *x2_sum = 0;
+
+  if ((bh & 3) == 0) {
+    switch (bw) {
+        // For smaller block widths, compute multiple rows simultaneously.
+      case 4: sse_sum_wd4_avx2(data, stride, bh, x_sum, x2_sum); break;
+      case 8: sse_sum_wd8_avx2(data, stride, bh, x_sum, x2_sum); break;
+      case 16:
+      case 32:
+        sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4);
+        break;
+      case 64:
+        // 32-bit variables will overflow for 64 rows at a single time, so
+        // compute 32 rows at a time.
+        if (bh <= 32) {
+          sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4);
+        } else {
+          sse_sum_wd16_avx2(data, stride, 32, x_sum, x2_sum, bw >> 4);
+          sse_sum_wd16_avx2(data + 32 * stride, stride, 32, x_sum, x2_sum,
+                            bw >> 4);
+        }
+        break;
+
+      default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+    }
+  } else {
+    aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+  }
+}
diff --git a/media/libaom/src/aom_dsp/x86/blk_sse_sum_sse2.c b/media/libaom/src/aom_dsp/x86/blk_sse_sum_sse2.c
new file mode 100644
index 0000000000..ef0a024eeb
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/blk_sse_sum_sse2.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void sse_sum_wd4_sse2(const int16_t *data, int stride, int bh,
+                                    int *x_sum, int64_t *x2_sum) {
+  const int16_t *data_tmp = data;
+  __m128i temp_buffer1, temp_buffer2;
+  __m128i load_pixels_low, load_pixels_hi, sum_buffer, sse_buffer;
+  __m128i one = _mm_set1_epi16(1);
+  __m128i regx_sum = _mm_setzero_si128();
+  __m128i regx2_sum = regx_sum;
+
+  for (int j = 0; j < (bh >> 1); ++j) {
+    // Load 2 rows (8 pixels) at a time.
+    load_pixels_low = _mm_loadl_epi64((__m128i const *)(data_tmp));
+    load_pixels_hi = _mm_loadl_epi64((__m128i const *)(data_tmp + stride));
+    load_pixels_low = _mm_unpacklo_epi64(load_pixels_low, load_pixels_hi);
+    sum_buffer = _mm_madd_epi16(load_pixels_low, one);
+    sse_buffer = _mm_madd_epi16(load_pixels_low, load_pixels_low);
+    regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
+    regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
+    data_tmp += 2 * stride;
+  }
+
+  regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
+  regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
+  *x_sum = _mm_cvtsi128_si32(regx_sum);
+  temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
+  temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
+  regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
+  regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
+#if ARCH_X86_64
+  *x2_sum += _mm_cvtsi128_si64(regx2_sum);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
+    *x2_sum += tmp;
+  }
+#endif
+}
+
+static INLINE void sse_sum_wd8_sse2(const int16_t *data, int stride, int bh,
+                                    int *x_sum, int64_t *x2_sum,
+                                    int loop_cycles) {
+  const int16_t *data_tmp;
+  __m128i temp_buffer1, temp_buffer2;
+  __m128i one = _mm_set1_epi16(1);
+  __m128i regx_sum = _mm_setzero_si128();
+  __m128i regx2_sum = regx_sum;
+  __m128i load_pixels, sum_buffer, sse_buffer;
+
+  for (int i = 0; i < loop_cycles; ++i) {
+    data_tmp = data + (8 * i);
+    for (int j = 0; j < bh; ++j) {
+      // Load 1 row (8-pixels) at a time.
+      load_pixels = _mm_loadu_si128((__m128i const *)(data_tmp));
+      sum_buffer = _mm_madd_epi16(load_pixels, one);
+      sse_buffer = _mm_madd_epi16(load_pixels, load_pixels);
+      regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
+      regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
+      data_tmp += stride;
+    }
+  }
+
+  regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
+  regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
+  *x_sum += _mm_cvtsi128_si32(regx_sum);
+  temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
+  temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
+  regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
+  regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
+#if ARCH_X86_64
+  *x2_sum += _mm_cvtsi128_si64(regx2_sum);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
+    *x2_sum += tmp;
+  }
+#endif
+}
+
+// This functions adds SSE2 Support for the functions 'get_blk_sse_sum_c'
+void aom_get_blk_sse_sum_sse2(const int16_t *data, int stride, int bw, int bh,
+                              int *x_sum, int64_t *x2_sum) {
+  *x_sum = 0;
+  *x2_sum = 0;
+
+  if ((bh & 3) == 0) {
+    switch (bw) {
+      case 4: sse_sum_wd4_sse2(data, stride, bh, x_sum, x2_sum); break;
+      case 8:
+      case 16:
+        sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+        break;
+        // For widths 32 and 64, the registers may overflow. So compute
+        // partial widths at a time.
+      case 32:
+        if (bh <= 32) {
+          sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+          break;
+        } else {
+          sse_sum_wd8_sse2(data, stride, 32, x_sum, x2_sum, bw >> 3);
+          sse_sum_wd8_sse2(data + 32 * stride, stride, 32, x_sum, x2_sum,
+                           bw >> 3);
+          break;
+        }
+
+      case 64:
+        if (bh <= 16) {
+          sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+          break;
+        } else {
+          for (int i = 0; i < bh; i += 16)
+            sse_sum_wd8_sse2(data + i * stride, stride, 16, x_sum, x2_sum,
+                             bw >> 3);
+          break;
+        }
+
+      default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+    }
+  } else {
+    aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+  }
+}
diff --git a/media/libaom/src/aom_dsp/x86/convolve.h b/media/libaom/src/aom_dsp/x86/convolve.h
index 3e19682cd9..b4ff6975cd 100644
--- a/media/libaom/src/aom_dsp/x86/convolve.h
+++ b/media/libaom/src/aom_dsp/x86/convolve.h
@@ -107,6 +107,7 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     }                                                                        \
   }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                        const ptrdiff_t src_pitch,
                                        uint16_t *output_ptr,
@@ -122,7 +123,30 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                             \
     if (step_q4 == 16 && filter[3] != 128) {                               \
-      if (filter[0] | filter[1] | filter[2]) {                             \
+      if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) &&        \
+          (filter[2] | filter[5])) {                                       \
+        while (w >= 16) {                                                  \
+          aom_highbd_filter_block1d16_##dir##4_##avg##opt(                 \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 16;                                                       \
+          dst += 16;                                                       \
+          w -= 16;                                                         \
+        }                                                                  \
+        while (w >= 8) {                                                   \
+          aom_highbd_filter_block1d8_##dir##4_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 8;                                                        \
+          dst += 8;                                                        \
+          w -= 8;                                                          \
+        }                                                                  \
+        while (w >= 4) {                                                   \
+          aom_highbd_filter_block1d4_##dir##4_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 4;                                                        \
+          dst += 4;                                                        \
+          w -= 4;                                                          \
+        }                                                                  \
+      } else if (filter[0] | filter[1] | filter[2]) {                      \
         while (w >= 16) {                                                  \
           aom_highbd_filter_block1d16_##dir##8_##avg##opt(                 \
               src_start, src_stride, dst, dst_stride, h, filter, bd);      \
@@ -174,5 +198,6 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
           dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \
     }                                                                      \
   }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 #endif  // AOM_AOM_DSP_X86_CONVOLVE_H_
diff --git a/media/libaom/src/aom_dsp/x86/convolve_avx2.h b/media/libaom/src/aom_dsp/x86/convolve_avx2.h
index 30253f65c2..d516de5f2f 100644
--- a/media/libaom/src/aom_dsp/x86/convolve_avx2.h
+++ b/media/libaom/src/aom_dsp/x86/convolve_avx2.h
@@ -34,6 +34,239 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
   2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
 };
 
+DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = {
+  3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255,
+  3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255
+};
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,
+                                           6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3,
+                                           3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                filt2_global_avx2[32]) = { 2, 3, 3, 4, 4,  5, 5, 6, 6, 7, 7,
+                                           8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5,
+                                           5, 6, 6, 7, 7,  8, 8, 9, 9, 10 };
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+#define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP                                     \
+  for (i = 0; i < (im_h - 2); i += 2) {                                        \
+    __m256i data = _mm256_castsi128_si256(                                     \
+        _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));           \
+    data = _mm256_inserti128_si256(                                            \
+        data,                                                                  \
+        _mm_loadu_si128(                                                       \
+            (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),           \
+        1);                                                                    \
+                                                                               \
+    __m256i res = convolve_lowbd_x(data, coeffs_h, filt);                      \
+    res =                                                                      \
+        _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+    _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);              \
+  }                                                                            \
+                                                                               \
+  __m256i data_1 = _mm256_castsi128_si256(                                     \
+      _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));             \
+                                                                               \
+  __m256i res = convolve_lowbd_x(data_1, coeffs_h, filt);                      \
+                                                                               \
+  res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+                                                                               \
+  _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+#define CONVOLVE_SR_VERTICAL_FILTER_8TAP                                      \
+  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
+  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
+  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));  \
+  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));  \
+  __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));  \
+  __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));  \
+                                                                              \
+  __m256i s[8];                                                               \
+  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
+  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                 \
+  s[2] = _mm256_unpacklo_epi16(src_4, src_5);                                 \
+                                                                              \
+  s[4] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
+  s[5] = _mm256_unpackhi_epi16(src_2, src_3);                                 \
+  s[6] = _mm256_unpackhi_epi16(src_4, src_5);                                 \
+                                                                              \
+  for (i = 0; i < h; i += 2) {                                                \
+    const int16_t *data = &im_block[i * im_stride];                           \
+                                                                              \
+    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \
+    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \
+                                                                              \
+    s[3] = _mm256_unpacklo_epi16(s6, s7);                                     \
+    s[7] = _mm256_unpackhi_epi16(s6, s7);                                     \
+                                                                              \
+    __m256i res_a = convolve(s, coeffs_v);                                    \
+    __m256i res_b = convolve(s + 4, coeffs_v);                                \
+                                                                              \
+    res_a =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);  \
+    res_b =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);  \
+                                                                              \
+    const __m256i res_a_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_a, round_const_v), round_shift_v);               \
+    const __m256i res_b_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_b, round_const_v), round_shift_v);               \
+                                                                              \
+    const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);   \
+    const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);         \
+                                                                              \
+    const __m128i res_0 = _mm256_castsi256_si128(res_8b);                     \
+    const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);                \
+                                                                              \
+    __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                 \
+    __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];    \
+    if (w - j > 4) {                                                          \
+      _mm_storel_epi64(p_0, res_0);                                           \
+      _mm_storel_epi64(p_1, res_1);                                           \
+    } else if (w == 4) {                                                      \
+      xx_storel_32(p_0, res_0);                                               \
+      xx_storel_32(p_1, res_1);                                               \
+    } else {                                                                  \
+      *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);                  \
+      *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);                  \
+    }                                                                         \
+                                                                              \
+    s[0] = s[1];                                                              \
+    s[1] = s[2];                                                              \
+    s[2] = s[3];                                                              \
+                                                                              \
+    s[4] = s[5];                                                              \
+    s[5] = s[6];                                                              \
+    s[6] = s[7];                                                              \
+  }
+
+#define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP                               \
+  for (i = 0; i < im_h; i += 2) {                                              \
+    __m256i data = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));  \
+    if (i + 1 < im_h)                                                          \
+      data = _mm256_inserti128_si256(                                          \
+          data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);          \
+    src_h += (src_stride << 1);                                                \
+    __m256i res = convolve_lowbd_x(data, coeffs_x, filt);                      \
+                                                                               \
+    res =                                                                      \
+        _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+                                                                               \
+    _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);              \
+  }
+
+#define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP                                 \
+  __m256i s[8];                                                                \
+  __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));      \
+  __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));      \
+  __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));      \
+  __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));      \
+  __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));      \
+  __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));      \
+                                                                               \
+  s[0] = _mm256_unpacklo_epi16(s0, s1);                                        \
+  s[1] = _mm256_unpacklo_epi16(s2, s3);                                        \
+  s[2] = _mm256_unpacklo_epi16(s4, s5);                                        \
+                                                                               \
+  s[4] = _mm256_unpackhi_epi16(s0, s1);                                        \
+  s[5] = _mm256_unpackhi_epi16(s2, s3);                                        \
+  s[6] = _mm256_unpackhi_epi16(s4, s5);                                        \
+                                                                               \
+  for (i = 0; i < h; i += 2) {                                                 \
+    const int16_t *data = &im_block[i * im_stride];                            \
+                                                                               \
+    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));  \
+    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));  \
+                                                                               \
+    s[3] = _mm256_unpacklo_epi16(s6, s7);                                      \
+    s[7] = _mm256_unpackhi_epi16(s6, s7);                                      \
+                                                                               \
+    const __m256i res_a = convolve(s, coeffs_y);                               \
+    const __m256i res_a_round = _mm256_sra_epi32(                              \
+        _mm256_add_epi32(res_a, round_const_v), round_shift_v);                \
+                                                                               \
+    if (w - j > 4) {                                                           \
+      const __m256i res_b = convolve(s + 4, coeffs_y);                         \
+      const __m256i res_b_round = _mm256_sra_epi32(                            \
+          _mm256_add_epi32(res_b, round_const_v), round_shift_v);              \
+      const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);    \
+      const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);    \
+                                                                               \
+      if (do_average) {                                                        \
+        const __m256i data_ref_0 = load_line2_avx2(                            \
+            &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);  \
+        const __m256i comp_avg_res =                                           \
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);  \
+                                                                               \
+        const __m256i round_result = convolve_rounding(                        \
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);    \
+                                                                               \
+        const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \
+        const __m128i res_0 = _mm256_castsi256_si128(res_8);                   \
+        const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);              \
+                                                                               \
+        _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);      \
+        _mm_storel_epi64(                                                      \
+            (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);   \
+      } else {                                                                 \
+        const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);            \
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);         \
+                                                                               \
+        const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);       \
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),    \
+                        res_1);                                                \
+      }                                                                        \
+    } else {                                                                   \
+      const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);    \
+      const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);    \
+                                                                               \
+      if (do_average) {                                                        \
+        const __m256i data_ref_0 = load_line2_avx2(                            \
+            &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);  \
+                                                                               \
+        const __m256i comp_avg_res =                                           \
+            comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);  \
+                                                                               \
+        const __m256i round_result = convolve_rounding(                        \
+            &comp_avg_res, &offset_const, &rounding_const, rounding_shift);    \
+                                                                               \
+        const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \
+        const __m128i res_0 = _mm256_castsi256_si128(res_8);                   \
+        const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);              \
+                                                                               \
+        *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);  \
+        *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =              \
+            _mm_cvtsi128_si32(res_1);                                          \
+                                                                               \
+      } else {                                                                 \
+        const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);            \
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);         \
+                                                                               \
+        const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);       \
+        _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),    \
+                        res_1);                                                \
+      }                                                                        \
+    }                                                                          \
+                                                                               \
+    s[0] = s[1];                                                               \
+    s[1] = s[2];                                                               \
+    s[2] = s[3];                                                               \
+                                                                               \
+    s[4] = s[5];                                                               \
+    s[5] = s[6];                                                               \
+    s[6] = s[7];                                                               \
+  }
 static INLINE void prepare_coeffs_lowbd(
     const InterpFilterParams *const filter_params, const int subpel_q4,
     __m256i *const coeffs /* [4] */) {
@@ -48,7 +281,7 @@ static INLINE void prepare_coeffs_lowbd(
   // Since all filter co-efficients are even, this change will not affect the
   // end result
   assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
-                            _mm_set1_epi16(0xffff)));
+                            _mm_set1_epi16((short)0xffff)));
 
   const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
 
@@ -95,6 +328,17 @@ static INLINE __m256i convolve_lowbd(const __m256i *const s,
   return res;
 }
 
+static INLINE __m256i convolve_lowbd_4tap(const __m256i *const s,
+                                          const __m256i *const coeffs) {
+  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+
+  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+  const __m256i res = _mm256_add_epi16(res_45, res_23);
+
+  return res;
+}
+
 static INLINE __m256i convolve(const __m256i *const s,
                                const __m256i *const coeffs) {
   const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
@@ -108,6 +352,15 @@ static INLINE __m256i convolve(const __m256i *const s,
   return res;
 }
 
+static INLINE __m256i convolve_4tap(const __m256i *const s,
+                                    const __m256i *const coeffs) {
+  const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
+  const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
+
+  const __m256i res = _mm256_add_epi32(res_1, res_2);
+  return res;
+}
+
 static INLINE __m256i convolve_lowbd_x(const __m256i data,
                                        const __m256i *const coeffs,
                                        const __m256i *const filt) {
@@ -121,6 +374,17 @@ static INLINE __m256i convolve_lowbd_x(const __m256i data,
   return convolve_lowbd(s, coeffs);
 }
 
+static INLINE __m256i convolve_lowbd_x_4tap(const __m256i data,
+                                            const __m256i *const coeffs,
+                                            const __m256i *const filt) {
+  __m256i s[2];
+
+  s[0] = _mm256_shuffle_epi8(data, filt[0]);
+  s[1] = _mm256_shuffle_epi8(data, filt[1]);
+
+  return convolve_lowbd_4tap(s, coeffs);
+}
+
 static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
                                          const __m256i *const res,
                                          const int do_average) {
@@ -138,9 +402,9 @@ static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
 static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
                                const __m256i *const res_unsigned,
                                const __m256i *const wt,
-                               const int use_jnt_comp_avg) {
+                               const int use_dist_wtd_comp_avg) {
   __m256i res;
-  if (use_jnt_comp_avg) {
+  if (use_dist_wtd_comp_avg) {
     const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
     const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
 
@@ -172,9 +436,9 @@ static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
                                       const __m256i *const res_unsigned,
                                       const __m256i *const wt0,
                                       const __m256i *const wt1,
-                                      const int use_jnt_comp_avg) {
+                                      const int use_dist_wtd_comp_avg) {
   __m256i res;
-  if (use_jnt_comp_avg) {
+  if (use_dist_wtd_comp_avg) {
     const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
     const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
     const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
diff --git a/media/libaom/src/aom_dsp/x86/convolve_sse2.h b/media/libaom/src/aom_dsp/x86/convolve_sse2.h
index 445d04b103..385c7c7e18 100644
--- a/media/libaom/src/aom_dsp/x86/convolve_sse2.h
+++ b/media/libaom/src/aom_dsp/x86/convolve_sse2.h
@@ -78,9 +78,9 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s,
 static INLINE __m128i comp_avg(const __m128i *const data_ref_0,
                                const __m128i *const res_unsigned,
                                const __m128i *const wt,
-                               const int use_jnt_comp_avg) {
+                               const int use_dist_wtd_avg) {
   __m128i res;
-  if (use_jnt_comp_avg) {
+  if (use_dist_wtd_avg) {
     const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned);
     const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned);
 
diff --git a/media/libaom/src/aom_dsp/x86/convolve_sse4_1.h b/media/libaom/src/aom_dsp/x86/convolve_sse4_1.h
index 6b8388d84a..b1a3bb4664 100644
--- a/media/libaom/src/aom_dsp/x86/convolve_sse4_1.h
+++ b/media/libaom/src/aom_dsp/x86/convolve_sse4_1.h
@@ -35,9 +35,9 @@ static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
                                              const __m128i *const res_unsigned,
                                              const __m128i *const wt0,
                                              const __m128i *const wt1,
-                                             const int use_jnt_comp_avg) {
+                                             const int use_dist_wtd_avg) {
   __m128i res;
-  if (use_jnt_comp_avg) {
+  if (use_dist_wtd_avg) {
     const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0);
     const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1);
 
diff --git a/media/libaom/src/aom_dsp/x86/fft_avx2.c b/media/libaom/src/aom_dsp/x86/fft_avx2.c
index 54da022538..4cccc5f00f 100644
--- a/media/libaom/src/aom_dsp/x86/fft_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/fft_avx2.c
@@ -11,6 +11,7 @@
 
 #include <immintrin.h>
 
+#include "config/aom_dsp_rtcd.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/fft_common.h"
 
diff --git a/media/libaom/src/aom_dsp/x86/fft_sse2.c b/media/libaom/src/aom_dsp/x86/fft_sse2.c
index 12bdc3e185..6f20a3cc01 100644
--- a/media/libaom/src/aom_dsp/x86/fft_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/fft_sse2.c
@@ -11,6 +11,7 @@ s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
 
 #include <xmmintrin.h>
 
+#include "config/aom_dsp_rtcd.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/fft_common.h"
 
diff --git a/media/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h b/media/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h
index 1e3d13ec85..89fe1899bb 100644
--- a/media/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/media/libaom/src/aom_dsp/x86/fwd_txfm_impl_sse2.h
@@ -30,6 +30,206 @@
 #define SUB_EPI16 _mm_sub_epi16
 #endif
 
+static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0,
+                              __m128i *in1) {
+  // Constants
+  // These are the coefficients used for the multiplies.
+  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
+  // where cospi_N_64 = cos(N pi /64)
+  const __m128i k__cospi_A =
+      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_B =
+      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_C =
+      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+                     cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_D =
+      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+                     cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_E =
+      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_F =
+      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_G =
+      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+                     -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_H =
+      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+                     -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
+
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // This second rounding constant saves doing some extra adds at the end
+  const __m128i k__DCT_CONST_ROUNDING2 =
+      _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
+  const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+
+  // Load inputs.
+  *in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  *in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+  *in1 = _mm_unpacklo_epi64(
+      *in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
+  *in0 = _mm_unpacklo_epi64(
+      *in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
+  // in0 = [i0 i1 i2 i3 iC iD iE iF]
+  // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+  // multiply by 16 to give some extra precision
+  *in0 = _mm_slli_epi16(*in0, 4);
+  *in1 = _mm_slli_epi16(*in1, 4);
+  // if (i == 0 && input[0]) input[0] += 1;
+  // add 1 to the upper left pixel if it is non-zero, which helps reduce
+  // the round-trip error
+  {
+    // The mask will only contain whether the first value is zero, all
+    // other comparison will fail as something shifted by 4 (above << 4)
+    // can never be equal to one. To increment in the non-zero case, we
+    // add the mask and one for the first element:
+    //   - if zero, mask = -1, v = v - 1 + 1 = v
+    //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+    __m128i mask = _mm_cmpeq_epi16(*in0, k__nonzero_bias_a);
+    *in0 = _mm_add_epi16(*in0, mask);
+    *in0 = _mm_add_epi16(*in0, k__nonzero_bias_b);
+  }
+  // There are 4 total stages, alternating between an add/subtract stage
+  // followed by an multiply-and-add stage.
+  {
+    // Stage 1: Add/subtract
+
+    // in0 = [i0 i1 i2 i3 iC iD iE iF]
+    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+    const __m128i r0 = _mm_unpacklo_epi16(*in0, *in1);
+    const __m128i r1 = _mm_unpackhi_epi16(*in0, *in1);
+    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
+    // r1 = [iC i8 iD i9 iE iA iF iB]
+    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
+    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
+    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
+    // r3 = [iC i8 iD i9 iF iB iE iA]
+
+    const __m128i t0 = _mm_add_epi16(r2, r3);
+    const __m128i t1 = _mm_sub_epi16(r2, r3);
+    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
+    // t1 = [aC a8 aD a9 aF aB aE aA]
+
+    // Stage 2: multiply by constants (which gets us into 32 bits).
+    // The constants needed here are:
+    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
+    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
+    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
+    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
+    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
+    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
+    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
+    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
+    // Then add and right-shift to get back to 16-bit range
+    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+    // w0 = [b0 b1 b7 b6]
+    // w1 = [b8 b9 bF bE]
+    // w2 = [b4 b5 b3 b2]
+    // w3 = [bC bD bB bA]
+    const __m128i x0 = _mm_packs_epi32(w0, w1);
+    const __m128i x1 = _mm_packs_epi32(w2, w3);
+
+    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
+    // x1 = [b4 b5 b3 b2 bC bD bB bA]
+    *in0 = _mm_shuffle_epi32(x0, 0xD8);
+    *in1 = _mm_shuffle_epi32(x1, 0x8D);
+    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
+    // in1 = [b3 b2 bB bA b4 b5 bC bD]
+  }
+  {
+    // vertical DCTs finished. Now we do the horizontal DCTs.
+    // Stage 3: Add/subtract
+
+    const __m128i t0 = ADD_EPI16(*in0, *in1);
+    const __m128i t1 = SUB_EPI16(*in0, *in1);
+
+    // Stage 4: multiply by constants (which gets us into 32 bits).
+    {
+      // The constants needed here are:
+      // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
+      // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
+      // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
+      // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
+      const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
+      const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
+      // Then add and right-shift to get back to 16-bit range
+      // but this combines the final right-shift as well to save operations
+      // This unusual rounding operations is to maintain bit-accurate
+      // compatibility with the c version of this function which has two
+      // rounding steps in a row.
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
+      // w0 = [o0 o4 o8 oC]
+      // w1 = [o2 o6 oA oE]
+      // w2 = [o1 o5 o9 oD]
+      // w3 = [o3 o7 oB oF]
+      // remember the o's are numbered according to the correct output location
+      const __m128i x0 = _mm_packs_epi32(w0, w1);
+      const __m128i x1 = _mm_packs_epi32(w2, w3);
+      {
+        // x0 = [o0 o4 o8 oC o2 o6 oA oE]
+        // x1 = [o1 o5 o9 oD o3 o7 oB oF]
+        const __m128i y0 = _mm_unpacklo_epi16(x0, x1);
+        const __m128i y1 = _mm_unpackhi_epi16(x0, x1);
+        // y0 = [o0 o1 o4 o5 o8 o9 oC oD]
+        // y1 = [o2 o3 o6 o7 oA oB oE oF]
+        *in0 = _mm_unpacklo_epi32(y0, y1);
+        // in0 = [o0 o1 o2 o3 o4 o5 o6 o7]
+        *in1 = _mm_unpackhi_epi32(y0, y1);
+        // in1 = [o8 o9 oA oB oC oD oE oF]
+      }
+    }
+  }
+}
+
+void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
+  // This 2D transform implements 4 vertical 1D transforms followed
+  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
+  // by Chen, Smith and Fralick ('77).  The commands for moving the data
+  // around have been minimized by hand.
+  // For the purposes of the comments, the 16 inputs are referred to at i0
+  // through iF (in raster order), intermediate variables are a0, b0, c0
+  // through f, and correspond to the in-place computations mapped to input
+  // locations.  The outputs, o0 through oF are labeled according to the
+  // output locations.
+  __m128i in0, in1;
+  FDCT4x4_2D_HELPER(input, stride, &in0, &in1);
+
+  // Post-condition (v + 1) >> 2 is now incorporated into previous
+  // add and right-shift commands.  Only 2 store instructions needed
+  // because we are using the fact that 1/3 are stored just after 0/2.
+  storeu_output(&in0, output + 0 * 4);
+  storeu_output(&in1, output + 2 * 4);
+}
+
+void FDCT4x4_2D_LP(const int16_t *input, int16_t *output, int stride) {
+  __m128i in0, in1;
+  FDCT4x4_2D_HELPER(input, stride, &in0, &in1);
+  _mm_storeu_si128((__m128i *)(output + 0 * 4), in0);
+  _mm_storeu_si128((__m128i *)(output + 2 * 4), in1);
+}
+
 void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
   int pass;
   // Constants
diff --git a/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c b/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c
index 2d8f8f71e4..0e4fb80468 100644
--- a/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.c
@@ -17,53 +17,23 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/x86/fwd_txfm_sse2.h"
 
-void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
-  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i u0, u1, sum;
-
-  u0 = _mm_add_epi16(in0, in1);
-  u1 = _mm_add_epi16(in2, in3);
-
-  in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
-  sum = _mm_add_epi16(u0, u1);
-
-  in0 = _mm_add_epi16(in0, in1);
-  in2 = _mm_add_epi16(in2, in3);
-  sum = _mm_add_epi16(sum, in0);
-
-  u0 = _mm_setzero_si128();
-  sum = _mm_add_epi16(sum, in2);
-
-  in0 = _mm_unpacklo_epi16(u0, sum);
-  in1 = _mm_unpackhi_epi16(u0, sum);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(sum, u0);
-  in1 = _mm_unpackhi_epi32(sum, u0);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(sum, 8);
-
-  in1 = _mm_add_epi32(sum, in0);
-  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
-}
-
 #define DCT_HIGH_BIT_DEPTH 0
+#define FDCT4x4_2D_HELPER fdct4x4_helper
+#define FDCT4x4_2D aom_fdct4x4_sse2
+#define FDCT4x4_2D_LP aom_fdct4x4_lp_sse2
 #define FDCT8x8_2D aom_fdct8x8_sse2
 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
+#undef FDCT4x4_2D_HELPER
+#undef FDCT4x4_2D
+#undef FDCT4x4_2D_LP
 #undef FDCT8x8_2D
 
+#if CONFIG_AV1_HIGHBITDEPTH
+
 #undef DCT_HIGH_BIT_DEPTH
 #define DCT_HIGH_BIT_DEPTH 1
 #define FDCT8x8_2D aom_highbd_fdct8x8_sse2
 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
 #undef FDCT8x8_2D
+
+#endif
diff --git a/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h b/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h
index 260d8dd58e..ab3cd91557 100644
--- a/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/media/libaom/src/aom_dsp/x86/fwd_txfm_sse2.h
@@ -136,16 +136,21 @@ static INLINE int check_epi16_overflow_x32(
 }
 
 static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
-  if (sizeof(tran_low_t) == 4) {
-    const __m128i zero = _mm_setzero_si128();
-    const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
-    __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
-    __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
-    _mm_store_si128((__m128i *)(dst_ptr), out0);
-    _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
-  } else {
-    _mm_store_si128((__m128i *)(dst_ptr), *poutput);
-  }
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  _mm_store_si128((__m128i *)(dst_ptr), out0);
+  _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+}
+
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
 }
 
 #ifdef __cplusplus
diff --git a/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c b/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
new file mode 100644
index 0000000000..c500b0a26c
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+#include "av1/encoder/av1_quantize.h"
+
+static INLINE void highbd_load_b_values_avx2(
+    const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
+    __m256i *round, const int16_t *quant_ptr, __m256i *quant,
+    const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr,
+    __m256i *shift) {
+  *zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr));
+  *zbin = _mm256_sub_epi32(*zbin, _mm256_set1_epi32(1));
+  *round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr));
+  *quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr));
+  *dequant =
+      _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr));
+  *shift = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)shift_ptr));
+}
+
+static INLINE void highbd_update_mask1_avx2(__m256i *cmp_mask,
+                                            const int16_t *iscan_ptr,
+                                            int *is_found, __m256i *mask) {
+  __m256i temp_mask = _mm256_setzero_si256();
+  if (_mm256_movemask_epi8(*cmp_mask)) {
+    __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr));
+    temp_mask = _mm256_and_si256(*cmp_mask, iscan);
+    *is_found = 1;
+  }
+  *mask = _mm256_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void highbd_update_mask0_avx2(__m256i *qcoeff0, __m256i *qcoeff1,
+                                            __m256i *threshold,
+                                            const int16_t *iscan_ptr,
+                                            int *is_found, __m256i *mask) {
+  __m256i coeff[2], cmp_mask0, cmp_mask1;
+  coeff[0] = _mm256_slli_epi32(*qcoeff0, AOM_QM_BITS);
+  cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]);
+  coeff[1] = _mm256_slli_epi32(*qcoeff1, AOM_QM_BITS);
+  cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]);
+  cmp_mask0 =
+      _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8);
+  highbd_update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE void highbd_mul_shift_avx2(const __m256i *x, const __m256i *y,
+                                         __m256i *p, const int shift) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+
+  prod_lo = _mm256_srli_epi64(prod_lo, shift);
+  prod_hi = _mm256_srli_epi64(prod_hi, shift);
+
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  *p = _mm256_blend_epi32(prod_lo, prod_hi, 0xaa);
+}
+
+static INLINE void highbd_calculate_qcoeff_avx2(__m256i *coeff,
+                                                const __m256i *round,
+                                                const __m256i *quant,
+                                                const __m256i *shift,
+                                                const int *log_scale) {
+  __m256i tmp, qcoeff;
+  qcoeff = _mm256_add_epi32(*coeff, *round);
+  highbd_mul_shift_avx2(&qcoeff, quant, &tmp, 16);
+  qcoeff = _mm256_add_epi32(tmp, qcoeff);
+  highbd_mul_shift_avx2(&qcoeff, shift, coeff, 16 - *log_scale);
+}
+
+static INLINE __m256i highbd_calculate_dqcoeff_avx2(__m256i qcoeff,
+                                                    __m256i dequant) {
+  return _mm256_mullo_epi32(qcoeff, dequant);
+}
+
+static INLINE __m256i highbd_calculate_dqcoeff_log_scale_avx2(
+    __m256i qcoeff, __m256i dequant, const int log_scale) {
+  __m256i abs_coeff = _mm256_abs_epi32(qcoeff);
+  highbd_mul_shift_avx2(&abs_coeff, &dequant, &abs_coeff, log_scale);
+  return _mm256_sign_epi32(abs_coeff, qcoeff);
+}
+
+static INLINE void highbd_store_coefficients_avx2(__m256i coeff0,
+                                                  __m256i coeff1,
+                                                  tran_low_t *coeff_ptr) {
+  _mm256_store_si256((__m256i *)(coeff_ptr), coeff0);
+  _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff1);
+}
+
+void aom_highbd_quantize_b_adaptive_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i zbin, round, quant, dequant, shift;
+  __m256i coeff0, qcoeff0, coeff1, qcoeff1;
+  __m256i cmp_mask, mask0 = zero, mask1 = zero;
+  __m128i temp_mask0, temp_mask1;
+  int prescan_add[2];
+  int thresh[2];
+  const int log_scale = 0;
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+  }
+  __m256i threshold[2];
+  threshold[0] = _mm256_set1_epi32(thresh[0]);
+  threshold[1] = _mm256_set1_epi32(thresh[1]);
+  threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+
+  // Setup global values.
+  highbd_load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr,
+                            &quant, dequant_ptr, &dequant, quant_shift_ptr,
+                            &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr));
+  qcoeff0 = _mm256_abs_epi32(coeff0);
+  coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+  qcoeff1 = _mm256_abs_epi32(coeff1);
+  highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0,
+                           &mask0);
+  __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm256_unpackhi_epi64(zbin, zbin);
+  __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+  highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+  threshold[0] = threshold[1];
+  if (_mm256_movemask_epi8(cmp_mask) == 0) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+    round = _mm256_unpackhi_epi64(round, round);
+    quant = _mm256_unpackhi_epi64(quant, quant);
+    shift = _mm256_unpackhi_epi64(shift, shift);
+    dequant = _mm256_unpackhi_epi64(dequant, dequant);
+  } else {
+    highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+    round = _mm256_unpackhi_epi64(round, round);
+    quant = _mm256_unpackhi_epi64(quant, quant);
+    shift = _mm256_unpackhi_epi64(shift, shift);
+    highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+    // Reinsert signs
+    qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+    qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+    qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+    highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr);
+    coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant);
+    dequant = _mm256_unpackhi_epi64(dequant, dequant);
+    coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant);
+    highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index));
+    qcoeff0 = _mm256_abs_epi32(coeff0);
+    coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8));
+    qcoeff1 = _mm256_abs_epi32(coeff1);
+    highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                             &is_found0, &mask0);
+    temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+    temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+    highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+    if (_mm256_movemask_epi8(cmp_mask) == 0) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+      index += 16;
+      continue;
+    }
+    highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+    qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+    qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+    qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+    qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+    highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index);
+    coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant);
+    coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant);
+    highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index);
+    index += 16;
+  }
+  if (is_found0) {
+    temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+                               _mm256_extracti128_si256(mask0, 1));
+    non_zero_count = calculate_non_zero_count(temp_mask0);
+  }
+  if (is_found1) {
+    temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+                               _mm256_extracti128_si256(mask1, 1));
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+  }
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff <
+          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const int log_scale = 1;
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i zbin, round, quant, dequant, shift;
+  __m256i coeff0, qcoeff0, coeff1, qcoeff1;
+  __m256i cmp_mask, mask0 = zero, mask1 = zero;
+  __m128i temp_mask0, temp_mask1;
+  const __m256i one = _mm256_set1_epi32(1);
+  const __m256i log_scale_vec = _mm256_set1_epi32(log_scale);
+  int prescan_add[2];
+  int thresh[2];
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  __m256i threshold[2];
+  threshold[0] = _mm256_set1_epi32(thresh[0]);
+  threshold[1] = _mm256_set1_epi32(thresh[1]);
+  threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+
+  // Setup global values.
+  zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr));
+  round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr));
+  quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr));
+  dequant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr));
+  shift =
+      _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_shift_ptr));
+
+  // Shift with rounding.
+  zbin = _mm256_add_epi32(zbin, log_scale_vec);
+  round = _mm256_add_epi32(round, log_scale_vec);
+  zbin = _mm256_srli_epi32(zbin, log_scale);
+  round = _mm256_srli_epi32(round, log_scale);
+  zbin = _mm256_sub_epi32(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr));
+  qcoeff0 = _mm256_abs_epi32(coeff0);
+  coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+  qcoeff1 = _mm256_abs_epi32(coeff1);
+  highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0,
+                           &mask0);
+  __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm256_permute2x128_si256(zbin, zbin, 0x11);
+  __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+  highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+  threshold[0] = threshold[1];
+  if (_mm256_movemask_epi8(cmp_mask) == 0) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+    round = _mm256_permute2x128_si256(round, round, 0x11);
+    quant = _mm256_permute2x128_si256(quant, quant, 0x11);
+    shift = _mm256_permute2x128_si256(shift, shift, 0x11);
+    dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11);
+  } else {
+    highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+    round = _mm256_permute2x128_si256(round, round, 0x11);
+    quant = _mm256_permute2x128_si256(quant, quant, 0x11);
+    shift = _mm256_permute2x128_si256(shift, shift, 0x11);
+    highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+    // Reinsert signs
+    qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+    qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+    qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+    highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr);
+    coeff0 =
+        highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale);
+    dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11);
+    coeff1 =
+        highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale);
+    highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index));
+    qcoeff0 = _mm256_abs_epi32(coeff0);
+    coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8));
+    qcoeff1 = _mm256_abs_epi32(coeff1);
+    highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                             &is_found0, &mask0);
+    temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+    temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+    highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+    if (_mm256_movemask_epi8(cmp_mask) == 0) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+      index += 16;
+      continue;
+    }
+    highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+    qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+    qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+    qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+    qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+    highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index);
+    coeff0 =
+        highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale);
+    coeff1 =
+        highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale);
+    highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index);
+    index += 16;
+  }
+  if (is_found0) {
+    temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+                               _mm256_extracti128_si256(mask0, 1));
+    non_zero_count = calculate_non_zero_count(temp_mask0);
+  }
+  if (is_found1) {
+    temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+                               _mm256_extracti128_si256(mask1, 1));
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+  }
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
diff --git a/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
new file mode 100644
index 0000000000..8f31f3596f
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
@@ -0,0 +1,732 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+#include "av1/encoder/av1_quantize.h"
+
+static INLINE __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) {
+  a = _mm_xor_si128(a, sign);
+  return _mm_sub_epi64(a, sign);
+}
+
+static INLINE void highbd_mul_shift_sse2(const __m128i *x, const __m128i *y,
+                                         __m128i *p, const int shift) {
+  __m128i sign = _mm_srai_epi32(*y, 31);
+  __m128i sign_lo = _mm_unpacklo_epi32(sign, sign);
+  __m128i sign_hi = _mm_unpackhi_epi32(sign, sign);
+  __m128i abs_y = invert_sign_32_sse2(*y, sign);
+  __m128i prod_lo = _mm_mul_epu32(*x, abs_y);
+  __m128i prod_hi = _mm_srli_epi64(*x, 32);
+  const __m128i mult_hi = _mm_srli_epi64(abs_y, 32);
+  prod_hi = _mm_mul_epu32(prod_hi, mult_hi);
+  prod_lo = highbd_invert_sign_64bit_sse2(prod_lo, sign_lo);
+  prod_hi = highbd_invert_sign_64bit_sse2(prod_hi, sign_hi);
+
+  prod_lo = _mm_srli_epi64(prod_lo, shift);
+  const __m128i mask = _mm_set_epi32(0, -1, 0, -1);
+  prod_lo = _mm_and_si128(prod_lo, mask);
+  prod_hi = _mm_srli_epi64(prod_hi, shift);
+
+  prod_hi = _mm_slli_epi64(prod_hi, 32);
+  *p = _mm_or_si128(prod_lo, prod_hi);
+}
+
+static INLINE void highbd_calculate_qcoeff(__m128i *coeff, const __m128i *round,
+                                           const __m128i *quant,
+                                           const __m128i *shift,
+                                           const int *log_scale) {
+  __m128i tmp, qcoeff;
+  qcoeff = _mm_add_epi32(*coeff, *round);
+  highbd_mul_shift_sse2(&qcoeff, quant, &tmp, 16);
+  qcoeff = _mm_add_epi32(tmp, qcoeff);
+  highbd_mul_shift_sse2(&qcoeff, shift, coeff, 16 - *log_scale);
+}
+
+static INLINE void highbd_update_mask1(__m128i *cmp_mask0,
+                                       const int16_t *iscan_ptr, int *is_found,
+                                       __m128i *mask) {
+  __m128i temp_mask = _mm_setzero_si128();
+  if (_mm_movemask_epi8(*cmp_mask0)) {
+    __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
+    __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
+    temp_mask = mask0;
+    *is_found = 1;
+  }
+  *mask = _mm_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void highbd_update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
+                                       __m128i *threshold,
+                                       const int16_t *iscan_ptr, int *is_found,
+                                       __m128i *mask) {
+  __m128i coeff[2], cmp_mask0, cmp_mask1;
+
+  coeff[0] = _mm_slli_epi32(*qcoeff0, AOM_QM_BITS);
+  cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
+  coeff[1] = _mm_slli_epi32(*qcoeff1, AOM_QM_BITS);
+  cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
+
+  cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+
+  highbd_update_mask1(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE __m128i highbd_calculate_dqcoeff(__m128i qcoeff, __m128i dequant,
+                                               const int log_scale) {
+  __m128i coeff_sign = _mm_srai_epi32(qcoeff, 31);
+  __m128i abs_coeff = invert_sign_32_sse2(qcoeff, coeff_sign);
+  highbd_mul_shift_sse2(&abs_coeff, &dequant, &abs_coeff, log_scale);
+  return invert_sign_32_sse2(abs_coeff, coeff_sign);
+}
+
+void aom_highbd_quantize_b_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 8;
+  const int log_scale = 0;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1, cmp_mask;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+  __m128i round_sign = _mm_srai_epi16(round, 15);
+  __m128i quant_sign = _mm_srai_epi16(quant, 15);
+  __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+  __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+  zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+  round = _mm_unpacklo_epi16(round, round_sign);
+  quant = _mm_unpacklo_epi16(quant, quant_sign);
+  dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+  shift = _mm_unpacklo_epi16(shift, shift_sign);
+  zbin = _mm_sub_epi32(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+  coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+  coeff0_sign = _mm_srai_epi32(coeff0, 31);
+  coeff1_sign = _mm_srai_epi32(coeff1, 31);
+  qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+  highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+  highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+    coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+    coeff0_sign = _mm_srai_epi32(coeff0, 31);
+    coeff1_sign = _mm_srai_epi32(coeff1, 31);
+    qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+    highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                        &is_found0, &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+    highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      index += 8;
+      continue;
+    }
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+    index += 8;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff <
+          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 8;
+  const int log_scale = 1;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+  const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1, cmp_mask;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+  __m128i round_sign = _mm_srai_epi16(round, 15);
+  __m128i quant_sign = _mm_srai_epi16(quant, 15);
+  __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+  __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+  zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+  round = _mm_unpacklo_epi16(round, round_sign);
+  quant = _mm_unpacklo_epi16(quant, quant_sign);
+  dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+  shift = _mm_unpacklo_epi16(shift, shift_sign);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi32(zbin, log_scale_vec);
+  round = _mm_add_epi32(round, log_scale_vec);
+  zbin = _mm_srli_epi32(zbin, log_scale);
+  round = _mm_srli_epi32(round, log_scale);
+  zbin = _mm_sub_epi32(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+  coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+  coeff0_sign = _mm_srai_epi32(coeff0, 31);
+  coeff1_sign = _mm_srai_epi32(coeff1, 31);
+  qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+  highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+  highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+    coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+    coeff0_sign = _mm_srai_epi32(coeff0, 31);
+    coeff1_sign = _mm_srai_epi32(coeff1, 31);
+    qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+    highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                        &is_found0, &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+    highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      index += 8;
+      continue;
+    }
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+    index += 8;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_highbd_quantize_b_64x64_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 8;
+  const int log_scale = 2;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+  const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1, cmp_mask;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+  __m128i round_sign = _mm_srai_epi16(round, 15);
+  __m128i quant_sign = _mm_srai_epi16(quant, 15);
+  __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+  __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+  zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+  round = _mm_unpacklo_epi16(round, round_sign);
+  quant = _mm_unpacklo_epi16(quant, quant_sign);
+  dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+  shift = _mm_unpacklo_epi16(shift, shift_sign);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi32(zbin, log_scale_vec);
+  round = _mm_add_epi32(round, log_scale_vec);
+  zbin = _mm_srli_epi32(zbin, log_scale);
+  round = _mm_srli_epi32(round, log_scale);
+  zbin = _mm_sub_epi32(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+  coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+  coeff0_sign = _mm_srai_epi32(coeff0, 31);
+  coeff1_sign = _mm_srai_epi32(coeff1, 31);
+  qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+  highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+  highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+    coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+    coeff0_sign = _mm_srai_epi32(coeff0, 31);
+    coeff1_sign = _mm_srai_epi32(coeff1, 31);
+    qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+    highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                        &is_found0, &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+    highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      index += 8;
+      continue;
+    }
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+    index += 8;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
diff --git a/media/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c b/media/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c
index 099fcf7fc6..b43a7d7b5b 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/highbd_convolve_avx2.c
@@ -20,6 +20,14 @@
 // -----------------------------------------------------------------------------
 // Copy and average
 
+static const uint8_t ip_shuffle_f2f3[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+                                             7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+                                             4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+static const uint8_t ip_shuffle_f4f5[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
+                                             8, 9, 10, 11, 10, 11, 12, 13,
+                                             4, 5, 6,  7,  6,  7,  8,  9,
+                                             8, 9, 10, 11, 10, 11, 12, 13 };
+
 void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,
                                    uint8_t *dst8, ptrdiff_t dst_stride,
                                    const int16_t *filter_x, int filter_x_stride,
@@ -107,13 +115,13 @@ void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
                                    uint16_t *dst, int dst_stride, int w, int h,
                                    const InterpFilterParams *filter_params_x,
                                    const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   const int subpel_x_qn, const int subpel_y_qn,
                                    ConvolveParams *conv_params, int bd) {
   int i, j;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride;
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
   (void)conv_params;
 
   assert(conv_params->round_0 <= FILTER_BITS);
@@ -130,7 +138,7 @@ void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
       _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   const __m256i zero = _mm256_setzero_si256();
 
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
 
   for (j = 0; j < w; j += 8) {
     const uint16_t *data = &src_ptr[j];
@@ -256,12 +264,12 @@ void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
                                    uint16_t *dst, int dst_stride, int w, int h,
                                    const InterpFilterParams *filter_params_x,
                                    const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
+                                   const int subpel_x_qn, const int subpel_y_qn,
                                    ConvolveParams *conv_params, int bd) {
   int i, j;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_horiz;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
   (void)filter_params_y;
 
   // Check that, even with 12-bit input, the intermediate values will fit
@@ -285,7 +293,7 @@ void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
 
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
 
   for (j = 0; j < w; j += 8) {
     /* Horizontal filter */
@@ -444,6 +452,17 @@ static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
   f[3] = _mm256_shuffle_epi8(hh, p3);
 }
 
+static INLINE void pack_filters_4tap(const int16_t *filter,
+                                     __m256i *f /*f[4]*/) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m256i coeff = _mm256_broadcastsi128_si256(h);
+
+  // coeffs 2 3 2 3 2 3 2 3
+  f[0] = _mm256_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  f[1] = _mm256_shuffle_epi32(coeff, 0xaa);
+}
+
 static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
                                      const __m256i *fil /*fil[4]*/,
                                      __m256i *y) {
@@ -544,6 +563,176 @@ static void aom_highbd_filter_block1d16_h8_avx2(
   } while (height > 0);
 }
 
+static void aom_highbd_filter_block1d4_h4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m256i ff[2], s[2];
+  uint32_t i;
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  static const uint8_t shuffle_mask[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+                                            7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+                                            4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+
+  __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
+  __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
+  __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
+
+  pack_filters_4tap(filter, ff);
+  src_ptr -= 3;
+  for (i = 0; i <= (height - 2); i += 2) {
+    __m256i row0 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
+    __m256i row1 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src_ptr[(i + 1) * src_pitch + 2]));
+
+    s[0] = _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
+    s[1] = _mm256_alignr_epi8(s[0], s[0], 4);
+
+    s[0] = _mm256_shuffle_epi8(s[0], mask);
+    s[1] = _mm256_shuffle_epi8(s[1], mask);
+
+    __m256i res = convolve_4tap(s, ff);
+    res =
+        _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
+
+    res = _mm256_packs_epi32(res, res);
+    res = _mm256_min_epi16(res, clip_pixel);
+    res = _mm256_max_epi16(res, zero);
+
+    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+                     _mm256_castsi256_si128(res));
+    _mm_storel_epi64((__m128i *)&dst_ptr[(i + 1) * dst_pitch],
+                     _mm256_extracti128_si256(res, 1));
+  }
+  if (height % 2 != 0) {
+    i = height - 1;
+    const __m256i row0_0 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
+    const __m256i row0_1 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 6]));
+
+    const __m256i r0 =
+        _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
+
+    s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
+    s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
+
+    __m256i res = convolve_4tap(s, ff);
+    res =
+        _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
+
+    res = _mm256_packs_epi32(res, res);
+    res = _mm256_min_epi16(res, clip_pixel);
+    res = _mm256_max_epi16(res, zero);
+
+    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+                     _mm256_castsi256_si128(res));
+  }
+}
+
+static void aom_highbd_filter_block1d8_h4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m256i ff[2], s[2];
+  uint32_t i = 0;
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  static const uint8_t shuffle_mask[32] = { 0, 1, 8,  9,  2, 3, 10, 11,
+                                            4, 5, 12, 13, 6, 7, 14, 15,
+                                            0, 1, 8,  9,  2, 3, 10, 11,
+                                            4, 5, 12, 13, 6, 7, 14, 15 };
+
+  __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
+  __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
+  __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
+
+  pack_filters_4tap(filter, ff);
+  src_ptr -= 3;
+
+  /* Horizontal filter */
+
+  for (i = 0; i <= (height - 2); i += 2) {
+    const __m256i row0 =
+        _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
+    __m256i row1 =
+        _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_pitch + 2]);
+
+    const __m256i r0 =
+        _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
+    const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+    // even pixels
+    s[0] = r0;
+    s[1] = _mm256_alignr_epi8(r1, r0, 4);
+
+    __m256i res_even = convolve_4tap(s, ff);
+    res_even = _mm256_srai_epi32(_mm256_add_epi32(res_even, rounding),
+                                 CONV8_ROUNDING_BITS);
+
+    // odd pixels
+    s[0] = _mm256_alignr_epi8(r1, r0, 2);
+    s[1] = _mm256_alignr_epi8(r1, r0, 6);
+
+    __m256i res_odd = convolve_4tap(s, ff);
+    res_odd = _mm256_srai_epi32(_mm256_add_epi32(res_odd, rounding),
+                                CONV8_ROUNDING_BITS);
+
+    __m256i res = _mm256_packs_epi32(res_even, res_odd);
+    res = _mm256_shuffle_epi8(res, mask);
+
+    res = _mm256_min_epi16(res, clip_pixel);
+    res = _mm256_max_epi16(res, zero);
+
+    _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
+                     _mm256_castsi256_si128(res));
+    _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
+                     _mm256_extracti128_si256(res, 1));
+  }
+
+  if (height % 2 != 0) {
+    i = height - 1;
+    const __m256i row0_0 =
+        _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
+    const __m256i row0_1 =
+        _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 6]);
+
+    const __m256i r0 =
+        _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
+
+    s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
+    s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
+
+    __m256i res = convolve_4tap(s, ff);
+    res =
+        _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
+
+    res = _mm256_packs_epi32(res, res);
+    res = _mm256_min_epi16(res, clip_pixel);
+    res = _mm256_max_epi16(res, zero);
+
+    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+                     _mm256_castsi256_si128(res));
+    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + 4],
+                     _mm256_extracti128_si256(res, 1));
+  }
+}
+
+static void aom_highbd_filter_block1d16_h4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  aom_highbd_filter_block1d8_h4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+                                     height, filter, bd);
+  aom_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
+                                     dst_pitch, height, filter, bd);
+}
+
 // -----------------------------------------------------------------------------
 // 2-tap horizontal filtering
 
@@ -875,6 +1064,142 @@ static void aom_highbd_filter_block1d16_v8_avx2(
   } while (height > 0);
 }
 
+static void aom_highbd_filter_block1d4_v4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi32(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+  uint32_t i;
+  __m256i s[2], ff[2];
+
+  pack_filters_4tap(filter, ff);
+
+  const uint16_t *data = src_ptr;
+  /* Vertical filter */
+  {
+    __m128i s2 = _mm_loadl_epi64((__m128i *)(data + 2 * src_pitch));
+    __m128i s3 = _mm_loadl_epi64((__m128i *)(data + 3 * src_pitch));
+
+    __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
+
+    __m128i s4 = _mm_loadl_epi64((__m128i *)(data + 4 * src_pitch));
+
+    __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
+
+    s[0] = _mm256_unpacklo_epi16(s23, s34);
+
+    for (i = 0; i < height; i += 2) {
+      data = &src_ptr[i * src_pitch];
+
+      __m128i s5 = _mm_loadl_epi64((__m128i *)(data + 5 * src_pitch));
+      __m128i s6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_pitch));
+
+      __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
+      __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
+
+      s[1] = _mm256_unpacklo_epi16(s45, s56);
+
+      const __m256i res_a = convolve_4tap(s, ff);
+
+      __m256i res_a_round = _mm256_sra_epi32(
+          _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+      __m256i res_16bit = _mm256_min_epi32(res_a_round, clip_pixel);
+      res_16bit = _mm256_max_epi32(res_16bit, zero);
+      res_16bit = _mm256_packs_epi32(res_16bit, res_16bit);
+
+      _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+                       _mm256_castsi256_si128(res_16bit));
+      _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
+                       _mm256_extracti128_si256(res_16bit, 1));
+
+      s[0] = s[1];
+      s4 = s6;
+    }
+  }
+}
+
+static void aom_highbd_filter_block1d8_v4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i s[4], ff[2];
+  uint32_t i;
+  pack_filters_4tap(filter, ff);
+
+  const uint16_t *data = src_ptr;
+  /* Vertical filter */
+  {
+    __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_pitch));
+    __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_pitch));
+
+    __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
+
+    __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_pitch));
+
+    __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
+
+    s[0] = _mm256_unpacklo_epi16(s23, s34);
+    s[2] = _mm256_unpackhi_epi16(s23, s34);
+
+    for (i = 0; i < height; i += 2) {
+      data = &src_ptr[i * src_pitch];
+
+      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_pitch));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_pitch));
+
+      __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
+      __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
+
+      s[1] = _mm256_unpacklo_epi16(s45, s56);
+      s[3] = _mm256_unpackhi_epi16(s45, s56);
+
+      const __m256i res_a = convolve_4tap(s, ff);
+
+      __m256i res_a_round = _mm256_sra_epi32(
+          _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+      const __m256i res_b = convolve_4tap(s + 2, ff);
+      __m256i res_b_round = _mm256_sra_epi32(
+          _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
+
+      __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+      res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+      res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+      _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
+                       _mm256_castsi256_si128(res_16bit));
+      _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
+                       _mm256_extracti128_si256(res_16bit, 1));
+
+      s[0] = s[1];
+      s[2] = s[3];
+      s4 = s6;
+    }
+  }
+}
+
+static void aom_highbd_filter_block1d16_v4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  aom_highbd_filter_block1d8_v4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+                                     height, filter, bd);
+
+  aom_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
+                                     dst_pitch, height, filter, bd);
+}
+
 // -----------------------------------------------------------------------------
 // 2-tap vertical filtering
 
diff --git a/media/libaom/src/aom_dsp/x86/highbd_convolve_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_convolve_sse2.c
new file mode 100644
index 0000000000..a2bb283222
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/highbd_convolve_sse2.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/convolve.h"
+
+// -----------------------------------------------------------------------------
+
+void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_pitch, uint32_t height,
+                                        const int16_t *filter, int bd) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23_lo, srcReg34_lo;
+  __m128i srcReg45_lo, srcReg56_lo;
+  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+  __m128i resReg23_45_lo, resReg34_56_lo;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg64, secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+  addFilterReg64 = _mm_set1_epi32(64);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = dst_pitch << 1;
+
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
+
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
+
+  for (i = height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+    srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+    srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
+    resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
+    resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
+    resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
+
+    resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
+    resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
+
+    // shift by 7 bit each 32 bit
+    resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
+    resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
+    resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
+    resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
+
+    // shrink to 16 bit each 32 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128());
+    resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128());
+
+    resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
+    resReg23_45 = _mm_min_epi16(resReg23_45, max);
+    resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
+    resReg34_56 = _mm_min_epi16(resReg34_56, max);
+
+    src_ptr += src_stride;
+
+    _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45));
+    _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
+
+    dst_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_lo = srcReg45_lo;
+    srcReg34_lo = srcReg56_lo;
+    srcReg4 = srcReg6;
+  }
+}
+
+void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_pitch, uint32_t height,
+                                        const int16_t *filter, int bd) {
+  __m128i filtersReg;
+  __m128i addFilterReg64;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1;
+  __m128i srcReg32b1;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg64 = _mm_set1_epi32(64);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
+
+  for (i = height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
+
+    __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
+    __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
+    __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
+    __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1);
+    __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1);
+
+    ss_23 = _mm_madd_epi16(ss_23, secondFilters);
+    ss_45 = _mm_madd_epi16(ss_45, thirdFilters);
+    srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45);
+
+    // shift by 7 bit each 32 bit
+    srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64);
+    srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7);
+
+    srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
+    srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+    srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
+
+    src_ptr += src_pitch;
+
+    _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1);
+
+    dst_ptr += dst_pitch;
+  }
+}
+
+void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_pitch, uint32_t height,
+                                        const int16_t *filter, int bd) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
+  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
+  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
+  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg64, secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+  addFilterReg64 = _mm_set1_epi32(64);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = dst_pitch << 1;
+
+  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
+  srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3);
+
+  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
+  srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4);
+
+  for (i = height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+
+    srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
+    srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+    srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
+    srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
+    resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
+    resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
+    resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
+
+    resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
+    resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters);
+    resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters);
+    resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters);
+    resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters);
+
+    resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi);
+    resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi);
+
+    // shift by 7 bit each 32 bit
+    resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
+    resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
+    resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64);
+    resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64);
+    resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
+    resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
+    resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7);
+    resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7);
+
+    // shrink to 16 bit each 32 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi);
+    resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi);
+
+    resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
+    resReg23_45 = _mm_min_epi16(resReg23_45, max);
+    resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
+    resReg34_56 = _mm_min_epi16(resReg34_56, max);
+
+    src_ptr += src_stride;
+
+    _mm_store_si128((__m128i *)dst_ptr, (resReg23_45));
+    _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
+
+    dst_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_lo = srcReg45_lo;
+    srcReg23_hi = srcReg45_hi;
+    srcReg34_lo = srcReg56_lo;
+    srcReg34_hi = srcReg56_hi;
+    srcReg4 = srcReg6;
+  }
+}
+
+void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_pitch, uint32_t height,
+                                        const int16_t *filter, int bd) {
+  __m128i filtersReg;
+  __m128i addFilterReg64;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
+  __m128i srcReg32b1, srcReg32b2;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg64 = _mm_set1_epi32(64);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
+
+  for (i = height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
+    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6));
+
+    __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
+    __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4);
+    __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2);
+
+    __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters);
+    __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
+    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+    __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
+    __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
+    __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2);
+    __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6);
+    __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2);
+    __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2);
+
+    d1 = _mm_madd_epi16(ss_3, secondFilters);
+    d2 = _mm_madd_epi16(ss_5, thirdFilters);
+    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
+
+    __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+    __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+
+    // shift by 7 bit each 32 bit
+    res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64);
+    res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64);
+    res_lo_1 = _mm_srai_epi32(res_lo_1, 7);
+    res_hi_1 = _mm_srai_epi32(res_hi_1, 7);
+
+    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1);
+
+    srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+    srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
+
+    src_ptr += src_pitch;
+
+    _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1);
+
+    dst_ptr += dst_pitch;
+  }
+}
+
+void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr,
+                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                         ptrdiff_t dst_pitch, uint32_t height,
+                                         const int16_t *filter, int bd) {
+  aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+                                     height, filter, bd);
+  aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
+                                     dst_pitch, height, filter, bd);
+}
+
+void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr,
+                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                         ptrdiff_t dst_pitch, uint32_t height,
+                                         const int16_t *filter, int bd) {
+  aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+                                     height, filter, bd);
+  aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
+                                     dst_pitch, height, filter, bd);
+}
diff --git a/media/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c b/media/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c
index e7b33d1c46..a79350f5a6 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c
+++ b/media/libaom/src/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -20,14 +20,14 @@ void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
                                     uint16_t *dst, int dst_stride, int w, int h,
                                     const InterpFilterParams *filter_params_x,
                                     const InterpFilterParams *filter_params_y,
-                                    const int subpel_x_q4,
-                                    const int subpel_y_q4,
+                                    const int subpel_x_qn,
+                                    const int subpel_y_qn,
                                     ConvolveParams *conv_params, int bd) {
   int i, j;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_vert * src_stride;
   (void)filter_params_x;
-  (void)subpel_x_q4;
+  (void)subpel_x_qn;
   (void)conv_params;
 
   assert(conv_params->round_0 <= FILTER_BITS);
@@ -44,7 +44,7 @@ void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   const __m128i zero = _mm_setzero_si128();
 
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
 
   for (j = 0; j < w; j += 8) {
     const uint16_t *data = &src_ptr[j];
@@ -168,13 +168,13 @@ void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
                                     uint16_t *dst, int dst_stride, int w, int h,
                                     const InterpFilterParams *filter_params_x,
                                     const InterpFilterParams *filter_params_y,
-                                    const int subpel_x_q4,
-                                    const int subpel_y_q4,
+                                    const int subpel_x_qn,
+                                    const int subpel_y_qn,
                                     ConvolveParams *conv_params, int bd) {
   int i, j;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
   const uint16_t *const src_ptr = src - fo_horiz;
-  (void)subpel_y_q4;
+  (void)subpel_y_qn;
   (void)filter_params_y;
 
   // Check that, even with 12-bit input, the intermediate values will fit
@@ -195,7 +195,7 @@ void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
       _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
   const __m128i zero = _mm_setzero_si128();
 
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
 
   for (j = 0; j < w; j += 8) {
     /* Horizontal filter */
diff --git a/media/libaom/src/aom_dsp/x86/highbd_intrapred_sse2_asm.asm b/media/libaom/src/aom_dsp/x86/highbd_intrapred_asm_sse2.asm
index 91b3d126ca..91b3d126ca 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_intrapred_sse2_asm.asm
+++ b/media/libaom/src/aom_dsp/x86/highbd_intrapred_asm_sse2.asm
diff --git a/media/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c
index 097e0778ff..ea7dc6a9e5 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -90,7 +90,7 @@ static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
 
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
-  const __m128i ffff = _mm_set1_epi16(0xFFFF);
+  const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
 
   __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
   max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
@@ -112,7 +112,7 @@ static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
                                                  __m128i *hev, __m128i *mask) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
-  const __m128i ffff = _mm_set1_epi16(0xFFFF);
+  const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
   __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
   __m128i max, max01, h;
 
@@ -497,8 +497,9 @@ static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
 }
 
 void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch,
-                                       const uint8_t *blt, const uint8_t *lt,
-                                       const uint8_t *thr, int bd) {
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh, int bd) {
   __m128i p[7], q[7], pq[7];
   int i;
 
@@ -507,7 +508,7 @@ void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch,
     q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
   }
 
-  highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd);
+  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
 
   for (i = 0; i < 6; i++) {
     _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
@@ -727,8 +728,8 @@ void aom_highbd_lpf_horizontal_14_dual_sse2(
                                    _limit1, _thresh1, bd);
 
   for (i = 0; i < 6; i++) {
-    _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
-    _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
+    _mm_storeu_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
+    _mm_storeu_si128((__m128i *)(s + i * pitch), q[i]);
   }
 }
 
diff --git a/media/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c
index 58e5f98e58..1764a4952a 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -134,7 +134,7 @@ void aom_highbd_quantize_b_32x32_sse2(
   for (i = 0; i < idx; i++) {
     const int rc = idx_arr[i];
     const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
+    const int coeff_sign = AOMSIGN(coeff);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
     const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
     const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
@@ -146,3 +146,61 @@ void aom_highbd_quantize_b_32x32_sse2(
   }
   *eob_ptr = eob + 1;
 }
+
+void aom_highbd_quantize_b_64x64_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  __m128i zbins[2];
+  __m128i nzbins[2];
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2);
+  (void)scan;
+  zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
+  zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs / 4; i++) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+  }
+
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = idx_arr[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14);
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
+    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+  }
+  *eob_ptr = eob + 1;
+}
diff --git a/media/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm b/media/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm
index 3398d8a2ae..09e64d510e 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm
+++ b/media/libaom/src/aom_dsp/x86/highbd_sad_sse2.asm
@@ -372,3 +372,71 @@ HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
 HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
 HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
 HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
+
+; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD4XN 1-2 0
+  HIGH_SAD_FN 4, %1, 7, %2
+  mov              n_rowsd, %1/4
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movq                  m1, [refq]
+  movq                  m2, [refq+ref_strideq*2]
+  movq                  m3, [refq+ref_strideq*4]
+  movq                  m4, [refq+ref_stride3q*2]
+  punpcklwd             m1, m3
+  punpcklwd             m2, m4
+%if %2 == 1
+  movq                  m3, [second_predq+8*0]
+  movq                  m5, [second_predq+8*2]
+  punpcklwd             m3, m5
+  movq                  m4, [second_predq+8*1]
+  movq                  m5, [second_predq+8*3]
+  punpcklwd             m4, m5
+  lea         second_predq, [second_predq+8*4]
+  pavgw                 m1, m3
+  pavgw                 m2, m4
+%endif
+  movq                  m5, [srcq]
+  movq                  m3, [srcq+src_strideq*4]
+  punpcklwd             m5, m3
+  movdqa                m3, m1
+  psubusw               m1, m5
+  psubusw               m5, m3
+  por                   m1, m5
+  movq                  m5, [srcq+src_strideq*2]
+  movq                  m4, [srcq+src_stride3q*2]
+  punpcklwd             m5, m4
+  movdqa                m4, m2
+  psubusw               m2, m5
+  psubusw               m5, m4
+  por                   m2, m5
+  paddw                 m1, m2
+  movdqa                m2, m1
+  punpcklwd             m1, m6
+  punpckhwd             m2, m6
+  lea                 refq, [refq+ref_strideq*8]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*8]
+  paddd                 m0, m2
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD4XN 16 ; highbd_sad4x16_sse2
+HIGH_SAD4XN  8 ; highbd_sad4x8_sse2
+HIGH_SAD4XN  4 ; highbd_sad4x4_sse2
+HIGH_SAD4XN 16, 1 ; highbd_sad4x16_avg_sse2
+HIGH_SAD4XN  8, 1 ; highbd_sad4x8_avg_sse2
+HIGH_SAD4XN  4, 1 ; highbd_sad4x4_avg_sse2
diff --git a/media/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/media/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 61f5b8e865..5c78933df5 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/media/libaom/src/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -114,45 +114,33 @@ SECTION .text
       cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
                                         x_offset, y_offset, \
                                         dst, dst_stride, \
-                                        sec, sec_stride, height, sse, \
-                                        g_bilin_filter, g_pw_8
+                                        sec, sec_stride, height, sse
       %define block_height dword heightm
       %define sec_str sec_stridemp
-
-      ; Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
-
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
-
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
-
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
                                     x_offset, y_offset, \
-                                    dst, dst_stride, height, sse, \
-                                    g_bilin_filter, g_pw_8
+                                    dst, dst_stride, height, sse
       %define block_height heightd
+    %endif
 
-      ; Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
+    ; reuse argument stack space
+    %define g_bilin_filterm x_offsetm
+    %define g_pw_8m y_offsetm
 
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
+    ; Store bilin_filter and pw_8 location in stack
+    %if GET_GOT_DEFINED == 1
+      GET_GOT eax
+      add esp, 4                ; restore esp
+    %endif
 
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
+    lea ecx, [GLOBAL(bilin_filter_m)]
+    mov g_bilin_filterm, ecx
 
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
-    %endif
+    lea ecx, [GLOBAL(pw_8)]
+    mov g_pw_8m, ecx
+
+    LOAD_IF_USED 0, 1         ; load eax, ecx back
   %else
     %if %2 == 1 ; avg
       cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
diff --git a/media/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c
index 18eb03d12c..b72d1cf8ba 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/highbd_subtract_sse2.c
@@ -29,15 +29,15 @@ static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
   __m128i x0, x1, x2, x3;
   int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
 
-  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+  u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
 
-  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+  v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride));
 
   x0 = _mm_sub_epi16(u0, v0);
   x1 = _mm_sub_epi16(u1, v1);
@@ -61,23 +61,23 @@ static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
   int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
 
-  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-  u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
-  u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
-  u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
-  u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+  u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+  u4 = _mm_loadl_epi64((__m128i const *)(src + 4 * src_stride));
+  u5 = _mm_loadl_epi64((__m128i const *)(src + 5 * src_stride));
+  u6 = _mm_loadl_epi64((__m128i const *)(src + 6 * src_stride));
+  u7 = _mm_loadl_epi64((__m128i const *)(src + 7 * src_stride));
 
-  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
-  v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
-  v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
-  v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
-  v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
+  v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride));
+  v4 = _mm_loadl_epi64((__m128i const *)(pred + 4 * pred_stride));
+  v5 = _mm_loadl_epi64((__m128i const *)(pred + 5 * pred_stride));
+  v6 = _mm_loadl_epi64((__m128i const *)(pred + 6 * pred_stride));
+  v7 = _mm_loadl_epi64((__m128i const *)(pred + 7 * pred_stride));
 
   x0 = _mm_sub_epi16(u0, v0);
   x1 = _mm_sub_epi16(u1, v1);
diff --git a/media/libaom/src/aom_dsp/x86/highbd_variance_sse2.c b/media/libaom/src/aom_dsp/x86/highbd_variance_sse2.c
index 47b052abc9..b7d15f93ec 100644
--- a/media/libaom/src/aom_dsp/x86/highbd_variance_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/highbd_variance_sse2.c
@@ -20,9 +20,10 @@
 
 #include "aom_ports/mem.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/reconinter.h"
+#include "av1/encoder/reconinter_enc.h"
 
 typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
                                        const uint16_t *ref, int ref_stride,
@@ -192,7 +193,6 @@ VAR_FN(16, 16, 16, 8);
 VAR_FN(16, 8, 8, 7);
 VAR_FN(8, 16, 8, 7);
 VAR_FN(8, 8, 8, 6);
-VAR_FN(16, 4, 16, 6);
 VAR_FN(8, 32, 8, 8);
 VAR_FN(32, 8, 8, 8);
 VAR_FN(16, 64, 16, 10);
@@ -287,30 +287,38 @@ DECLS(sse2);
   uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt(                   \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
       const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
-    uint32_t sse;                                                              \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    int se = aom_highbd_sub_pixel_variance##wf##xh_##opt(                      \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
-        NULL);                                                                 \
-    if (w > wf) {                                                              \
-      unsigned int sse2;                                                       \
+    int se = 0;                                                                \
+    unsigned int sse = 0;                                                      \
+    unsigned int sse2;                                                         \
+    int row_rep = (w > 64) ? 2 : 1;                                            \
+    for (int wd_64 = 0; wd_64 < row_rep; wd_64++) {                            \
+      src += wd_64 * 64;                                                       \
+      dst += wd_64 * 64;                                                       \
       int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
-          &sse2, NULL, NULL);                                                  \
+          src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse2,      \
+          NULL, NULL);                                                         \
       se += se2;                                                               \
       sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
+      if (w > wf) {                                                            \
         se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+            src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \
             &sse2, NULL, NULL);                                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
+        if (w > wf * 2) {                                                      \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,      \
+              dst_stride, h, &sse2, NULL, NULL);                               \
+          se += se2;                                                           \
+          sse += sse2;                                                         \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,      \
+              dst_stride, h, &sse2, NULL, NULL);                               \
+          se += se2;                                                           \
+          sse += sse2;                                                         \
+        }                                                                      \
       }                                                                        \
     }                                                                          \
     *sse_ptr = sse;                                                            \
@@ -322,33 +330,42 @@ DECLS(sse2);
       const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
     int64_t var;                                                               \
     uint32_t sse;                                                              \
+    uint64_t long_sse = 0;                                                     \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    int se = aom_highbd_sub_pixel_variance##wf##xh_##opt(                      \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
-        NULL);                                                                 \
-    if (w > wf) {                                                              \
-      uint32_t sse2;                                                           \
+    int se = 0;                                                                \
+    int row_rep = (w > 64) ? 2 : 1;                                            \
+    for (int wd_64 = 0; wd_64 < row_rep; wd_64++) {                            \
+      src += wd_64 * 64;                                                       \
+      dst += wd_64 * 64;                                                       \
       int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
-          &sse2, NULL, NULL);                                                  \
+          src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
+          NULL);                                                               \
       se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
+      long_sse += sse;                                                         \
+      if (w > wf) {                                                            \
+        uint32_t sse2;                                                         \
         se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+            src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \
             &sse2, NULL, NULL);                                                \
         se += se2;                                                             \
-        sse += sse2;                                                           \
+        long_sse += sse2;                                                      \
+        if (w > wf * 2) {                                                      \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,      \
+              dst_stride, h, &sse2, NULL, NULL);                               \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,      \
+              dst_stride, h, &sse2, NULL, NULL);                               \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+        }                                                                      \
       }                                                                        \
     }                                                                          \
     se = ROUND_POWER_OF_TWO(se, 2);                                            \
-    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
+    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 4);                           \
     *sse_ptr = sse;                                                            \
     var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
     return (var >= 0) ? (uint32_t)var : 0;                                     \
@@ -364,35 +381,38 @@ DECLS(sse2);
     uint64_t long_sse = 0;                                                     \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    int row_rep = (w > 64) ? 2 : 1;                                            \
     for (start_row = 0; start_row < h; start_row += 16) {                      \
       uint32_t sse2;                                                           \
       int height = h - start_row < 16 ? h - start_row : 16;                    \
-      int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
-          dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL,     \
-          NULL);                                                               \
-      se += se2;                                                               \
-      long_sse += sse2;                                                        \
-      if (w > wf) {                                                            \
-        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
-            y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \
-            &sse2, NULL, NULL);                                                \
+      uint16_t *src_tmp = src + (start_row * src_stride);                      \
+      uint16_t *dst_tmp = dst + (start_row * dst_stride);                      \
+      for (int wd_64 = 0; wd_64 < row_rep; wd_64++) {                          \
+        src_tmp += wd_64 * 64;                                                 \
+        dst_tmp += wd_64 * 64;                                                 \
+        int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                 \
+            src_tmp, src_stride, x_offset, y_offset, dst_tmp, dst_stride,      \
+            height, &sse2, NULL, NULL);                                        \
         se += se2;                                                             \
         long_sse += sse2;                                                      \
-        if (w > wf * 2) {                                                      \
+        if (w > wf) {                                                          \
           se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
-              height, &sse2, NULL, NULL);                                      \
-          se += se2;                                                           \
-          long_sse += sse2;                                                    \
-          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
-              height, &sse2, NULL, NULL);                                      \
+              src_tmp + wf, src_stride, x_offset, y_offset, dst_tmp + wf,      \
+              dst_stride, height, &sse2, NULL, NULL);                          \
           se += se2;                                                           \
           long_sse += sse2;                                                    \
+          if (w > wf * 2) {                                                    \
+            se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                 \
+                src_tmp + 2 * wf, src_stride, x_offset, y_offset,              \
+                dst_tmp + 2 * wf, dst_stride, height, &sse2, NULL, NULL);      \
+            se += se2;                                                         \
+            long_sse += sse2;                                                  \
+            se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                 \
+                src_tmp + 3 * wf, src_stride, x_offset, y_offset,              \
+                dst_tmp + 3 * wf, dst_stride, height, &sse2, NULL, NULL);      \
+            se += se2;                                                         \
+            long_sse += sse2;                                                  \
+          }                                                                    \
         }                                                                      \
       }                                                                        \
     }                                                                          \
@@ -403,22 +423,25 @@ DECLS(sse2);
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t));    \
-  FN(16, 4, 16, 4, 2, opt, (int64_t));  \
-  FN(8, 32, 8, 3, 5, opt, (int64_t));   \
-  FN(32, 8, 16, 5, 3, opt, (int64_t));  \
-  FN(16, 64, 16, 4, 6, opt, (int64_t)); \
+#define FNS(opt)                          \
+  FN(128, 128, 16, 7, 7, opt, (int64_t)); \
+  FN(128, 64, 16, 7, 6, opt, (int64_t));  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t));  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t));   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t));   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t));   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t));   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t));   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t));   \
+  FN(16, 16, 16, 4, 4, opt, (int64_t));   \
+  FN(16, 8, 16, 4, 3, opt, (int64_t));    \
+  FN(8, 16, 8, 3, 4, opt, (int64_t));     \
+  FN(8, 8, 8, 3, 3, opt, (int64_t));      \
+  FN(8, 4, 8, 3, 2, opt, (int64_t));      \
+  FN(16, 4, 16, 4, 2, opt, (int64_t));    \
+  FN(8, 32, 8, 3, 5, opt, (int64_t));     \
+  FN(32, 8, 16, 5, 3, opt, (int64_t));    \
+  FN(16, 64, 16, 4, 6, opt, (int64_t));   \
   FN(64, 16, 16, 6, 4, opt, (int64_t))
 
 FNS(sse2);
@@ -456,19 +479,19 @@ DECLS(sse2);
     if (w > wf) {                                                              \
       uint32_t sse2;                                                           \
       int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
-          sec + 16, w, h, &sse2, NULL, NULL);                                  \
+          src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride,      \
+          sec + wf, w, h, &sse2, NULL, NULL);                                  \
       se += se2;                                                               \
       sse += sse2;                                                             \
       if (w > wf * 2) {                                                        \
         se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
-            sec + 32, w, h, &sse2, NULL, NULL);                                \
+            src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,        \
+            dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL);                \
         se += se2;                                                             \
         sse += sse2;                                                           \
         se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
-            sec + 48, w, h, &sse2, NULL, NULL);                                \
+            src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,        \
+            dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL);                \
         se += se2;                                                             \
         sse += sse2;                                                           \
       }                                                                        \
@@ -492,19 +515,19 @@ DECLS(sse2);
     if (w > wf) {                                                              \
       uint32_t sse2;                                                           \
       int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
-          sec + 16, w, h, &sse2, NULL, NULL);                                  \
+          src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride,      \
+          sec + wf, w, h, &sse2, NULL, NULL);                                  \
       se += se2;                                                               \
       sse += sse2;                                                             \
       if (w > wf * 2) {                                                        \
         se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
-            sec + 32, w, h, &sse2, NULL, NULL);                                \
+            src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,        \
+            dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL);                \
         se += se2;                                                             \
         sse += sse2;                                                           \
         se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
-            sec + 48, w, h, &sse2, NULL, NULL);                                \
+            src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,        \
+            dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL);                \
         se += se2;                                                             \
         sse += sse2;                                                           \
       }                                                                        \
@@ -539,22 +562,22 @@ DECLS(sse2);
       long_sse += sse2;                                                        \
       if (w > wf) {                                                            \
         se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
-            y_offset, dst + 16 + (start_row * dst_stride), dst_stride,         \
-            sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL);         \
+            src + wf + (start_row * src_stride), src_stride, x_offset,         \
+            y_offset, dst + wf + (start_row * dst_stride), dst_stride,         \
+            sec + wf + (start_row * w), w, height, &sse2, NULL, NULL);         \
         se += se2;                                                             \
         long_sse += sse2;                                                      \
         if (w > wf * 2) {                                                      \
           se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
-              sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL);       \
+              src + 2 * wf + (start_row * src_stride), src_stride, x_offset,   \
+              y_offset, dst + 2 * wf + (start_row * dst_stride), dst_stride,   \
+              sec + 2 * wf + (start_row * w), w, height, &sse2, NULL, NULL);   \
           se += se2;                                                           \
           long_sse += sse2;                                                    \
           se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
-              sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL);       \
+              src + 3 * wf + (start_row * src_stride), src_stride, x_offset,   \
+              y_offset, dst + 3 * wf + (start_row * dst_stride), dst_stride,   \
+              sec + 3 * wf + (start_row * w), w, height, &sse2, NULL, NULL);   \
           se += se2;                                                           \
           long_sse += sse2;                                                    \
         }                                                                      \
@@ -603,85 +626,34 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
     const int ref_num = 0;
     const int is_intrabc = is_intrabc_block(mi);
     const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
     const int is_scaled = av1_is_scaled(sf);
 
     if (is_scaled) {
-      // Note: This is mostly a copy from the >=8X8 case in
-      // build_inter_predictors() function, with some small tweaks.
-      // Some assumptions.
-      const int plane = 0;
-
-      // Get pre-requisites.
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
       const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const int ssx = pd->subsampling_x;
-      const int ssy = pd->subsampling_y;
-      assert(ssx == 0 && ssy == 0);
       const struct buf_2d *const dst_buf = &pd->dst;
       const struct buf_2d *const pre_buf =
           is_intrabc ? dst_buf : &pd->pre[ref_num];
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
 
-      // Calculate subpel_x/y and x/y_step.
-      const int row_start = 0;  // Because ss_y is 0.
-      const int col_start = 0;  // Because ss_x is 0.
-      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
-      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
-      int orig_pos_y = pre_y << SUBPEL_BITS;
-      orig_pos_y += mv->row * (1 << (1 - ssy));
-      int orig_pos_x = pre_x << SUBPEL_BITS;
-      orig_pos_x += mv->col * (1 << (1 - ssx));
-      int pos_y = sf->scale_value_y(orig_pos_y, sf);
-      int pos_x = sf->scale_value_x(orig_pos_x, sf);
-      pos_x += SCALE_EXTRA_OFF;
-      pos_y += SCALE_EXTRA_OFF;
-
-      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
-      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
-      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                         << SCALE_SUBPEL_BITS;
-      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
-                        << SCALE_SUBPEL_BITS;
-      pos_y = clamp(pos_y, top, bottom);
-      pos_x = clamp(pos_x, left, right);
-
-      const uint8_t *const pre =
-          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-          (pos_x >> SCALE_SUBPEL_BITS);
-      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
-                                           pos_x & SCALE_SUBPEL_MASK,
-                                           pos_y & SCALE_SUBPEL_MASK };
-
-      // Get warp types.
-      const WarpedMotionParams *const wm =
-          &xd->global_motion[mi->ref_frame[ref_num]];
-      const int is_global = is_global_mv_block(mi, wm->wmtype);
-      WarpTypesAllowed warp_types;
-      warp_types.global_warp_allowed = is_global;
-      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-
-      // Get convolve parameters.
-      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
-      const InterpFilters filters =
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
           av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-
-      // Get the inter predictor.
-      const int build_for_obmc = 0;
-      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width,
-                               &subpel_params, sf, width, height, &conv_params,
-                               filters, &warp_types, mi_x >> pd->subsampling_x,
-                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
-                               build_for_obmc, xd, cm->allow_warped_motion);
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred8, width, mv,
+                                        &inter_pred_params);
       return;
     }
   }
 
-  const InterpFilterParams *filter =
-      (subpel_search == 1)
-          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
-          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
-
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+  int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
   if (!subpel_x_q3 && !subpel_y_q3) {
     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
     uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
@@ -729,17 +701,20 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
         av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
     const int16_t *const kernel_y =
         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
+    const uint8_t *ref_start = ref8 - ref_stride * ((filter_taps >> 1) - 1);
+    uint16_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
+                                     ? temp + (filter_taps >> 1) * MAX_SB_SIZE
+                                     : temp;
+    uint16_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
     const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
+        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1),
-                               ref_stride, CONVERT_TO_BYTEPTR(temp),
-                               MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
-                               intermediate_height, bd);
-    aom_highbd_convolve8_vert(
-        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
-        MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
-        bd);
+    aom_highbd_convolve8_horiz(
+        ref_start, ref_stride, CONVERT_TO_BYTEPTR(temp_start_horiz),
+        MAX_SB_SIZE, kernel_x, 16, NULL, -1, width, intermediate_height, bd);
+    aom_highbd_convolve8_vert(CONVERT_TO_BYTEPTR(temp_start_vert), MAX_SB_SIZE,
+                              comp_pred8, width, NULL, -1, kernel_y, 16, width,
+                              height, bd);
   }
 }
 
@@ -765,11 +740,11 @@ void aom_highbd_comp_avg_upsampled_pred_sse2(
   }
 }
 
-static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
-                                               const __m128i *w0,
-                                               const __m128i *w1,
-                                               const __m128i *r,
-                                               void *const result) {
+static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
+                                                    const __m128i *w0,
+                                                    const __m128i *w1,
+                                                    const __m128i *r,
+                                                    void *const result) {
   assert(DIST_PRECISION_BITS <= 4);
   __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
   __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
@@ -780,11 +755,10 @@ static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
   xx_storeu_128(result, shift);
 }
 
-void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
-                                       const uint8_t *pred8, int width,
-                                       int height, const uint8_t *ref8,
-                                       int ref_stride,
-                                       const JNT_COMP_PARAMS *jcp_param) {
+void aom_highbd_dist_wtd_comp_avg_pred_sse2(
+    uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+    const uint8_t *ref8, int ref_stride,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
   int i;
   const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
   const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
@@ -806,7 +780,7 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
         __m128i p0 = xx_loadu_128(ref);
         __m128i p1 = xx_loadu_128(pred);
 
-        highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+        highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
 
         comp_pred += 8;
         pred += 8;
@@ -823,7 +797,7 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
       __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
       __m128i p1 = xx_loadu_128(pred);
 
-      highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+      highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
 
       comp_pred += 8;
       pred += 8;
@@ -832,11 +806,11 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
   }
 }
 
-void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2(
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+    int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
     int subpel_search) {
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   int n;
@@ -860,7 +834,7 @@ void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
     __m128i p0 = xx_loadu_128(comp_pred16);
     __m128i p1 = xx_loadu_128(pred);
 
-    highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
+    highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
 
     comp_pred16 += 8;
     pred += 8;
diff --git a/media/libaom/src/aom_dsp/x86/intrapred_sse2_asm.asm b/media/libaom/src/aom_dsp/x86/intrapred_asm_sse2.asm
index 9aece27beb..0eb632326b 100644
--- a/media/libaom/src/aom_dsp/x86/intrapred_sse2_asm.asm
+++ b/media/libaom/src/aom_dsp/x86/intrapred_asm_sse2.asm
@@ -27,23 +27,6 @@ pw2_32:  times 8 dw 16
 
 SECTION .text
 
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
-  pavgb               %4, %1, %3
-  pxor                %3, %1
-  pand                %3, [GLOBAL(pb_1)]
-  psubb               %4, %3
-  pavgb               %4, %2
-%endmacro
-
 INIT_XMM sse2
 cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
   GET_GOT     goffsetq
diff --git a/media/libaom/src/aom_dsp/x86/intrapred_avx2.c b/media/libaom/src/aom_dsp/x86/intrapred_avx2.c
index 1e67d392e8..546ee74bb3 100644
--- a/media/libaom/src/aom_dsp/x86/intrapred_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/intrapred_avx2.c
@@ -12,6 +12,8 @@
 #include <immintrin.h>
 
 #include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/intrapred_x86.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
 
 static INLINE __m256i dc_sum_64(const uint8_t *ref) {
   const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
@@ -63,6 +65,255 @@ static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
   }
 }
 
+static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+  { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+};
+
+static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = {
+  { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 },
+  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
+  { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
+  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 }
+};
+
+static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = {
+  { 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 29,
+    2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
+  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27,
+    0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
+  { 0, 1, 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25,
+    0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 },
+  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23,
+    0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 8,  9,  12, 13, 16, 17, 20, 21,
+    0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19,
+    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17,
+    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15,
+    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 }
+};
+
+static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
+    0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
+    0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
+    0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
+};
+
+static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
+  __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+
+  r0 = _mm_unpacklo_epi16(x[0], x[1]);
+  r1 = _mm_unpacklo_epi16(x[2], x[3]);
+  r2 = _mm_unpacklo_epi16(x[4], x[5]);
+  r3 = _mm_unpacklo_epi16(x[6], x[7]);
+
+  r4 = _mm_unpacklo_epi16(x[8], x[9]);
+  r5 = _mm_unpacklo_epi16(x[10], x[11]);
+  r6 = _mm_unpacklo_epi16(x[12], x[13]);
+  r7 = _mm_unpacklo_epi16(x[14], x[15]);
+
+  r8 = _mm_unpacklo_epi32(r0, r1);
+  r9 = _mm_unpackhi_epi32(r0, r1);
+  r10 = _mm_unpacklo_epi32(r2, r3);
+  r11 = _mm_unpackhi_epi32(r2, r3);
+
+  r12 = _mm_unpacklo_epi32(r4, r5);
+  r13 = _mm_unpackhi_epi32(r4, r5);
+  r14 = _mm_unpacklo_epi32(r6, r7);
+  r15 = _mm_unpackhi_epi32(r6, r7);
+
+  r0 = _mm_unpacklo_epi64(r8, r9);
+  r1 = _mm_unpackhi_epi64(r8, r9);
+  r2 = _mm_unpacklo_epi64(r10, r11);
+  r3 = _mm_unpackhi_epi64(r10, r11);
+
+  r4 = _mm_unpacklo_epi64(r12, r13);
+  r5 = _mm_unpackhi_epi64(r12, r13);
+  r6 = _mm_unpacklo_epi64(r14, r15);
+  r7 = _mm_unpackhi_epi64(r14, r15);
+
+  d[0] = _mm_unpacklo_epi64(r0, r2);
+  d[1] = _mm_unpacklo_epi64(r4, r6);
+  d[2] = _mm_unpacklo_epi64(r1, r3);
+  d[3] = _mm_unpacklo_epi64(r5, r7);
+
+  d[4] = _mm_unpackhi_epi64(r0, r2);
+  d[5] = _mm_unpackhi_epi64(r4, r6);
+  d[6] = _mm_unpackhi_epi64(r1, r3);
+  d[7] = _mm_unpackhi_epi64(r5, r7);
+}
+
+static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
+  __m256i w0, w1, w2, w3, ww0, ww1;
+
+  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
+  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
+  w2 = _mm256_unpackhi_epi16(x[0], x[1]);  // 40 50 41 51 42 52 43 53
+  w3 = _mm256_unpackhi_epi16(x[2], x[3]);  // 60 70 61 71 62 72 63 73
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
+
+  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
+  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
+
+  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
+  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
+}
+
+static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
+  __m256i w0, w1, w2, w3, ww0, ww1;
+
+  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
+  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
+  w2 = _mm256_unpacklo_epi16(x[4], x[5]);  // 40 50 41 51 42 52 43 53
+  w3 = _mm256_unpacklo_epi16(x[6], x[7]);  // 60 70 61 71 62 72 63 73
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
+
+  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
+  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
+
+  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
+  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
+
+  w0 = _mm256_unpackhi_epi16(x[0], x[1]);  // 04 14 05 15 06 16 07 17
+  w1 = _mm256_unpackhi_epi16(x[2], x[3]);  // 24 34 25 35 26 36 27 37
+  w2 = _mm256_unpackhi_epi16(x[4], x[5]);  // 44 54 45 55 46 56 47 57
+  w3 = _mm256_unpackhi_epi16(x[6], x[7]);  // 64 74 65 75 66 76 67 77
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
+  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
+
+  d[4] = _mm256_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
+  d[5] = _mm256_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
+  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
+
+  d[6] = _mm256_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
+  d[7] = _mm256_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
+}
+
+static INLINE void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
+  __m256i w0, w1, w2, w3, ww0, ww1;
+  __m256i dd[16];
+  w0 = _mm256_unpacklo_epi16(x[0], x[1]);
+  w1 = _mm256_unpacklo_epi16(x[2], x[3]);
+  w2 = _mm256_unpacklo_epi16(x[4], x[5]);
+  w3 = _mm256_unpacklo_epi16(x[6], x[7]);
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
+  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
+
+  dd[0] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[1] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
+  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
+
+  dd[2] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[3] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  w0 = _mm256_unpackhi_epi16(x[0], x[1]);
+  w1 = _mm256_unpackhi_epi16(x[2], x[3]);
+  w2 = _mm256_unpackhi_epi16(x[4], x[5]);
+  w3 = _mm256_unpackhi_epi16(x[6], x[7]);
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
+  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
+
+  dd[4] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[5] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
+  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
+
+  dd[6] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[7] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  w0 = _mm256_unpacklo_epi16(x[8], x[9]);
+  w1 = _mm256_unpacklo_epi16(x[10], x[11]);
+  w2 = _mm256_unpacklo_epi16(x[12], x[13]);
+  w3 = _mm256_unpacklo_epi16(x[14], x[15]);
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);
+  ww1 = _mm256_unpacklo_epi32(w2, w3);
+
+  dd[8] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[9] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);
+  ww1 = _mm256_unpackhi_epi32(w2, w3);
+
+  dd[10] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[11] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  w0 = _mm256_unpackhi_epi16(x[8], x[9]);
+  w1 = _mm256_unpackhi_epi16(x[10], x[11]);
+  w2 = _mm256_unpackhi_epi16(x[12], x[13]);
+  w3 = _mm256_unpackhi_epi16(x[14], x[15]);
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);
+  ww1 = _mm256_unpacklo_epi32(w2, w3);
+
+  dd[12] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[13] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);
+  ww1 = _mm256_unpackhi_epi32(w2, w3);
+
+  dd[14] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[15] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  for (int i = 0; i < 8; i++) {
+    d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1);
+    d[i + 8] = _mm256_insertf128_si256(dd[i + 8],
+                                       _mm256_extracti128_si256(dd[i], 1), 0);
+  }
+}
+
 void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const __m256i sum_above = dc_sum_32(above);
@@ -169,34 +420,12 @@ void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
 
 // -----------------------------------------------------------------------------
 // Rectangle
-
-// TODO(luoyi) The following two functions are shared with intrapred_sse2.c.
-// Use a header file, intrapred_common_x86.h
-static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
-  __m128i x = _mm_load_si128((__m128i const *)ref);
-  const __m128i zero = _mm_setzero_si128();
-  x = _mm_sad_epu8(x, zero);
-  const __m128i high = _mm_unpackhi_epi64(x, x);
-  return _mm_add_epi16(x, high);
-}
-
-static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) {
-  __m128i x0 = _mm_load_si128((__m128i const *)ref);
-  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
-  const __m128i zero = _mm_setzero_si128();
-  x0 = _mm_sad_epu8(x0, zero);
-  x1 = _mm_sad_epu8(x1, zero);
-  x0 = _mm_add_epi16(x0, x1);
-  const __m128i high = _mm_unpackhi_epi64(x0, x0);
-  return _mm_add_epi16(x0, high);
-}
-
 void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const __m128i top_sum = dc_sum_32_sse2(above);
   __m128i left_sum = dc_sum_16_sse2(left);
   left_sum = _mm_add_epi16(top_sum, left_sum);
-  uint32_t sum = _mm_cvtsi128_si32(left_sum);
+  uint16_t sum = _mm_cvtsi128_si32(left_sum);
   sum += 24;
   sum /= 48;
   const __m256i row = _mm256_set1_epi8((uint8_t)sum);
@@ -208,7 +437,7 @@ void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
   const __m256i sum_above = dc_sum_32(above);
   __m256i sum_left = dc_sum_64(left);
   sum_left = _mm256_add_epi16(sum_left, sum_above);
-  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
   sum += 48;
   sum /= 96;
   const __m256i row = _mm256_set1_epi8((uint8_t)sum);
@@ -220,7 +449,7 @@ void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
   const __m256i sum_above = dc_sum_64(above);
   __m256i sum_left = dc_sum_64(left);
   sum_left = _mm256_add_epi16(sum_left, sum_above);
-  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
   sum += 64;
   sum /= 128;
   const __m256i row = _mm256_set1_epi8((uint8_t)sum);
@@ -232,7 +461,7 @@ void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
   const __m256i sum_above = dc_sum_64(above);
   __m256i sum_left = dc_sum_32(left);
   sum_left = _mm256_add_epi16(sum_left, sum_above);
-  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
   sum += 48;
   sum /= 96;
   const __m256i row = _mm256_set1_epi8((uint8_t)sum);
@@ -244,7 +473,7 @@ void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
   const __m256i sum_above = dc_sum_64(above);
   __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
   sum_left = _mm256_add_epi16(sum_left, sum_above);
-  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
   sum += 40;
   sum /= 80;
   const __m256i row = _mm256_set1_epi8((uint8_t)sum);
@@ -525,7 +754,7 @@ void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
   __m128i x = _mm_loadl_epi64((const __m128i *)left);
   const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
   const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i top = get_top_vector(above);
 
@@ -549,7 +778,7 @@ void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   const __m256i l = get_left_vector(left);
   const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i top = get_top_vector(above);
 
@@ -568,7 +797,7 @@ void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   __m256i l = get_left_vector(left);
   const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
   const __m256i top = get_top_vector(above);
 
@@ -583,7 +812,7 @@ void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
   }
 
   l = get_left_vector(left + 16);
-  rep = _mm256_set1_epi16(0x8000);
+  rep = _mm256_set1_epi16((short)0x8000);
   for (i = 0; i < 16; ++i) {
     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
@@ -602,7 +831,7 @@ void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
 
   for (int j = 0; j < 4; ++j) {
     const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16(0x8000);
+    __m256i rep = _mm256_set1_epi16((short)0x8000);
     for (int i = 0; i < 16; ++i) {
       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
       const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
@@ -635,7 +864,7 @@ void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   const __m256i t0 = get_top_vector(above);
   const __m256i t1 = get_top_vector(above + 16);
   const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
 
   int i;
@@ -657,7 +886,7 @@ void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
   const __m256i t0 = get_top_vector(above);
   const __m256i t1 = get_top_vector(above + 16);
   const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
   const __m256i one = _mm256_set1_epi16(1);
 
   int i;
@@ -675,7 +904,7 @@ void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
   }
 
   l = get_left_vector(left + 16);
-  rep = _mm256_set1_epi16(0x8000);
+  rep = _mm256_set1_epi16((short)0x8000);
   for (i = 0; i < 16; ++i) {
     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
 
@@ -700,7 +929,7 @@ void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
   int i, j;
   for (j = 0; j < 4; ++j) {
     const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16(0x8000);
+    __m256i rep = _mm256_set1_epi16((short)0x8000);
     for (i = 0; i < 16; ++i) {
       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
 
@@ -728,7 +957,7 @@ void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
   int i, j;
   for (j = 0; j < 2; ++j) {
     const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16(0x8000);
+    __m256i rep = _mm256_set1_epi16((short)0x8000);
     for (i = 0; i < 16; ++i) {
       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
 
@@ -760,7 +989,7 @@ void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
   int i, j;
   for (j = 0; j < 4; ++j) {
     const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16(0x8000);
+    __m256i rep = _mm256_set1_epi16((short)0x8000);
     for (i = 0; i < 16; ++i) {
       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
 
@@ -791,7 +1020,7 @@ void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
 
   int i;
   const __m256i l = get_left_vector(left);
-  __m256i rep = _mm256_set1_epi16(0x8000);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
   for (i = 0; i < 16; ++i) {
     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
 
@@ -809,3 +1038,3858 @@ void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
     rep = _mm256_add_epi16(rep, one);
   }
 }
+
+#define PERM4x64(c0, c1, c2, c3) c0 + (c1 << 2) + (c2 << 4) + (c3 << 6)
+#define PERM2x128(c0, c1) c0 + (c1 << 4)
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
+    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((N + 4) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16;
+  __m256i diff, c3f;
+  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
+  __m128i a0_128, a1_128;
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
+  max_base_x128 = _mm_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i res1;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
+    }
+
+    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
+    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
+
+    if (upsample_above) {
+      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]);
+      a1_128 = _mm_srli_si128(a0_128, 8);
+
+      base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8,
+                                   base + 10, base + 12, base + 14);
+      shift = _mm256_srli_epi16(
+          _mm256_and_si256(
+              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above),
+              _mm256_set1_epi16(0x3f)),
+          1);
+    } else {
+      base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4,
+                                   base + 5, base + 6, base + 7);
+      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+    }
+    a0 = _mm256_castsi128_si256(a0_128);
+    a1 = _mm256_castsi128_si256(a1_128);
+    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+    res1 = _mm256_castsi256_si128(res);
+
+    mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128);
+    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
+    x += dx;
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2(
+    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((N + 4) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16;
+  __m256i diff;
+  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
+  max_base_x128 = _mm_set1_epi32(max_base_x);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i res1;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
+    }
+
+    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+    if (upsample_above) {
+      a0 = _mm256_permutevar8x32_epi32(
+          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
+      base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
+      shift = _mm256_srli_epi32(
+          _mm256_and_si256(
+              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
+              _mm256_set1_epi32(0x3f)),
+          1);
+    } else {
+      base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
+      shift = _mm256_srli_epi32(
+          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+    }
+
+    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi32(diff, shift);
+    res = _mm256_add_epi32(a32, b);
+    res = _mm256_srli_epi32(res, 5);
+
+    res1 = _mm256_castsi256_si128(res);
+    res1 = _mm_packus_epi32(res1, res1);
+
+    mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
+    mask128 = _mm_packs_epi32(mask128, mask128);  // goto 16 bit
+    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
+    x += dx;
+  }
+}
+
+static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst,
+                                             ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             int upsample_above, int dx,
+                                             int bd) {
+  __m128i dstvec[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
+                                              dx);
+  } else {
+    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above,
+                                                    upsample_above, dx);
+  }
+  for (int i = 0; i < N; i++) {
+    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2(
+    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((8 + N) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a0_1, a1_1, a32, a16;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi32(max_base_x);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, res1, shift;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
+      }
+      return;
+    }
+
+    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+    if (upsample_above) {
+      a0 = _mm256_permutevar8x32_epi32(
+          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
+
+      a0_1 =
+          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
+      a0_1 = _mm256_permutevar8x32_epi32(
+          a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+      a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
+
+      a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
+      a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
+      base_inc256 =
+          _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
+                            base + 10, base + 12, base + 14);
+      shift = _mm256_srli_epi32(
+          _mm256_and_si256(
+              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
+              _mm256_set1_epi32(0x3f)),
+          1);
+    } else {
+      base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
+                                      base + 4, base + 5, base + 6, base + 7);
+      shift = _mm256_srli_epi32(
+          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+    }
+
+    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi32(diff, shift);
+    res = _mm256_add_epi32(a32, b);
+    res = _mm256_srli_epi32(res, 5);
+
+    res1 = _mm256_packus_epi32(
+        res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+    mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
+    mask256 = _mm256_packs_epi32(
+        mask256, _mm256_castsi128_si256(
+                     _mm256_extracti128_si256(mask256, 1)));  // goto 16 bit
+    res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+    dst[r] = _mm256_castsi256_si128(res1);
+    x += dx;
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
+    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((8 + N) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+  __m128i a0_x128, a1_x128;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, res1, shift;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
+      }
+      return;
+    }
+
+    a0_x128 = _mm_loadu_si128((__m128i *)(above + base));
+    if (upsample_above) {
+      __m128i mask, atmp0, atmp1, atmp2, atmp3;
+      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8));
+      atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
+      atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
+      atmp2 =
+          _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
+      atmp3 =
+          _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
+      mask =
+          _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15));
+      a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+      mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16),
+                            _mm_set1_epi8(15));
+      a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
+
+      base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6,
+                                      base + 8, base + 10, base + 12, base + 14,
+                                      0, 0, 0, 0, 0, 0, 0, 0);
+      shift = _mm256_srli_epi16(
+          _mm256_and_si256(
+              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
+          1);
+    } else {
+      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1));
+      base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
+                                      base + 4, base + 5, base + 6, base + 7, 0,
+                                      0, 0, 0, 0, 0, 0, 0);
+      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+    }
+    a0 = _mm256_castsi128_si256(a0_x128);
+    a1 = _mm256_castsi128_si256(a1_x128);
+
+    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+    res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+    dst[r] = _mm256_castsi256_si128(res1);
+    x += dx;
+  }
+}
+
+static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst,
+                                             ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             int upsample_above, int dx,
+                                             int bd) {
+  __m128i dstvec[32];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
+                                              dx);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above,
+                                                    upsample_above, dx);
+  }
+  for (int i = 0; i < N; i++) {
+    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2(
+    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((16 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a0_1, a1, a1_1, a32, a16;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res[2], res1;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 16 values
+      }
+      return;
+    }
+    __m256i shift = _mm256_srli_epi32(
+        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+
+    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+    b = _mm256_mullo_epi32(diff, shift);
+
+    res[0] = _mm256_add_epi32(a32, b);
+    res[0] = _mm256_srli_epi32(res[0], 5);
+    res[0] = _mm256_packus_epi32(
+        res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+
+    int mdif = max_base_x - base;
+    if (mdif > 8) {
+      a0_1 =
+          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
+      a1_1 =
+          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
+
+      diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
+      a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
+      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+      b = _mm256_mullo_epi32(diff, shift);
+
+      res[1] = _mm256_add_epi32(a32, b);
+      res[1] = _mm256_srli_epi32(res[1], 5);
+      res[1] = _mm256_packus_epi32(
+          res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+    } else {
+      res[1] = a_mbase_x;
+    }
+    res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
+                                   1);  // 16 16bit values
+
+    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
+                                    base + 4, base + 5, base + 6, base + 7,
+                                    base + 8, base + 9, base + 10, base + 11,
+                                    base + 12, base + 13, base + 14, base + 15);
+    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+    x += dx;
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
+    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((16 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 16 values
+      }
+      return;
+    }
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    a0 = _mm256_loadu_si256((__m256i *)(above + base));
+    a1 = _mm256_loadu_si256((__m256i *)(above + base + 1));
+
+    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+    b = _mm256_mullo_epi16(diff, shift);
+
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);  // 16 16bit values
+
+    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
+                                    base + 4, base + 5, base + 6, base + 7,
+                                    base + 8, base + 9, base + 10, base + 11,
+                                    base + 12, base + 13, base + 14, base + 15);
+    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+    x += dx;
+  }
+}
+
+static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst,
+                                              ptrdiff_t stride,
+                                              const uint16_t *above,
+                                              int upsample_above, int dx,
+                                              int bd) {
+  __m256i dstvec[64];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
+                                               dx);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above,
+                                                     upsample_above, dx);
+  }
+  for (int i = 0; i < N; i++) {
+    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2(
+    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((32 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a0_1, a1, a1_1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res[2], res1;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 32 values
+        dstvec[i + N] = a_mbase_x;
+      }
+      return;
+    }
+
+    __m256i shift =
+        _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
+
+    for (int j = 0; j < 32; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        res1 = a_mbase_x;
+      } else {
+        a0 = _mm256_cvtepu16_epi32(
+            _mm_loadu_si128((__m128i *)(above + base + j)));
+        a1 = _mm256_cvtepu16_epi32(
+            _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
+
+        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi32(diff, shift);
+
+        res[0] = _mm256_add_epi32(a32, b);
+        res[0] = _mm256_srli_epi32(res[0], 5);
+        res[0] = _mm256_packus_epi32(
+            res[0],
+            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+        if (mdif > 8) {
+          a0_1 = _mm256_cvtepu16_epi32(
+              _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
+          a1_1 = _mm256_cvtepu16_epi32(
+              _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
+
+          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
+          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
+          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+          b = _mm256_mullo_epi32(diff, shift);
+
+          res[1] = _mm256_add_epi32(a32, b);
+          res[1] = _mm256_srli_epi32(res[1], 5);
+          res[1] = _mm256_packus_epi32(
+              res[1],
+              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+        } else {
+          res[1] = a_mbase_x;
+        }
+        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
+                                       1);  // 16 16bit values
+        base_inc256 = _mm256_setr_epi16(
+            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+            base + j + 13, base + j + 14, base + j + 15);
+
+        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+      }
+      if (!j) {
+        dstvec[r] = res1;
+      } else {
+        dstvec[r + N] = res1;
+      }
+    }
+    x += dx;
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
+    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((32 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 32 values
+        dstvec[i + N] = a_mbase_x;
+      }
+      return;
+    }
+
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    for (int j = 0; j < 32; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        res = a_mbase_x;
+      } else {
+        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
+        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
+
+        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi16(diff, shift);
+
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);
+
+        base_inc256 = _mm256_setr_epi16(
+            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+            base + j + 13, base + j + 14, base + j + 15);
+
+        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+      }
+      if (!j) {
+        dstvec[r] = res;
+      } else {
+        dstvec[r + N] = res;
+      }
+    }
+    x += dx;
+  }
+}
+
+static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst,
+                                              ptrdiff_t stride,
+                                              const uint16_t *above,
+                                              int upsample_above, int dx,
+                                              int bd) {
+  __m256i dstvec[128];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
+                                               dx);
+  } else {
+    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above,
+                                                     upsample_above, dx);
+  }
+  for (int i = 0; i < N; i++) {
+    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
+    _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
+  }
+}
+
+static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst,
+                                                    ptrdiff_t stride,
+                                                    const uint16_t *above,
+                                                    int upsample_above,
+                                                    int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((64 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a0_1, a1, a1_1, a32, a16;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+
+  int x = dx;
+  for (int r = 0; r < N; r++, dst += stride) {
+    __m256i b, res[2], res1;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
+        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
+        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
+        dst += stride;
+      }
+      return;
+    }
+
+    __m256i shift = _mm256_srli_epi32(
+        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+
+    __m128i a0_128, a0_1_128, a1_128, a1_1_128;
+    for (int j = 0; j < 64; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
+      } else {
+        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
+        a0 = _mm256_cvtepu16_epi32(a0_128);
+        a1 = _mm256_cvtepu16_epi32(a1_128);
+
+        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi32(diff, shift);
+
+        res[0] = _mm256_add_epi32(a32, b);
+        res[0] = _mm256_srli_epi32(res[0], 5);
+        res[0] = _mm256_packus_epi32(
+            res[0],
+            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+        if (mdif > 8) {
+          a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
+          a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
+          a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
+          a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
+
+          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
+          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
+          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+          b = _mm256_mullo_epi32(diff, shift);
+
+          res[1] = _mm256_add_epi32(a32, b);
+          res[1] = _mm256_srli_epi32(res[1], 5);
+          res[1] = _mm256_packus_epi32(
+              res[1],
+              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+        } else {
+          res[1] = a_mbase_x;
+        }
+        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
+                                       1);  // 16 16bit values
+        base_inc256 = _mm256_setr_epi16(
+            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+            base + j + 13, base + j + 14, base + j + 15);
+
+        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+        _mm256_storeu_si256((__m256i *)(dst + j), res1);
+      }
+    }
+    x += dx;
+  }
+}
+
+static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
+                                              ptrdiff_t stride,
+                                              const uint16_t *above,
+                                              int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((64 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++, dst += stride) {
+    __m256i b, res;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
+        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
+        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
+        dst += stride;
+      }
+      return;
+    }
+
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    for (int j = 0; j < 64; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
+      } else {
+        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
+        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
+
+        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi16(diff, shift);
+
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);
+
+        base_inc256 = _mm256_setr_epi16(
+            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+            base + j + 13, base + j + 14, base + j + 15);
+
+        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+        _mm256_storeu_si256((__m256i *)(dst + j), res);  // 16 16bit values
+      }
+    }
+    x += dx;
+  }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int upsample_above,
+                                      int dx, int dy, int bd) {
+  (void)left;
+  (void)dy;
+
+  switch (bw) {
+    case 4:
+      highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
+                                       dx, bd);
+      break;
+    case 8:
+      highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
+                                       dx, bd);
+      break;
+    case 16:
+      highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
+                                        dx, bd);
+      break;
+    case 32:
+      highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
+                                        dx, bd);
+      break;
+    case 64:
+      if (bd < 12) {
+        highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above,
+                                          upsample_above, dx);
+      } else {
+        highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above,
+                                                upsample_above, dx);
+      }
+      break;
+    default: break;
+  }
+  return;
+}
+
+static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc,
+                                      uint16_t *dst, ptrdiff_t pitchDst) {
+  __m256i r[16];
+  __m256i d[16];
+  for (int j = 0; j < 16; j++) {
+    r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
+  }
+  highbd_transpose16x16_avx2(r, d);
+  for (int j = 0; j < 16; j++) {
+    _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
+  }
+}
+
+static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
+                             uint16_t *dst, ptrdiff_t pitchDst, int width,
+                             int height) {
+  for (int j = 0; j < height; j += 16)
+    for (int i = 0; i < width; i += 16)
+      highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
+                                dst + j * pitchDst + i, pitchDst);
+}
+
+static void highbd_dr_prediction_32bit_z2_Nx4_avx2(
+    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a32, a16;
+  __m256i diff;
+  __m128i c3f, min_base_y128;
+
+  a16 = _mm256_set1_epi32(16);
+  c3f = _mm_set1_epi32(0x3f);
+  min_base_y128 = _mm_set1_epi32(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+    __m128i a0_x128, a1_x128;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 4) {
+      base_min_diff = 4;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 3) {
+      a0_x = _mm256_setzero_si256();
+      a1_x = _mm256_setzero_si256();
+      shift = _mm256_setzero_si256();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      if (upsample_above) {
+        a0_x128 = _mm_shuffle_epi8(a0_x128,
+                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi32(
+            _mm_and_si128(
+                _mm_slli_epi32(
+                    _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+                                   (2 << 6) - y * dx, (3 << 6) - y * dx),
+                    upsample_above),
+                c3f),
+            1));
+      } else {
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 2);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi32(
+            _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+                                         (2 << 6) - y * dx, (3 << 6) - y * dx),
+                          c3f),
+            1));
+      }
+      a0_x = _mm256_cvtepu16_epi32(a0_x128);
+      a1_x = _mm256_cvtepu16_epi32(a1_x128);
+    }
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+      DECLARE_ALIGNED(32, int, base_y_c[4]);
+      r6 = _mm_set1_epi32(r << 6);
+      dy128 = _mm_set1_epi32(dy);
+      c1234 = _mm_setr_epi32(1, 2, 3, 4);
+      y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
+      base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]]);
+      a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+                            left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi32(
+            _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
+      }
+      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+      shift = _mm256_inserti128_si256(shift, shifty, 1);
+    }
+
+    diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
+    a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi32(diff, shift);
+    res = _mm256_add_epi32(a32, b);
+    res = _mm256_srli_epi32(res, 5);
+
+    resx = _mm256_castsi256_si128(res);
+    resx = _mm_packus_epi32(resx, resx);
+
+    resy = _mm256_extracti128_si256(res, 1);
+    resy = _mm_packus_epi32(resy, resy);
+
+    resxy =
+        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+    _mm_storel_epi64((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_z2_Nx4_avx2(
+    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a32, a16;
+  __m256i diff;
+  __m128i c3f, min_base_y128;
+
+  a16 = _mm256_set1_epi16(16);
+  c3f = _mm_set1_epi16(0x3f);
+  min_base_y128 = _mm_set1_epi16(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+    __m128i a0_x128, a1_x128;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 4) {
+      base_min_diff = 4;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 3) {
+      a0_x = _mm256_setzero_si256();
+      a1_x = _mm256_setzero_si256();
+      shift = _mm256_setzero_si256();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      if (upsample_above) {
+        a0_x128 = _mm_shuffle_epi8(a0_x128,
+                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+                                              (2 << 6) - y * dx,
+                                              (3 << 6) - y * dx, 0, 0, 0, 0),
+                               upsample_above),
+                c3f),
+            1));
+      } else {
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 2);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(
+                _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
+                               (3 << 6) - y * dx, 0, 0, 0, 0),
+                c3f),
+            1));
+      }
+      a0_x = _mm256_castsi128_si256(a0_x128);
+      a1_x = _mm256_castsi128_si256(a1_x128);
+    }
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+      r6 = _mm_set1_epi16(r << 6);
+      dy128 = _mm_set1_epi16(dy);
+      c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
+      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+                            left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
+                            0, 0);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+      }
+      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+      shift = _mm256_inserti128_si256(shift, shifty, 1);
+    }
+
+    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    resx = _mm256_castsi256_si128(res);
+    resy = _mm256_extracti128_si256(res, 1);
+    resxy =
+        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+    _mm_storel_epi64((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
+    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
+  __m256i diff;
+  __m128i a0_x128, a1_x128;
+
+  a16 = _mm256_set1_epi32(16);
+  c3f = _mm256_set1_epi32(0x3f);
+  min_base_y256 = _mm256_set1_epi32(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 8) {
+      base_min_diff = 8;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 7) {
+      resx = _mm_setzero_si128();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      if (upsample_above) {
+        __m128i mask, atmp0, atmp1, atmp2, atmp3;
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
+        atmp0 = _mm_shuffle_epi8(a0_x128,
+                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+        atmp1 = _mm_shuffle_epi8(a1_x128,
+                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+        atmp2 = _mm_shuffle_epi8(
+            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+        atmp3 = _mm_shuffle_epi8(
+            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
+                              _mm_set1_epi8(15));
+        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
+                              _mm_set1_epi8(15));
+        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(
+                _mm256_slli_epi32(
+                    _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
+                                      (2 << 6) - y * dx, (3 << 6) - y * dx,
+                                      (4 << 6) - y * dx, (5 << 6) - y * dx,
+                                      (6 << 6) - y * dx, (7 << 6) - y * dx),
+                    upsample_above),
+                c3f),
+            1);
+      } else {
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 =
+            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(
+                _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
+                                  (3 << 6) - y * dx, (4 << 6) - y * dx,
+                                  (5 << 6) - y * dx, (6 << 6) - y * dx,
+                                  (7 << 6) - y * dx),
+                c3f),
+            1);
+      }
+      a0_x = _mm256_cvtepu16_epi32(a0_x128);
+      a1_x = _mm256_cvtepu16_epi32(a1_x128);
+
+      diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
+      a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
+      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+      b = _mm256_mullo_epi32(diff, shift);
+      res = _mm256_add_epi32(a32, b);
+      res = _mm256_srli_epi32(res, 5);
+
+      resx = _mm256_castsi256_si128(_mm256_packus_epi32(
+          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+    }
+    // y calc
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int, base_y_c[8]);
+      __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
+      r6 = _mm256_set1_epi32(r << 6);
+      dy256 = _mm256_set1_epi32(dy);
+      c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+      y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+      base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
+      mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+      base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+      _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+      a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+          left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+          left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+          left[base_y_c[6]], left[base_y_c[7]]));
+      a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+          left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+          left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+          left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
+
+      if (upsample_left) {
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
+            1);
+      } else {
+        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
+      }
+      diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
+      a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
+      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+      b = _mm256_mullo_epi32(diff, shift);
+      res = _mm256_add_epi32(a32, b);
+      res = _mm256_srli_epi32(res, 5);
+
+      resy = _mm256_castsi256_si128(_mm256_packus_epi32(
+          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+    } else {
+      resy = resx;
+    }
+    resxy =
+        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+    _mm_storeu_si128((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_z2_Nx8_avx2(
+    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m128i c3f, min_base_y128;
+  __m256i a0_x, a1_x, diff, a32, a16;
+  __m128i a0_x128, a1_x128;
+
+  a16 = _mm256_set1_epi16(16);
+  c3f = _mm_set1_epi16(0x3f);
+  min_base_y128 = _mm_set1_epi16(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 8) {
+      base_min_diff = 8;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 7) {
+      a0_x = _mm256_setzero_si256();
+      a1_x = _mm256_setzero_si256();
+      shift = _mm256_setzero_si256();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      if (upsample_above) {
+        __m128i mask, atmp0, atmp1, atmp2, atmp3;
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
+        atmp0 = _mm_shuffle_epi8(a0_x128,
+                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+        atmp1 = _mm_shuffle_epi8(a1_x128,
+                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+        atmp2 = _mm_shuffle_epi8(
+            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+        atmp3 = _mm_shuffle_epi8(
+            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
+                              _mm_set1_epi8(15));
+        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
+                              _mm_set1_epi8(15));
+        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(
+                    _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+                                   (2 << 6) - y * dx, (3 << 6) - y * dx,
+                                   (4 << 6) - y * dx, (5 << 6) - y * dx,
+                                   (6 << 6) - y * dx, (7 << 6) - y * dx),
+                    upsample_above),
+                c3f),
+            1));
+      } else {
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 =
+            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+                                         (2 << 6) - y * dx, (3 << 6) - y * dx,
+                                         (4 << 6) - y * dx, (5 << 6) - y * dx,
+                                         (6 << 6) - y * dx, (7 << 6) - y * dx),
+                          c3f),
+            1));
+      }
+      a0_x = _mm256_castsi128_si256(a0_x128);
+      a1_x = _mm256_castsi128_si256(a1_x128);
+    }
+
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+      r6 = _mm_set1_epi16(r << 6);
+      dy128 = _mm_set1_epi16(dy);
+      c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]],
+                            left[base_y_c[4]], left[base_y_c[5]],
+                            left[base_y_c[6]], left[base_y_c[7]]);
+      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+                            left[base_y_c[2] + 1], left[base_y_c[3] + 1],
+                            left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+                            left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+      }
+      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+      shift = _mm256_inserti128_si256(shift, shifty, 1);
+    }
+
+    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    resx = _mm256_castsi256_si128(res);
+    resy = _mm256_extracti128_si256(res, 1);
+
+    resxy =
+        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+    _mm_storeu_si128((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_32bit_z2_HxW_avx2(
+    int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_x = -1;
+  const int min_base_y = -1;
+  (void)upsample_above;
+  (void)upsample_left;
+  const int frac_bits_x = 6;
+  const int frac_bits_y = 6;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1;
+  __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8;
+  __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
+  DECLARE_ALIGNED(32, int, base_y_c[16]);
+
+  a16 = _mm256_set1_epi32(16);
+  c1 = _mm256_srli_epi32(a16, 4);
+  c8 = _mm256_srli_epi32(a16, 1);
+  min_base_y256 = _mm256_set1_epi16(min_base_y);
+  c3f = _mm256_set1_epi32(0x3f);
+  dy256 = _mm256_set1_epi32(dy);
+  c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+  c1234 = _mm256_add_epi32(c0123, c1);
+
+  for (int r = 0; r < H; r++) {
+    __m256i b, res, shift, ydx;
+    __m256i resx[2], resy[2];
+    __m256i resxy, j256, r6;
+    for (int j = 0; j < W; j += 16) {
+      j256 = _mm256_set1_epi32(j);
+      int y = r + 1;
+      ydx = _mm256_set1_epi32(y * dx);
+
+      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
+      int base_shift = 0;
+      if ((base_x) < (min_base_x - 1)) {
+        base_shift = (min_base_x - base_x - 1);
+      }
+      int base_min_diff = (min_base_x - base_x);
+      if (base_min_diff > 16) {
+        base_min_diff = 16;
+      } else {
+        if (base_min_diff < 0) base_min_diff = 0;
+      }
+
+      if (base_shift > 7) {
+        resx[0] = _mm256_setzero_si256();
+      } else {
+        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 =
+            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+        a0_x = _mm256_cvtepu16_epi32(a0_x128);
+        a1_x = _mm256_cvtepu16_epi32(a1_x128);
+
+        r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6);
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
+
+        diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi32(diff, shift);
+        res = _mm256_add_epi32(a32, b);
+        res = _mm256_srli_epi32(res, 5);
+
+        resx[0] = _mm256_packus_epi32(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+      }
+      int base_shift8 = 0;
+      if ((base_x + 8) < (min_base_x - 1)) {
+        base_shift8 = (min_base_x - (base_x + 8) - 1);
+      }
+      if (base_shift8 > 7) {
+        resx[1] = _mm256_setzero_si256();
+      } else {
+        a0_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8));
+        a1_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9));
+        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
+                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
+        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
+                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
+
+        a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
+        a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
+
+        r6 = _mm256_slli_epi32(
+            _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6);
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
+
+        diff = _mm256_sub_epi32(a1_1_x, a0_1_x);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0_1_x, 5);       // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);         // a[x] * 32 + 16
+        b = _mm256_mullo_epi32(diff, shift);
+
+        resx[1] = _mm256_add_epi32(a32, b);
+        resx[1] = _mm256_srli_epi32(resx[1], 5);
+        resx[1] = _mm256_packus_epi32(
+            resx[1],
+            _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
+      }
+      resx[0] =
+          _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
+                                  1);  // 16 16bit values
+
+      // y calc
+      resy[0] = _mm256_setzero_si256();
+      if ((base_x < min_base_x)) {
+        __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256;
+        r6 = _mm256_set1_epi32(r << 6);
+        c256 = _mm256_add_epi32(j256, c1234);
+        y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+        base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
+        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+        c256 = _mm256_add_epi32(c256, c8);
+        y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+        base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
+        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+        _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
+
+        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+            left[base_y_c[6]], left[base_y_c[7]]));
+        a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+            left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+            left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+            left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
+
+        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
+
+        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi32(diff, shift);
+        res = _mm256_add_epi32(a32, b);
+        res = _mm256_srli_epi32(res, 5);
+
+        resy[0] = _mm256_packus_epi32(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+            left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
+            left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
+            left[base_y_c[14]], left[base_y_c[15]]));
+        a1_y = _mm256_cvtepu16_epi32(
+            _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
+                           left[base_y_c[10] + 1], left[base_y_c[11] + 1],
+                           left[base_y_c[12] + 1], left[base_y_c[13] + 1],
+                           left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
+        shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
+
+        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi32(diff, shift);
+        res = _mm256_add_epi32(a32, b);
+        res = _mm256_srli_epi32(res, 5);
+
+        resy[1] = _mm256_packus_epi32(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+        resy[0] =
+            _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
+                                    1);  // 16 16bit values
+      }
+
+      resxy = _mm256_blendv_epi8(resx[0], resy[0],
+                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
+      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
+    }  // for j
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_z2_HxW_avx2(
+    int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_x = -1;
+  const int min_base_y = -1;
+  (void)upsample_above;
+  (void)upsample_left;
+  const int frac_bits_x = 6;
+  const int frac_bits_y = 6;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a32, a16, c3f, c1;
+  __m256i diff, min_base_y256, dy256, c1234, c0123;
+  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+
+  a16 = _mm256_set1_epi16(16);
+  c1 = _mm256_srli_epi16(a16, 4);
+  min_base_y256 = _mm256_set1_epi16(min_base_y);
+  c3f = _mm256_set1_epi16(0x3f);
+  dy256 = _mm256_set1_epi16(dy);
+  c0123 =
+      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  c1234 = _mm256_add_epi16(c0123, c1);
+
+  for (int r = 0; r < H; r++) {
+    __m256i b, res, shift;
+    __m256i resx, resy, ydx;
+    __m256i resxy, j256, r6;
+    __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
+    int y = r + 1;
+    ydx = _mm256_set1_epi16((short)(y * dx));
+
+    for (int j = 0; j < W; j += 16) {
+      j256 = _mm256_set1_epi16(j);
+      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
+      int base_shift = 0;
+      if ((base_x) < (min_base_x - 1)) {
+        base_shift = (min_base_x - (base_x)-1);
+      }
+      int base_min_diff = (min_base_x - base_x);
+      if (base_min_diff > 16) {
+        base_min_diff = 16;
+      } else {
+        if (base_min_diff < 0) base_min_diff = 0;
+      }
+
+      if (base_shift < 8) {
+        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 =
+            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+        a0_x = _mm256_castsi128_si256(a0_x128);
+        a1_x = _mm256_castsi128_si256(a1_x128);
+      } else {
+        a0_x = _mm256_setzero_si256();
+        a1_x = _mm256_setzero_si256();
+      }
+
+      int base_shift1 = 0;
+      if (base_shift > 8) {
+        base_shift1 = base_shift - 8;
+      }
+      if (base_shift1 < 8) {
+        a0_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8));
+        a1_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9));
+        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
+                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
+        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
+                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
+
+        a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
+        a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
+      }
+      r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
+      shift = _mm256_srli_epi16(
+          _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
+
+      diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+      a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+      a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+      b = _mm256_mullo_epi16(diff, shift);
+      res = _mm256_add_epi16(a32, b);
+      resx = _mm256_srli_epi16(res, 5);  // 16 16-bit values
+
+      // y calc
+      resy = _mm256_setzero_si256();
+      __m256i a0_y, a1_y, shifty;
+      if ((base_x < min_base_x)) {
+        __m256i c256, y_c256, base_y_c256, mask256, mul16;
+        r6 = _mm256_set1_epi16(r << 6);
+        c256 = _mm256_add_epi16(j256, c1234);
+        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
+                                 _mm256_srli_epi16(min_base_y256, 1));
+        y_c256 = _mm256_sub_epi16(r6, mul16);
+        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
+        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
+        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+        a0_y = _mm256_setr_epi16(
+            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+            left[base_y_c[15]]);
+        base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
+        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+        a1_y = _mm256_setr_epi16(
+            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+            left[base_y_c[15]]);
+
+        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
+
+        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi16(diff, shifty);
+        res = _mm256_add_epi16(a32, b);
+        resy = _mm256_srli_epi16(res, 5);
+      }
+
+      resxy = _mm256_blendv_epi8(resx, resy,
+                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
+      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
+    }  // for j
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int upsample_above,
+                                      int upsample_left, int dx, int dy,
+                                      int bd) {
+  (void)bd;
+  assert(dx > 0);
+  assert(dy > 0);
+  switch (bw) {
+    case 4:
+      if (bd < 12) {
+        highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
+                                         upsample_above, upsample_left, dx, dy);
+      } else {
+        highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left,
+                                               upsample_above, upsample_left,
+                                               dx, dy);
+      }
+      break;
+    case 8:
+      if (bd < 12) {
+        highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
+                                         upsample_above, upsample_left, dx, dy);
+      } else {
+        highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
+                                               upsample_above, upsample_left,
+                                               dx, dy);
+      }
+      break;
+    default:
+      if (bd < 12) {
+        highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+                                         upsample_above, upsample_left, dx, dy);
+      } else {
+        highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+                                               upsample_above, upsample_left,
+                                               dx, dy);
+      }
+      break;
+  }
+}
+
+//  Directional prediction, zone 3 functions
+static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *left,
+                                             int upsample_left, int dy,
+                                             int bd) {
+  __m128i dstvec[4], d[4];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left,
+                                                    upsample_left, dy);
+  }
+  highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
+                                   &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
+  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
+  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
+  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
+  return;
+}
+
+static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *left,
+                                             int upsample_left, int dy,
+                                             int bd) {
+  __m128i dstvec[8], d[8];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left,
+                                                    upsample_left, dy);
+  }
+  highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                           &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
+                           &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+                           &d[7]);
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *left,
+                                             int upsample_left, int dy,
+                                             int bd) {
+  __m128i dstvec[4], d[8];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left,
+                                                    upsample_left, dy);
+  }
+
+  highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                               &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+                               &d[7]);
+  for (int i = 0; i < 8; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *left,
+                                             int upsample_left, int dy,
+                                             int bd) {
+  __m128i dstvec[8], d[4];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left,
+                                                    upsample_left, dy);
+  }
+
+  highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                               &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
+                               &d[0], &d[1], &d[2], &d[3]);
+  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
+  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
+  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
+}
+
+static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m256i dstvec[8], d[8];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  highbd_transpose8x16_16x8_avx2(dstvec, d);
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride),
+                     _mm256_castsi256_si128(d[i]));
+  }
+  for (int i = 8; i < 16; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride),
+                     _mm256_extracti128_si256(d[i - 8], 1));
+  }
+}
+
+static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m128i dstvec[16], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left,
+                                                    upsample_left, dy);
+  }
+  for (int i = 0; i < 16; i += 8) {
+    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
+                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
+                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
+                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
+                             &d[5 + i], &d[6 + i], &d[7 + i]);
+  }
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
+  }
+}
+
+static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m256i dstvec[4], d[4], d1;
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  highbd_transpose4x16_avx2(dstvec, d);
+  for (int i = 0; i < 4; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride),
+                     _mm256_castsi256_si128(d[i]));
+    d1 = _mm256_bsrli_epi128(d[i], 8);
+    _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
+                     _mm256_castsi256_si128(d1));
+    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
+                     _mm256_extracti128_si256(d[i], 1));
+    _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
+                     _mm256_extracti128_si256(d1, 1));
+  }
+}
+
+static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m128i dstvec[16], d[8];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left,
+                                                    upsample_left, dy);
+  }
+  highbd_transpose16x4_8x8_sse2(dstvec, d);
+
+  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
+  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
+  _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
+  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
+  _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
+  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
+  _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
+}
+
+static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m256i dstvec[16], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left,
+                                                     upsample_left, dy);
+  }
+
+  for (int i = 0; i < 16; i += 8) {
+    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
+  }
+
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride),
+                     _mm256_castsi256_si128(d[i]));
+  }
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
+                     _mm256_extracti128_si256(d[i], 1));
+  }
+  for (int i = 8; i < 16; i++) {
+    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
+                     _mm256_castsi256_si128(d[i]));
+  }
+  for (int i = 8; i < 16; i++) {
+    _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
+                     _mm256_extracti128_si256(d[i], 1));
+  }
+}
+
+static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m128i dstvec[32], d[32];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left,
+                                                    upsample_left, dy);
+  }
+
+  for (int i = 0; i < 32; i += 8) {
+    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
+                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
+                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
+                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
+                             &d[5 + i], &d[6 + i], &d[7 + i]);
+  }
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
+  }
+}
+
+static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  __m256i dstvec[16], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left,
+                                                     upsample_left, dy);
+  }
+
+  highbd_transpose16x16_avx2(dstvec, d);
+
+  for (int i = 0; i < 16; i++) {
+    _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  __m256i dstvec[64], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  highbd_transpose16x16_avx2(dstvec, d);
+  for (int j = 0; j < 16; j++) {
+    _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
+  }
+  highbd_transpose16x16_avx2(dstvec + 16, d);
+  for (int j = 0; j < 16; j++) {
+    _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
+  }
+  highbd_transpose16x16_avx2(dstvec + 32, d);
+  for (int j = 0; j < 16; j++) {
+    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
+  }
+  highbd_transpose16x16_avx2(dstvec + 48, d);
+  for (int j = 0; j < 16; j++) {
+    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
+  }
+}
+
+static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left,
+                                            dy);
+  }
+  highbd_transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  __m256i dstvec[32], d[32];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  for (int i = 0; i < 32; i += 8) {
+    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
+  }
+  // store
+  for (int j = 0; j < 32; j += 16) {
+    for (int i = 0; i < 8; i++) {
+      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
+                       _mm256_castsi256_si128(d[(i + j)]));
+    }
+    for (int i = 0; i < 8; i++) {
+      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
+                       _mm256_castsi256_si128(d[(i + j) + 8]));
+    }
+    for (int i = 8; i < 16; i++) {
+      _mm256_storeu_si256(
+          (__m256i *)(dst + (i + j) * stride),
+          _mm256_inserti128_si256(
+              d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
+    }
+  }
+}
+
+static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  __m256i dstvec[32], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  for (int i = 0; i < 32; i += 16) {
+    highbd_transpose16x16_avx2((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  uint16_t dstT[64 * 32];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left,
+                                            dy);
+  }
+  highbd_transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
+  highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd);
+  highbd_transpose(dstT, 32, dst, stride, 64, 32);
+  return;
+}
+
+static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left,
+                                            dy);
+  }
+  highbd_transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  __m256i dstvec[64], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  for (int i = 0; i < 64; i += 16) {
+    highbd_transpose16x16_avx2((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int upsample_left,
+                                      int dx, int dy, int bd) {
+  (void)above;
+  (void)dx;
+
+  assert(dx == 1);
+  assert(dy > 0);
+  if (bw == bh) {
+    switch (bw) {
+      case 4:
+        highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy,
+                                         bd);
+        break;
+      case 8:
+        highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy,
+                                         bd);
+        break;
+      case 16:
+        highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy,
+                                           bd);
+        break;
+      case 32:
+        highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy,
+                                           bd);
+        break;
+      case 64:
+        highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy,
+                                           bd);
+        break;
+    }
+  } else {
+    if (bw < bh) {
+      if (bw + bw == bh) {
+        switch (bw) {
+          case 4:
+            highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
+                                             dy, bd);
+            break;
+          case 8:
+            highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 16:
+            highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+          case 32:
+            highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+        }
+      } else {
+        switch (bw) {
+          case 4:
+            highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 8:
+            highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 16:
+            highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+        }
+      }
+    } else {
+      if (bh + bh == bw) {
+        switch (bh) {
+          case 4:
+            highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
+                                             dy, bd);
+            break;
+          case 8:
+            highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 16:
+            highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+          case 32:
+            highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+        }
+      } else {
+        switch (bh) {
+          case 4:
+            highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 8:
+            highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 16:
+            highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+        }
+      }
+    }
+  }
+  return;
+}
+
+// Low bit depth functions
+static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+};
+
+static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = {
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+  { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+  { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },
+  { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+  { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 },
+  { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+};
+
+static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
+  { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+  { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
+  { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
+  { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
+  { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
+  { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
+  { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
+  { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
+};
+/* clang-format off */
+static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = {
+  { -1,  0,  0,  0,  0,  0,  0,  0},
+  { -1, -1,  0,  0,  0,  0,  0,  0},
+  { -1, -1, -1,  0,  0,  0,  0,  0},
+  { -1, -1, -1, -1,  0,  0,  0,  0},
+  { -1, -1, -1, -1, -1,  0,  0,  0},
+  { -1, -1, -1, -1, -1, -1,  0,  0},
+  { -1, -1, -1, -1, -1, -1, -1,  0},
+  { -1, -1, -1, -1, -1, -1, -1, -1},
+};
+/* clang-format on */
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
+    int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
+    int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((W + H) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16;
+  __m256i diff, c3f;
+  __m128i a_mbase_x;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm_set1_epi8(above[max_base_x]);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < W; r++) {
+    __m256i b, res, shift;
+    __m128i res1, a0_128, a1_128;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base) >> upsample_above;
+    if (base_max_diff <= 0) {
+      for (int i = r; i < W; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
+    }
+    if (base_max_diff > H) base_max_diff = H;
+    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
+    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
+
+    if (upsample_above) {
+      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
+      a1_128 = _mm_srli_si128(a0_128, 8);
+
+      shift = _mm256_srli_epi16(
+          _mm256_and_si256(
+              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
+          1);
+    } else {
+      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+    }
+    a0 = _mm256_cvtepu8_epi16(a0_128);
+    a1 = _mm256_cvtepu8_epi16(a1_128);
+
+    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    res = _mm256_packus_epi16(
+        res, _mm256_castsi128_si256(
+                 _mm256_extracti128_si256(res, 1)));  // goto 8 bit
+    res1 = _mm256_castsi256_si128(res);               // 16 8bit values
+
+    dst[r] =
+        _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, int upsample_above,
+                                      int dx) {
+  __m128i dstvec[16];
+
+  dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
+  }
+}
+
+static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, int upsample_above,
+                                      int dx) {
+  __m128i dstvec[32];
+
+  dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  __m128i dstvec[64];
+
+  dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
+    int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((32 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16;
+  __m256i a_mbase_x, diff, c3f;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, res16[2];
+    __m128i a0_128, a1_128;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base);
+    if (base_max_diff <= 0) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 32 values
+      }
+      return;
+    }
+    if (base_max_diff > 32) base_max_diff = 32;
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
+      int mdiff = base_max_diff - j;
+      if (mdiff <= 0) {
+        res16[jj] = a_mbase_x;
+      } else {
+        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+        a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1));
+        a0 = _mm256_cvtepu8_epi16(a0_128);
+        a1 = _mm256_cvtepu8_epi16(a1_128);
+
+        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi16(diff, shift);
+
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);
+        res16[jj] = _mm256_packus_epi16(
+            res, _mm256_castsi128_si256(
+                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
+      }
+    }
+    res16[1] =
+        _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
+                                1);  // 32 8bit values
+
+    dstvec[r] = _mm256_blendv_epi8(
+        a_mbase_x, res16[1],
+        *(__m256i *)BaseMask[base_max_diff]);  // 32 8bit values
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  __m256i dstvec[64];
+  dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((64 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16;
+  __m256i a_mbase_x, diff, c3f;
+  __m128i max_base_x128, base_inc128, mask128;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
+  max_base_x128 = _mm_set1_epi8(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++, dst += stride) {
+    __m256i b, res;
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
+        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+        dst += stride;
+      }
+      return;
+    }
+
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    __m128i a0_128, a1_128, res128;
+    for (int j = 0; j < 64; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        _mm_storeu_si128((__m128i *)(dst + j),
+                         _mm256_castsi256_si128(a_mbase_x));
+      } else {
+        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
+        a0 = _mm256_cvtepu8_epi16(a0_128);
+        a1 = _mm256_cvtepu8_epi16(a1_128);
+
+        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi16(diff, shift);
+
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);
+        res = _mm256_packus_epi16(
+            res, _mm256_castsi128_si256(
+                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
+
+        base_inc128 =
+            _mm_setr_epi8((uint8_t)(base + j), (uint8_t)(base + j + 1),
+                          (uint8_t)(base + j + 2), (uint8_t)(base + j + 3),
+                          (uint8_t)(base + j + 4), (uint8_t)(base + j + 5),
+                          (uint8_t)(base + j + 6), (uint8_t)(base + j + 7),
+                          (uint8_t)(base + j + 8), (uint8_t)(base + j + 9),
+                          (uint8_t)(base + j + 10), (uint8_t)(base + j + 11),
+                          (uint8_t)(base + j + 12), (uint8_t)(base + j + 13),
+                          (uint8_t)(base + j + 14), (uint8_t)(base + j + 15));
+
+        mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
+                                 _mm_setzero_si128());
+        res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x),
+                                 _mm256_castsi256_si128(res), mask128);
+        _mm_storeu_si128((__m128i *)(dst + j), res128);
+      }
+    }
+    x += dx;
+  }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_above, int dx, int dy) {
+  (void)left;
+  (void)dy;
+  switch (bw) {
+    case 4:
+      dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 8:
+      dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 16:
+      dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 32:
+      dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 64:
+      dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    default: break;
+  }
+  return;
+}
+
+static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int upsample_above, int upsample_left,
+                                      int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m128i a0_x, a1_x, a32, a16, diff;
+  __m128i c3f, min_base_y128, c1234, dy128;
+
+  a16 = _mm_set1_epi16(16);
+  c3f = _mm_set1_epi16(0x3f);
+  min_base_y128 = _mm_set1_epi16(min_base_y);
+  c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
+  dy128 = _mm_set1_epi16(dy);
+
+  for (int r = 0; r < N; r++) {
+    __m128i b, res, shift, r6, ydx;
+    __m128i resx, resy, resxy;
+    __m128i a0_x128, a1_x128;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 4) {
+      base_min_diff = 4;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 3) {
+      a0_x = _mm_setzero_si128();
+      a1_x = _mm_setzero_si128();
+      shift = _mm_setzero_si128();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      ydx = _mm_set1_epi16(y * dx);
+      r6 = _mm_slli_epi16(c1234, 6);
+
+      if (upsample_above) {
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+        shift = _mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+            1);
+      } else {
+        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 1);
+
+        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+      }
+      a0_x = _mm_cvtepu8_epi16(a0_x128);
+      a1_x = _mm_cvtepu8_epi16(a1_x128);
+    }
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+      __m128i y_c128, base_y_c128, mask128, c1234_;
+      c1234_ = _mm_srli_si128(c1234, 2);
+      r6 = _mm_set1_epi16(r << 6);
+      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
+      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+      base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+      }
+      a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
+      a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
+      shift = _mm_unpacklo_epi64(shift, shifty);
+    }
+
+    diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm_mullo_epi16(diff, shift);
+    res = _mm_add_epi16(a32, b);
+    res = _mm_srli_epi16(res, 5);
+
+    resx = _mm_packus_epi16(res, res);
+    resy = _mm_srli_si128(resx, 4);
+
+    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+    *(uint32_t *)(dst) = _mm_cvtsi128_si32(resxy);
+    dst += stride;
+  }
+}
+
+static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int upsample_above, int upsample_left,
+                                      int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i diff, a32, a16;
+  __m256i a0_x, a1_x;
+  __m128i a0_x128, a1_x128, min_base_y128, c3f;
+  __m128i c1234, dy128;
+
+  a16 = _mm256_set1_epi16(16);
+  c3f = _mm_set1_epi16(0x3f);
+  min_base_y128 = _mm_set1_epi16(min_base_y);
+  dy128 = _mm_set1_epi16(dy);
+  c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy, r6, ydx;
+
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 8) {
+      base_min_diff = 8;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 7) {
+      a0_x = _mm256_setzero_si256();
+      a1_x = _mm256_setzero_si256();
+      shift = _mm256_setzero_si256();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      ydx = _mm_set1_epi16(y * dx);
+      r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
+      if (upsample_above) {
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+            1));
+      } else {
+        a1_x128 = _mm_srli_si128(a0_x128, 1);
+        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
+
+        shift = _mm256_castsi128_si256(
+            _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
+      }
+      a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
+      a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
+    }
+
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+      __m128i y_c128, base_y_c128, mask128;
+      r6 = _mm_set1_epi16(r << 6);
+      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]],
+                            left[base_y_c[4]], left[base_y_c[5]],
+                            left[base_y_c[6]], left[base_y_c[7]]);
+      base_y_c128 = _mm_add_epi16(
+          base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]],
+                            left[base_y_c[4]], left[base_y_c[5]],
+                            left[base_y_c[6]], left[base_y_c[7]]);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+      }
+
+      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+      shift = _mm256_inserti128_si256(shift, shifty, 1);
+    }
+
+    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
+                            _mm256_castsi256_si128(res));
+    resy = _mm256_extracti128_si256(res, 1);
+    resy = _mm_packus_epi16(resy, resy);
+
+    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+    _mm_storel_epi64((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
+                                      ptrdiff_t stride, const uint8_t *above,
+                                      const uint8_t *left, int upsample_above,
+                                      int upsample_left, int dx, int dy) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_x = -1;
+  const int min_base_y = -1;
+  (void)upsample_above;
+  (void)upsample_left;
+  const int frac_bits_x = 6;
+  const int frac_bits_y = 6;
+
+  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
+  __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
+  __m128i a0_x128, a1_x128;
+
+  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+  a16 = _mm256_set1_epi16(16);
+  c1 = _mm256_srli_epi16(a16, 4);
+  min_base_y256 = _mm256_set1_epi16(min_base_y);
+  c3f = _mm256_set1_epi16(0x3f);
+  dy256 = _mm256_set1_epi16(dy);
+  c0123 =
+      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  c1234 = _mm256_add_epi16(c0123, c1);
+
+  for (int r = 0; r < H; r++) {
+    __m256i b, res, shift, j256, r6, ydx;
+    __m128i resx, resy;
+    __m128i resxy;
+    int y = r + 1;
+    ydx = _mm256_set1_epi16((uint16_t)(y * dx));
+
+    int base_x = (-y * dx) >> frac_bits_x;
+    for (int j = 0; j < W; j += 16) {
+      j256 = _mm256_set1_epi16(j);
+      int base_shift = 0;
+      if ((base_x + j) < (min_base_x - 1)) {
+        base_shift = (min_base_x - (base_x + j) - 1);
+      }
+      int base_min_diff = (min_base_x - base_x - j);
+      if (base_min_diff > 16) {
+        base_min_diff = 16;
+      } else {
+        if (base_min_diff < 0) base_min_diff = 0;
+      }
+
+      if (base_shift < 16) {
+        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
+        a1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
+        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
+
+        a0_x = _mm256_cvtepu8_epi16(a0_x128);
+        a1_x = _mm256_cvtepu8_epi16(a1_x128);
+
+        r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
+        shift = _mm256_srli_epi16(
+            _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
+
+        diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi16(diff, shift);
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
+        resx = _mm256_castsi256_si128(_mm256_packus_epi16(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+      } else {
+        resx = _mm_setzero_si128();
+      }
+
+      // y calc
+      if (base_x < min_base_x) {
+        __m256i c256, y_c256, base_y_c256, mask256, mul16;
+        r6 = _mm256_set1_epi16(r << 6);
+        c256 = _mm256_add_epi16(j256, c1234);
+        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
+                                 _mm256_srli_epi16(min_base_y256, 1));
+        y_c256 = _mm256_sub_epi16(r6, mul16);
+
+        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
+        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
+
+        base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
+        int16_t min_y = (int16_t)_mm_extract_epi16(
+            _mm256_extracti128_si256(base_y_c256, 1), 7);
+        int16_t max_y =
+            (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0);
+        int16_t offset_diff = max_y - min_y;
+
+        if (offset_diff < 16) {
+          __m256i min_y256 = _mm256_set1_epi16(min_y);
+
+          __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
+          __m128i base_y_offset128 =
+              _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
+                              _mm256_extracti128_si256(base_y_offset, 1));
+
+          __m128i a0_y128 = _mm_maskload_epi32(
+              (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
+          __m128i a1_y128 =
+              _mm_maskload_epi32((int *)(left + min_y + 1),
+                                 *(__m128i *)LoadMaskz2[offset_diff / 4]);
+          a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
+          a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
+          a0_y = _mm256_cvtepu8_epi16(a0_y128);
+          a1_y = _mm256_cvtepu8_epi16(a1_y128);
+        } else {
+          base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+          a0_y = _mm256_setr_epi16(
+              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+              left[base_y_c[15]]);
+          base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
+          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+          a1_y = _mm256_setr_epi16(
+              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+              left[base_y_c[15]]);
+        }
+        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
+
+        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi16(diff, shifty);
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
+        resy = _mm256_castsi256_si128(_mm256_packus_epi16(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+      } else {
+        resy = _mm_setzero_si128();
+      }
+      resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+      _mm_storeu_si128((__m128i *)(dst + j), resxy);
+    }  // for j
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_above, int upsample_left, int dx,
+                               int dy) {
+  assert(dx > 0);
+  assert(dy > 0);
+  switch (bw) {
+    case 4:
+      dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
+                                upsample_left, dx, dy);
+      break;
+    case 8:
+      dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
+                                upsample_left, dx, dy);
+      break;
+    default:
+      dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+                                upsample_above, upsample_left, dx, dy);
+      break;
+  }
+  return;
+}
+
+// z3 functions
+static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) {
+  __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
+  w0 = _mm_unpacklo_epi8(x[0], x[1]);
+  w1 = _mm_unpacklo_epi8(x[2], x[3]);
+  w2 = _mm_unpackhi_epi8(x[0], x[1]);
+  w3 = _mm_unpackhi_epi8(x[2], x[3]);
+
+  ww0 = _mm_unpacklo_epi16(w0, w1);
+  ww1 = _mm_unpacklo_epi16(w2, w3);
+  ww2 = _mm_unpackhi_epi16(w0, w1);
+  ww3 = _mm_unpackhi_epi16(w2, w3);
+
+  w0 = _mm_unpacklo_epi32(ww0, ww1);
+  w2 = _mm_unpacklo_epi32(ww2, ww3);
+  w1 = _mm_unpackhi_epi32(ww0, ww1);
+  w3 = _mm_unpackhi_epi32(ww2, ww3);
+
+  d[0] = _mm_unpacklo_epi64(w0, w2);
+  d[1] = _mm_unpackhi_epi64(w0, w2);
+  d[2] = _mm_unpacklo_epi64(w1, w3);
+  d[3] = _mm_unpackhi_epi64(w1, w3);
+
+  d[4] = _mm_srli_si128(d[0], 8);
+  d[5] = _mm_srli_si128(d[1], 8);
+  d[6] = _mm_srli_si128(d[2], 8);
+  d[7] = _mm_srli_si128(d[3], 8);
+
+  d[8] = _mm_srli_si128(d[0], 4);
+  d[9] = _mm_srli_si128(d[1], 4);
+  d[10] = _mm_srli_si128(d[2], 4);
+  d[11] = _mm_srli_si128(d[3], 4);
+
+  d[12] = _mm_srli_si128(d[0], 12);
+  d[13] = _mm_srli_si128(d[1], 12);
+  d[14] = _mm_srli_si128(d[2], 12);
+  d[15] = _mm_srli_si128(d[3], 12);
+}
+
+static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) {
+  __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m256i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm256_unpacklo_epi8(x[0], x[1]);
+  w1 = _mm256_unpacklo_epi8(x[2], x[3]);
+  w2 = _mm256_unpacklo_epi8(x[4], x[5]);
+  w3 = _mm256_unpacklo_epi8(x[6], x[7]);
+
+  w8 = _mm256_unpacklo_epi8(x[8], x[9]);
+  w9 = _mm256_unpacklo_epi8(x[10], x[11]);
+  w10 = _mm256_unpacklo_epi8(x[12], x[13]);
+  w11 = _mm256_unpacklo_epi8(x[14], x[15]);
+
+  w4 = _mm256_unpacklo_epi16(w0, w1);
+  w5 = _mm256_unpacklo_epi16(w2, w3);
+  w12 = _mm256_unpacklo_epi16(w8, w9);
+  w13 = _mm256_unpacklo_epi16(w10, w11);
+
+  w6 = _mm256_unpacklo_epi32(w4, w5);
+  w7 = _mm256_unpackhi_epi32(w4, w5);
+  w14 = _mm256_unpacklo_epi32(w12, w13);
+  w15 = _mm256_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[0] = _mm256_unpacklo_epi64(w6, w14);
+  d[1] = _mm256_unpackhi_epi64(w6, w14);
+  d[2] = _mm256_unpacklo_epi64(w7, w15);
+  d[3] = _mm256_unpackhi_epi64(w7, w15);
+
+  w4 = _mm256_unpackhi_epi16(w0, w1);
+  w5 = _mm256_unpackhi_epi16(w2, w3);
+  w12 = _mm256_unpackhi_epi16(w8, w9);
+  w13 = _mm256_unpackhi_epi16(w10, w11);
+
+  w6 = _mm256_unpacklo_epi32(w4, w5);
+  w7 = _mm256_unpackhi_epi32(w4, w5);
+  w14 = _mm256_unpacklo_epi32(w12, w13);
+  w15 = _mm256_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[4] = _mm256_unpacklo_epi64(w6, w14);
+  d[5] = _mm256_unpackhi_epi64(w6, w14);
+  d[6] = _mm256_unpacklo_epi64(w7, w15);
+  d[7] = _mm256_unpackhi_epi64(w7, w15);
+
+  // upper half
+  w0 = _mm256_unpackhi_epi8(x[0], x[1]);
+  w1 = _mm256_unpackhi_epi8(x[2], x[3]);
+  w2 = _mm256_unpackhi_epi8(x[4], x[5]);
+  w3 = _mm256_unpackhi_epi8(x[6], x[7]);
+
+  w8 = _mm256_unpackhi_epi8(x[8], x[9]);
+  w9 = _mm256_unpackhi_epi8(x[10], x[11]);
+  w10 = _mm256_unpackhi_epi8(x[12], x[13]);
+  w11 = _mm256_unpackhi_epi8(x[14], x[15]);
+
+  w4 = _mm256_unpacklo_epi16(w0, w1);
+  w5 = _mm256_unpacklo_epi16(w2, w3);
+  w12 = _mm256_unpacklo_epi16(w8, w9);
+  w13 = _mm256_unpacklo_epi16(w10, w11);
+
+  w6 = _mm256_unpacklo_epi32(w4, w5);
+  w7 = _mm256_unpackhi_epi32(w4, w5);
+  w14 = _mm256_unpacklo_epi32(w12, w13);
+  w15 = _mm256_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[8] = _mm256_unpacklo_epi64(w6, w14);
+  d[9] = _mm256_unpackhi_epi64(w6, w14);
+  d[10] = _mm256_unpacklo_epi64(w7, w15);
+  d[11] = _mm256_unpackhi_epi64(w7, w15);
+
+  w4 = _mm256_unpackhi_epi16(w0, w1);
+  w5 = _mm256_unpackhi_epi16(w2, w3);
+  w12 = _mm256_unpackhi_epi16(w8, w9);
+  w13 = _mm256_unpackhi_epi16(w10, w11);
+
+  w6 = _mm256_unpacklo_epi32(w4, w5);
+  w7 = _mm256_unpackhi_epi32(w4, w5);
+  w14 = _mm256_unpacklo_epi32(w12, w13);
+  w15 = _mm256_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[12] = _mm256_unpacklo_epi64(w6, w14);
+  d[13] = _mm256_unpackhi_epi64(w6, w14);
+  d[14] = _mm256_unpacklo_epi64(w7, w15);
+  d[15] = _mm256_unpackhi_epi64(w7, w15);
+}
+
+static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(x[0], x[1]);
+  w1 = _mm_unpacklo_epi8(x[2], x[3]);
+  w2 = _mm_unpacklo_epi8(x[4], x[5]);
+  w3 = _mm_unpacklo_epi8(x[6], x[7]);
+
+  w8 = _mm_unpacklo_epi8(x[8], x[9]);
+  w9 = _mm_unpacklo_epi8(x[10], x[11]);
+  w10 = _mm_unpacklo_epi8(x[12], x[13]);
+  w11 = _mm_unpacklo_epi8(x[14], x[15]);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[0] = _mm_unpacklo_epi64(w6, w14);
+  d[1] = _mm_unpackhi_epi64(w6, w14);
+  d[2] = _mm_unpacklo_epi64(w7, w15);
+  d[3] = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[4] = _mm_unpacklo_epi64(w6, w14);
+  d[5] = _mm_unpackhi_epi64(w6, w14);
+  d[6] = _mm_unpacklo_epi64(w7, w15);
+  d[7] = _mm_unpackhi_epi64(w7, w15);
+
+  // upper half
+  w0 = _mm_unpackhi_epi8(x[0], x[1]);
+  w1 = _mm_unpackhi_epi8(x[2], x[3]);
+  w2 = _mm_unpackhi_epi8(x[4], x[5]);
+  w3 = _mm_unpackhi_epi8(x[6], x[7]);
+
+  w8 = _mm_unpackhi_epi8(x[8], x[9]);
+  w9 = _mm_unpackhi_epi8(x[10], x[11]);
+  w10 = _mm_unpackhi_epi8(x[12], x[13]);
+  w11 = _mm_unpackhi_epi8(x[14], x[15]);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[8] = _mm_unpacklo_epi64(w6, w14);
+  d[9] = _mm_unpackhi_epi64(w6, w14);
+  d[10] = _mm_unpacklo_epi64(w7, w15);
+  d[11] = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[12] = _mm_unpacklo_epi64(w6, w14);
+  d[13] = _mm_unpackhi_epi64(w6, w14);
+  d[14] = _mm_unpacklo_epi64(w7, w15);
+  d[15] = _mm_unpackhi_epi64(w7, w15);
+}
+
+static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
+                               uint8_t *dst, ptrdiff_t pitchDst) {
+  __m128i r[16];
+  __m128i d[16];
+  for (int j = 0; j < 16; j++) {
+    r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc));
+  }
+  transpose16x16_sse2(r, d);
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]);
+  }
+}
+
+static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
+                      ptrdiff_t pitchDst, int width, int height) {
+  for (int j = 0; j < height; j += 16)
+    for (int i = 0; i < width; i += 16)
+      transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
+                         dst + j * pitchDst + i, pitchDst);
+}
+
+static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  __m128i dstvec[4], d[4];
+
+  dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
+  transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                            &d[0], &d[1], &d[2], &d[3]);
+
+  *(uint32_t *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
+  *(uint32_t *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
+  *(uint32_t *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
+  *(uint32_t *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
+  return;
+}
+
+static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  __m128i dstvec[8], d[8];
+
+  dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
+  transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
+                    &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
+                    &d[3]);
+
+  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
+  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
+  _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
+  _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
+  _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
+  _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
+  _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
+}
+
+static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  __m128i dstvec[4], d[8];
+
+  dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
+  transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
+                        &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+  for (int i = 0; i < 8; i++) {
+    *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+  }
+}
+
+static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  __m128i dstvec[8], d[4];
+
+  dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
+  transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                        &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
+                        &d[1], &d[2], &d[3]);
+  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
+  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
+  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
+}
+
+static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[8], d[8];
+
+  dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
+  transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
+                          dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
+                          d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
+  for (int i = 0; i < 8; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
+                     _mm_srli_si128(d[i], 8));
+  }
+}
+
+static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[16], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[4], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
+  transpose4x16_sse2(dstvec, d);
+  for (int i = 0; i < 16; i++) {
+    *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+  }
+}
+
+static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[16], d[8];
+
+  dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
+  for (int i = 4; i < 8; i++) {
+    d[i] = _mm_setzero_si128();
+  }
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+  for (int i = 0; i < 4; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m256i dstvec[16], d[16];
+
+  dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
+  for (int i = 8; i < 16; i++) {
+    dstvec[i] = _mm256_setzero_si256();
+  }
+  transpose16x32_avx2(dstvec, d);
+
+  for (int i = 0; i < 16; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride),
+                     _mm256_castsi256_si128(d[i]));
+  }
+  for (int i = 0; i < 16; i++) {
+    _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
+                     _mm256_extracti128_si256(d[i], 1));
+  }
+}
+
+static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[32], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
+
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+  transpose16x8_8x16_sse2(
+      &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
+      &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
+      &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
+      &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
+      &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
+      &d[6 + 8], &d[7 + 8]);
+
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
+  }
+}
+
+static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[16], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
+  transpose16x16_sse2(dstvec, d);
+
+  for (int i = 0; i < 16; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m256i dstvec[32], d[32];
+
+  dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
+  transpose16x32_avx2(dstvec, d);
+  transpose16x32_avx2(dstvec + 16, d + 16);
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + j * stride),
+                     _mm256_castsi256_si128(d[j]));
+    _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
+                     _mm256_castsi256_si128(d[j + 16]));
+  }
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
+                     _mm256_extracti128_si256(d[j], 1));
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
+                     _mm256_extracti128_si256(d[j + 16], 1));
+  }
+}
+
+static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
+  dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m256i dstvec[16], d[16];
+
+  dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
+  transpose16x32_avx2(dstvec, d);
+  // store
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + j * stride),
+                     _mm256_castsi256_si128(d[j]));
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
+                     _mm256_extracti128_si256(d[j], 1));
+  }
+}
+
+static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[32], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 32; i += 16) {
+    transpose16x16_sse2((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[64 * 32];
+  dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[32 * 64];
+  dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
+  transpose(dstT, 32, dst, stride, 64, 32);
+  return;
+}
+
+static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[64 * 16];
+  dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[64], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 64; i += 16) {
+    transpose16x16_sse2((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_left, int dx, int dy) {
+  (void)above;
+  (void)dx;
+  assert(dx == 1);
+  assert(dy > 0);
+
+  if (bw == bh) {
+    switch (bw) {
+      case 4:
+        dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
+        break;
+      case 8:
+        dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
+        break;
+      case 16:
+        dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
+        break;
+      case 32:
+        dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
+        break;
+      case 64:
+        dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
+        break;
+    }
+  } else {
+    if (bw < bh) {
+      if (bw + bw == bh) {
+        switch (bw) {
+          case 4:
+            dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 32:
+            dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      } else {
+        switch (bw) {
+          case 4:
+            dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      }
+    } else {
+      if (bh + bh == bw) {
+        switch (bh) {
+          case 4:
+            dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 32:
+            dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      } else {
+        switch (bh) {
+          case 4:
+            dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      }
+    }
+  }
+}
diff --git a/media/libaom/src/aom_dsp/x86/intrapred_sse2.c b/media/libaom/src/aom_dsp/x86/intrapred_sse2.c
index 5b2452c8eb..5afef68c39 100644
--- a/media/libaom/src/aom_dsp/x86/intrapred_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/intrapred_sse2.c
@@ -10,7 +10,7 @@
  */
 
 #include <emmintrin.h>
-
+#include "aom_dsp/x86/intrapred_x86.h"
 #include "config/aom_dsp_rtcd.h"
 
 static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
@@ -75,25 +75,6 @@ static INLINE __m128i dc_sum_8(const uint8_t *ref) {
   return _mm_sad_epu8(x, zero);
 }
 
-static INLINE __m128i dc_sum_16(const uint8_t *ref) {
-  __m128i x = _mm_load_si128((__m128i const *)ref);
-  const __m128i zero = _mm_setzero_si128();
-  x = _mm_sad_epu8(x, zero);
-  const __m128i high = _mm_unpackhi_epi64(x, x);
-  return _mm_add_epi16(x, high);
-}
-
-static INLINE __m128i dc_sum_32(const uint8_t *ref) {
-  __m128i x0 = _mm_load_si128((__m128i const *)ref);
-  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
-  const __m128i zero = _mm_setzero_si128();
-  x0 = _mm_sad_epu8(x0, zero);
-  x1 = _mm_sad_epu8(x1, zero);
-  x0 = _mm_add_epi16(x0, x1);
-  const __m128i high = _mm_unpackhi_epi64(x0, x0);
-  return _mm_add_epi16(x0, high);
-}
-
 static INLINE __m128i dc_sum_64(const uint8_t *ref) {
   __m128i x0 = _mm_load_si128((__m128i const *)ref);
   __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
@@ -142,7 +123,7 @@ void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
 
 void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_16(left);
+  const __m128i sum_left = dc_sum_16_sse2(left);
   __m128i sum_above = dc_sum_4(above);
   sum_above = _mm_add_epi16(sum_left, sum_above);
 
@@ -171,7 +152,7 @@ void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
 
 void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_16(left);
+  const __m128i sum_left = dc_sum_16_sse2(left);
   __m128i sum_above = dc_sum_8(above);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
@@ -184,7 +165,7 @@ void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
 
 void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_32(left);
+  const __m128i sum_left = dc_sum_32_sse2(left);
   __m128i sum_above = dc_sum_8(above);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
@@ -198,7 +179,7 @@ void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
 void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   const __m128i sum_left = dc_sum_4(left);
-  __m128i sum_above = dc_sum_16(above);
+  __m128i sum_above = dc_sum_16_sse2(above);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
@@ -211,7 +192,7 @@ void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
 void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   const __m128i sum_left = dc_sum_8(left);
-  __m128i sum_above = dc_sum_16(above);
+  __m128i sum_above = dc_sum_16_sse2(above);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
@@ -223,8 +204,8 @@ void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
 
 void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_32(left);
-  __m128i sum_above = dc_sum_16(above);
+  const __m128i sum_left = dc_sum_32_sse2(left);
+  __m128i sum_above = dc_sum_16_sse2(above);
   sum_above = _mm_add_epi16(sum_left, sum_above);
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
@@ -237,7 +218,7 @@ void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
 void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const __m128i sum_left = dc_sum_64(left);
-  __m128i sum_above = dc_sum_16(above);
+  __m128i sum_above = dc_sum_16_sse2(above);
   sum_above = _mm_add_epi16(sum_left, sum_above);
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
@@ -249,7 +230,7 @@ void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
 
 void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_32(above);
+  __m128i sum_above = dc_sum_32_sse2(above);
   const __m128i sum_left = dc_sum_8(left);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
@@ -262,8 +243,8 @@ void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
 
 void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_32(above);
-  const __m128i sum_left = dc_sum_16(left);
+  __m128i sum_above = dc_sum_32_sse2(above);
+  const __m128i sum_left = dc_sum_16_sse2(left);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
@@ -275,7 +256,7 @@ void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
 
 void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_32(above);
+  __m128i sum_above = dc_sum_32_sse2(above);
   const __m128i sum_left = dc_sum_64(left);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
@@ -302,7 +283,7 @@ void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
 void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   __m128i sum_above = dc_sum_64(above);
-  const __m128i sum_left = dc_sum_32(left);
+  const __m128i sum_left = dc_sum_32_sse2(left);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
@@ -315,7 +296,7 @@ void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
 void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   __m128i sum_above = dc_sum_64(above);
-  const __m128i sum_left = dc_sum_16(left);
+  const __m128i sum_left = dc_sum_16_sse2(left);
   sum_above = _mm_add_epi16(sum_above, sum_left);
 
   uint32_t sum = _mm_cvtsi128_si32(sum_above);
@@ -395,7 +376,7 @@ void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
 void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
-  __m128i sum_above = dc_sum_16(above);
+  __m128i sum_above = dc_sum_16_sse2(above);
   const __m128i eight = _mm_set1_epi16((uint16_t)8);
   sum_above = _mm_add_epi16(sum_above, eight);
   sum_above = _mm_srai_epi16(sum_above, 4);
@@ -408,7 +389,7 @@ void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
 void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
-  __m128i sum_above = dc_sum_16(above);
+  __m128i sum_above = dc_sum_16_sse2(above);
   const __m128i eight = _mm_set1_epi16((uint16_t)8);
   sum_above = _mm_add_epi16(sum_above, eight);
   sum_above = _mm_srai_epi16(sum_above, 4);
@@ -422,7 +403,7 @@ void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
   (void)left;
-  __m128i sum_above = dc_sum_16(above);
+  __m128i sum_above = dc_sum_16_sse2(above);
   const __m128i eight = _mm_set1_epi16((uint16_t)8);
   sum_above = _mm_add_epi16(sum_above, eight);
   sum_above = _mm_srai_epi16(sum_above, 4);
@@ -436,7 +417,7 @@ void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
   (void)left;
-  __m128i sum_above = dc_sum_16(above);
+  __m128i sum_above = dc_sum_16_sse2(above);
   const __m128i eight = _mm_set1_epi16((uint16_t)8);
   sum_above = _mm_add_epi16(sum_above, eight);
   sum_above = _mm_srai_epi16(sum_above, 4);
@@ -449,7 +430,7 @@ void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
 void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
   (void)left;
-  __m128i sum_above = dc_sum_32(above);
+  __m128i sum_above = dc_sum_32_sse2(above);
   const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
   sum_above = _mm_add_epi16(sum_above, sixteen);
   sum_above = _mm_srai_epi16(sum_above, 5);
@@ -463,7 +444,7 @@ void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
   (void)left;
-  __m128i sum_above = dc_sum_32(above);
+  __m128i sum_above = dc_sum_32_sse2(above);
   const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
   sum_above = _mm_add_epi16(sum_above, sixteen);
   sum_above = _mm_srai_epi16(sum_above, 5);
@@ -477,7 +458,7 @@ void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
   (void)left;
-  __m128i sum_above = dc_sum_32(above);
+  __m128i sum_above = dc_sum_32_sse2(above);
   const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
   sum_above = _mm_add_epi16(sum_above, sixteen);
   sum_above = _mm_srai_epi16(sum_above, 5);
@@ -550,7 +531,7 @@ void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
   (void)above;
-  __m128i sum_left = dc_sum_16(left);
+  __m128i sum_left = dc_sum_16_sse2(left);
   const __m128i eight = _mm_set1_epi16((uint16_t)8);
   sum_left = _mm_add_epi16(sum_left, eight);
   sum_left = _mm_srai_epi16(sum_left, 4);
@@ -577,7 +558,7 @@ void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
   (void)above;
-  __m128i sum_left = dc_sum_16(left);
+  __m128i sum_left = dc_sum_16_sse2(left);
   const __m128i eight = _mm_set1_epi16((uint16_t)8);
   sum_left = _mm_add_epi16(sum_left, eight);
   sum_left = _mm_srai_epi16(sum_left, 4);
@@ -590,7 +571,7 @@ void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
   (void)above;
-  __m128i sum_left = dc_sum_32(left);
+  __m128i sum_left = dc_sum_32_sse2(left);
   const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
   sum_left = _mm_add_epi16(sum_left, sixteen);
   sum_left = _mm_srai_epi16(sum_left, 5);
@@ -631,7 +612,7 @@ void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
   (void)above;
-  __m128i sum_left = dc_sum_32(left);
+  __m128i sum_left = dc_sum_32_sse2(left);
   const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
   sum_left = _mm_add_epi16(sum_left, sixteen);
   sum_left = _mm_srai_epi16(sum_left, 5);
@@ -673,7 +654,7 @@ void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
   (void)above;
-  __m128i sum_left = dc_sum_16(left);
+  __m128i sum_left = dc_sum_16_sse2(left);
   const __m128i eight = _mm_set1_epi16((uint16_t)8);
   sum_left = _mm_add_epi16(sum_left, eight);
   sum_left = _mm_srai_epi16(sum_left, 4);
@@ -715,7 +696,7 @@ void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
   (void)above;
-  __m128i sum_left = dc_sum_32(left);
+  __m128i sum_left = dc_sum_32_sse2(left);
   const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
   sum_left = _mm_add_epi16(sum_left, sixteen);
   sum_left = _mm_srai_epi16(sum_left, 5);
@@ -729,7 +710,7 @@ void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
   (void)above;
-  __m128i sum_left = dc_sum_16(left);
+  __m128i sum_left = dc_sum_16_sse2(left);
   const __m128i eight = _mm_set1_epi16((uint16_t)8);
   sum_left = _mm_add_epi16(sum_left, eight);
   sum_left = _mm_srai_epi16(sum_left, 4);
diff --git a/media/libaom/src/aom_dsp/x86/intrapred_ssse3.c b/media/libaom/src/aom_dsp/x86/intrapred_ssse3.c
index 807ed1770f..5a34ea0c8e 100644
--- a/media/libaom/src/aom_dsp/x86/intrapred_ssse3.c
+++ b/media/libaom/src/aom_dsp/x86/intrapred_ssse3.c
@@ -48,7 +48,7 @@ void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   int i;
@@ -69,7 +69,7 @@ void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   int i;
@@ -90,7 +90,7 @@ void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   for (int i = 0; i < 16; ++i) {
@@ -110,7 +110,7 @@ void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   int i;
@@ -131,7 +131,7 @@ void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   int i;
@@ -152,7 +152,7 @@ void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i zero = _mm_setzero_si128();
   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   int i;
@@ -176,7 +176,7 @@ void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
 
   for (int j = 0; j < 2; ++j) {
     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
+    __m128i rep = _mm_set1_epi16((short)0x8000);
     for (int i = 0; i < 16; ++i) {
       const __m128i l16 = _mm_shuffle_epi8(l, rep);
       const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
@@ -205,7 +205,7 @@ void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   for (int i = 0; i < 4; ++i) {
@@ -226,7 +226,7 @@ void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   int i;
@@ -249,7 +249,7 @@ void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
 
   int i;
@@ -272,7 +272,7 @@ void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l16;
 
@@ -287,7 +287,7 @@ void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   }
 
   l = _mm_load_si128((const __m128i *)(left + 16));
-  rep = _mm_set1_epi16(0x8000);
+  rep = _mm_set1_epi16((short)0x8000);
   for (i = 0; i < 16; ++i) {
     l16 = _mm_shuffle_epi8(l, rep);
     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
@@ -310,7 +310,7 @@ void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
 
   for (int j = 0; j < 4; ++j) {
     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
+    __m128i rep = _mm_set1_epi16((short)0x8000);
     for (int i = 0; i < 16; ++i) {
       const __m128i l16 = _mm_shuffle_epi8(l, rep);
       const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
@@ -332,7 +332,7 @@ void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
   const __m128i l = _mm_loadl_epi64((const __m128i *)left);
   __m128i l16;
@@ -361,7 +361,7 @@ void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l = _mm_load_si128((const __m128i *)left);
   __m128i l16;
@@ -391,7 +391,7 @@ void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   const __m128i bh = _mm_unpackhi_epi8(b, zero);
 
   const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   const __m128i one = _mm_set1_epi16(1);
   __m128i l = _mm_load_si128((const __m128i *)left);
   __m128i l16;
@@ -408,7 +408,7 @@ void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
     rep = _mm_add_epi16(rep, one);
   }
 
-  rep = _mm_set1_epi16(0x8000);
+  rep = _mm_set1_epi16((short)0x8000);
   l = _mm_load_si128((const __m128i *)(left + 16));
   for (i = 0; i < 16; ++i) {
     l16 = _mm_shuffle_epi8(l, rep);
@@ -440,7 +440,7 @@ void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
   int i, j;
   for (j = 0; j < 4; ++j) {
     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
+    __m128i rep = _mm_set1_epi16((short)0x8000);
     for (i = 0; i < 16; ++i) {
       l16 = _mm_shuffle_epi8(l, rep);
       const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
@@ -478,7 +478,7 @@ void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
   int i, j;
   for (j = 0; j < 2; ++j) {
     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
+    __m128i rep = _mm_set1_epi16((short)0x8000);
     for (i = 0; i < 16; ++i) {
       l16 = _mm_shuffle_epi8(l, rep);
       const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
@@ -520,7 +520,7 @@ void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
   int i, j;
   for (j = 0; j < 4; ++j) {
     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
+    __m128i rep = _mm_set1_epi16((short)0x8000);
     for (i = 0; i < 16; ++i) {
       l16 = _mm_shuffle_epi8(l, rep);
       const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
@@ -561,7 +561,7 @@ void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
 
   int i;
   const __m128i l = _mm_load_si128((const __m128i *)left);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
   for (i = 0; i < 16; ++i) {
     l16 = _mm_shuffle_epi8(l, rep);
     const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
@@ -636,7 +636,8 @@ static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
   const __m128i one = _mm_set1_epi16(1);
   const __m128i inc = _mm_set1_epi16(0x202);
   const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
+  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
+                            : _mm_set1_epi16((short)0x8000);
   __m128i d = _mm_set1_epi16(0x100);
 
   for (int i = 0; i < h; ++i) {
@@ -792,7 +793,8 @@ static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
   const __m128i inc = _mm_set1_epi16(0x202);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
 
-  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
+  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
+                            : _mm_set1_epi16((short)0x8000);
   __m128i d = _mm_set1_epi16(0x100);
 
   int i;
@@ -1400,7 +1402,7 @@ static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
   const __m128i one = _mm_set1_epi16(1);
   const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i rep = _mm_set1_epi16(0x8000);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
 
   for (int i = 0; i < h; ++i) {
     __m128i b = _mm_shuffle_epi8(pixel[0], rep);
@@ -1499,7 +1501,8 @@ static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
   const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
   const __m128i one = _mm_set1_epi16(1);
   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
+  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
+                            : _mm_set1_epi16((short)0x8000);
 
   for (int i = 0; i < h; ++i) {
     __m128i b = _mm_shuffle_epi8(pixels[0], rep);
diff --git a/media/libaom/src/aom_dsp/x86/intrapred_x86.h b/media/libaom/src/aom_dsp/x86/intrapred_x86.h
new file mode 100644
index 0000000000..b13f575a76
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/intrapred_x86.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_INTRAPRED_X86_H_
+#define AOM_AOM_DSP_X86_INTRAPRED_X86_H_
+
+#include <emmintrin.h>  // SSE2
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+
+static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
+  __m128i x = _mm_load_si128((__m128i const *)ref);
+  const __m128i zero = _mm_setzero_si128();
+  x = _mm_sad_epu8(x, zero);
+  const __m128i high = _mm_unpackhi_epi64(x, x);
+  return _mm_add_epi16(x, high);
+}
+
+static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) {
+  __m128i x0 = _mm_load_si128((__m128i const *)ref);
+  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+  const __m128i zero = _mm_setzero_si128();
+  x0 = _mm_sad_epu8(x0, zero);
+  x1 = _mm_sad_epu8(x1, zero);
+  x0 = _mm_add_epi16(x0, x1);
+  const __m128i high = _mm_unpackhi_epi64(x0, x0);
+  return _mm_add_epi16(x0, high);
+}
+
+#endif  // AOM_AOM_DSP_X86_INTRAPRED_X86_H_
diff --git a/media/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c b/media/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c
index c3c88245a4..2e3e2be105 100644
--- a/media/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c
+++ b/media/libaom/src/aom_dsp/x86/jnt_sad_ssse3.c
@@ -192,47 +192,47 @@ unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
   return res;
 }
 
-#define jnt_sadMxN_sse2(m, n)                                                 \
-  unsigned int aom_jnt_sad##m##x##n##_avg_ssse3(                              \
+#define dist_wtd_sadMxN_sse2(m, n)                                            \
+  unsigned int aom_dist_wtd_sad##m##x##n##_avg_ssse3(                         \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
     uint8_t comp_pred[m * n];                                                 \
-    aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride,      \
-                          jcp_param);                                         \
+    aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
+                               jcp_param);                                    \
     return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n);          \
   }
 
-#define jnt_sadMxN_avx2(m, n)                                                 \
-  unsigned int aom_jnt_sad##m##x##n##_avg_avx2(                               \
+#define dist_wtd_sadMxN_avx2(m, n)                                            \
+  unsigned int aom_dist_wtd_sad##m##x##n##_avg_avx2(                          \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
     uint8_t comp_pred[m * n];                                                 \
-    aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride,      \
-                          jcp_param);                                         \
+    aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
+                               jcp_param);                                    \
     return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n);          \
   }
 
 /* clang-format off */
-jnt_sadMxN_sse2(128, 128)
-jnt_sadMxN_sse2(128, 64)
-jnt_sadMxN_sse2(64, 128)
-jnt_sadMxN_sse2(64, 64)
-jnt_sadMxN_sse2(64, 32)
-jnt_sadMxN_sse2(32, 64)
-jnt_sadMxN_sse2(32, 32)
-jnt_sadMxN_sse2(32, 16)
-jnt_sadMxN_sse2(16, 32)
-jnt_sadMxN_sse2(16, 16)
-jnt_sadMxN_sse2(16, 8)
-jnt_sadMxN_sse2(8, 16)
-jnt_sadMxN_sse2(8, 8)
-jnt_sadMxN_sse2(8, 4)
-jnt_sadMxN_sse2(4, 8)
-jnt_sadMxN_sse2(4, 4)
-jnt_sadMxN_sse2(4, 16)
-jnt_sadMxN_sse2(16, 4)
-jnt_sadMxN_sse2(8, 32)
-jnt_sadMxN_sse2(32, 8)
-jnt_sadMxN_sse2(16, 64)
-jnt_sadMxN_sse2(64, 16)
+dist_wtd_sadMxN_sse2(128, 128)
+dist_wtd_sadMxN_sse2(128, 64)
+dist_wtd_sadMxN_sse2(64, 128)
+dist_wtd_sadMxN_sse2(64, 64)
+dist_wtd_sadMxN_sse2(64, 32)
+dist_wtd_sadMxN_sse2(32, 64)
+dist_wtd_sadMxN_sse2(32, 32)
+dist_wtd_sadMxN_sse2(32, 16)
+dist_wtd_sadMxN_sse2(16, 32)
+dist_wtd_sadMxN_sse2(16, 16)
+dist_wtd_sadMxN_sse2(16, 8)
+dist_wtd_sadMxN_sse2(8, 16)
+dist_wtd_sadMxN_sse2(8, 8)
+dist_wtd_sadMxN_sse2(8, 4)
+dist_wtd_sadMxN_sse2(4, 8)
+dist_wtd_sadMxN_sse2(4, 4)
+dist_wtd_sadMxN_sse2(4, 16)
+dist_wtd_sadMxN_sse2(16, 4)
+dist_wtd_sadMxN_sse2(8, 32)
+dist_wtd_sadMxN_sse2(32, 8)
+dist_wtd_sadMxN_sse2(16, 64)
+dist_wtd_sadMxN_sse2(64, 16)
     /* clang-format on */
diff --git a/media/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c b/media/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c
index f9a41a210b..c8b02f5560 100644
--- a/media/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/media/libaom/src/aom_dsp/x86/jnt_variance_ssse3.c
@@ -29,7 +29,7 @@ void aom_var_filter_block2d_bil_second_pass_ssse3(
     unsigned int pixel_step, unsigned int output_height,
     unsigned int output_width, const uint8_t *filter);
 
-static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
+static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
                                         const __m128i *w, const __m128i *r,
                                         void *const result) {
   __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
@@ -45,10 +45,10 @@ static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
   xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
 }
 
-void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
-                                 int width, int height, const uint8_t *ref,
-                                 int ref_stride,
-                                 const JNT_COMP_PARAMS *jcp_param) {
+void aom_dist_wtd_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+                                      int width, int height, const uint8_t *ref,
+                                      int ref_stride,
+                                      const DIST_WTD_COMP_PARAMS *jcp_param) {
   int i;
   const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
   const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
@@ -67,7 +67,7 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
         __m128i p0 = xx_loadu_128(ref);
         __m128i p1 = xx_loadu_128(pred);
 
-        compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+        compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
 
         comp_pred += 16;
         pred += 16;
@@ -85,7 +85,7 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
       __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
       __m128i p1 = xx_loadu_128(pred);
 
-      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+      compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
 
       comp_pred += 16;
       pred += 16;
@@ -107,7 +107,7 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
                         row3[0], row3[1], row3[2], row3[3]);
       __m128i p1 = xx_loadu_128(pred);
 
-      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+      compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
 
       comp_pred += 16;
       pred += 16;
@@ -116,11 +116,11 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
   }
 }
 
-void aom_jnt_comp_avg_upsampled_pred_ssse3(
+void aom_dist_wtd_comp_avg_upsampled_pred_ssse3(
     MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
     const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
     int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
+    int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
   int n;
   int i;
   aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
@@ -141,52 +141,52 @@ void aom_jnt_comp_avg_upsampled_pred_ssse3(
     __m128i p0 = xx_loadu_128(comp_pred);
     __m128i p1 = xx_loadu_128(pred);
 
-    compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+    compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
 
     comp_pred += 16;
     pred += 16;
   }
 }
 
-#define JNT_SUBPIX_AVG_VAR(W, H)                                         \
-  uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_ssse3(              \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,          \
-      const uint8_t *b, int b_stride, uint32_t *sse,                     \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {    \
-    uint16_t fdata3[(H + 1) * W];                                        \
-    uint8_t temp2[H * W];                                                \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                          \
-                                                                         \
-    aom_var_filter_block2d_bil_first_pass_ssse3(                         \
-        a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
-    aom_var_filter_block2d_bil_second_pass_ssse3(                        \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);        \
-                                                                         \
-    aom_jnt_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W,      \
-                                jcp_param);                              \
-                                                                         \
-    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);            \
+#define DIST_WTD_SUBPIX_AVG_VAR(W, H)                                      \
+  uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_ssse3(           \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,            \
+      const uint8_t *b, int b_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+    uint16_t fdata3[(H + 1) * W];                                          \
+    uint8_t temp2[H * W];                                                  \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                            \
+                                                                           \
+    aom_var_filter_block2d_bil_first_pass_ssse3(                           \
+        a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_var_filter_block2d_bil_second_pass_ssse3(                          \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);          \
+                                                                           \
+    aom_dist_wtd_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W,   \
+                                     jcp_param);                           \
+                                                                           \
+    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);              \
   }
 
-JNT_SUBPIX_AVG_VAR(128, 128)
-JNT_SUBPIX_AVG_VAR(128, 64)
-JNT_SUBPIX_AVG_VAR(64, 128)
-JNT_SUBPIX_AVG_VAR(64, 64)
-JNT_SUBPIX_AVG_VAR(64, 32)
-JNT_SUBPIX_AVG_VAR(32, 64)
-JNT_SUBPIX_AVG_VAR(32, 32)
-JNT_SUBPIX_AVG_VAR(32, 16)
-JNT_SUBPIX_AVG_VAR(16, 32)
-JNT_SUBPIX_AVG_VAR(16, 16)
-JNT_SUBPIX_AVG_VAR(16, 8)
-JNT_SUBPIX_AVG_VAR(8, 16)
-JNT_SUBPIX_AVG_VAR(8, 8)
-JNT_SUBPIX_AVG_VAR(8, 4)
-JNT_SUBPIX_AVG_VAR(4, 8)
-JNT_SUBPIX_AVG_VAR(4, 4)
-JNT_SUBPIX_AVG_VAR(4, 16)
-JNT_SUBPIX_AVG_VAR(16, 4)
-JNT_SUBPIX_AVG_VAR(8, 32)
-JNT_SUBPIX_AVG_VAR(32, 8)
-JNT_SUBPIX_AVG_VAR(16, 64)
-JNT_SUBPIX_AVG_VAR(64, 16)
+DIST_WTD_SUBPIX_AVG_VAR(128, 128)
+DIST_WTD_SUBPIX_AVG_VAR(128, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 128)
+DIST_WTD_SUBPIX_AVG_VAR(64, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 64)
+DIST_WTD_SUBPIX_AVG_VAR(32, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 32)
+DIST_WTD_SUBPIX_AVG_VAR(16, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 8)
+DIST_WTD_SUBPIX_AVG_VAR(8, 16)
+DIST_WTD_SUBPIX_AVG_VAR(8, 8)
+DIST_WTD_SUBPIX_AVG_VAR(8, 4)
+DIST_WTD_SUBPIX_AVG_VAR(4, 8)
+DIST_WTD_SUBPIX_AVG_VAR(4, 4)
+DIST_WTD_SUBPIX_AVG_VAR(4, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 4)
+DIST_WTD_SUBPIX_AVG_VAR(8, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 8)
+DIST_WTD_SUBPIX_AVG_VAR(16, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 16)
diff --git a/media/libaom/src/aom_dsp/x86/loopfilter_sse2.c b/media/libaom/src/aom_dsp/x86/loopfilter_sse2.c
index 9d88b5e493..d534683fce 100644
--- a/media/libaom/src/aom_dsp/x86/loopfilter_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/loopfilter_sse2.c
@@ -16,237 +16,69 @@
 #include "aom_dsp/x86/synonyms.h"
 #include "aom_ports/mem.h"
 #include "aom_ports/emmintrin_compat.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
 
 static INLINE __m128i abs_diff(__m128i a, __m128i b) {
   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
 }
 
-static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
-                                             __m128i *x2, __m128i *x3,
-                                             __m128i *d0, __m128i *d1,
-                                             __m128i *d2, __m128i *d3) {
-  // input
-  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
-  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
-  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
-  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
-  // output
-  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-
-  __m128i w0, w1;
-
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
-  *d0 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-
-  *d1 = _mm_srli_si128(*d0,
-                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d2 = _mm_srli_si128(*d0,
-                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d3 = _mm_srli_si128(*d0,
-                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-}
-
-static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
-                                         __m128i *x3, __m128i *d0, __m128i *d1,
-                                         __m128i *d2, __m128i *d3, __m128i *d4,
-                                         __m128i *d5, __m128i *d6,
-                                         __m128i *d7) {
-  // input
-  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
-  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
-  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
-  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
-  // output
-  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
-
-  __m128i w0, w1, ww0, ww1;
-
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them to 4x8  independently while flipping the second matrix horizontally.
+// Used for 14 taps pq pairs creation
+static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                        __m128i *x3, __m128i *q0p0,
+                                        __m128i *q1p1, __m128i *q2p2,
+                                        __m128i *q3p3, __m128i *q4p4,
+                                        __m128i *q5p5, __m128i *q6p6,
+                                        __m128i *q7p7) {
+  __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
   w0 = _mm_unpacklo_epi8(
       *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
   w1 = _mm_unpacklo_epi8(
       *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  w2 = _mm_unpackhi_epi8(
+      *x0, *x1);  // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
+  w3 = _mm_unpackhi_epi8(
+      *x2, *x3);  // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
 
   ww0 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+      w0, w1);  // 00 10 20 30 01 11 21 31        02 12 22 32 03 13 23 33
   ww1 = _mm_unpackhi_epi16(
-      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
-
-  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d1 = _mm_srli_si128(ww0,
-                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d2 = _mm_srli_si128(ww0,
-                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d3 = _mm_srli_si128(ww0,
-                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-
-  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d5 = _mm_srli_si128(ww1,
-                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d6 = _mm_srli_si128(ww1,
-                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d7 = _mm_srli_si128(ww1,
-                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
-}
-
-static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
-                                         __m128i *x3, __m128i *x4, __m128i *x5,
-                                         __m128i *x6, __m128i *x7, __m128i *d0,
-                                         __m128i *d1, __m128i *d2,
-                                         __m128i *d3) {
-  // input
-  // x0 00 01 02 03 04 05 06 07
-  // x1 10 11 12 13 14 15 16 17
-  // x2 20 21 22 23 24 25 26 27
-  // x3 30 31 32 33 34 35 36 37
-  // x4 40 41 42 43 44 45 46 47
-  // x5  50 51 52 53 54 55 56 57
-  // x6  60 61 62 63 64 65 66 67
-  // x7 70 71 72 73 74 75 76 77
-  // output
-  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
-  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
-  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
-  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
-
-  __m128i w0, w1, w2, w3, w4, w5;
-
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
-  w2 = _mm_unpacklo_epi8(
-      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-
-  w3 = _mm_unpacklo_epi8(
-      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-
-  w4 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  w5 = _mm_unpacklo_epi16(
-      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
-  *d0 = _mm_unpacklo_epi32(
-      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-  *d1 = _mm_srli_si128(*d0, 8);
-  *d2 = _mm_unpackhi_epi32(
-      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-  *d3 = _mm_srli_si128(*d2, 8);
-}
-
-static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
-                                     __m128i *x3, __m128i *x4, __m128i *x5,
-                                     __m128i *x6, __m128i *x7, __m128i *d0d1,
-                                     __m128i *d2d3, __m128i *d4d5,
-                                     __m128i *d6d7) {
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
-  // x0 00 01 02 03 04 05 06 07
-  // x1 10 11 12 13 14 15 16 17
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-
-  // x2 20 21 22 23 24 25 26 27
-  // x3 30 31 32 33 34 35 36 37
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
-  // x4 40 41 42 43 44 45 46 47
-  // x5  50 51 52 53 54 55 56 57
-  w2 = _mm_unpacklo_epi8(
-      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-
-  // x6  60 61 62 63 64 65 66 67
-  // x7 70 71 72 73 74 75 76 77
-  w3 = _mm_unpacklo_epi8(
-      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-
-  w4 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  w5 = _mm_unpacklo_epi16(
-      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
-  *d0d1 = _mm_unpacklo_epi32(
-      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-  *d2d3 = _mm_unpackhi_epi32(
-      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-
-  w6 = _mm_unpackhi_epi16(
-      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
-  w7 = _mm_unpackhi_epi16(
-      w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
-
-  *d4d5 = _mm_unpacklo_epi32(
-      w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
-  *d6d7 = _mm_unpackhi_epi32(
-      w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
-}
+      w0, w1);  // 04 14 24 34 05 15 25 35        06 16 26 36 07 17 27 37
+  ww2 = _mm_unpacklo_epi16(
+      w2, w3);  // 08 18 28 38 09 19 29 39       010 110 210 310 011 111 211 311
+  ww3 = _mm_unpackhi_epi16(
+      w2,
+      w3);  // 012 112 212 312 013 113 213 313  014 114 214 314 015 115 215 315
 
-static INLINE void transpose16x8_8x16_sse2(
-    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
-    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
-    __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
-    __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
-    __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
-  __m128i w10, w11, w12, w13, w14, w15;
-
-  w0 = _mm_unpacklo_epi8(*x0, *x1);
-  w1 = _mm_unpacklo_epi8(*x2, *x3);
-  w2 = _mm_unpacklo_epi8(*x4, *x5);
-  w3 = _mm_unpacklo_epi8(*x6, *x7);
-
-  w8 = _mm_unpacklo_epi8(*x8, *x9);
-  w9 = _mm_unpacklo_epi8(*x10, *x11);
-  w10 = _mm_unpacklo_epi8(*x12, *x13);
-  w11 = _mm_unpacklo_epi8(*x14, *x15);
-
-  w4 = _mm_unpacklo_epi16(w0, w1);
-  w5 = _mm_unpacklo_epi16(w2, w3);
-  w12 = _mm_unpacklo_epi16(w8, w9);
-  w13 = _mm_unpacklo_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store first 4-line result
-  *d0 = _mm_unpacklo_epi64(w6, w14);
-  *d1 = _mm_unpackhi_epi64(w6, w14);
-  *d2 = _mm_unpacklo_epi64(w7, w15);
-  *d3 = _mm_unpackhi_epi64(w7, w15);
-
-  w4 = _mm_unpackhi_epi16(w0, w1);
-  w5 = _mm_unpackhi_epi16(w2, w3);
-  w12 = _mm_unpackhi_epi16(w8, w9);
-  w13 = _mm_unpackhi_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store second 4-line result
-  *d4 = _mm_unpacklo_epi64(w6, w14);
-  *d5 = _mm_unpackhi_epi64(w6, w14);
-  *d6 = _mm_unpacklo_epi64(w7, w15);
-  *d7 = _mm_unpackhi_epi64(w7, w15);
+  *q7p7 = _mm_unpacklo_epi32(
+      ww0,
+      _mm_srli_si128(
+          ww3, 12));  // 00 10 20 30  015 115 215 315  xx xx xx xx xx xx xx xx
+  *q6p6 = _mm_unpackhi_epi32(
+      _mm_slli_si128(ww0, 4),
+      ww3);  // 01 11 21 31  014 114 214 314  xx xx xx xxxx xx xx xx
+  *q5p5 = _mm_unpackhi_epi32(
+      ww0,
+      _mm_slli_si128(
+          ww3, 4));  // 02 12 22 32  013 113 213 313  xx xx xx x xx xx xx xxx
+  *q4p4 = _mm_unpacklo_epi32(
+      _mm_srli_si128(ww0, 12),
+      ww3);  // 03 13 23 33  012 112 212 312 xx xx xx xx xx xx xx xx
+  *q3p3 = _mm_unpacklo_epi32(
+      ww1,
+      _mm_srli_si128(
+          ww2, 12));  // 04 14 24 34  011 111 211 311 xx xx xx xx xx xx xx xx
+  *q2p2 = _mm_unpackhi_epi32(
+      _mm_slli_si128(ww1, 4),
+      ww2);  // 05 15 25 35   010 110 210 310 xx xx xx xx xx xx xx xx
+  *q1p1 = _mm_unpackhi_epi32(
+      ww1,
+      _mm_slli_si128(
+          ww2, 4));  // 06 16 26 36   09 19 29 39     xx xx xx xx xx xx xx xx
+  *q0p0 = _mm_unpacklo_epi32(
+      _mm_srli_si128(ww1, 12),
+      ww2);  // 07 17 27 37  08 18 28 38     xx xx xx xx xx xx xx xx
 }
 
 // this function treats its input as 2 parallel 8x4 matrices, transposes each of
@@ -306,116 +138,6 @@ static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
   *pq3 = _mm_unpackhi_epi64(d2, d3);  // pq
 }
 
-static INLINE void transpose8x16_16x8_sse2(
-    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
-    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
-    __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
-    __m128i *d12d13, __m128i *d14d15) {
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
-  __m128i w10, w11, w12, w13, w14, w15;
-
-  w0 = _mm_unpacklo_epi8(*x0, *x1);
-  w1 = _mm_unpacklo_epi8(*x2, *x3);
-  w2 = _mm_unpacklo_epi8(*x4, *x5);
-  w3 = _mm_unpacklo_epi8(*x6, *x7);
-
-  w8 = _mm_unpackhi_epi8(*x0, *x1);
-  w9 = _mm_unpackhi_epi8(*x2, *x3);
-  w10 = _mm_unpackhi_epi8(*x4, *x5);
-  w11 = _mm_unpackhi_epi8(*x6, *x7);
-
-  w4 = _mm_unpacklo_epi16(w0, w1);
-  w5 = _mm_unpacklo_epi16(w2, w3);
-  w12 = _mm_unpacklo_epi16(w8, w9);
-  w13 = _mm_unpacklo_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store first 4-line result
-  *d0d1 = _mm_unpacklo_epi64(w6, w14);
-  *d2d3 = _mm_unpackhi_epi64(w6, w14);
-  *d4d5 = _mm_unpacklo_epi64(w7, w15);
-  *d6d7 = _mm_unpackhi_epi64(w7, w15);
-
-  w4 = _mm_unpackhi_epi16(w0, w1);
-  w5 = _mm_unpackhi_epi16(w2, w3);
-  w12 = _mm_unpackhi_epi16(w8, w9);
-  w13 = _mm_unpackhi_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store second 4-line result
-  *d8d9 = _mm_unpacklo_epi64(w6, w14);
-  *d10d11 = _mm_unpackhi_epi64(w6, w14);
-  *d12d13 = _mm_unpacklo_epi64(w7, w15);
-  *d14d15 = _mm_unpackhi_epi64(w7, w15);
-}
-
-// this function treats its input as 2 parallel 8x4 matrices, transposes each of
-// them to 4x8  independently while flipping the second matrix horizontaly. Used
-// for 14 taps pq pairs creation
-static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
-                                        __m128i *x3, __m128i *q0p0,
-                                        __m128i *q1p1, __m128i *q2p2,
-                                        __m128i *q3p3, __m128i *q4p4,
-                                        __m128i *q5p5, __m128i *q6p6,
-                                        __m128i *q7p7) {
-  __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-  w2 = _mm_unpackhi_epi8(
-      *x0, *x1);  // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
-  w3 = _mm_unpackhi_epi8(
-      *x2, *x3);  // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
-
-  ww0 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31        02 12 22 32 03 13 23 33
-  ww1 = _mm_unpackhi_epi16(
-      w0, w1);  // 04 14 24 34 05 15 25 35        06 16 26 36 07 17 27 37
-  ww2 = _mm_unpacklo_epi16(
-      w2, w3);  // 08 18 28 38 09 19 29 39       010 110 210 310 011 111 211 311
-  ww3 = _mm_unpackhi_epi16(
-      w2,
-      w3);  // 012 112 212 312 013 113 213 313  014 114 214 314 015 115 215 315
-
-  *q7p7 = _mm_unpacklo_epi32(
-      ww0,
-      _mm_srli_si128(
-          ww3, 12));  // 00 10 20 30  015 115 215 315  xx xx xx xx xx xx xx xx
-  *q6p6 = _mm_unpackhi_epi32(
-      _mm_slli_si128(ww0, 4),
-      ww3);  // 01 11 21 31  014 114 214 314  xx xx xx xxxx xx xx xx
-  *q5p5 = _mm_unpackhi_epi32(
-      ww0,
-      _mm_slli_si128(
-          ww3, 4));  // 02 12 22 32  013 113 213 313  xx xx xx x xx xx xx xxx
-  *q4p4 = _mm_unpacklo_epi32(
-      _mm_srli_si128(ww0, 12),
-      ww3);  // 03 13 23 33  012 112 212 312 xx xx xx xx xx xx xx xx
-  *q3p3 = _mm_unpacklo_epi32(
-      ww1,
-      _mm_srli_si128(
-          ww2, 12));  // 04 14 24 34  011 111 211 311 xx xx xx xx xx xx xx xx
-  *q2p2 = _mm_unpackhi_epi32(
-      _mm_slli_si128(ww1, 4),
-      ww2);  // 05 15 25 35   010 110 210 310 xx xx xx xx xx xx xx xx
-  *q1p1 = _mm_unpackhi_epi32(
-      ww1,
-      _mm_slli_si128(
-          ww2, 4));  // 06 16 26 36   09 19 29 39     xx xx xx xx xx xx xx xx
-  *q0p0 = _mm_unpacklo_epi32(
-      _mm_srli_si128(ww1, 12),
-      ww2);  // 07 17 27 37  08 18 28 38     xx xx xx xx xx xx xx xx
-}
-
 static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
                                           __m128i *hev, __m128i *mask,
                                           __m128i *qs1qs0, __m128i *ps1ps0) {
@@ -424,7 +146,7 @@ static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
   __m128i hev1;
   const __m128i t3t4 =
       _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
-  const __m128i t80 = _mm_set1_epi8(0x80);
+  const __m128i t80 = _mm_set1_epi8((char)0x80);
   const __m128i ff = _mm_cmpeq_epi8(t80, t80);
 
   ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
@@ -473,7 +195,7 @@ static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0,
                                                __m128i *ps1ps0) {
   const __m128i t3t4 =
       _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
-  const __m128i t80 = _mm_set1_epi8(0x80);
+  const __m128i t80 = _mm_set1_epi8((char)0x80);
   __m128i filter, filter2filter1, work;
   __m128i ps1ps0_work, qs1qs0_work;
   __m128i hev1;
@@ -616,10 +338,10 @@ void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
   __m128i qs1qs0, ps1ps0;
   __m128i p1, p0, q0, q1;
 
-  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
-  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
-  q0 = _mm_cvtsi32_si128(*(int *)(s + 0 * p));
-  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
+  p1 = xx_loadl_32(s - 2 * p);
+  p0 = xx_loadl_32(s - 1 * p);
+  q0 = xx_loadl_32(s - 0 * p);
+  q1 = xx_loadl_32(s + 1 * p);
 
   lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
 
@@ -688,7 +410,7 @@ static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2(
     __m128i fe, ff, work;
     abs_p1p0 = abs_diff(*q1p1, *q0p0);
     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-    fe = _mm_set1_epi8(0xfe);
+    fe = _mm_set1_epi8((char)0xfe);
     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     abs_p0q0 = abs_diff(p1p0, q1q0);
     abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
@@ -992,7 +714,7 @@ static AOM_FORCE_INLINE void lpf_internal_14_sse2(
     __m128i abs_p1q1, abs_p0q0, abs_q1q0;
     abs_p1p0 = abs_diff(*q1p1, *q0p0);
     abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
-    fe = _mm_set1_epi8(0xfe);
+    fe = _mm_set1_epi8((char)0xfe);
     ff = _mm_cmpeq_epi8(fe, fe);
     abs_p0q0 = abs_diff(p1p0, q1q0);
     abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
@@ -1241,23 +963,16 @@ void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
   __m128i limit = _mm_load_si128((const __m128i *)_limit);
   __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
 
-  q4p4 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 5 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 4 * p)));
-  q3p3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 4 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 3 * p)));
-  q2p2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 3 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 2 * p)));
-  q1p1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 1 * p)));
+  q4p4 = _mm_unpacklo_epi32(xx_loadl_32(s - 5 * p), xx_loadl_32(s + 4 * p));
+  q3p3 = _mm_unpacklo_epi32(xx_loadl_32(s - 4 * p), xx_loadl_32(s + 3 * p));
+  q2p2 = _mm_unpacklo_epi32(xx_loadl_32(s - 3 * p), xx_loadl_32(s + 2 * p));
+  q1p1 = _mm_unpacklo_epi32(xx_loadl_32(s - 2 * p), xx_loadl_32(s + 1 * p));
 
-  q0p0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s - 0 * p)));
+  q0p0 = _mm_unpacklo_epi32(xx_loadl_32(s - 1 * p), xx_loadl_32(s - 0 * p));
 
-  q5p5 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 6 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 5 * p)));
+  q5p5 = _mm_unpacklo_epi32(xx_loadl_32(s - 6 * p), xx_loadl_32(s + 5 * p));
 
-  q6p6 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 7 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 6 * p)));
+  q6p6 = _mm_unpacklo_epi32(xx_loadl_32(s - 7 * p), xx_loadl_32(s + 6 * p));
 
   lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
                        &limit, &thresh);
@@ -1288,7 +1003,7 @@ static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2(
   *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
 
   const __m128i one = _mm_set1_epi8(1);
-  const __m128i fe = _mm_set1_epi8(0xfe);
+  const __m128i fe = _mm_set1_epi8((char)0xfe);
   const __m128i ff = _mm_cmpeq_epi8(fe, fe);
 
   {
@@ -1417,7 +1132,7 @@ static AOM_FORCE_INLINE void lpf_internal_6_sse2(
   *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
 
   const __m128i one = _mm_set1_epi8(1);
-  const __m128i fe = _mm_set1_epi8(0xfe);
+  const __m128i fe = _mm_set1_epi8((char)0xfe);
   const __m128i ff = _mm_cmpeq_epi8(fe, fe);
   {
     // filter_mask and hev_mask
@@ -1543,12 +1258,12 @@ void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
   __m128i limit = _mm_load_si128((__m128i *)_limit);
   __m128i thresh = _mm_load_si128((__m128i *)_thresh);
 
-  p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
-  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
-  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
-  q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
-  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
-  q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
+  p2 = xx_loadl_32(s - 3 * p);
+  p1 = xx_loadl_32(s - 2 * p);
+  p0 = xx_loadl_32(s - 1 * p);
+  q0 = xx_loadl_32(s - 0 * p);
+  q1 = xx_loadl_32(s + 1 * p);
+  q2 = xx_loadl_32(s + 2 * p);
 
   lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
                       &limit, &thresh);
@@ -1622,7 +1337,7 @@ static AOM_FORCE_INLINE void lpf_internal_8_sse2(
   // otherwise - not
 
   const __m128i one = _mm_set1_epi8(1);
-  const __m128i fe = _mm_set1_epi8(0xfe);
+  const __m128i fe = _mm_set1_epi8((char)0xfe);
   const __m128i ff = _mm_cmpeq_epi8(fe, fe);
   __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
 
@@ -1777,7 +1492,7 @@ static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2(
     // otherwise - not
 
     const __m128i one = _mm_set1_epi8(1);
-    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i fe = _mm_set1_epi8((char)0xfe);
     const __m128i ff = _mm_cmpeq_epi8(fe, fe);
     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
 
@@ -1895,20 +1610,20 @@ void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
                                const unsigned char *_blimit,
                                const unsigned char *_limit,
                                const unsigned char *_thresh) {
-  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   __m128i q1q0, p1p0;
   __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
   __m128i limit = _mm_load_si128((const __m128i *)_limit);
   __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
 
-  p3 = _mm_cvtsi32_si128(*(int *)(s - 4 * p));
-  p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
-  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
-  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
-  q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
-  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
-  q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
-  q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p));
+  p3 = xx_loadl_32(s - 4 * p);
+  p2 = xx_loadl_32(s - 3 * p);
+  p1 = xx_loadl_32(s - 2 * p);
+  p0 = xx_loadl_32(s - 1 * p);
+  q0 = xx_loadl_32(s - 0 * p);
+  q1 = xx_loadl_32(s + 1 * p);
+  q2 = xx_loadl_32(s + 2 * p);
+  q3 = xx_loadl_32(s + 3 * p);
 
   lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
                       &blimit, &limit, &thresh);
diff --git a/media/libaom/src/aom_dsp/x86/lpf_common_sse2.h b/media/libaom/src/aom_dsp/x86/lpf_common_sse2.h
index 8970fe7dd6..6ed2cbfdf4 100644
--- a/media/libaom/src/aom_dsp/x86/lpf_common_sse2.h
+++ b/media/libaom/src/aom_dsp/x86/lpf_common_sse2.h
@@ -212,4 +212,284 @@ static INLINE void highbd_transpose8x16_sse2(
                            d4 + 1, d5 + 1, d6 + 1, d7 + 1);
 }
 
+// Low bit depth functions
+static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+                                             __m128i *x2, __m128i *x3,
+                                             __m128i *d0, __m128i *d1,
+                                             __m128i *d2, __m128i *d3) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  *d0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+
+  *d1 = _mm_srli_si128(*d0,
+                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(*d0,
+                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(*d0,
+                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                         __m128i *x3, __m128i *d0, __m128i *d1,
+                                         __m128i *d2, __m128i *d3, __m128i *d4,
+                                         __m128i *d5, __m128i *d6,
+                                         __m128i *d7) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1, ww0, ww1;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  ww0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+
+  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d1 = _mm_srli_si128(ww0,
+                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(ww0,
+                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(ww0,
+                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d5 = _mm_srli_si128(ww1,
+                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d6 = _mm_srli_si128(ww1,
+                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d7 = _mm_srli_si128(ww1,
+                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                         __m128i *x3, __m128i *x4, __m128i *x5,
+                                         __m128i *x6, __m128i *x7, __m128i *d0,
+                                         __m128i *d1, __m128i *d2,
+                                         __m128i *d3) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5  50 51 52 53 54 55 56 57
+  // x6  60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  // output
+  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
+  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1, w2, w3, w4, w5;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  *d0 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d1 = _mm_srli_si128(*d0, 8);
+  *d2 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+  *d3 = _mm_srli_si128(*d2, 8);
+}
+
+static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                     __m128i *x3, __m128i *x4, __m128i *x5,
+                                     __m128i *x6, __m128i *x7, __m128i *d0d1,
+                                     __m128i *d2d3, __m128i *d4d5,
+                                     __m128i *d6d7) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  // x4 40 41 42 43 44 45 46 47
+  // x5  50 51 52 53 54 55 56 57
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+  // x6  60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  *d0d1 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d2d3 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+  w6 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+  w7 = _mm_unpackhi_epi16(
+      w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+
+  *d4d5 = _mm_unpacklo_epi32(
+      w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+  *d6d7 = _mm_unpackhi_epi32(
+      w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+}
+
+static INLINE void transpose16x8_8x16_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
+    __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
+    __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
+    __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(*x0, *x1);
+  w1 = _mm_unpacklo_epi8(*x2, *x3);
+  w2 = _mm_unpacklo_epi8(*x4, *x5);
+  w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+  w8 = _mm_unpacklo_epi8(*x8, *x9);
+  w9 = _mm_unpacklo_epi8(*x10, *x11);
+  w10 = _mm_unpacklo_epi8(*x12, *x13);
+  w11 = _mm_unpacklo_epi8(*x14, *x15);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  *d0 = _mm_unpacklo_epi64(w6, w14);
+  *d1 = _mm_unpackhi_epi64(w6, w14);
+  *d2 = _mm_unpacklo_epi64(w7, w15);
+  *d3 = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  *d4 = _mm_unpacklo_epi64(w6, w14);
+  *d5 = _mm_unpackhi_epi64(w6, w14);
+  *d6 = _mm_unpacklo_epi64(w7, w15);
+  *d7 = _mm_unpackhi_epi64(w7, w15);
+}
+
+static INLINE void transpose8x16_16x8_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
+    __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
+    __m128i *d12d13, __m128i *d14d15) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(*x0, *x1);
+  w1 = _mm_unpacklo_epi8(*x2, *x3);
+  w2 = _mm_unpacklo_epi8(*x4, *x5);
+  w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+  w8 = _mm_unpackhi_epi8(*x0, *x1);
+  w9 = _mm_unpackhi_epi8(*x2, *x3);
+  w10 = _mm_unpackhi_epi8(*x4, *x5);
+  w11 = _mm_unpackhi_epi8(*x6, *x7);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  *d0d1 = _mm_unpacklo_epi64(w6, w14);
+  *d2d3 = _mm_unpackhi_epi64(w6, w14);
+  *d4d5 = _mm_unpacklo_epi64(w7, w15);
+  *d6d7 = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  *d8d9 = _mm_unpacklo_epi64(w6, w14);
+  *d10d11 = _mm_unpackhi_epi64(w6, w14);
+  *d12d13 = _mm_unpacklo_epi64(w7, w15);
+  *d14d15 = _mm_unpackhi_epi64(w7, w15);
+}
+
 #endif  // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
diff --git a/media/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c b/media/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c
new file mode 100644
index 0000000000..8ef7ee0d7b
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/masked_sad4d_ssse3.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
+
+#define MASK_SAD16XH_ONE_REF(idx)                             \
+  a = _mm_loadu_si128((const __m128i *)&ref##idx[x]);         \
+  data_l = _mm_unpacklo_epi8(a, b);                           \
+  mask_l = _mm_unpacklo_epi8(m, m_inv);                       \
+  pred_l = _mm_maddubs_epi16(data_l, mask_l);                 \
+  pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); \
+                                                              \
+  data_r = _mm_unpackhi_epi8(a, b);                           \
+  mask_r = _mm_unpackhi_epi8(m, m_inv);                       \
+  pred_r = _mm_maddubs_epi16(data_r, mask_r);                 \
+  pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); \
+                                                              \
+  pred = _mm_packus_epi16(pred_l, pred_r);                    \
+  res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+static INLINE void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *a_ptr[], int a_stride,
+                                       const uint8_t *b_ptr, int b_stride,
+                                       const uint8_t *m_ptr, int m_stride,
+                                       int width, int height, int inv_mask,
+                                       unsigned sad_array[]) {
+  int x, y;
+  __m128i a;
+  __m128i data_l, data_r, mask_l, mask_r, pred_l, pred_r, pred;
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+  __m128i res0 = _mm_setzero_si128();
+  __m128i res1 = _mm_setzero_si128();
+  __m128i res2 = _mm_setzero_si128();
+  __m128i res3 = _mm_setzero_si128();
+  const uint8_t *ref0 = a_ptr[0];
+  const uint8_t *ref1 = a_ptr[1];
+  const uint8_t *ref2 = a_ptr[2];
+  const uint8_t *ref3 = a_ptr[3];
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 16) {
+      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+      const __m128i m_copy = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
+      __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+      __m128i m = inv_mask ? m_inv : m_copy;
+      m_inv = inv_mask ? m_copy : m_inv;
+
+      MASK_SAD16XH_ONE_REF(0)
+      MASK_SAD16XH_ONE_REF(1)
+      MASK_SAD16XH_ONE_REF(2)
+      MASK_SAD16XH_ONE_REF(3)
+    }
+
+    src_ptr += src_stride;
+    ref0 += a_stride;
+    ref1 += a_stride;
+    ref2 += a_stride;
+    ref3 += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1),
+                       _mm_unpackhi_epi32(res0, res1));
+  res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3),
+                       _mm_unpackhi_epi32(res2, res3));
+
+  res0 = _mm_unpacklo_epi64(res0, res2);
+  _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASK_SAD8XH_ONE_REF(idx)                                               \
+  const __m128i a##idx##0 = _mm_loadl_epi64((__m128i *)ref##idx);              \
+  const __m128i a##idx##1 = _mm_loadl_epi64((__m128i *)(ref##idx + a_stride)); \
+  data_l = _mm_unpacklo_epi8(a##idx##0, b0);                                   \
+  mask_l = _mm_unpacklo_epi8(m, m_inv);                                        \
+  pred_l = _mm_maddubs_epi16(data_l, mask_l);                                  \
+  pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);                  \
+                                                                               \
+  data_r = _mm_unpacklo_epi8(a##idx##1, b1);                                   \
+  mask_r = _mm_unpackhi_epi8(m, m_inv);                                        \
+  pred_r = _mm_maddubs_epi16(data_r, mask_r);                                  \
+  pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);                  \
+                                                                               \
+  pred = _mm_packus_epi16(pred_l, pred_r);                                     \
+  res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_array[], int a_stride,
+                                const uint8_t *b_ptr, int b_stride,
+                                const uint8_t *m_ptr, int m_stride, int height,
+                                int inv_mask, unsigned sad_array[]) {
+  const uint8_t *ref0 = ref_array[0];
+  const uint8_t *ref1 = ref_array[1];
+  const uint8_t *ref2 = ref_array[2];
+  const uint8_t *ref3 = ref_array[3];
+  __m128i data_l, data_r, pred_l, pred_r, mask_l, mask_r, pred;
+  __m128i res0 = _mm_setzero_si128();
+  __m128i res1 = _mm_setzero_si128();
+  __m128i res2 = _mm_setzero_si128();
+  __m128i res3 = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+  for (int y = 0; y < height; y += 2) {
+    const __m128i src = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)src_ptr),
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)));
+    const __m128i b0 = _mm_loadl_epi64((__m128i *)b_ptr);
+    const __m128i b1 = _mm_loadl_epi64((__m128i *)(b_ptr + b_stride));
+    const __m128i m0 = _mm_loadl_epi64((__m128i *)m_ptr);
+    const __m128i m1 = _mm_loadl_epi64((__m128i *)(m_ptr + m_stride));
+    __m128i m_copy = _mm_unpacklo_epi64(m0, m1);
+    __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+    __m128i m = inv_mask ? m_inv : m_copy;
+    m_inv = inv_mask ? m_copy : m_inv;
+
+    MASK_SAD8XH_ONE_REF(0)
+    MASK_SAD8XH_ONE_REF(1)
+    MASK_SAD8XH_ONE_REF(2)
+    MASK_SAD8XH_ONE_REF(3)
+
+    ref0 += 2 * a_stride;
+    ref1 += 2 * a_stride;
+    ref2 += 2 * a_stride;
+    ref3 += 2 * a_stride;
+    src_ptr += 2 * src_stride;
+    b_ptr += 2 * b_stride;
+    m_ptr += 2 * m_stride;
+  }
+  res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1),
+                       _mm_unpackhi_epi32(res0, res1));
+  res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3),
+                       _mm_unpackhi_epi32(res2, res3));
+  res0 = _mm_unpacklo_epi64(res0, res2);
+  _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASK_SAD4XH_ONE_REF(idx)                                               \
+  a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)ref##idx),             \
+                         _mm_cvtsi32_si128(*(uint32_t *)&ref##idx[a_stride])); \
+  data = _mm_unpacklo_epi8(a, b);                                              \
+  mask = _mm_unpacklo_epi8(m, m_inv);                                          \
+  pred = _mm_maddubs_epi16(data, mask);                                        \
+  pred = xx_roundn_epu16(pred, AOM_BLEND_A64_ROUND_BITS);                      \
+                                                                               \
+  pred = _mm_packus_epi16(pred, _mm_setzero_si128());                          \
+  res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_array[], int a_stride,
+                                const uint8_t *b_ptr, int b_stride,
+                                const uint8_t *m_ptr, int m_stride, int height,
+                                int inv_mask, unsigned sad_array[]) {
+  const uint8_t *ref0 = ref_array[0];
+  const uint8_t *ref1 = ref_array[1];
+  const uint8_t *ref2 = ref_array[2];
+  const uint8_t *ref3 = ref_array[3];
+  __m128i data, pred, mask;
+  __m128i res0 = _mm_setzero_si128();
+  __m128i res1 = _mm_setzero_si128();
+  __m128i res2 = _mm_setzero_si128();
+  __m128i res3 = _mm_setzero_si128();
+  __m128i a;
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+  for (int y = 0; y < height; y += 2) {
+    const __m128i src = _mm_unpacklo_epi32(
+        _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
+        _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
+    const __m128i b =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
+                           _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
+    const __m128i m_copy =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
+                           _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
+
+    __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+    __m128i m = inv_mask ? m_inv : m_copy;
+    m_inv = inv_mask ? m_copy : m_inv;
+
+    MASK_SAD4XH_ONE_REF(0)
+    MASK_SAD4XH_ONE_REF(1)
+    MASK_SAD4XH_ONE_REF(2)
+    MASK_SAD4XH_ONE_REF(3)
+
+    ref0 += 2 * a_stride;
+    ref1 += 2 * a_stride;
+    ref2 += 2 * a_stride;
+    ref3 += 2 * a_stride;
+    src_ptr += 2 * src_stride;
+    b_ptr += 2 * b_stride;
+    m_ptr += 2 * m_stride;
+  }
+  res0 = _mm_unpacklo_epi32(res0, res1);
+  res2 = _mm_unpacklo_epi32(res2, res3);
+  res0 = _mm_unpacklo_epi64(res0, res2);
+  _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASKSADMXN_SSSE3(m, n)                                                 \
+  void aom_masked_sad##m##x##n##x4d_ssse3(                                     \
+      const uint8_t *src, int src_stride, const uint8_t *ref[],                \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,          \
+      int msk_stride, int inv_mask, unsigned sad_array[]) {                    \
+    masked_sadx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, m, msk, \
+                        msk_stride, m, n, inv_mask, sad_array);                \
+  }
+
+#define MASKSAD8XN_SSSE3(n)                                                   \
+  void aom_masked_sad8x##n##x4d_ssse3(                                        \
+      const uint8_t *src, int src_stride, const uint8_t *ref[],               \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,         \
+      int msk_stride, int inv_mask, unsigned sad_array[]) {                   \
+    aom_masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
+                               8, msk, msk_stride, n, inv_mask, sad_array);   \
+  }
+
+#define MASKSAD4XN_SSSE3(n)                                                   \
+  void aom_masked_sad4x##n##x4d_ssse3(                                        \
+      const uint8_t *src, int src_stride, const uint8_t *ref[],               \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,         \
+      int msk_stride, int inv_mask, unsigned sad_array[]) {                   \
+    aom_masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
+                               4, msk, msk_stride, n, inv_mask, sad_array);   \
+  }
+
+MASKSADMXN_SSSE3(128, 128)
+MASKSADMXN_SSSE3(128, 64)
+MASKSADMXN_SSSE3(64, 128)
+MASKSADMXN_SSSE3(64, 64)
+MASKSADMXN_SSSE3(64, 32)
+MASKSADMXN_SSSE3(32, 64)
+MASKSADMXN_SSSE3(32, 32)
+MASKSADMXN_SSSE3(32, 16)
+MASKSADMXN_SSSE3(16, 32)
+MASKSADMXN_SSSE3(16, 16)
+MASKSADMXN_SSSE3(16, 8)
+MASKSAD8XN_SSSE3(16)
+MASKSAD8XN_SSSE3(8)
+MASKSAD8XN_SSSE3(4)
+MASKSAD4XN_SSSE3(8)
+MASKSAD4XN_SSSE3(4)
+MASKSAD4XN_SSSE3(16)
+MASKSADMXN_SSSE3(16, 4)
+MASKSAD8XN_SSSE3(32)
+MASKSADMXN_SSSE3(32, 8)
+MASKSADMXN_SSSE3(16, 64)
+MASKSADMXN_SSSE3(64, 16)
diff --git a/media/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c b/media/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c
index 584b5e7e37..60f0ab3390 100644
--- a/media/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -17,7 +17,7 @@
 #include "aom_dsp/blend.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
 
 static INLINE unsigned int masked_sad32xh_avx2(
     const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
@@ -64,7 +64,7 @@ static INLINE unsigned int masked_sad32xh_avx2(
   res = _mm256_hadd_epi32(res, res);
   res = _mm256_hadd_epi32(res, res);
   int32_t sad = _mm256_extract_epi32(res, 0);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) {
@@ -117,7 +117,7 @@ static INLINE unsigned int masked_sad16xh_avx2(
   res = _mm256_hadd_epi32(res, res);
   res = _mm256_hadd_epi32(res, res);
   int32_t sad = _mm256_extract_epi32(res, 0);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 static INLINE unsigned int aom_masked_sad_avx2(
@@ -253,7 +253,7 @@ static INLINE unsigned int highbd_masked_sad8xh_avx2(
   res = _mm256_hadd_epi32(res, res);
   res = _mm256_hadd_epi32(res, res);
   int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 static INLINE unsigned int highbd_masked_sad16xh_avx2(
@@ -311,7 +311,7 @@ static INLINE unsigned int highbd_masked_sad16xh_avx2(
   res = _mm256_hadd_epi32(res, res);
   res = _mm256_hadd_epi32(res, res);
   int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 static INLINE unsigned int aom_highbd_masked_sad_avx2(
diff --git a/media/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c b/media/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c
index 493f9bd8f2..7168277963 100644
--- a/media/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ b/media/libaom/src/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -19,7 +19,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/synonyms.h"
 
-#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
 
 // For width a multiple of 16
 static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
@@ -134,7 +134,7 @@ static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
   // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
   int32_t sad =
       _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
@@ -179,7 +179,7 @@ unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
   }
   int32_t sad =
       _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
@@ -223,7 +223,7 @@ unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
   }
   // At this point, the SAD is stored in lane 0 of 'res'
   int32_t sad = _mm_cvtsi128_si32(res);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 // For width a multiple of 8
@@ -338,7 +338,7 @@ static INLINE unsigned int highbd_masked_sad_ssse3(
   res = _mm_hadd_epi32(res, res);
   res = _mm_hadd_epi32(res, res);
   int sad = _mm_cvtsi128_si32(res);
-  return (sad + 31) >> 6;
+  return sad;
 }
 
 unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
@@ -398,5 +398,5 @@ unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
   res = _mm_hadd_epi32(res, res);
   res = _mm_hadd_epi32(res, res);
   int sad = _mm_cvtsi128_si32(res);
-  return (sad + 31) >> 6;
+  return sad;
 }
diff --git a/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c b/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c
index d7dbefd7d9..fa93f0df4f 100644
--- a/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ b/media/libaom/src/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -218,15 +218,15 @@ static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
   }
 }
 
-static INLINE __m128i filter_block_2rows(const __m128i a0, const __m128i b0,
-                                         const __m128i a1, const __m128i b1,
-                                         const __m128i filter) {
-  __m128i v0 = _mm_unpacklo_epi8(a0, b0);
-  v0 = _mm_maddubs_epi16(v0, filter);
+static INLINE __m128i filter_block_2rows(const __m128i *a0, const __m128i *b0,
+                                         const __m128i *a1, const __m128i *b1,
+                                         const __m128i *filter) {
+  __m128i v0 = _mm_unpacklo_epi8(*a0, *b0);
+  v0 = _mm_maddubs_epi16(v0, *filter);
   v0 = xx_roundn_epu16(v0, FILTER_BITS);
 
-  __m128i v1 = _mm_unpacklo_epi8(a1, b1);
-  v1 = _mm_maddubs_epi16(v1, filter);
+  __m128i v1 = _mm_unpacklo_epi8(*a1, *b1);
+  v1 = _mm_maddubs_epi16(v1, *filter);
   v1 = xx_roundn_epu16(v1, FILTER_BITS);
 
   return _mm_packus_epi16(v0, v1);
@@ -262,7 +262,7 @@ static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
       const __m128i z0 = _mm_srli_si128(x0, 1);
       const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
       const __m128i z1 = _mm_srli_si128(x1, 1);
-      const __m128i res = filter_block_2rows(x0, z0, x1, z1, hfilter_vec);
+      const __m128i res = filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
       _mm_storeu_si128((__m128i *)b, res);
 
       src += src_stride * 2;
@@ -296,7 +296,7 @@ static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
       const __m128i x = _mm_loadl_epi64((__m128i *)dst);
       const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
       const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]);
-      const __m128i res = filter_block_2rows(x, y, y, z, vfilter_vec);
+      const __m128i res = filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
       _mm_storeu_si128((__m128i *)dst, res);
 
       dst += 16;
@@ -343,7 +343,7 @@ static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
       const __m128i b0 = _mm_unpacklo_epi32(z0, z1);
       const __m128i a1 = _mm_unpacklo_epi32(x2, x3);
       const __m128i b1 = _mm_unpacklo_epi32(z2, z3);
-      const __m128i res = filter_block_2rows(a0, b0, a1, b1, hfilter_vec);
+      const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &hfilter_vec);
       _mm_storeu_si128((__m128i *)b, res);
 
       src += src_stride * 4;
@@ -384,7 +384,7 @@ static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
       const __m128i b0 = _mm_unpacklo_epi32(b, c);
       const __m128i a1 = _mm_unpacklo_epi32(c, d);
       const __m128i b1 = _mm_unpacklo_epi32(d, e);
-      const __m128i res = filter_block_2rows(a0, b0, a1, b1, vfilter_vec);
+      const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &vfilter_vec);
       _mm_storeu_si128((__m128i *)dst, res);
 
       dst += 16;
@@ -392,29 +392,29 @@ static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
   }
 }
 
-static INLINE void accumulate_block(const __m128i src, const __m128i a,
-                                    const __m128i b, const __m128i m,
+static INLINE void accumulate_block(const __m128i *src, const __m128i *a,
+                                    const __m128i *b, const __m128i *m,
                                     __m128i *sum, __m128i *sum_sq) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+  const __m128i m_inv = _mm_sub_epi8(mask_max, *m);
 
   // Calculate 16 predicted pixels.
   // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
   // is 64 * 255, so we have plenty of space to add rounding constants.
-  const __m128i data_l = _mm_unpacklo_epi8(a, b);
-  const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
+  const __m128i data_l = _mm_unpacklo_epi8(*a, *b);
+  const __m128i mask_l = _mm_unpacklo_epi8(*m, m_inv);
   __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
   pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
 
-  const __m128i data_r = _mm_unpackhi_epi8(a, b);
-  const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
+  const __m128i data_r = _mm_unpackhi_epi8(*a, *b);
+  const __m128i mask_r = _mm_unpackhi_epi8(*m, m_inv);
   __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
   pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
 
-  const __m128i src_l = _mm_unpacklo_epi8(src, zero);
-  const __m128i src_r = _mm_unpackhi_epi8(src, zero);
+  const __m128i src_l = _mm_unpacklo_epi8(*src, zero);
+  const __m128i src_r = _mm_unpackhi_epi8(*src, zero);
   const __m128i diff_l = _mm_sub_epi16(pred_l, src_l);
   const __m128i diff_r = _mm_sub_epi16(pred_r, src_r);
 
@@ -440,7 +440,7 @@ static void masked_variance(const uint8_t *src_ptr, int src_stride,
       const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
       const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
       const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
-      accumulate_block(src, a, b, m, &sum, &sum_sq);
+      accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
     }
 
     src_ptr += src_stride;
@@ -471,7 +471,7 @@ static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
     const __m128i m =
         _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
                            _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
-    accumulate_block(src, a, b, m, &sum, &sum_sq);
+    accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
 
     src_ptr += src_stride * 2;
     a_ptr += 16;
@@ -503,7 +503,7 @@ static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
     const __m128i m = _mm_setr_epi32(
         *(uint32_t *)m_ptr, *(uint32_t *)&m_ptr[m_stride],
         *(uint32_t *)&m_ptr[m_stride * 2], *(uint32_t *)&m_ptr[m_stride * 3]);
-    accumulate_block(src, a, b, m, &sum, &sum_sq);
+    accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
 
     src_ptr += src_stride * 4;
     a_ptr += 16;
@@ -517,6 +517,7 @@ static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
   *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 // For width a multiple of 8
 static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
                                    int xoffset, int yoffset, uint16_t *dst,
@@ -797,17 +798,17 @@ static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
   }
 }
 
-static INLINE __m128i highbd_filter_block_2rows(const __m128i a0,
-                                                const __m128i b0,
-                                                const __m128i a1,
-                                                const __m128i b1,
-                                                const __m128i filter) {
-  __m128i v0 = _mm_unpacklo_epi16(a0, b0);
-  v0 = _mm_madd_epi16(v0, filter);
+static INLINE __m128i highbd_filter_block_2rows(const __m128i *a0,
+                                                const __m128i *b0,
+                                                const __m128i *a1,
+                                                const __m128i *b1,
+                                                const __m128i *filter) {
+  __m128i v0 = _mm_unpacklo_epi16(*a0, *b0);
+  v0 = _mm_madd_epi16(v0, *filter);
   v0 = xx_roundn_epu32(v0, FILTER_BITS);
 
-  __m128i v1 = _mm_unpacklo_epi16(a1, b1);
-  v1 = _mm_madd_epi16(v1, filter);
+  __m128i v1 = _mm_unpacklo_epi16(*a1, *b1);
+  v1 = _mm_madd_epi16(v1, *filter);
   v1 = xx_roundn_epu32(v1, FILTER_BITS);
 
   return _mm_packs_epi32(v0, v1);
@@ -845,7 +846,7 @@ static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
       const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
       const __m128i z1 = _mm_srli_si128(x1, 2);
       const __m128i res =
-          highbd_filter_block_2rows(x0, z0, x1, z1, hfilter_vec);
+          highbd_filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
       _mm_storeu_si128((__m128i *)b, res);
 
       src += src_stride * 2;
@@ -879,7 +880,8 @@ static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
       const __m128i x = _mm_loadl_epi64((__m128i *)dst);
       const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
       const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]);
-      const __m128i res = highbd_filter_block_2rows(x, y, y, z, vfilter_vec);
+      const __m128i res =
+          highbd_filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
       _mm_storeu_si128((__m128i *)dst, res);
 
       dst += 8;
@@ -1024,6 +1026,7 @@ static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
   *sum_ = _mm_cvtsi128_si32(sum);
   *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
 
 void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
                               int width, int height, const uint8_t *ref,
diff --git a/media/libaom/src/aom_dsp/x86/obmc_variance_sse4.c b/media/libaom/src/aom_dsp/x86/obmc_variance_sse4.c
index 72eda0e578..aa73c392dd 100644
--- a/media/libaom/src/aom_dsp/x86/obmc_variance_sse4.c
+++ b/media/libaom/src/aom_dsp/x86/obmc_variance_sse4.c
@@ -166,7 +166,7 @@ OBMC_SUBPIX_VAR(64, 16)
 ////////////////////////////////////////////////////////////////////////////////
 // High bit-depth
 ////////////////////////////////////////////////////////////////////////////////
-
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void hbd_obmc_variance_w4(
     const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
     const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
@@ -378,3 +378,4 @@ HBD_OBMCVARWXH(8, 32)
 HBD_OBMCVARWXH(32, 8)
 HBD_OBMCVARWXH(16, 64)
 HBD_OBMCVARWXH(64, 16)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm b/media/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm
index 216a0bd8f9..d6e15c4be5 100644
--- a/media/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm
+++ b/media/libaom/src/aom_dsp/x86/quantize_avx_x86_64.asm
@@ -126,7 +126,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
   punpckhqdq                      m3, m3
   pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
 
-  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  ; Store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
   pcmpgtw                         m6, m5, m8
   punpckhwd                       m6, m8, m6
   pmovsxwd                       m11, m8
@@ -198,10 +198,7 @@ DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \
   mova                            m4, [r2]            ; m4 = shift
   mov                             r4, dqcoeffmp
   mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
+  pxor                            m5, m5              ; m5 = dedicated zero
 
   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
 
@@ -255,9 +252,26 @@ DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   paddw                           m8, m6                   ; m8 += m6
   paddw                          m13, m11                  ; m13 += m11
+  %ifidn %1, b_32x32
+  pmullw                          m5, m8, m4               ; store the lower 16 bits of m8*qsh
+  %endif
   pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                           m8, 1
+  psrlw                           m5, 15
+  por                             m8, m5
+  %endif
   punpckhqdq                      m4, m4
+  %ifidn %1, b_32x32
+  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
+  %endif
   pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m13, 1
+  psrlw                           m5, 15
+  por                            m13, m5
+  pxor                            m5, m5                   ; reset m5 to zero register
+  %endif
   psignw                          m8, m9                   ; m8 = reinsert sign
   psignw                         m13, m10                  ; m13 = reinsert sign
   pand                            m8, m7
@@ -289,7 +303,7 @@ DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \
   psignw                         m13, m10
 %endif
 
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
   pcmpgtw                         m6, m5, m8
   punpckhwd                       m6, m8, m6
   pmovsxwd                       m11, m8
@@ -359,8 +373,23 @@ DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   paddw                          m14, m6                   ; m14 += m6
   paddw                          m13, m11                  ; m13 += m11
+  %ifidn %1, b_32x32
+  pmullw                          m5, m14, m4              ; store the lower 16 bits of m14*qsh
+  %endif
   pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m14, 1
+  psrlw                           m5, 15
+  por                            m14, m5
+  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
+  %endif
   pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m13, 1
+  psrlw                           m5, 15
+  por                            m13, m5
+  pxor                            m5, m5                   ; reset m5 to zero register
+  %endif
   psignw                         m14, m9                   ; m14 = reinsert sign
   psignw                         m13, m10                  ; m13 = reinsert sign
   pand                           m14, m7
@@ -391,7 +420,7 @@ DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \
   psignw                         m13, m10
 %endif
 
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
   pcmpgtw                         m6, m5, m14
   punpckhwd                       m6, m14, m6
   pmovsxwd                       m11, m14
diff --git a/media/libaom/src/aom_dsp/x86/quantize_sse2.c b/media/libaom/src/aom_dsp/x86/quantize_sse2.c
index d3de6e24db..ebef1fbac2 100644
--- a/media/libaom/src/aom_dsp/x86/quantize_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/quantize_sse2.c
@@ -18,28 +18,6 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/quantize_x86.h"
 
-static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
-  assert(sizeof(tran_low_t) == 4);
-
-  return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
-                        (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
-                        (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
-                        (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
-}
-
-static INLINE void store_coefficients(__m128i coeff_vals,
-                                      tran_low_t *coeff_ptr) {
-  assert(sizeof(tran_low_t) == 4);
-
-  __m128i one = _mm_set1_epi16(1);
-  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
-  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
-  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
-  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
-  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
-  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
-}
-
 void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          const int16_t *zbin_ptr, const int16_t *round_ptr,
                          const int16_t *quant_ptr,
diff --git a/media/libaom/src/aom_dsp/x86/quantize_ssse3.c b/media/libaom/src/aom_dsp/x86/quantize_ssse3.c
new file mode 100644
index 0000000000..25980a055a
--- /dev/null
+++ b/media/libaom/src/aom_dsp/x86/quantize_ssse3.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round,
+                                          const __m128i quant,
+                                          const __m128i *shift) {
+  __m128i tmp, qcoeff, tmp1;
+  qcoeff = _mm_adds_epi16(*coeff, round);
+  tmp = _mm_mulhi_epi16(qcoeff, quant);
+  qcoeff = _mm_add_epi16(tmp, qcoeff);
+  tmp = _mm_mullo_epi16(qcoeff, *shift);
+  tmp = _mm_srli_epi16(tmp, 14);
+  tmp1 = _mm_mulhi_epi16(qcoeff, *shift);
+  tmp1 = _mm_slli_epi16(tmp1, 2);
+  *coeff = _mm_or_si128(tmp, tmp1);
+}
+
+static INLINE void calculate_dqcoeff_and_store_64x64(const __m128i qcoeff,
+                                                     const __m128i dequant,
+                                                     const __m128i zero,
+                                                     tran_low_t *dqcoeff) {
+  // Un-sign to bias rounding like C.
+  const __m128i coeff = _mm_abs_epi16(qcoeff);
+
+  const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff);
+  const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff);
+
+  const __m128i low = _mm_mullo_epi16(coeff, dequant);
+  const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+  __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+  __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+  // "Divide" by 4.
+  dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 2);
+  dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 2);
+
+  dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0);
+  dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1);
+
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
+void aom_quantize_b_64x64_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i two = _mm_set1_epi16(2);
+  int index;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1, all_zero;
+  __m128i eob = zero, eob0;
+
+  (void)scan;
+  (void)n_coeffs;
+
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, two);
+  round = _mm_add_epi16(round, two);
+  zbin = _mm_srli_epi16(zbin, 2);
+  round = _mm_srli_epi16(round, 2);
+  zbin = _mm_sub_epi16(zbin, one);
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
+
+    // Reinsert signs.
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs.
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr);
+    store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, dqcoeff_ptr);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
+
+    eob =
+        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < 1024; index += 16) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+      continue;
+    }
+    calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
+    calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero,
+                                      dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero,
+                                      dqcoeff_ptr + 8 + index);
+
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/media/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm b/media/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm
index 39d4ca674c..fa616a6f1a 100644
--- a/media/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm
+++ b/media/libaom/src/aom_dsp/x86/quantize_ssse3_x86_64.asm
@@ -48,9 +48,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
   mov                             r3, qcoeffmp
   mov                             r4, dqcoeffmp
   mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
   pxor                            m5, m5                   ; m5 = dedicated zero
   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
   lea                         coeffq, [  coeffq+ncoeffq*4]
@@ -78,9 +75,26 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   paddw                           m8, m6                   ; m8 += m6
   paddw                          m13, m11                  ; m13 += m11
+  %ifidn %1, b_32x32
+  pmullw                          m5, m8, m4               ; store the lower 16 bits of m8*qsh
+  %endif
   pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                           m8, 1
+  psrlw                           m5, 15
+  por                             m8, m5
+  %endif
   punpckhqdq                      m4, m4
+  %ifidn %1, b_32x32
+  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
+  %endif
   pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m13, 1
+  psrlw                           m5, 15
+  por                            m13, m5
+  pxor                            m5, m5                   ; reset m5 to zero register
+  %endif
   psignw                          m8, m9                   ; m8 = reinsert sign
   psignw                         m13, m10                  ; m13 = reinsert sign
   pand                            m8, m7
@@ -117,7 +131,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
   psignw                          m8, m9
   psignw                         m13, m10
 %endif
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
   mova                            m11, m8
   mova                            m6, m8
   pcmpgtw                         m5, m8
@@ -169,12 +183,28 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   paddw                          m14, m6                   ; m14 += m6
   paddw                          m13, m11                  ; m13 += m11
+  %ifidn %1, b_32x32
+  pmullw                          m5, m14, m4              ; store the lower 16 bits of m14*qsh
+  %endif
   pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m14, 1
+  psrlw                           m5, 15
+  por                            m14, m5
+  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
+  %endif
   pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m13, 1
+  psrlw                           m5, 15
+  por                            m13, m5
+  pxor                            m5, m5                   ; reset m5 to zero register
+  %endif
   psignw                         m14, m9                   ; m14 = reinsert sign
   psignw                         m13, m10                  ; m13 = reinsert sign
   pand                           m14, m7
   pand                           m13, m12
+
   ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
   pxor                           m11, m11
   mova                           m11, m14
@@ -207,7 +237,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
   psignw                         m13, m10
 %endif
 
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
   mova                           m11, m14
   mova                            m6, m14
   pcmpgtw                         m5, m14
diff --git a/media/libaom/src/aom_dsp/x86/quantize_x86.h b/media/libaom/src/aom_dsp/x86/quantize_x86.h
index 4eed7dd29a..5b040a278a 100644
--- a/media/libaom/src/aom_dsp/x86/quantize_x86.h
+++ b/media/libaom/src/aom_dsp/x86/quantize_x86.h
@@ -32,6 +32,11 @@ static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
   return _mm_sub_epi16(a, sign);
 }
 
+static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
+  a = _mm_xor_si128(a, sign);
+  return _mm_sub_epi32(a, sign);
+}
+
 static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
                                     const __m128i quant, const __m128i shift) {
   __m128i tmp, qcoeff;
@@ -41,10 +46,53 @@ static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
   *coeff = _mm_mulhi_epi16(qcoeff, shift);
 }
 
+static INLINE void calculate_qcoeff_log_scale(__m128i *coeff,
+                                              const __m128i round,
+                                              const __m128i quant,
+                                              const __m128i *shift,
+                                              const int *log_scale) {
+  __m128i tmp, tmp1, qcoeff;
+  qcoeff = _mm_adds_epi16(*coeff, round);
+  tmp = _mm_mulhi_epi16(qcoeff, quant);
+  qcoeff = _mm_add_epi16(tmp, qcoeff);
+  tmp = _mm_mullo_epi16(qcoeff, *shift);
+  tmp = _mm_srli_epi16(tmp, (16 - *log_scale));
+  tmp1 = _mm_mulhi_epi16(qcoeff, *shift);
+  tmp1 = _mm_slli_epi16(tmp1, *log_scale);
+  *coeff = _mm_or_si128(tmp, tmp1);
+}
+
 static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
   return _mm_mullo_epi16(qcoeff, dequant);
 }
 
+static INLINE void calculate_dqcoeff_and_store_log_scale(__m128i qcoeff,
+                                                         __m128i dequant,
+                                                         const __m128i zero,
+                                                         tran_low_t *dqcoeff,
+                                                         const int *log_scale) {
+  // calculate abs
+  __m128i coeff_sign = _mm_srai_epi16(qcoeff, 15);
+  __m128i coeff = invert_sign_sse2(qcoeff, coeff_sign);
+
+  const __m128i sign_0 = _mm_unpacklo_epi16(coeff_sign, zero);
+  const __m128i sign_1 = _mm_unpackhi_epi16(coeff_sign, zero);
+
+  const __m128i low = _mm_mullo_epi16(coeff, dequant);
+  const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+  __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+  __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+  dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, *log_scale);
+  dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, *log_scale);
+
+  dqcoeff32_0 = invert_sign_32_sse2(dqcoeff32_0, sign_0);
+  dqcoeff32_1 = invert_sign_32_sse2(dqcoeff32_1, sign_1);
+
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
 // Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
 // to zbin to add 1 to the index in 'scan'.
 static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
@@ -75,3 +123,80 @@ static INLINE int16_t accumulate_eob(__m128i eob) {
   eob = _mm_max_epi16(eob, eob_shuffled);
   return _mm_extract_epi16(eob, 1);
 }
+
+static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
+  assert(sizeof(tran_low_t) == 4);
+  const __m128i coeff1 = _mm_load_si128((__m128i *)(coeff_ptr));
+  const __m128i coeff2 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+  return _mm_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void store_coefficients(__m128i coeff_vals,
+                                      tran_low_t *coeff_ptr) {
+  assert(sizeof(tran_low_t) == 4);
+
+  __m128i one = _mm_set1_epi16(1);
+  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
+  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
+}
+
+static INLINE void update_mask1(__m128i *cmp_mask0, __m128i *cmp_mask1,
+                                const int16_t *iscan_ptr, int *is_found,
+                                __m128i *mask) {
+  __m128i all_zero;
+  __m128i temp_mask = _mm_setzero_si128();
+  all_zero = _mm_or_si128(*cmp_mask0, *cmp_mask1);
+  if (_mm_movemask_epi8(all_zero)) {
+    __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
+    __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
+    __m128i iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8));
+    __m128i mask1 = _mm_and_si128(*cmp_mask1, iscan1);
+    temp_mask = _mm_max_epi16(mask0, mask1);
+    *is_found = 1;
+  }
+  *mask = _mm_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
+                                __m128i *threshold, const int16_t *iscan_ptr,
+                                int *is_found, __m128i *mask) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i coeff[4], cmp_mask0, cmp_mask1, cmp_mask2, cmp_mask3;
+
+  coeff[0] = _mm_unpacklo_epi16(*qcoeff0, zero);
+  coeff[1] = _mm_unpackhi_epi16(*qcoeff0, zero);
+  coeff[2] = _mm_unpacklo_epi16(*qcoeff1, zero);
+  coeff[3] = _mm_unpackhi_epi16(*qcoeff1, zero);
+
+  coeff[0] = _mm_slli_epi32(coeff[0], AOM_QM_BITS);
+  cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
+  coeff[1] = _mm_slli_epi32(coeff[1], AOM_QM_BITS);
+  cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
+  coeff[2] = _mm_slli_epi32(coeff[2], AOM_QM_BITS);
+  cmp_mask2 = _mm_cmpgt_epi32(coeff[2], threshold[1]);
+  coeff[3] = _mm_slli_epi32(coeff[3], AOM_QM_BITS);
+  cmp_mask3 = _mm_cmpgt_epi32(coeff[3], threshold[1]);
+
+  cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+  cmp_mask1 = _mm_packs_epi32(cmp_mask2, cmp_mask3);
+
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan_ptr, is_found, mask);
+}
+
+static INLINE int calculate_non_zero_count(__m128i mask) {
+  __m128i mask0, mask1;
+  int non_zero_count = 0;
+  mask0 = _mm_unpackhi_epi64(mask, mask);
+  mask1 = _mm_max_epi16(mask0, mask);
+  mask0 = _mm_shuffle_epi32(mask1, 1);
+  mask0 = _mm_max_epi16(mask0, mask1);
+  mask1 = _mm_srli_epi32(mask0, 16);
+  mask0 = _mm_max_epi16(mask0, mask1);
+  non_zero_count = _mm_extract_epi16(mask0, 0) + 1;
+
+  return non_zero_count;
+}
diff --git a/media/libaom/src/aom_dsp/x86/sad4d_avx2.c b/media/libaom/src/aom_dsp/x86/sad4d_avx2.c
index f662b62b16..0771252584 100644
--- a/media/libaom/src/aom_dsp/x86/sad4d_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/sad4d_avx2.c
@@ -14,41 +14,43 @@
 
 #include "aom/aom_integer.h"
 
-void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
+void aom_sadMxNx4d_avx2(int M, int N, const uint8_t *src, int src_stride,
+                        const uint8_t *const ref[4], int ref_stride,
+                        uint32_t res[4]) {
   __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
   __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
-  __m256i sum_mlow, sum_mhigh;
-  int i;
+  int i, j;
   const uint8_t *ref0, *ref1, *ref2, *ref3;
 
   ref0 = ref[0];
   ref1 = ref[1];
   ref2 = ref[2];
   ref3 = ref[3];
-  sum_ref0 = _mm256_set1_epi16(0);
-  sum_ref1 = _mm256_set1_epi16(0);
-  sum_ref2 = _mm256_set1_epi16(0);
-  sum_ref3 = _mm256_set1_epi16(0);
-  for (i = 0; i < 32; i++) {
-    // load src and all refs
-    src_reg = _mm256_loadu_si256((const __m256i *)src);
-    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
-    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
-    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
-    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
-    // sum of the absolute differences between every ref-i to src
-    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
-    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
-    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
-    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
-    // sum every ref-i
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
-
+  sum_ref0 = _mm256_setzero_si256();
+  sum_ref2 = _mm256_setzero_si256();
+  sum_ref1 = _mm256_setzero_si256();
+  sum_ref3 = _mm256_setzero_si256();
+
+  for (i = 0; i < N; i++) {
+    for (j = 0; j < M; j += 32) {
+      // load src and all refs
+      src_reg = _mm256_loadu_si256((const __m256i *)(src + j));
+      ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j));
+      ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j));
+      ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j));
+      ref3_reg = _mm256_loadu_si256((const __m256i *)(ref3 + j));
+
+      // sum of the absolute differences between every ref-i to src
+      ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+      ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+      ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+      ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+      // sum every ref-i
+      sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+      sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+      sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+      sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+    }
     src += src_stride;
     ref0 += ref_stride;
     ref1 += ref_stride;
@@ -57,6 +59,7 @@ void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
   }
   {
     __m128i sum;
+    __m256i sum_mlow, sum_mhigh;
     // in sum_ref-i the result is saved in the first 4 bytes
     // the other 4 bytes are zeroed.
     // sum_ref1 and sum_ref3 are shifted left by 4 bytes
@@ -80,139 +83,24 @@ void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
 
     _mm_storeu_si128((__m128i *)(res), sum);
   }
-  _mm256_zeroupper();
 }
 
-void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
-  __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
-  __m256i ref3_reg, ref3next_reg;
-  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
-  __m256i sum_mlow, sum_mhigh;
-  int i;
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-  sum_ref0 = _mm256_set1_epi16(0);
-  sum_ref1 = _mm256_set1_epi16(0);
-  sum_ref2 = _mm256_set1_epi16(0);
-  sum_ref3 = _mm256_set1_epi16(0);
-  for (i = 0; i < 64; i++) {
-    // load 64 bytes from src and all refs
-    src_reg = _mm256_loadu_si256((const __m256i *)src);
-    srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
-    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
-    ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32));
-    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
-    ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32));
-    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
-    ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32));
-    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
-    ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32));
-    // sum of the absolute differences between every ref-i to src
-    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
-    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
-    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
-    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
-    ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
-    ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
-    ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
-    ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
-
-    // sum every ref-i
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
-  {
-    __m128i sum;
-
-    // in sum_ref-i the result is saved in the first 4 bytes
-    // the other 4 bytes are zeroed.
-    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
-    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
-    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
-
-    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
-    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
-    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
-
-    // merge every 64 bit from each sum_ref-i
-    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
-    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
-
-    // add the low 64 bit to the high 64 bit
-    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
-
-    // add the low 128 bit to the high 128 bit
-    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
-                        _mm256_extractf128_si256(sum_mlow, 1));
-
-    _mm_storeu_si128((__m128i *)(res), sum);
+#define sadMxN_avx2(m, n)                                                      \
+  void aom_sad##m##x##n##x4d_avx2(const uint8_t *src, int src_stride,          \
+                                  const uint8_t *const ref[4], int ref_stride, \
+                                  uint32_t res[4]) {                           \
+    aom_sadMxNx4d_avx2(m, n, src, src_stride, ref, ref_stride, res);           \
   }
-  _mm256_zeroupper();
-}
 
-void aom_sad32x64x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  const uint8_t *rf[4];
-  uint32_t sum0[4];
-  uint32_t sum1[4];
+sadMxN_avx2(32, 8);
+sadMxN_avx2(32, 16);
+sadMxN_avx2(32, 32);
+sadMxN_avx2(32, 64);
 
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0);
-  src += src_stride << 5;
-  rf[0] += ref_stride << 5;
-  rf[1] += ref_stride << 5;
-  rf[2] += ref_stride << 5;
-  rf[3] += ref_stride << 5;
-  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1);
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  res[2] = sum0[2] + sum1[2];
-  res[3] = sum0[3] + sum1[3];
-}
+sadMxN_avx2(64, 16);
+sadMxN_avx2(64, 32);
+sadMxN_avx2(64, 64);
+sadMxN_avx2(64, 128);
 
-void aom_sad64x32x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  const uint8_t *rf[4];
-  uint32_t sum0[4];
-  uint32_t sum1[4];
-  unsigned int half_width = 32;
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0);
-  src += half_width;
-  rf[0] += half_width;
-  rf[1] += half_width;
-  rf[2] += half_width;
-  rf[3] += half_width;
-  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1);
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  res[2] = sum0[2] + sum1[2];
-  res[3] = sum0[3] + sum1[3];
-}
+sadMxN_avx2(128, 64);
+sadMxN_avx2(128, 128);
diff --git a/media/libaom/src/aom_dsp/x86/sad4d_sse2.asm b/media/libaom/src/aom_dsp/x86/sad4d_sse2.asm
index 55a856985a..a9043742d4 100644
--- a/media/libaom/src/aom_dsp/x86/sad4d_sse2.asm
+++ b/media/libaom/src/aom_dsp/x86/sad4d_sse2.asm
@@ -15,15 +15,85 @@
 
 SECTION .text
 
-; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_4x2x4 5-6 0
-  movd                  m0, [srcq +%2]
+%macro AVG_4x2x4 2
+  movh                  m2, [second_predq]
+  movlhps               m2, m2
+  pavgb                 %1, m2
+  pavgb                 %2, m2
+  lea                   second_predq, [second_predq+8]
+%endmacro
+; 'mflag' affect a lot how the code works.
+;
+; When 'mflag' is false, the 'src_strideq' resides in register,
+; [srcq + src_strideq + offset] is allowed, so we can simply
+; use such form to access src memory and don't bother to update
+; 'srcq' at each line. We only update 'srcq' each two-lines using
+; a compact LEA instruction like [srcq+src_strideq*2].
+;
+; When 'mflag' is true, the 'src_strideq' resides in memory.
+; we cannot use above form to access memory, we have to update
+; 'srcq' at each line break. As we process two parts (first,second)
+; together in each macro function, the second part may also sit
+; in the next line, which means we also need to possibly add
+; one 'src_strideq' to 'srcq' before processing second part.
+
+%macro HANDLE_FIRST_OFFSET 2
+  %define first_offset %2
+  %if mflag == 0 && %1 == 1
+    %define first_offset (src_strideq + %2)
+  %endif
+%endmacro
+
+; first_extraline, second_extraline, in_line_offset
+%macro HANDLE_SECOND_OFFSET 3
+  %define second_offset %3
+  %if mflag && %1 == 0 && %2 == 1
+    add srcq, src_strideq
+  %endif
+  %if mflag == 0 && %2 == 1
+    %define second_offset (src_strideq + %3)
+  %endif
+%endmacro
+
+; Notes for line_ending:
+; 0 -- not a line ending
+; 1 -- line ending of a odd line [line numbers starts from one]
+; 2 -- line ending of a even line
+; This is specically designed to handle when src_strideq is a
+; memory position, under such case, we can not accomplish
+; complex address calculation using LEA, and fall back to
+; using simple ADD instruction at each line ending.
+%macro ADVANCE_END_OF_LINE 1
+  %if mflag
+    add srcq, src_strideq
+  %endif
+  %if mflag == 0 && %1 == 2
+    lea                 srcq, [srcq +src_strideq*2]
+  %endif
+
+  %if %1 == 2
+    lea                ref1q, [ref1q+ref_strideq*2]
+    lea                ref2q, [ref2q+ref_strideq*2]
+    lea                ref3q, [ref3q+ref_strideq*2]
+    lea                ref4q, [ref4q+ref_strideq*2]
+  %endif
+%endmacro
+
+; Please note that the second_offset of src is for in_line_offset,
+; so it is less than src_stride.
+; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;               {first, second}_extraline, line_ending
+%macro PROCESS_4x2x4 9
+  HANDLE_FIRST_OFFSET   %7, %2
+  movd                  m0, [srcq + first_offset]
+  HANDLE_SECOND_OFFSET  %7, %8, %4
 %if %1 == 1
   movd                  m6, [ref1q+%3]
   movd                  m4, [ref2q+%3]
   movd                  m7, [ref3q+%3]
   movd                  m5, [ref4q+%3]
-  movd                  m1, [srcq +%4]
+
+  movd                  m1, [srcq + second_offset]
   movd                  m2, [ref1q+%5]
   punpckldq             m0, m1
   punpckldq             m6, m2
@@ -36,6 +106,9 @@ SECTION .text
   movlhps               m0, m0
   movlhps               m6, m4
   movlhps               m7, m5
+%if %6 == 1
+  AVG_4x2x4             m6, m7
+%endif
   psadbw                m6, m0
   psadbw                m7, m0
 %else
@@ -51,38 +124,48 @@ SECTION .text
   movd                  m4, [ref4q+%3]
   movd                  m5, [ref4q+%5]
   punpckldq             m4, m5
-  movd                  m5, [srcq +%4]
+  movd                  m5, [srcq + second_offset]
   punpckldq             m0, m5
   movlhps               m0, m0
   movlhps               m1, m2
   movlhps               m3, m4
+%if %6 == 1
+  AVG_4x2x4             m1, m3
+%endif
   psadbw                m1, m0
   psadbw                m3, m0
   paddd                 m6, m1
   paddd                 m7, m3
 %endif
-%if %6 == 1
-  lea                 srcq, [srcq +src_strideq*2]
-  lea                ref1q, [ref1q+ref_strideq*2]
-  lea                ref2q, [ref2q+ref_strideq*2]
-  lea                ref3q, [ref3q+ref_strideq*2]
-  lea                ref4q, [ref4q+ref_strideq*2]
+%if %9 > 0
+  ADVANCE_END_OF_LINE %9
 %endif
 %endmacro
 
-; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_8x2x4 5-6 0
-  movh                  m0, [srcq +%2]
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;               {first,second}_extraline, line_ending
+%macro PROCESS_8x2x4 9
+  HANDLE_FIRST_OFFSET   %7, %2
+  movh                  m0, [srcq + first_offset]
+  HANDLE_SECOND_OFFSET  %7, %8, %4
 %if %1 == 1
   movh                  m4, [ref1q+%3]
   movh                  m5, [ref2q+%3]
   movh                  m6, [ref3q+%3]
   movh                  m7, [ref4q+%3]
-  movhps                m0, [srcq +%4]
+  movhps                m0, [srcq + second_offset]
   movhps                m4, [ref1q+%5]
   movhps                m5, [ref2q+%5]
   movhps                m6, [ref3q+%5]
   movhps                m7, [ref4q+%5]
+%if %6 == 1
+  movu                  m3, [second_predq]
+  pavgb                 m4, m3
+  pavgb                 m5, m3
+  pavgb                 m6, m3
+  pavgb                 m7, m3
+  lea                   second_predq, [second_predq+mmsize]
+%endif
   psadbw                m4, m0
   psadbw                m5, m0
   psadbw                m6, m0
@@ -90,105 +173,148 @@ SECTION .text
 %else
   movh                  m1, [ref1q+%3]
   movh                  m2, [ref2q+%3]
-  movh                  m3, [ref3q+%3]
-  movhps                m0, [srcq +%4]
+  movhps                m0, [srcq + second_offset]
   movhps                m1, [ref1q+%5]
   movhps                m2, [ref2q+%5]
-  movhps                m3, [ref3q+%5]
+%if %6 == 1
+  movu                  m3, [second_predq]
+  pavgb                 m1, m3
+  pavgb                 m2, m3
+%endif
   psadbw                m1, m0
   psadbw                m2, m0
-  psadbw                m3, m0
   paddd                 m4, m1
-  movh                  m1, [ref4q+%3]
-  movhps                m1, [ref4q+%5]
   paddd                 m5, m2
-  paddd                 m6, m3
+
+  movh                  m1, [ref3q+%3]
+  movhps                m1, [ref3q+%5]
+  movh                  m2, [ref4q+%3]
+  movhps                m2, [ref4q+%5]
+%if %6 == 1
+  pavgb                 m1, m3
+  pavgb                 m2, m3
+  lea                   second_predq, [second_predq+mmsize]
+%endif
   psadbw                m1, m0
-  paddd                 m7, m1
+  psadbw                m2, m0
+  paddd                 m6, m1
+  paddd                 m7, m2
 %endif
-%if %6 == 1
-  lea                 srcq, [srcq +src_strideq*2]
-  lea                ref1q, [ref1q+ref_strideq*2]
-  lea                ref2q, [ref2q+ref_strideq*2]
-  lea                ref3q, [ref3q+ref_strideq*2]
-  lea                ref4q, [ref4q+ref_strideq*2]
+%if %9 > 0
+  ADVANCE_END_OF_LINE %9
 %endif
 %endmacro
 
-; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_16x2x4 5-6 0
+; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;                {first,second}_extraline, line_ending
+%macro PROCESS_16x2x4 9
   ; 1st 16 px
-  mova                  m0, [srcq +%2]
+  HANDLE_FIRST_OFFSET   %7, %2
+  mova                  m0, [srcq + first_offset]
+  HANDLE_SECOND_OFFSET  %7, %8, %4
 %if %1 == 1
   movu                  m4, [ref1q+%3]
   movu                  m5, [ref2q+%3]
   movu                  m6, [ref3q+%3]
   movu                  m7, [ref4q+%3]
+%if %6 == 1
+  movu                  m3, [second_predq]
+  pavgb                 m4, m3
+  pavgb                 m5, m3
+  pavgb                 m6, m3
+  pavgb                 m7, m3
+  lea                   second_predq, [second_predq+mmsize]
+%endif
   psadbw                m4, m0
   psadbw                m5, m0
   psadbw                m6, m0
   psadbw                m7, m0
-%else
+%else ; %1 == 1
   movu                  m1, [ref1q+%3]
   movu                  m2, [ref2q+%3]
-  movu                  m3, [ref3q+%3]
+%if %6 == 1
+  movu                  m3, [second_predq]
+  pavgb                 m1, m3
+  pavgb                 m2, m3
+%endif
   psadbw                m1, m0
   psadbw                m2, m0
-  psadbw                m3, m0
   paddd                 m4, m1
-  movu                  m1, [ref4q+%3]
   paddd                 m5, m2
-  paddd                 m6, m3
-  psadbw                m1, m0
-  paddd                 m7, m1
+
+  movu                  m1, [ref3q+%3]
+  movu                  m2, [ref4q+%3]
+%if %6 == 1
+  pavgb                 m1, m3
+  pavgb                 m2, m3
+  lea                   second_predq, [second_predq+mmsize]
 %endif
+  psadbw                m1, m0
+  psadbw                m2, m0
+  paddd                 m6, m1
+  paddd                 m7, m2
+%endif ; %1 == 1
 
   ; 2nd 16 px
-  mova                  m0, [srcq +%4]
+  mova                  m0, [srcq + second_offset]
   movu                  m1, [ref1q+%5]
   movu                  m2, [ref2q+%5]
-  movu                  m3, [ref3q+%5]
+
+%if %6 == 1
+  movu                  m3, [second_predq]
+  pavgb                 m1, m3
+  pavgb                 m2, m3
+%endif
   psadbw                m1, m0
   psadbw                m2, m0
-  psadbw                m3, m0
   paddd                 m4, m1
-  movu                  m1, [ref4q+%5]
   paddd                 m5, m2
-  paddd                 m6, m3
+
+  movu                  m1, [ref3q+%5]
+  movu                  m2, [ref4q+%5]
+
+%if %9 > 0
+  ADVANCE_END_OF_LINE %9
+%endif
+
 %if %6 == 1
-  lea                 srcq, [srcq +src_strideq*2]
-  lea                ref1q, [ref1q+ref_strideq*2]
-  lea                ref2q, [ref2q+ref_strideq*2]
-  lea                ref3q, [ref3q+ref_strideq*2]
-  lea                ref4q, [ref4q+ref_strideq*2]
+  pavgb                 m1, m3
+  pavgb                 m2, m3
+  lea                   second_predq, [second_predq+mmsize]
 %endif
   psadbw                m1, m0
-  paddd                 m7, m1
+  psadbw                m2, m0
+  paddd                 m6, m1
+  paddd                 m7, m2
 %endmacro
 
-; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_32x2x4 5-6 0
-  PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
-  PROCESS_16x2x4  0, %4, %5, %4 + 16, %5 + 16, %6
+; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;                {first,second}_extraline, line_ending
+%macro PROCESS_32x2x4 9
+  PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16, %6, %7, %7, %8 - %7
+  PROCESS_16x2x4  0, %4, %5, %4 + 16, %5 + 16, %6, %8, %8, %9
 %endmacro
 
-; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_64x2x4 5-6 0
-  PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
-  PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6
+; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;                {first,second}_extraline, line_ending
+%macro PROCESS_64x2x4 9
+  PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32, %6, %7, %7, %8 - %7
+  PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6, %8, %8, %9
 %endmacro
 
-; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_128x2x4 5-6 0
-  PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64
-  PROCESS_64x2x4  0, %4, %5, %4 + 64, %5 + 64, %6
+; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, do_avg,
+;                 {first,second}_extraline, line_ending
+%macro PROCESS_128x2x4 9
+  PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64, %6, %7, %7, %8 - %7
+  PROCESS_64x2x4  0, %4, %5, %4 + 64, %5 + 64, %6, %8, %8, %9
 %endmacro
 
 ; void aom_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
-%macro SADNXN4D 2
+%macro SADNXN4D 2-3 0
+%if %3 == 0
 %if UNIX64
 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
                               res, ref2, ref3, ref4
@@ -196,18 +322,41 @@ cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
 cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
                               ref2, ref3, ref4
 %endif
+%else ; avg
+
+%if UNIX64
+cglobal sad%1x%2x4d_avg, 6, 10, 8, src, src_stride, ref1, ref_stride, \
+                                  second_pred, res, ref2, ref3, ref4
+%else
+cglobal sad%1x%2x4d_avg, 5, 7, 8, src, ref4, ref1, ref_stride, \
+                                  second_pred, ref2, ref3
+  %define src_strideq r1mp
+  %define src_strided r1mp
+%endif
+%endif
+
+  %define mflag ((1 - UNIX64) & %3)
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
+
   mov                ref2q, [ref1q+gprsize*1]
   mov                ref3q, [ref1q+gprsize*2]
   mov                ref4q, [ref1q+gprsize*3]
   mov                ref1q, [ref1q+gprsize*0]
 
-  PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+  PROCESS_%1x2x4 1, 0, 0, 0, ref_strideq, %3, 0, 1, 2
 %rep (%2-4)/2
-  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+  PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2
 %endrep
-  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+  PROCESS_%1x2x4 0, 0, 0, 0, ref_strideq, %3, 0, 1, 2
+
+%if %3 == 0
+  %define resultq r4
+  %define resultmp r4mp
+%else
+  %define resultq r5
+  %define resultmp r5mp
+%endif
 
 %if %1 > 4
   pslldq                m5, 4
@@ -218,16 +367,16 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   mova                  m7, m6
   punpcklqdq            m4, m6
   punpckhqdq            m5, m7
-  movifnidn             r4, r4mp
   paddd                 m4, m5
-  movu                [r4], m4
+  movifnidn             resultq, resultmp
+  movu                [resultq], m4
   RET
 %else
-  movifnidn             r4, r4mp
   pshufd            m6, m6, 0x08
   pshufd            m7, m7, 0x08
-  movq              [r4+0], m6
-  movq              [r4+8], m7
+  movifnidn             resultq, resultmp
+  movq              [resultq+0], m6
+  movq              [resultq+8], m7
   RET
 %endif
 %endmacro
@@ -255,3 +404,25 @@ SADNXN4D  8, 32
 SADNXN4D 32,  8
 SADNXN4D 16, 64
 SADNXN4D 64, 16
+SADNXN4D 128, 128, 1
+SADNXN4D 128, 64, 1
+SADNXN4D 64,  128, 1
+SADNXN4D 64, 64, 1
+SADNXN4D 64, 32, 1
+SADNXN4D 32, 64, 1
+SADNXN4D 32, 32, 1
+SADNXN4D 32, 16, 1
+SADNXN4D 16, 32, 1
+SADNXN4D 16, 16, 1
+SADNXN4D 16,  8, 1
+SADNXN4D  8, 16, 1
+SADNXN4D  8,  8, 1
+SADNXN4D  8,  4, 1
+SADNXN4D  4,  8, 1
+SADNXN4D  4,  4, 1
+SADNXN4D  4, 16, 1
+SADNXN4D 16,  4, 1
+SADNXN4D  8, 32, 1
+SADNXN4D 32,  8, 1
+SADNXN4D 16, 64, 1
+SADNXN4D 64, 16, 1
diff --git a/media/libaom/src/aom_dsp/x86/sad_highbd_avx2.c b/media/libaom/src/aom_dsp/x86/sad_highbd_avx2.c
index b506d46639..2cff2e6a9f 100644
--- a/media/libaom/src/aom_dsp/x86/sad_highbd_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/sad_highbd_avx2.c
@@ -37,487 +37,257 @@ static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) {
   return (unsigned int)_mm_cvtsi128_si32(lo128);
 }
 
-unsigned int aom_highbd_sad16x8_avx2(const uint8_t *src, int src_stride,
-                                     const uint8_t *ref, int ref_stride) {
-  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+static INLINE void highbd_sad16x4_core_avx2(__m256i *s, __m256i *r,
+                                            __m256i *sad_acc) {
+  const __m256i zero = _mm256_setzero_si256();
+  int i;
+  for (i = 0; i < 4; i++) {
+    s[i] = _mm256_sub_epi16(s[i], r[i]);
+    s[i] = _mm256_abs_epi16(s[i]);
+  }
 
-  // first 4 rows
-  __m256i s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-  __m256i s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-  __m256i s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
-  __m256i s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
-  __m256i r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-  __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
-  __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
-  __m256i u0 = _mm256_sub_epi16(s0, r0);
-  __m256i u1 = _mm256_sub_epi16(s1, r1);
-  __m256i u2 = _mm256_sub_epi16(s2, r2);
-  __m256i u3 = _mm256_sub_epi16(s3, r3);
-  __m256i zero = _mm256_setzero_si256();
-  __m256i sum0, sum1;
-
-  u0 = _mm256_abs_epi16(u0);
-  u1 = _mm256_abs_epi16(u1);
-  u2 = _mm256_abs_epi16(u2);
-  u3 = _mm256_abs_epi16(u3);
-
-  sum0 = _mm256_add_epi16(u0, u1);
-  sum0 = _mm256_add_epi16(sum0, u2);
-  sum0 = _mm256_add_epi16(sum0, u3);
-
-  // second 4 rows
-  src_ptr += src_stride << 2;
-  ref_ptr += ref_stride << 2;
-  s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-  s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-  s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
-  s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
-  r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-  r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
-  r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
-  u0 = _mm256_sub_epi16(s0, r0);
-  u1 = _mm256_sub_epi16(s1, r1);
-  u2 = _mm256_sub_epi16(s2, r2);
-  u3 = _mm256_sub_epi16(s3, r3);
-
-  u0 = _mm256_abs_epi16(u0);
-  u1 = _mm256_abs_epi16(u1);
-  u2 = _mm256_abs_epi16(u2);
-  u3 = _mm256_abs_epi16(u3);
-
-  sum1 = _mm256_add_epi16(u0, u1);
-  sum1 = _mm256_add_epi16(sum1, u2);
-  sum1 = _mm256_add_epi16(sum1, u3);
-
-  // find out the SAD
-  s0 = _mm256_unpacklo_epi16(sum0, zero);
-  s1 = _mm256_unpackhi_epi16(sum0, zero);
-  r0 = _mm256_unpacklo_epi16(sum1, zero);
-  r1 = _mm256_unpackhi_epi16(sum1, zero);
-  s0 = _mm256_add_epi32(s0, s1);
-  r0 = _mm256_add_epi32(r0, r1);
-  sum0 = _mm256_add_epi32(s0, r0);
-  // 8 32-bit summation
+  s[0] = _mm256_add_epi16(s[0], s[1]);
+  s[0] = _mm256_add_epi16(s[0], s[2]);
+  s[0] = _mm256_add_epi16(s[0], s[3]);
 
-  return (unsigned int)get_sad_from_mm256_epi32(&sum0);
+  r[0] = _mm256_unpacklo_epi16(s[0], zero);
+  r[1] = _mm256_unpackhi_epi16(s[0], zero);
+
+  r[0] = _mm256_add_epi32(r[0], r[1]);
+  *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
 }
 
-unsigned int aom_highbd_sad16x16_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
+// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
+static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
+                           const uint16_t *ref_ptr, int ref_stride,
+                           const uint16_t *sec_ptr, __m256i *sad_acc) {
+  __m256i s[4], r[4];
+  s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+  s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+  s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
+  s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
+
+  r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+  r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+  r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
+  r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
+
+  if (sec_ptr) {
+    r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+    r[1] = _mm256_avg_epu16(
+        r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+    r[2] = _mm256_avg_epu16(
+        r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+    r[3] = _mm256_avg_epu16(
+        r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+  }
+  highbd_sad16x4_core_avx2(s, r, sad_acc);
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad16xN_avx2(int N,
+                                                             const uint8_t *src,
+                                                             int src_stride,
+                                                             const uint8_t *ref,
+                                                             int ref_stride) {
   const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
   const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
-  __m256i s0, s1, s2, s3, r0, r1, r2, r3, u0, u1, u2, u3;
-  __m256i sum0;
-  __m256i sum = _mm256_setzero_si256();
-  const __m256i zero = _mm256_setzero_si256();
-  int row = 0;
-
-  // Loop for every 4 rows
-  while (row < 16) {
-    s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-    s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-    s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
-    s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
-    r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-    r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-    r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
-    r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
-    u0 = _mm256_sub_epi16(s0, r0);
-    u1 = _mm256_sub_epi16(s1, r1);
-    u2 = _mm256_sub_epi16(s2, r2);
-    u3 = _mm256_sub_epi16(s3, r3);
-
-    u0 = _mm256_abs_epi16(u0);
-    u1 = _mm256_abs_epi16(u1);
-    u2 = _mm256_abs_epi16(u2);
-    u3 = _mm256_abs_epi16(u3);
-
-    sum0 = _mm256_add_epi16(u0, u1);
-    sum0 = _mm256_add_epi16(sum0, u2);
-    sum0 = _mm256_add_epi16(sum0, u3);
-
-    s0 = _mm256_unpacklo_epi16(sum0, zero);
-    s1 = _mm256_unpackhi_epi16(sum0, zero);
-    sum = _mm256_add_epi32(sum, s0);
-    sum = _mm256_add_epi32(sum, s1);
-    // 8 32-bit summation
-
-    row += 4;
+  int i;
+  __m256i sad = _mm256_setzero_si256();
+  for (i = 0; i < N; i += 4) {
+    sad16x4(src_ptr, src_stride, ref_ptr, ref_stride, NULL, &sad);
     src_ptr += src_stride << 2;
     ref_ptr += ref_stride << 2;
   }
-  return get_sad_from_mm256_epi32(&sum);
+  return (unsigned int)get_sad_from_mm256_epi32(&sad);
 }
 
 static void sad32x4(const uint16_t *src_ptr, int src_stride,
                     const uint16_t *ref_ptr, int ref_stride,
                     const uint16_t *sec_ptr, __m256i *sad_acc) {
-  __m256i s0, s1, s2, s3, r0, r1, r2, r3;
-  const __m256i zero = _mm256_setzero_si256();
+  __m256i s[4], r[4];
   int row_sections = 0;
 
   while (row_sections < 2) {
-    s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-    s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
-    s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-    s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
+    s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+    s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+    s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+    s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
 
-    r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-    r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
-    r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-    r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
+    r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+    r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+    r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+    r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
 
     if (sec_ptr) {
-      r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr));
-      r1 = _mm256_avg_epu16(
-          r1, _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
-      r2 = _mm256_avg_epu16(
-          r2, _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
-      r3 = _mm256_avg_epu16(
-          r3, _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+      r[0] =
+          _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+      r[1] = _mm256_avg_epu16(
+          r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+      r[2] = _mm256_avg_epu16(
+          r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+      r[3] = _mm256_avg_epu16(
+          r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+      sec_ptr += 32 << 1;
     }
-    s0 = _mm256_sub_epi16(s0, r0);
-    s1 = _mm256_sub_epi16(s1, r1);
-    s2 = _mm256_sub_epi16(s2, r2);
-    s3 = _mm256_sub_epi16(s3, r3);
-
-    s0 = _mm256_abs_epi16(s0);
-    s1 = _mm256_abs_epi16(s1);
-    s2 = _mm256_abs_epi16(s2);
-    s3 = _mm256_abs_epi16(s3);
-
-    s0 = _mm256_add_epi16(s0, s1);
-    s0 = _mm256_add_epi16(s0, s2);
-    s0 = _mm256_add_epi16(s0, s3);
-
-    r0 = _mm256_unpacklo_epi16(s0, zero);
-    r1 = _mm256_unpackhi_epi16(s0, zero);
-
-    r0 = _mm256_add_epi32(r0, r1);
-    *sad_acc = _mm256_add_epi32(*sad_acc, r0);
+    highbd_sad16x4_core_avx2(s, r, sad_acc);
 
     row_sections += 1;
     src_ptr += src_stride << 1;
     ref_ptr += ref_stride << 1;
-    if (sec_ptr) sec_ptr += 32 << 1;
   }
 }
 
-unsigned int aom_highbd_sad32x16_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad32xN_avx2(int N,
+                                                             const uint8_t *src,
+                                                             int src_stride,
+                                                             const uint8_t *ref,
+                                                             int ref_stride) {
   __m256i sad = _mm256_setzero_si256();
   uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
   uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
   const int left_shift = 2;
-  int row_section = 0;
+  int i;
 
-  while (row_section < 4) {
+  for (i = 0; i < N; i += 4) {
     sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad);
     srcp += src_stride << left_shift;
     refp += ref_stride << left_shift;
-    row_section += 1;
   }
   return get_sad_from_mm256_epi32(&sad);
 }
 
-unsigned int aom_highbd_sad16x32_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 4;
-  ref += ref_stride << 4;
-  sum += aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-unsigned int aom_highbd_sad32x32_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 4;
-  ref += ref_stride << 4;
-  sum += aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-unsigned int aom_highbd_sad32x64_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 5;
-  ref += ref_stride << 5;
-  sum += aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
 static void sad64x2(const uint16_t *src_ptr, int src_stride,
                     const uint16_t *ref_ptr, int ref_stride,
                     const uint16_t *sec_ptr, __m256i *sad_acc) {
-  __m256i s[8], r[8];
-  const __m256i zero = _mm256_setzero_si256();
-
-  s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
-  s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
-  s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
-  s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
-  s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-  s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
-  s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 32));
-  s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 48));
-
-  r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
-  r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
-  r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
-  r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-  r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
-  r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 32));
-  r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 48));
-
-  if (sec_ptr) {
-    r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
-    r[1] = _mm256_avg_epu16(
-        r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
-    r[2] = _mm256_avg_epu16(
-        r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
-    r[3] = _mm256_avg_epu16(
-        r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
-    r[4] = _mm256_avg_epu16(
-        r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64)));
-    r[5] = _mm256_avg_epu16(
-        r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80)));
-    r[6] = _mm256_avg_epu16(
-        r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96)));
-    r[7] = _mm256_avg_epu16(
-        r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112)));
+  __m256i s[4], r[4];
+  int i;
+  for (i = 0; i < 2; i++) {
+    s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+    s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+    s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
+    s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
+
+    r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+    r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+    r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
+    r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
+    if (sec_ptr) {
+      r[0] =
+          _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+      r[1] = _mm256_avg_epu16(
+          r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+      r[2] = _mm256_avg_epu16(
+          r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+      r[3] = _mm256_avg_epu16(
+          r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+      sec_ptr += 64;
+    }
+    highbd_sad16x4_core_avx2(s, r, sad_acc);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
   }
-
-  s[0] = _mm256_sub_epi16(s[0], r[0]);
-  s[1] = _mm256_sub_epi16(s[1], r[1]);
-  s[2] = _mm256_sub_epi16(s[2], r[2]);
-  s[3] = _mm256_sub_epi16(s[3], r[3]);
-  s[4] = _mm256_sub_epi16(s[4], r[4]);
-  s[5] = _mm256_sub_epi16(s[5], r[5]);
-  s[6] = _mm256_sub_epi16(s[6], r[6]);
-  s[7] = _mm256_sub_epi16(s[7], r[7]);
-
-  s[0] = _mm256_abs_epi16(s[0]);
-  s[1] = _mm256_abs_epi16(s[1]);
-  s[2] = _mm256_abs_epi16(s[2]);
-  s[3] = _mm256_abs_epi16(s[3]);
-  s[4] = _mm256_abs_epi16(s[4]);
-  s[5] = _mm256_abs_epi16(s[5]);
-  s[6] = _mm256_abs_epi16(s[6]);
-  s[7] = _mm256_abs_epi16(s[7]);
-
-  s[0] = _mm256_add_epi16(s[0], s[1]);
-  s[0] = _mm256_add_epi16(s[0], s[2]);
-  s[0] = _mm256_add_epi16(s[0], s[3]);
-
-  s[4] = _mm256_add_epi16(s[4], s[5]);
-  s[4] = _mm256_add_epi16(s[4], s[6]);
-  s[4] = _mm256_add_epi16(s[4], s[7]);
-
-  r[0] = _mm256_unpacklo_epi16(s[0], zero);
-  r[1] = _mm256_unpackhi_epi16(s[0], zero);
-  r[2] = _mm256_unpacklo_epi16(s[4], zero);
-  r[3] = _mm256_unpackhi_epi16(s[4], zero);
-
-  r[0] = _mm256_add_epi32(r[0], r[1]);
-  r[0] = _mm256_add_epi32(r[0], r[2]);
-  r[0] = _mm256_add_epi32(r[0], r[3]);
-  *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
 }
 
-unsigned int aom_highbd_sad64x32_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad64xN_avx2(int N,
+                                                             const uint8_t *src,
+                                                             int src_stride,
+                                                             const uint8_t *ref,
+                                                             int ref_stride) {
   __m256i sad = _mm256_setzero_si256();
   uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
   uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
   const int left_shift = 1;
-  int row_section = 0;
-
-  while (row_section < 16) {
+  int i;
+  for (i = 0; i < N; i += 2) {
     sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad);
     srcp += src_stride << left_shift;
     refp += ref_stride << left_shift;
-    row_section += 1;
   }
   return get_sad_from_mm256_epi32(&sad);
 }
 
-unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 5;
-  ref += ref_stride << 5;
-  sum += aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
 static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr,
                      const uint16_t *sec_ptr, __m256i *sad_acc) {
-  __m256i s[8], r[8];
-  const __m256i zero = _mm256_setzero_si256();
-
-  s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
-  s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
-  s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
-  s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
-  s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + 64));
-  s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + 80));
-  s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + 96));
-  s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + 112));
-
-  r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
-  r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
-  r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
-  r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 64));
-  r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 80));
-  r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 96));
-  r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 112));
-
-  if (sec_ptr) {
-    r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
-    r[1] = _mm256_avg_epu16(
-        r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
-    r[2] = _mm256_avg_epu16(
-        r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
-    r[3] = _mm256_avg_epu16(
-        r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
-    r[4] = _mm256_avg_epu16(
-        r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64)));
-    r[5] = _mm256_avg_epu16(
-        r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80)));
-    r[6] = _mm256_avg_epu16(
-        r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96)));
-    r[7] = _mm256_avg_epu16(
-        r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112)));
+  __m256i s[4], r[4];
+  int i;
+  for (i = 0; i < 2; i++) {
+    s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+    s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+    s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
+    s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
+    r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+    r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+    r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
+    r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
+    if (sec_ptr) {
+      r[0] =
+          _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+      r[1] = _mm256_avg_epu16(
+          r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+      r[2] = _mm256_avg_epu16(
+          r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+      r[3] = _mm256_avg_epu16(
+          r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+      sec_ptr += 64;
+    }
+    highbd_sad16x4_core_avx2(s, r, sad_acc);
+    src_ptr += 64;
+    ref_ptr += 64;
   }
-
-  s[0] = _mm256_sub_epi16(s[0], r[0]);
-  s[1] = _mm256_sub_epi16(s[1], r[1]);
-  s[2] = _mm256_sub_epi16(s[2], r[2]);
-  s[3] = _mm256_sub_epi16(s[3], r[3]);
-  s[4] = _mm256_sub_epi16(s[4], r[4]);
-  s[5] = _mm256_sub_epi16(s[5], r[5]);
-  s[6] = _mm256_sub_epi16(s[6], r[6]);
-  s[7] = _mm256_sub_epi16(s[7], r[7]);
-
-  s[0] = _mm256_abs_epi16(s[0]);
-  s[1] = _mm256_abs_epi16(s[1]);
-  s[2] = _mm256_abs_epi16(s[2]);
-  s[3] = _mm256_abs_epi16(s[3]);
-  s[4] = _mm256_abs_epi16(s[4]);
-  s[5] = _mm256_abs_epi16(s[5]);
-  s[6] = _mm256_abs_epi16(s[6]);
-  s[7] = _mm256_abs_epi16(s[7]);
-
-  s[0] = _mm256_add_epi16(s[0], s[1]);
-  s[0] = _mm256_add_epi16(s[0], s[2]);
-  s[0] = _mm256_add_epi16(s[0], s[3]);
-
-  s[4] = _mm256_add_epi16(s[4], s[5]);
-  s[4] = _mm256_add_epi16(s[4], s[6]);
-  s[4] = _mm256_add_epi16(s[4], s[7]);
-
-  r[0] = _mm256_unpacklo_epi16(s[0], zero);
-  r[1] = _mm256_unpackhi_epi16(s[0], zero);
-  r[2] = _mm256_unpacklo_epi16(s[4], zero);
-  r[3] = _mm256_unpackhi_epi16(s[4], zero);
-
-  r[0] = _mm256_add_epi32(r[0], r[1]);
-  r[0] = _mm256_add_epi32(r[0], r[2]);
-  r[0] = _mm256_add_epi32(r[0], r[3]);
-  *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
 }
 
-unsigned int aom_highbd_sad128x64_avx2(const uint8_t *src, int src_stride,
-                                       const uint8_t *ref, int ref_stride) {
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad128xN_avx2(
+    int N, const uint8_t *src, int src_stride, const uint8_t *ref,
+    int ref_stride) {
   __m256i sad = _mm256_setzero_si256();
   uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
   uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
   int row = 0;
-  while (row < 64) {
+  while (row < N) {
     sad128x1(srcp, refp, NULL, &sad);
     srcp += src_stride;
     refp += ref_stride;
-    row += 1;
+    row++;
   }
   return get_sad_from_mm256_epi32(&sad);
 }
 
-unsigned int aom_highbd_sad64x128_avx2(const uint8_t *src, int src_stride,
-                                       const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 6;
-  ref += ref_stride << 6;
-  sum += aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride,
-                                        const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 6;
-  ref += ref_stride << 6;
-  sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
-static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
-                           const uint16_t *ref_ptr, int ref_stride,
-                           const uint16_t *sec_ptr, __m256i *sad_acc) {
-  __m256i s0, s1, s2, s3, r0, r1, r2, r3;
-  const __m256i zero = _mm256_setzero_si256();
-
-  s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-  s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-  s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
-  s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
-  r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-  r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
-  r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
-  if (sec_ptr) {
-    r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr));
-    r1 = _mm256_avg_epu16(r1,
-                          _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
-    r2 = _mm256_avg_epu16(r2,
-                          _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
-    r3 = _mm256_avg_epu16(r3,
-                          _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+#define highbd_sadMxN_avx2(m, n)                                            \
+  unsigned int aom_highbd_sad##m##x##n##_avx2(                              \
+      const uint8_t *src, int src_stride, const uint8_t *ref,               \
+      int ref_stride) {                                                     \
+    return aom_highbd_sad##m##xN_avx2(n, src, src_stride, ref, ref_stride); \
   }
 
-  s0 = _mm256_sub_epi16(s0, r0);
-  s1 = _mm256_sub_epi16(s1, r1);
-  s2 = _mm256_sub_epi16(s2, r2);
-  s3 = _mm256_sub_epi16(s3, r3);
+highbd_sadMxN_avx2(16, 4);
+highbd_sadMxN_avx2(16, 8);
+highbd_sadMxN_avx2(16, 16);
+highbd_sadMxN_avx2(16, 32);
+highbd_sadMxN_avx2(16, 64);
+
+highbd_sadMxN_avx2(32, 8);
+highbd_sadMxN_avx2(32, 16);
+highbd_sadMxN_avx2(32, 32);
+highbd_sadMxN_avx2(32, 64);
 
-  s0 = _mm256_abs_epi16(s0);
-  s1 = _mm256_abs_epi16(s1);
-  s2 = _mm256_abs_epi16(s2);
-  s3 = _mm256_abs_epi16(s3);
+highbd_sadMxN_avx2(64, 16);
+highbd_sadMxN_avx2(64, 32);
+highbd_sadMxN_avx2(64, 64);
+highbd_sadMxN_avx2(64, 128);
 
-  s0 = _mm256_add_epi16(s0, s1);
-  s0 = _mm256_add_epi16(s0, s2);
-  s0 = _mm256_add_epi16(s0, s3);
+highbd_sadMxN_avx2(128, 64);
+highbd_sadMxN_avx2(128, 128);
 
-  r0 = _mm256_unpacklo_epi16(s0, zero);
-  r1 = _mm256_unpackhi_epi16(s0, zero);
+unsigned int aom_highbd_sad16x4_avg_avx2(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
 
-  r0 = _mm256_add_epi32(r0, r1);
-  *sad_acc = _mm256_add_epi32(*sad_acc, r0);
+  return get_sad_from_mm256_epi32(&sad);
 }
 
 unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride,
@@ -566,6 +336,40 @@ unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride,
   return sum;
 }
 
+unsigned int aom_highbd_sad16x64_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 5;
+  uint32_t sum = aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 16 << left_shift;
+  sum += aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad32x8_avg_avx2(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  const int left_shift = 2;
+  int row_section = 0;
+
+  while (row_section < 2) {
+    sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+    secp += 32 << left_shift;
+    row_section += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
 unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride,
                                           const uint8_t *ref, int ref_stride,
                                           const uint8_t *second_pred) {
@@ -614,6 +418,26 @@ unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride,
   return sum;
 }
 
+unsigned int aom_highbd_sad64x16_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  const int left_shift = 1;
+  int row_section = 0;
+
+  while (row_section < 8) {
+    sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+    secp += 64 << left_shift;
+    row_section += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
 unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride,
                                           const uint8_t *ref, int ref_stride,
                                           const uint8_t *second_pred) {
@@ -697,7 +521,7 @@ unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
 }
 
 // SAD 4D
-// Combine 4 __m256i vectors to uint32_t result[4]
+// Combine 4 __m256i input vectors  v to uint32_t result[4]
 static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
                                                uint32_t *res) {
   __m256i u0, u1, u2, u3;
@@ -752,287 +576,124 @@ static void init_sad(__m256i *s) {
   s[3] = _mm256_setzero_si256();
 }
 
-void aom_highbd_sad16x8x4d_avx2(const uint8_t *src, int src_stride,
-                                const uint8_t *const ref_array[],
-                                int ref_stride, uint32_t *sad_array) {
+static AOM_FORCE_INLINE void aom_highbd_sad16xNx4d_avx2(
+    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
+    int ref_stride, uint32_t *sad_array) {
   __m256i sad_vec[4];
   const uint16_t *refp[4];
   const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
   const uint16_t *srcp;
   const int shift_for_4_rows = 2;
-  int i;
+  int i, j;
 
   init_sad(sad_vec);
   convert_pointers(ref_array, refp);
 
   for (i = 0; i < 4; ++i) {
     srcp = keep;
-    sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
-    srcp += src_stride << shift_for_4_rows;
-    refp[i] += ref_stride << shift_for_4_rows;
-    sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+    for (j = 0; j < N; j += 4) {
+      sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+      srcp += src_stride << shift_for_4_rows;
+      refp[i] += ref_stride << shift_for_4_rows;
+    }
   }
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }
 
-void aom_highbd_sad16x16x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first8rows[4];
-  uint32_t second8rows[4];
-  const uint8_t *ref[4];
-  const int shift_for_8_rows = 3;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, first8rows);
-  src += src_stride << shift_for_8_rows;
-  ref[0] += ref_stride << shift_for_8_rows;
-  ref[1] += ref_stride << shift_for_8_rows;
-  ref[2] += ref_stride << shift_for_8_rows;
-  ref[3] += ref_stride << shift_for_8_rows;
-  aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, second8rows);
-  sad_array[0] = first8rows[0] + second8rows[0];
-  sad_array[1] = first8rows[1] + second8rows[1];
-  sad_array[2] = first8rows[2] + second8rows[2];
-  sad_array[3] = first8rows[3] + second8rows[3];
-}
-
-void aom_highbd_sad16x32x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 4;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad32x16x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
+static AOM_FORCE_INLINE void aom_highbd_sad32xNx4d_avx2(
+    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
+    int ref_stride, uint32_t *sad_array) {
   __m256i sad_vec[4];
   const uint16_t *refp[4];
   const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
   const uint16_t *srcp;
   const int shift_for_4_rows = 2;
-  int i;
-  int rows_section;
+  int i, r;
 
   init_sad(sad_vec);
   convert_pointers(ref_array, refp);
 
   for (i = 0; i < 4; ++i) {
     srcp = keep;
-    rows_section = 0;
-    while (rows_section < 4) {
+    for (r = 0; r < N; r += 4) {
       sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
       srcp += src_stride << shift_for_4_rows;
       refp[i] += ref_stride << shift_for_4_rows;
-      rows_section++;
     }
   }
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }
 
-void aom_highbd_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 4;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad32x64x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 5;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad64x32x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
+static AOM_FORCE_INLINE void aom_highbd_sad64xNx4d_avx2(
+    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
+    int ref_stride, uint32_t *sad_array) {
   __m256i sad_vec[4];
   const uint16_t *refp[4];
   const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
   const uint16_t *srcp;
   const int shift_for_rows = 1;
-  int i;
-  int rows_section;
+  int i, r;
 
   init_sad(sad_vec);
   convert_pointers(ref_array, refp);
 
   for (i = 0; i < 4; ++i) {
     srcp = keep;
-    rows_section = 0;
-    while (rows_section < 16) {
+    for (r = 0; r < N; r += 2) {
       sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
       srcp += src_stride << shift_for_rows;
       refp[i] += ref_stride << shift_for_rows;
-      rows_section++;
     }
   }
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }
 
-void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 5;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
-                                  const uint8_t *const ref_array[],
-                                  int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 6;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad128x64x4d_avx2(const uint8_t *src, int src_stride,
-                                  const uint8_t *const ref_array[],
-                                  int ref_stride, uint32_t *sad_array) {
+static AOM_FORCE_INLINE void aom_highbd_sad128xNx4d_avx2(
+    int N, const uint8_t *src, int src_stride, const uint8_t *const ref_array[],
+    int ref_stride, uint32_t *sad_array) {
   __m256i sad_vec[4];
   const uint16_t *refp[4];
   const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
   const uint16_t *srcp;
-  int i;
-  int rows_section;
+  int i, r;
 
   init_sad(sad_vec);
   convert_pointers(ref_array, refp);
 
   for (i = 0; i < 4; ++i) {
     srcp = keep;
-    rows_section = 0;
-    while (rows_section < 64) {
+    for (r = 0; r < N; r++) {
       sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
       srcp += src_stride;
       refp[i] += ref_stride;
-      rows_section++;
     }
   }
   get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
 }
 
-void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
-                                   const uint8_t *const ref_array[],
-                                   int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 6;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
+#define highbd_sadMxNx4d_avx2(m, n)                                          \
+  void aom_highbd_sad##m##x##n##x4d_avx2(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[],  \
+      int ref_stride, uint32_t *sad_array) {                                 \
+    aom_highbd_sad##m##xNx4d_avx2(n, src, src_stride, ref_array, ref_stride, \
+                                  sad_array);                                \
+  }
+
+highbd_sadMxNx4d_avx2(16, 4);
+highbd_sadMxNx4d_avx2(16, 8);
+highbd_sadMxNx4d_avx2(16, 16);
+highbd_sadMxNx4d_avx2(16, 32);
+highbd_sadMxNx4d_avx2(16, 64);
+
+highbd_sadMxNx4d_avx2(32, 8);
+highbd_sadMxNx4d_avx2(32, 16);
+highbd_sadMxNx4d_avx2(32, 32);
+highbd_sadMxNx4d_avx2(32, 64);
+
+highbd_sadMxNx4d_avx2(64, 16);
+highbd_sadMxNx4d_avx2(64, 32);
+highbd_sadMxNx4d_avx2(64, 64);
+highbd_sadMxNx4d_avx2(64, 128);
+
+highbd_sadMxNx4d_avx2(128, 64);
+highbd_sadMxNx4d_avx2(128, 128);
diff --git a/media/libaom/src/aom_dsp/x86/sad_impl_avx2.c b/media/libaom/src/aom_dsp/x86/sad_impl_avx2.c
index c6fd62c9e2..f77a585b4c 100644
--- a/media/libaom/src/aom_dsp/x86/sad_impl_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/sad_impl_avx2.c
@@ -84,81 +84,6 @@ unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride,
   return sum;
 }
 
-static void sad64x64x4d(const uint8_t *src, int src_stride,
-                        const uint8_t *const ref[4], int ref_stride,
-                        __m128i *res) {
-  uint32_t sum[4];
-  aom_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, sum);
-  *res = _mm_loadu_si128((const __m128i *)sum);
-}
-
-void aom_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
-                           const uint8_t *const ref[4], int ref_stride,
-                           uint32_t res[4]) {
-  __m128i sum0, sum1;
-  const uint8_t *rf[4];
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  sad64x64x4d(src, src_stride, rf, ref_stride, &sum0);
-  src += src_stride << 6;
-  rf[0] += ref_stride << 6;
-  rf[1] += ref_stride << 6;
-  rf[2] += ref_stride << 6;
-  rf[3] += ref_stride << 6;
-  sad64x64x4d(src, src_stride, rf, ref_stride, &sum1);
-  sum0 = _mm_add_epi32(sum0, sum1);
-  _mm_storeu_si128((__m128i *)res, sum0);
-}
-
-void aom_sad128x64x4d_avx2(const uint8_t *src, int src_stride,
-                           const uint8_t *const ref[4], int ref_stride,
-                           uint32_t res[4]) {
-  __m128i sum0, sum1;
-  unsigned int half_width = 64;
-  const uint8_t *rf[4];
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  sad64x64x4d(src, src_stride, rf, ref_stride, &sum0);
-  src += half_width;
-  rf[0] += half_width;
-  rf[1] += half_width;
-  rf[2] += half_width;
-  rf[3] += half_width;
-  sad64x64x4d(src, src_stride, rf, ref_stride, &sum1);
-  sum0 = _mm_add_epi32(sum0, sum1);
-  _mm_storeu_si128((__m128i *)res, sum0);
-}
-
-void aom_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
-                            const uint8_t *const ref[4], int ref_stride,
-                            uint32_t res[4]) {
-  const uint8_t *rf[4];
-  uint32_t sum0[4];
-  uint32_t sum1[4];
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum0);
-  src += src_stride << 6;
-  rf[0] += ref_stride << 6;
-  rf[1] += ref_stride << 6;
-  rf[2] += ref_stride << 6;
-  rf[3] += ref_stride << 6;
-  aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum1);
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  res[2] = sum0[2] + sum1[2];
-  res[3] = sum0[3] + sum1[3];
-}
-
 static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride,
                                      const uint8_t *ref_ptr, int ref_stride,
                                      const int h, const uint8_t *second_pred,
diff --git a/media/libaom/src/aom_dsp/x86/sse_avx2.c b/media/libaom/src/aom_dsp/x86/sse_avx2.c
index 305dde5c08..e6ee2fcab9 100644
--- a/media/libaom/src/aom_dsp/x86/sse_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/sse_avx2.c
@@ -21,12 +21,11 @@ static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
                                 const uint8_t *b) {
   const __m256i v_a0 = yy_loadu_256(a);
   const __m256i v_b0 = yy_loadu_256(b);
-  const __m256i v_a00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_a0));
-  const __m256i v_a01_w =
-      _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_a0, 1));
-  const __m256i v_b00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_b0));
-  const __m256i v_b01_w =
-      _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_b0, 1));
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero);
+  const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero);
+  const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero);
+  const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero);
   const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
   const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
@@ -35,11 +34,29 @@ static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
 
 static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
   int64_t sum;
+  __m256i zero = _mm256_setzero_si256();
+  const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero);
+  const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero);
+  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+                                         _mm256_extracti128_si256(sum_4x64, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) {
   const __m256i sum0_4x64 =
-      _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum_all));
+      _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32));
   const __m256i sum1_4x64 =
-      _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum_all, 1));
+      _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1));
   const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+  *sum = _mm256_add_epi64(*sum, sum_4x64);
+}
+
+static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) {
+  int64_t sum;
   const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
                                          _mm256_extracti128_si256(sum_4x64, 1));
   const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
@@ -47,31 +64,48 @@ static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
   xx_storel_64(&sum, sum_1x64);
   return sum;
 }
+#endif
 
+static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m256i *sum) {
+  const __m128i v_a0 = xx_loadl_32(a);
+  const __m128i v_a1 = xx_loadl_32(a + a_stride);
+  const __m128i v_a2 = xx_loadl_32(a + a_stride * 2);
+  const __m128i v_a3 = xx_loadl_32(a + a_stride * 3);
+  const __m128i v_b0 = xx_loadl_32(b);
+  const __m128i v_b1 = xx_loadl_32(b + b_stride);
+  const __m128i v_b2 = xx_loadl_32(b + b_stride * 2);
+  const __m128i v_b3 = xx_loadl_32(b + b_stride * 3);
+  const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1),
+                                             _mm_unpacklo_epi32(v_a2, v_a3));
+  const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1),
+                                             _mm_unpacklo_epi32(v_b2, v_b3));
+  const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
+  const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m256i *sum) {
+  const __m128i v_a0 = xx_loadl_64(a);
+  const __m128i v_a1 = xx_loadl_64(a + a_stride);
+  const __m128i v_b0 = xx_loadl_64(b);
+  const __m128i v_b1 = xx_loadl_64(b + b_stride);
+  const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
+  const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
 int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
                      int b_stride, int width, int height) {
   int32_t y = 0;
   int64_t sse = 0;
   __m256i sum = _mm256_setzero_si256();
+  __m256i zero = _mm256_setzero_si256();
   switch (width) {
     case 4:
       do {
-        const __m128i v_a0 = xx_loadl_32(a);
-        const __m128i v_a1 = xx_loadl_32(a + a_stride);
-        const __m128i v_a2 = xx_loadl_32(a + a_stride * 2);
-        const __m128i v_a3 = xx_loadl_32(a + a_stride * 3);
-        const __m128i v_b0 = xx_loadl_32(b);
-        const __m128i v_b1 = xx_loadl_32(b + b_stride);
-        const __m128i v_b2 = xx_loadl_32(b + b_stride * 2);
-        const __m128i v_b3 = xx_loadl_32(b + b_stride * 3);
-        const __m128i v_a0123 = _mm_unpacklo_epi64(
-            _mm_unpacklo_epi32(v_a0, v_a1), _mm_unpacklo_epi32(v_a2, v_a3));
-        const __m128i v_b0123 = _mm_unpacklo_epi64(
-            _mm_unpacklo_epi32(v_b0, v_b1), _mm_unpacklo_epi32(v_b2, v_b3));
-        const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
-        const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
-        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+        sse_w4x4_avx2(a, a_stride, b, b_stride, &sum);
         a += a_stride << 2;
         b += b_stride << 2;
         y += 4;
@@ -80,16 +114,7 @@ int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
       break;
     case 8:
       do {
-        const __m128i v_a0 = xx_loadl_64(a);
-        const __m128i v_a1 = xx_loadl_64(a + a_stride);
-        const __m128i v_b0 = xx_loadl_64(b);
-        const __m128i v_b1 = xx_loadl_64(b + b_stride);
-        const __m256i v_a_w =
-            _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
-        const __m256i v_b_w =
-            _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
-        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+        sse_w8x2_avx2(a, a_stride, b, b_stride, &sum);
         a += a_stride << 1;
         b += b_stride << 1;
         y += 2;
@@ -99,14 +124,26 @@ int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
     case 16:
       do {
         const __m128i v_a0 = xx_loadu_128(a);
+        const __m128i v_a1 = xx_loadu_128(a + a_stride);
         const __m128i v_b0 = xx_loadu_128(b);
-        const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0);
-        const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0);
-        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
-        a += a_stride;
-        b += b_stride;
-        y += 1;
+        const __m128i v_b1 = xx_loadu_128(b + b_stride);
+        const __m256i v_a =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01);
+        const __m256i v_b =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01);
+        const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero);
+        const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero);
+        const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero);
+        const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero);
+        const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl);
+        const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu);
+        const __m256i temp =
+            _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub),
+                             _mm256_madd_epi16(v_bsub, v_bsub));
+        sum = _mm256_add_epi32(sum, temp);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
       } while (y < height);
       sse = summary_all_avx2(&sum);
       break;
@@ -141,12 +178,42 @@ int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
       } while (y < height);
       sse = summary_all_avx2(&sum);
       break;
-    default: break;
+    default:
+      if ((width & 0x07) == 0) {
+        do {
+          int i = 0;
+          do {
+            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+            i += 8;
+          } while (i < width);
+          a += a_stride << 1;
+          b += b_stride << 1;
+          y += 2;
+        } while (y < height);
+      } else {
+        do {
+          int i = 0;
+          do {
+            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+            const uint8_t *a2 = a + i + (a_stride << 1);
+            const uint8_t *b2 = b + i + (b_stride << 1);
+            sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum);
+            i += 8;
+          } while (i + 4 < width);
+          sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum);
+          a += a_stride << 2;
+          b += b_stride << 2;
+          y += 4;
+        } while (y < height);
+      }
+      sse = summary_all_avx2(&sum);
+      break;
   }
 
   return sse;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
 static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
                                        const uint16_t *b) {
   const __m256i v_a_w = yy_loadu_256(a);
@@ -155,6 +222,33 @@ static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
   *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
 }
 
+static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a,
+                                        int a_stride, const uint16_t *b,
+                                        int b_stride) {
+  const __m128i v_a0 = xx_loadl_64(a);
+  const __m128i v_a1 = xx_loadl_64(a + a_stride);
+  const __m128i v_a2 = xx_loadl_64(a + a_stride * 2);
+  const __m128i v_a3 = xx_loadl_64(a + a_stride * 3);
+  const __m128i v_b0 = xx_loadl_64(b);
+  const __m128i v_b1 = xx_loadl_64(b + b_stride);
+  const __m128i v_b2 = xx_loadl_64(b + b_stride * 2);
+  const __m128i v_b3 = xx_loadl_64(b + b_stride * 3);
+  const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1),
+                                     _mm_unpacklo_epi64(v_a2, v_a3));
+  const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1),
+                                     _mm_unpacklo_epi64(v_b2, v_b3));
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a,
+                                        int a_stride, const uint16_t *b,
+                                        int b_stride) {
+  const __m256i v_a_w = yy_loadu2_128(a + a_stride, a);
+  const __m256i v_b_w = yy_loadu2_128(b + b_stride, b);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
 int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
                             int b_stride, int width, int height) {
   int32_t y = 0;
@@ -165,20 +259,7 @@ int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
   switch (width) {
     case 4:
       do {
-        const __m128i v_a0 = xx_loadl_64(a);
-        const __m128i v_a1 = xx_loadl_64(a + a_stride);
-        const __m128i v_a2 = xx_loadl_64(a + a_stride * 2);
-        const __m128i v_a3 = xx_loadl_64(a + a_stride * 3);
-        const __m128i v_b0 = xx_loadl_64(b);
-        const __m128i v_b1 = xx_loadl_64(b + b_stride);
-        const __m128i v_b2 = xx_loadl_64(b + b_stride * 2);
-        const __m128i v_b3 = xx_loadl_64(b + b_stride * 3);
-        const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1),
-                                           _mm_unpacklo_epi64(v_a2, v_a3));
-        const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1),
-                                           _mm_unpacklo_epi64(v_b2, v_b3));
-        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+        highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride);
         a += a_stride << 2;
         b += b_stride << 2;
         y += 4;
@@ -187,10 +268,7 @@ int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
       break;
     case 8:
       do {
-        const __m256i v_a_w = yy_loadu2_128(a + a_stride, a);
-        const __m256i v_b_w = yy_loadu2_128(b + b_stride, b);
-        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
+        highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride);
         a += a_stride << 1;
         b += b_stride << 1;
         y += 2;
@@ -208,43 +286,99 @@ int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
       break;
     case 32:
       do {
-        highbd_sse_w16_avx2(&sum, a, b);
-        highbd_sse_w16_avx2(&sum, a + 16, b + 16);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16, b + 16);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 64 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 64;
       } while (y < height);
-      sse = summary_all_avx2(&sum);
+      sse = summary_4x64_avx2(sum);
       break;
     case 64:
       do {
-        highbd_sse_w16_avx2(&sum, a, b);
-        highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1);
-        highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2);
-        highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 32 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 32;
       } while (y < height);
-      sse = summary_all_avx2(&sum);
+      sse = summary_4x64_avx2(sum);
       break;
     case 128:
       do {
-        highbd_sse_w16_avx2(&sum, a, b);
-        highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1);
-        highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2);
-        highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3);
-        highbd_sse_w16_avx2(&sum, a + 16 * 4, b + 16 * 4);
-        highbd_sse_w16_avx2(&sum, a + 16 * 5, b + 16 * 5);
-        highbd_sse_w16_avx2(&sum, a + 16 * 6, b + 16 * 6);
-        highbd_sse_w16_avx2(&sum, a + 16 * 7, b + 16 * 7);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 4, b + 16 * 4);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 5, b + 16 * 5);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 6, b + 16 * 6);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 7, b + 16 * 7);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 16 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 16;
       } while (y < height);
-      sse = summary_all_avx2(&sum);
+      sse = summary_4x64_avx2(sum);
+      break;
+    default:
+      if (width & 0x7) {
+        do {
+          int i = 0;
+          __m256i sum32 = _mm256_setzero_si256();
+          do {
+            highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+            const uint16_t *a2 = a + i + (a_stride << 1);
+            const uint16_t *b2 = b + i + (b_stride << 1);
+            highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride);
+            i += 8;
+          } while (i + 4 < width);
+          highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+          summary_32_avx2(&sum32, &sum);
+          a += a_stride << 2;
+          b += b_stride << 2;
+          y += 4;
+        } while (y < height);
+      } else {
+        do {
+          int l = 0;
+          __m256i sum32 = _mm256_setzero_si256();
+          do {
+            int i = 0;
+            do {
+              highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+              i += 8;
+            } while (i < width);
+            a += a_stride << 1;
+            b += b_stride << 1;
+            l += 2;
+          } while (l < 8 && l < (height - y));
+          summary_32_avx2(&sum32, &sum);
+          y += 8;
+        } while (y < height);
+      }
+      sse = summary_4x64_avx2(sum);
       break;
-    default: break;
   }
   return sse;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/aom_dsp/x86/sse_sse4.c b/media/libaom/src/aom_dsp/x86/sse_sse4.c
index 8b5af84691..5f95eb9aeb 100644
--- a/media/libaom/src/aom_dsp/x86/sse_sse4.c
+++ b/media/libaom/src/aom_dsp/x86/sse_sse4.c
@@ -28,6 +28,15 @@ static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
   return sum;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) {
+  const __m128i sum0 = _mm_cvtepu32_epi64(*sum32);
+  const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8));
+  *sum64 = _mm_add_epi64(sum0, *sum64);
+  *sum64 = _mm_add_epi64(sum1, *sum64);
+}
+#endif
+
 static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
                                   const uint8_t *b) {
   const __m128i v_a0 = xx_loadu_128(a);
@@ -42,6 +51,27 @@ static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
   *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
 }
 
+static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m128i *sum) {
+  const __m128i v_a0 = xx_loadl_32(a);
+  const __m128i v_a1 = xx_loadl_32(a + a_stride);
+  const __m128i v_b0 = xx_loadl_32(b);
+  const __m128i v_b1 = xx_loadl_32(b + b_stride);
+  const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
+  const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
+                               __m128i *sum) {
+  const __m128i v_a0 = xx_loadl_64(a);
+  const __m128i v_b0 = xx_loadl_64(b);
+  const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
+  const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
 int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
                        int b_stride, int width, int height) {
   int y = 0;
@@ -50,14 +80,7 @@ int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
   switch (width) {
     case 4:
       do {
-        const __m128i v_a0 = xx_loadl_32(a);
-        const __m128i v_a1 = xx_loadl_32(a + a_stride);
-        const __m128i v_b0 = xx_loadl_32(b);
-        const __m128i v_b1 = xx_loadl_32(b + b_stride);
-        const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
-        const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
-        const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-        sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
+        sse4x2_sse4_1(a, a_stride, b, b_stride, &sum);
         a += a_stride << 1;
         b += b_stride << 1;
         y += 2;
@@ -66,12 +89,7 @@ int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
       break;
     case 8:
       do {
-        const __m128i v_a0 = xx_loadl_64(a);
-        const __m128i v_b0 = xx_loadl_64(b);
-        const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
-        const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
-        const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-        sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
+        sse8_sse4_1(a, b, &sum);
         a += a_stride;
         b += b_stride;
         y += 1;
@@ -125,12 +143,53 @@ int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
       } while (y < height);
       sse = summary_all_sse4(&sum);
       break;
-    default: break;
+    default:
+      if (width & 0x07) {
+        do {
+          int i = 0;
+          do {
+            sse8_sse4_1(a + i, b + i, &sum);
+            sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum);
+            i += 8;
+          } while (i + 4 < width);
+          sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum);
+          a += (a_stride << 1);
+          b += (b_stride << 1);
+          y += 2;
+        } while (y < height);
+      } else {
+        do {
+          int i = 0;
+          do {
+            sse8_sse4_1(a + i, b + i, &sum);
+            i += 8;
+          } while (i < width);
+          a += a_stride;
+          b += b_stride;
+          y += 1;
+        } while (y < height);
+      }
+      sse = summary_all_sse4(&sum);
+      break;
   }
 
   return sse;
 }
 
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,
+                                          int a_stride, const uint16_t *b,
+                                          int b_stride) {
+  const __m128i v_a0 = xx_loadl_64(a);
+  const __m128i v_a1 = xx_loadl_64(a + a_stride);
+  const __m128i v_b0 = xx_loadl_64(b);
+  const __m128i v_b1 = xx_loadl_64(b + b_stride);
+  const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
+  const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
 static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
                                         const uint16_t *b) {
   const __m128i v_a_w = xx_loadu_128(a);
@@ -150,14 +209,7 @@ int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
   switch (width) {
     case 4:
       do {
-        const __m128i v_a0 = xx_loadl_64(a);
-        const __m128i v_a1 = xx_loadl_64(a + a_stride);
-        const __m128i v_b0 = xx_loadl_64(b);
-        const __m128i v_b1 = xx_loadl_64(b + b_stride);
-        const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
-        const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
-        const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-        sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
+        highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride);
         a += a_stride << 1;
         b += b_stride << 1;
         y += 2;
@@ -175,67 +227,127 @@ int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
       break;
     case 16:
       do {
-        highbd_sse_w8_sse4_1(&sum, a, b);
-        highbd_sse_w8_sse4_1(&sum, a + 8, b + 8);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 64 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 64;
       } while (y < height);
-      sse = summary_all_sse4(&sum);
+      xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
       break;
     case 32:
       do {
-        highbd_sse_w8_sse4_1(&sum, a, b);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 32 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 32;
       } while (y < height);
-      sse = summary_all_sse4(&sum);
+      xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
       break;
     case 64:
       do {
-        highbd_sse_w8_sse4_1(&sum, a, b);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 16 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 16;
       } while (y < height);
-      sse = summary_all_sse4(&sum);
+      xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
       break;
     case 128:
       do {
-        highbd_sse_w8_sse4_1(&sum, a, b);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 8, b + 8 * 8);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 9, b + 8 * 9);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 10, b + 8 * 10);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 11, b + 8 * 11);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 12, b + 8 * 12);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 13, b + 8 * 13);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 14, b + 8 * 14);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 15, b + 8 * 15);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 8 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 8;
       } while (y < height);
-      sse = summary_all_sse4(&sum);
+      xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    default:
+      if (width & 0x7) {
+        do {
+          __m128i sum32 = _mm_setzero_si128();
+          int i = 0;
+          do {
+            highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+            highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride);
+            i += 8;
+          } while (i + 4 < width);
+          highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride);
+          a += (a_stride << 1);
+          b += (b_stride << 1);
+          y += 2;
+          summary_32_sse4(&sum32, &sum);
+        } while (y < height);
+      } else {
+        do {
+          int l = 0;
+          __m128i sum32 = _mm_setzero_si128();
+          do {
+            int i = 0;
+            do {
+              highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+              i += 8;
+            } while (i < width);
+            a += a_stride;
+            b += b_stride;
+            l += 1;
+          } while (l < 8 && l < (height - y));
+          summary_32_sse4(&sum32, &sum);
+          y += 8;
+        } while (y < height);
+      }
+      xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
       break;
-    default: break;
   }
   return sse;
 }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/media/libaom/src/aom_dsp/x86/ssim_opt_x86_64.asm b/media/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm
index 6d9b5a12f1..6d9b5a12f1 100644
--- a/media/libaom/src/aom_dsp/x86/ssim_opt_x86_64.asm
+++ b/media/libaom/src/aom_dsp/x86/ssim_sse2_x86_64.asm
diff --git a/media/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm b/media/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm
index 45bf6ec3c5..cbf28901be 100644
--- a/media/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm
+++ b/media/libaom/src/aom_dsp/x86/subpel_variance_sse2.asm
@@ -135,44 +135,33 @@ SECTION .text
     %if %2 == 1 ; avg
       cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
                                           x_offset, y_offset, dst, dst_stride, \
-                                          sec, sec_stride, height, sse, \
-                                          g_bilin_filter, g_pw_8
+                                          sec, sec_stride, height, sse
       %define block_height dword heightm
       %define sec_str sec_stridemp
-
-      ;Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
-
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
-
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
-
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
     %else
       cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
                                       x_offset, y_offset, dst, dst_stride, \
-                                      height, sse, g_bilin_filter, g_pw_8
+                                      height, sse
       %define block_height heightd
+    %endif
+
+    ; reuse argument stack space
+    %define g_bilin_filterm x_offsetm
+    %define g_pw_8m y_offsetm
 
-      ;Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
+    ;Store bilin_filter and pw_8 location in stack
+    %if GET_GOT_DEFINED == 1
+      GET_GOT eax
+      add esp, 4                ; restore esp
+    %endif
 
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
+    lea ecx, [GLOBAL(bilin_filter_m)]
+    mov g_bilin_filterm, ecx
 
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
+    lea ecx, [GLOBAL(pw_8)]
+    mov g_pw_8m, ecx
 
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
-    %endif
+    LOAD_IF_USED 0, 1         ; load eax, ecx back
   %else
     %if %2 == 1 ; avg
       cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
diff --git a/media/libaom/src/aom_dsp/x86/subtract_avx2.c b/media/libaom/src/aom_dsp/x86/subtract_avx2.c
index 4389d123db..40831600a6 100644
--- a/media/libaom/src/aom_dsp/x86/subtract_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/subtract_avx2.c
@@ -26,7 +26,7 @@ static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
   _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1);
 }
 
-static INLINE void aom_subtract_block_16xn_avx2(
+static INLINE void subtract_block_16xn_avx2(
     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
   for (int32_t j = 0; j < rows; ++j) {
@@ -42,7 +42,7 @@ static INLINE void aom_subtract_block_16xn_avx2(
   }
 }
 
-static INLINE void aom_subtract_block_32xn_avx2(
+static INLINE void subtract_block_32xn_avx2(
     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
   for (int32_t j = 0; j < rows; ++j) {
@@ -53,7 +53,7 @@ static INLINE void aom_subtract_block_32xn_avx2(
   }
 }
 
-static INLINE void aom_subtract_block_64xn_avx2(
+static INLINE void subtract_block_64xn_avx2(
     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
   for (int32_t j = 0; j < rows; ++j) {
@@ -65,7 +65,7 @@ static INLINE void aom_subtract_block_64xn_avx2(
   }
 }
 
-static INLINE void aom_subtract_block_128xn_avx2(
+static INLINE void subtract_block_128xn_avx2(
     int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
     ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
   for (int32_t j = 0; j < rows; ++j) {
@@ -85,20 +85,20 @@ void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
                              ptrdiff_t pred_stride) {
   switch (cols) {
     case 16:
-      aom_subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
-                                   src_stride, pred_ptr, pred_stride);
+      subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
       break;
     case 32:
-      aom_subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
-                                   src_stride, pred_ptr, pred_stride);
+      subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
       break;
     case 64:
-      aom_subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
-                                   src_stride, pred_ptr, pred_stride);
+      subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
       break;
     case 128:
-      aom_subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
-                                    src_stride, pred_ptr, pred_stride);
+      subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+                                src_stride, pred_ptr, pred_stride);
       break;
     default:
       aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
diff --git a/media/libaom/src/aom_dsp/x86/sum_squares_avx2.c b/media/libaom/src/aom_dsp/x86/sum_squares_avx2.c
index 0af44e3a4c..97d78b6842 100644
--- a/media/libaom/src/aom_dsp/x86/sum_squares_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/sum_squares_avx2.c
@@ -77,3 +77,172 @@ uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width,
     return aom_sum_squares_2d_i16_c(src, stride, width, height);
   }
 }
+
+// Accumulate sum of 16-bit elements in the vector
+static AOM_INLINE int32_t mm256_accumulate_epi16(__m256i vec_a) {
+  __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
+  __m128i vtmp2 = _mm256_castsi256_si128(vec_a);
+  vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 8);
+  vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 4);
+  vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 2);
+  vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+  return _mm_extract_epi16(vtmp1, 0);
+}
+
+// Accumulate sum of 32-bit elements in the vector
+static AOM_INLINE int32_t mm256_accumulate_epi32(__m256i vec_a) {
+  __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
+  __m128i vtmp2 = _mm256_castsi256_si128(vec_a);
+  vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 8);
+  vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 4);
+  vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+  return _mm_cvtsi128_si32(vtmp1);
+}
+
+uint64_t aom_var_2d_u8_avx2(uint8_t *src, int src_stride, int width,
+                            int height) {
+  uint8_t *srcp;
+  uint64_t s = 0, ss = 0;
+  __m256i vzero = _mm256_setzero_si256();
+  __m256i v_acc_sum = vzero;
+  __m256i v_acc_sqs = vzero;
+  int i, j;
+
+  // Process 32 elements in a row
+  for (i = 0; i < width - 31; i += 32) {
+    srcp = src + i;
+    // Process 8 columns at a time
+    for (j = 0; j < height - 7; j += 8) {
+      __m256i vsrc[8];
+      for (int k = 0; k < 8; k++) {
+        vsrc[k] = _mm256_loadu_si256((__m256i *)srcp);
+        srcp += src_stride;
+      }
+      for (int k = 0; k < 8; k++) {
+        __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc[k], vzero);
+        __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc[k], vzero);
+        v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0);
+        v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1);
+
+        __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0);
+        __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1);
+        v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+        v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1);
+      }
+
+      // Update total sum and clear the vectors
+      s += mm256_accumulate_epi16(v_acc_sum);
+      ss += mm256_accumulate_epi32(v_acc_sqs);
+      v_acc_sum = vzero;
+      v_acc_sqs = vzero;
+    }
+
+    // Process remaining rows (height not a multiple of 8)
+    for (; j < height; j++) {
+      __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp);
+      __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc, vzero);
+      __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc, vzero);
+      v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0);
+      v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1);
+
+      __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0);
+      __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1);
+      v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+      v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1);
+
+      srcp += src_stride;
+    }
+
+    // Update total sum and clear the vectors
+    s += mm256_accumulate_epi16(v_acc_sum);
+    ss += mm256_accumulate_epi32(v_acc_sqs);
+    v_acc_sum = vzero;
+    v_acc_sqs = vzero;
+  }
+
+  // Process the remaining area using C
+  srcp = src;
+  for (int k = 0; k < height; k++) {
+    for (int m = i; m < width; m++) {
+      uint8_t val = srcp[m];
+      s += val;
+      ss += val * val;
+    }
+    srcp += src_stride;
+  }
+  return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_avx2(uint8_t *src, int src_stride, int width,
+                             int height) {
+  uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp;
+  uint64_t s = 0, ss = 0;
+  __m256i vzero = _mm256_setzero_si256();
+  __m256i v_acc_sum = vzero;
+  __m256i v_acc_sqs = vzero;
+  int i, j;
+
+  // Process 16 elements in a row
+  for (i = 0; i < width - 15; i += 16) {
+    srcp = srcp1 + i;
+    // Process 8 columns at a time
+    for (j = 0; j < height - 8; j += 8) {
+      __m256i vsrc[8];
+      for (int k = 0; k < 8; k++) {
+        vsrc[k] = _mm256_loadu_si256((__m256i *)srcp);
+        srcp += src_stride;
+      }
+      for (int k = 0; k < 8; k++) {
+        __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc[k], vzero);
+        __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc[k], vzero);
+        v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum);
+        v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum);
+
+        __m256i vsqs0 = _mm256_madd_epi16(vsrc[k], vsrc[k]);
+        v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+      }
+
+      // Update total sum and clear the vectors
+      s += mm256_accumulate_epi32(v_acc_sum);
+      ss += mm256_accumulate_epi32(v_acc_sqs);
+      v_acc_sum = vzero;
+      v_acc_sqs = vzero;
+    }
+
+    // Process remaining rows (height not a multiple of 8)
+    for (; j < height; j++) {
+      __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp);
+      __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc, vzero);
+      __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc, vzero);
+      v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum);
+      v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum);
+
+      __m256i vsqs0 = _mm256_madd_epi16(vsrc, vsrc);
+      v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+      srcp += src_stride;
+    }
+
+    // Update total sum and clear the vectors
+    s += mm256_accumulate_epi32(v_acc_sum);
+    ss += mm256_accumulate_epi32(v_acc_sqs);
+    v_acc_sum = vzero;
+    v_acc_sqs = vzero;
+  }
+
+  // Process the remaining area using C
+  srcp = srcp1;
+  for (int k = 0; k < height; k++) {
+    for (int m = i; m < width; m++) {
+      uint16_t val = srcp[m];
+      s += val;
+      ss += val * val;
+    }
+    srcp += src_stride;
+  }
+  return (ss - s * s / (width * height));
+}
diff --git a/media/libaom/src/aom_dsp/x86/sum_squares_sse2.c b/media/libaom/src/aom_dsp/x86/sum_squares_sse2.c
index 22d7739ec4..85b301a88e 100644
--- a/media/libaom/src/aom_dsp/x86/sum_squares_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/sum_squares_sse2.c
@@ -201,3 +201,166 @@ uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
     return aom_sum_squares_i16_c(src, n);
   }
 }
+
+// Accumulate sum of 16-bit elements in the vector
+static AOM_INLINE int32_t mm_accumulate_epi16(__m128i vec_a) {
+  __m128i vtmp = _mm_srli_si128(vec_a, 8);
+  vec_a = _mm_add_epi16(vec_a, vtmp);
+  vtmp = _mm_srli_si128(vec_a, 4);
+  vec_a = _mm_add_epi16(vec_a, vtmp);
+  vtmp = _mm_srli_si128(vec_a, 2);
+  vec_a = _mm_add_epi16(vec_a, vtmp);
+  return _mm_extract_epi16(vec_a, 0);
+}
+
+// Accumulate sum of 32-bit elements in the vector
+static AOM_INLINE int32_t mm_accumulate_epi32(__m128i vec_a) {
+  __m128i vtmp = _mm_srli_si128(vec_a, 8);
+  vec_a = _mm_add_epi32(vec_a, vtmp);
+  vtmp = _mm_srli_si128(vec_a, 4);
+  vec_a = _mm_add_epi32(vec_a, vtmp);
+  return _mm_cvtsi128_si32(vec_a);
+}
+
+uint64_t aom_var_2d_u8_sse2(uint8_t *src, int src_stride, int width,
+                            int height) {
+  uint8_t *srcp;
+  uint64_t s = 0, ss = 0;
+  __m128i vzero = _mm_setzero_si128();
+  __m128i v_acc_sum = vzero;
+  __m128i v_acc_sqs = vzero;
+  int i, j;
+
+  // Process 16 elements in a row
+  for (i = 0; i < width - 15; i += 16) {
+    srcp = src + i;
+    // Process 8 columns at a time
+    for (j = 0; j < height - 7; j += 8) {
+      __m128i vsrc[8];
+      for (int k = 0; k < 8; k++) {
+        vsrc[k] = _mm_loadu_si128((__m128i *)srcp);
+        srcp += src_stride;
+      }
+      for (int k = 0; k < 8; k++) {
+        __m128i vsrc0 = _mm_unpacklo_epi8(vsrc[k], vzero);
+        __m128i vsrc1 = _mm_unpackhi_epi8(vsrc[k], vzero);
+        v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0);
+        v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1);
+
+        __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0);
+        __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1);
+        v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+        v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1);
+      }
+
+      // Update total sum and clear the vectors
+      s += mm_accumulate_epi16(v_acc_sum);
+      ss += mm_accumulate_epi32(v_acc_sqs);
+      v_acc_sum = vzero;
+      v_acc_sqs = vzero;
+    }
+
+    // Process remaining rows (height not a multiple of 8)
+    for (; j < height; j++) {
+      __m128i vsrc = _mm_loadu_si128((__m128i *)srcp);
+      __m128i vsrc0 = _mm_unpacklo_epi8(vsrc, vzero);
+      __m128i vsrc1 = _mm_unpackhi_epi8(vsrc, vzero);
+      v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0);
+      v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1);
+
+      __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0);
+      __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1);
+      v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+      v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1);
+
+      srcp += src_stride;
+    }
+
+    // Update total sum and clear the vectors
+    s += mm_accumulate_epi16(v_acc_sum);
+    ss += mm_accumulate_epi32(v_acc_sqs);
+    v_acc_sum = vzero;
+    v_acc_sqs = vzero;
+  }
+
+  // Process the remaining area using C
+  srcp = src;
+  for (int k = 0; k < height; k++) {
+    for (int m = i; m < width; m++) {
+      uint8_t val = srcp[m];
+      s += val;
+      ss += val * val;
+    }
+    srcp += src_stride;
+  }
+  return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_sse2(uint8_t *src, int src_stride, int width,
+                             int height) {
+  uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp;
+  uint64_t s = 0, ss = 0;
+  __m128i vzero = _mm_setzero_si128();
+  __m128i v_acc_sum = vzero;
+  __m128i v_acc_sqs = vzero;
+  int i, j;
+
+  // Process 8 elements in a row
+  for (i = 0; i < width - 8; i += 8) {
+    srcp = srcp1 + i;
+    // Process 8 columns at a time
+    for (j = 0; j < height - 8; j += 8) {
+      __m128i vsrc[8];
+      for (int k = 0; k < 8; k++) {
+        vsrc[k] = _mm_loadu_si128((__m128i *)srcp);
+        srcp += src_stride;
+      }
+      for (int k = 0; k < 8; k++) {
+        __m128i vsrc0 = _mm_unpacklo_epi16(vsrc[k], vzero);
+        __m128i vsrc1 = _mm_unpackhi_epi16(vsrc[k], vzero);
+        v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum);
+        v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum);
+
+        __m128i vsqs0 = _mm_madd_epi16(vsrc[k], vsrc[k]);
+        v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+      }
+
+      // Update total sum and clear the vectors
+      s += mm_accumulate_epi32(v_acc_sum);
+      ss += mm_accumulate_epi32(v_acc_sqs);
+      v_acc_sum = vzero;
+      v_acc_sqs = vzero;
+    }
+
+    // Process remaining rows (height not a multiple of 8)
+    for (; j < height; j++) {
+      __m128i vsrc = _mm_loadu_si128((__m128i *)srcp);
+      __m128i vsrc0 = _mm_unpacklo_epi16(vsrc, vzero);
+      __m128i vsrc1 = _mm_unpackhi_epi16(vsrc, vzero);
+      v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum);
+      v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum);
+
+      __m128i vsqs0 = _mm_madd_epi16(vsrc, vsrc);
+      v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+      srcp += src_stride;
+    }
+
+    // Update total sum and clear the vectors
+    s += mm_accumulate_epi32(v_acc_sum);
+    ss += mm_accumulate_epi32(v_acc_sqs);
+    v_acc_sum = vzero;
+    v_acc_sqs = vzero;
+  }
+
+  // Process the remaining area using C
+  srcp = srcp1;
+  for (int k = 0; k < height; k++) {
+    for (int m = i; m < width; m++) {
+      uint16_t val = srcp[m];
+      s += val;
+      ss += val * val;
+    }
+    srcp += src_stride;
+  }
+  return (ss - s * s / (width * height));
+}
diff --git a/media/libaom/src/aom_dsp/x86/synonyms.h b/media/libaom/src/aom_dsp/x86/synonyms.h
index 1e9f1e27b8..2e99bee3e9 100644
--- a/media/libaom/src/aom_dsp/x86/synonyms.h
+++ b/media/libaom/src/aom_dsp/x86/synonyms.h
@@ -13,6 +13,7 @@
 #define AOM_AOM_DSP_X86_SYNONYMS_H_
 
 #include <immintrin.h>
+#include <string.h>
 
 #include "config/aom_config.h"
 
@@ -28,7 +29,9 @@
 // Loads and stores to do away with the tedium of casting the address
 // to the right type.
 static INLINE __m128i xx_loadl_32(const void *a) {
-  return _mm_cvtsi32_si128(*(const uint32_t *)a);
+  int val;
+  memcpy(&val, a, sizeof(val));
+  return _mm_cvtsi32_si128(val);
 }
 
 static INLINE __m128i xx_loadl_64(const void *a) {
@@ -44,7 +47,8 @@ static INLINE __m128i xx_loadu_128(const void *a) {
 }
 
 static INLINE void xx_storel_32(void *const a, const __m128i v) {
-  *(uint32_t *)a = _mm_cvtsi128_si32(v);
+  const int val = _mm_cvtsi128_si32(v);
+  memcpy(a, &val, sizeof(val));
 }
 
 static INLINE void xx_storel_64(void *const a, const __m128i v) {
diff --git a/media/libaom/src/aom_dsp/x86/synonyms_avx2.h b/media/libaom/src/aom_dsp/x86/synonyms_avx2.h
index 3f69b120ea..4d6ee6ad64 100644
--- a/media/libaom/src/aom_dsp/x86/synonyms_avx2.h
+++ b/media/libaom/src/aom_dsp/x86/synonyms_avx2.h
@@ -67,6 +67,11 @@ static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
   return yy_set_m128i(mhi, mlo);
 }
 
+static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
+  _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
+  _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a));
+}
+
 static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
   const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
   return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
diff --git a/media/libaom/src/aom_dsp/x86/transpose_sse2.h b/media/libaom/src/aom_dsp/x86/transpose_sse2.h
index d0d1ee6845..7ac692c78b 100644
--- a/media/libaom/src/aom_dsp/x86/transpose_sse2.h
+++ b/media/libaom/src/aom_dsp/x86/transpose_sse2.h
@@ -17,7 +17,7 @@
 #include "config/aom_config.h"
 
 static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
-  // Unpack 16 bit elements. Goes from:
+  // Unpack 8 bit elements. Goes from:
   // in[0]: 00 01 02 03
   // in[1]: 10 11 12 13
   // in[2]: 20 21 22 23
@@ -28,7 +28,7 @@ static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
   const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
   const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
 
-  // Unpack 32 bit elements resulting in:
+  // Unpack 16 bit elements resulting in:
   // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
   return _mm_unpacklo_epi16(a0, a1);
 }
diff --git a/media/libaom/src/aom_dsp/x86/txfm_common_avx2.h b/media/libaom/src/aom_dsp/x86/txfm_common_avx2.h
index b1611ba870..ea57c9f35e 100644
--- a/media/libaom/src/aom_dsp/x86/txfm_common_avx2.h
+++ b/media/libaom/src/aom_dsp/x86/txfm_common_avx2.h
@@ -20,9 +20,6 @@
 extern "C" {
 #endif
 
-typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
-                                  int8_t cos_bit);
-
 static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) {
   return _mm256_set1_epi32(
       (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
@@ -117,58 +114,115 @@ static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
   }
 }
 
-static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
-                                              __m256i *const out) {
-  // Unpack 16 bit elements. Goes from:
-  // in[0]: 00 01 02 03  08 09 0a 0b  04 05 06 07  0c 0d 0e 0f
-  // in[1]: 10 11 12 13  18 19 1a 1b  14 15 16 17  1c 1d 1e 1f
-  // in[2]: 20 21 22 23  28 29 2a 2b  24 25 26 27  2c 2d 2e 2f
-  // in[3]: 30 31 32 33  38 39 3a 3b  34 35 36 37  3c 3d 3e 3f
-  // in[4]: 40 41 42 43  48 49 4a 4b  44 45 46 47  4c 4d 4e 4f
-  // in[5]: 50 51 52 53  58 59 5a 5b  54 55 56 57  5c 5d 5e 5f
-  // in[6]: 60 61 62 63  68 69 6a 6b  64 65 66 67  6c 6d 6e 6f
-  // in[7]: 70 71 72 73  78 79 7a 7b  74 75 76 77  7c 7d 7e 7f
-  // in[8]: 80 81 82 83  88 89 8a 8b  84 85 86 87  8c 8d 8e 8f
-  // to:
-  // a0:    00 10 01 11  02 12 03 13  04 14 05 15  06 16 07 17
-  // a1:    20 30 21 31  22 32 23 33  24 34 25 35  26 36 27 37
-  // a2:    40 50 41 51  42 52 43 53  44 54 45 55  46 56 47 57
-  // a3:    60 70 61 71  62 72 63 73  64 74 65 75  66 76 67 77
-  // ...
-  __m256i a[16];
-  for (int i = 0; i < 16; i += 2) {
-    a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]);
-    a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]);
+static INLINE void transpose2_8x8_avx2(const __m256i *const in,
+                                       __m256i *const out) {
+  __m256i t[16], u[16];
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 1)   ==>  (0, 1)
+  //   (2, 3)   ==>  (2, 3)
+  //   (4, 5)   ==>  (4, 5)
+  //   (6, 7)   ==>  (6, 7)
+  for (int i = 0; i < 4; i++) {
+    t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+    t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
   }
-  __m256i b[16];
-  for (int i = 0; i < 16; i += 2) {
-    b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]);
-    b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]);
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 2)   ==>  (0, 2)
+  //   (1, 3)   ==>  (1, 3)
+  //   (4, 6)   ==>  (4, 6)
+  //   (5, 7)   ==>  (5, 7)
+  for (int i = 0; i < 2; i++) {
+    u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+    u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+    u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+    u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
   }
-  __m256i c[16];
-  for (int i = 0; i < 16; i += 2) {
-    c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]);
-    c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]);
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 4)   ==>  (0, 1)
+  //   (1, 5)   ==>  (4, 5)
+  //   (2, 6)   ==>  (2, 3)
+  //   (3, 7)   ==>  (6, 7)
+  for (int i = 0; i < 2; i++) {
+    out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+    out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+    out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+    out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
   }
-  out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20);
-  out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20);
-  out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20);
-  out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20);
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+                                              __m256i *const out) {
+  __m256i t[16];
 
-  out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31);
-  out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31);
-  out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31);
-  out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31);
+#define LOADL(idx)                                                            \
+  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+  t[idx] = _mm256_inserti128_si256(                                           \
+      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
 
-  out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20);
-  out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20);
-  out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20);
-  out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20);
+#define LOADR(idx)                                                           \
+  t[8 + idx] =                                                               \
+      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+  t[8 + idx] = _mm256_inserti128_si256(                                      \
+      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
 
-  out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31);
-  out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31);
-  out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31);
-  out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
+  // load left 8x16
+  LOADL(0)
+  LOADL(1)
+  LOADL(2)
+  LOADL(3)
+  LOADL(4)
+  LOADL(5)
+  LOADL(6)
+  LOADL(7)
+
+  // load right 8x16
+  LOADR(0)
+  LOADR(1)
+  LOADR(2)
+  LOADR(3)
+  LOADR(4)
+  LOADR(5)
+  LOADR(6)
+  LOADR(7)
+
+  // get the top 16x8 result
+  transpose2_8x8_avx2(t, out);
+  // get the bottom 16x8 result
+  transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+static INLINE void transpose_16bit_16x8_avx2(const __m256i *const in,
+                                             __m256i *const out) {
+  const __m256i a0 = _mm256_unpacklo_epi16(in[0], in[1]);
+  const __m256i a1 = _mm256_unpacklo_epi16(in[2], in[3]);
+  const __m256i a2 = _mm256_unpacklo_epi16(in[4], in[5]);
+  const __m256i a3 = _mm256_unpacklo_epi16(in[6], in[7]);
+  const __m256i a4 = _mm256_unpackhi_epi16(in[0], in[1]);
+  const __m256i a5 = _mm256_unpackhi_epi16(in[2], in[3]);
+  const __m256i a6 = _mm256_unpackhi_epi16(in[4], in[5]);
+  const __m256i a7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+  const __m256i b0 = _mm256_unpacklo_epi32(a0, a1);
+  const __m256i b1 = _mm256_unpacklo_epi32(a2, a3);
+  const __m256i b2 = _mm256_unpacklo_epi32(a4, a5);
+  const __m256i b3 = _mm256_unpacklo_epi32(a6, a7);
+  const __m256i b4 = _mm256_unpackhi_epi32(a0, a1);
+  const __m256i b5 = _mm256_unpackhi_epi32(a2, a3);
+  const __m256i b6 = _mm256_unpackhi_epi32(a4, a5);
+  const __m256i b7 = _mm256_unpackhi_epi32(a6, a7);
+
+  out[0] = _mm256_unpacklo_epi64(b0, b1);
+  out[1] = _mm256_unpackhi_epi64(b0, b1);
+  out[2] = _mm256_unpacklo_epi64(b4, b5);
+  out[3] = _mm256_unpackhi_epi64(b4, b5);
+  out[4] = _mm256_unpacklo_epi64(b2, b3);
+  out[5] = _mm256_unpackhi_epi64(b2, b3);
+  out[6] = _mm256_unpacklo_epi64(b6, b7);
+  out[7] = _mm256_unpackhi_epi64(b6, b7);
 }
 
 static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
@@ -192,6 +246,113 @@ static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
   }
 }
 
+static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
+  __m256i tmp, round;
+  round = _mm256_set1_epi32(1 << (bit - 1));
+  tmp = _mm256_add_epi32(vec, round);
+  return _mm256_srai_epi32(tmp, bit);
+}
+
+static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
+                                                 __m256i *output,
+                                                 const int size,
+                                                 const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = av1_round_shift_32_avx2(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm256_slli_epi32(input[i], -bit);
+    }
+  }
+}
+
+static INLINE void av1_round_shift_rect_array_32_avx2(__m256i *input,
+                                                      __m256i *output,
+                                                      const int size,
+                                                      const int bit,
+                                                      const int val) {
+  const __m256i sqrt2 = _mm256_set1_epi32(val);
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m256i r0 = av1_round_shift_32_avx2(input[i], bit);
+      const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+      output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m256i r0 = _mm256_slli_epi32(input[i], -bit);
+      const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+      output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
+    }
+  }
+}
+
+static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
+  const __m256i scale_rounding =
+      pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
+  const __m256i b = _mm256_madd_epi16(a, scale_rounding);
+  return _mm256_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void store_rect_16bit_to_32bit_w8_avx2(const __m256i a,
+                                                     int32_t *const b) {
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i a_lo = _mm256_unpacklo_epi16(a, one);
+  const __m256i a_hi = _mm256_unpackhi_epi16(a, one);
+  const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
+  const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
+  const __m256i temp = _mm256_permute2f128_si256(b_lo, b_hi, 0x31);
+  _mm_store_si128((__m128i *)b, _mm256_castsi256_si128(b_lo));
+  _mm_store_si128((__m128i *)(b + 4), _mm256_castsi256_si128(b_hi));
+  _mm256_store_si256((__m256i *)(b + 64), temp);
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w8_avx2(
+    const __m256i *const in, int32_t *const out, const int stride,
+    const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit_w8_avx2(in[i], out + i * stride);
+  }
+}
+
+static INLINE void pack_reg(const __m128i *in1, const __m128i *in2,
+                            __m256i *out) {
+  out[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[0]), in2[0], 0x1);
+  out[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[1]), in2[1], 0x1);
+  out[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[2]), in2[2], 0x1);
+  out[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[3]), in2[3], 0x1);
+  out[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[4]), in2[4], 0x1);
+  out[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[5]), in2[5], 0x1);
+  out[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[6]), in2[6], 0x1);
+  out[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[7]), in2[7], 0x1);
+}
+
+static INLINE void extract_reg(const __m256i *in, __m128i *out1) {
+  out1[0] = _mm256_castsi256_si128(in[0]);
+  out1[1] = _mm256_castsi256_si128(in[1]);
+  out1[2] = _mm256_castsi256_si128(in[2]);
+  out1[3] = _mm256_castsi256_si128(in[3]);
+  out1[4] = _mm256_castsi256_si128(in[4]);
+  out1[5] = _mm256_castsi256_si128(in[5]);
+  out1[6] = _mm256_castsi256_si128(in[6]);
+  out1[7] = _mm256_castsi256_si128(in[7]);
+
+  out1[8] = _mm256_extracti128_si256(in[0], 0x01);
+  out1[9] = _mm256_extracti128_si256(in[1], 0x01);
+  out1[10] = _mm256_extracti128_si256(in[2], 0x01);
+  out1[11] = _mm256_extracti128_si256(in[3], 0x01);
+  out1[12] = _mm256_extracti128_si256(in[4], 0x01);
+  out1[13] = _mm256_extracti128_si256(in[5], 0x01);
+  out1[14] = _mm256_extracti128_si256(in[6], 0x01);
+  out1[15] = _mm256_extracti128_si256(in[7], 0x01);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/media/libaom/src/aom_dsp/x86/txfm_common_sse2.h b/media/libaom/src/aom_dsp/x86/txfm_common_sse2.h
index ed82eee962..9c99eb93bd 100644
--- a/media/libaom/src/aom_dsp/x86/txfm_common_sse2.h
+++ b/media/libaom/src/aom_dsp/x86/txfm_common_sse2.h
@@ -26,4 +26,8 @@ static INLINE __m128i mm_reverse_epi16(const __m128i x) {
   return _mm_shuffle_epi32(b, 0x4e);
 }
 
+#define octa_set_epi16(a, b, c, d, e, f, g, h)                           \
+  _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
+                 (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+
 #endif  // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/media/libaom/src/aom_dsp/x86/variance_avx2.c b/media/libaom/src/aom_dsp/x86/variance_avx2.c
index 800aef1266..c4919ba9b4 100644
--- a/media/libaom/src/aom_dsp/x86/variance_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/variance_avx2.c
@@ -28,7 +28,7 @@ static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) {
 static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
                                         __m256i *const sse,
                                         __m256i *const sum) {
-  const __m256i adj_sub = _mm256_set1_epi16(0xff01);  // (1,-1)
+  const __m256i adj_sub = _mm256_set1_epi16((short)0xff01);  // (1,-1)
 
   // unpack into pairs of source and reference values
   const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
@@ -234,6 +234,10 @@ unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
                                              int x_offset, int y_offset,
                                              const uint8_t *dst, int dst_stride,
                                              int height, unsigned int *sse);
+unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height, unsigned int *sse);
 
 unsigned int aom_sub_pixel_avg_variance32xh_avx2(
     const uint8_t *src, int src_stride, int x_offset, int y_offset,
@@ -276,6 +280,11 @@ AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5);
 AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6);
 AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5);
 AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4);
+AOM_SUB_PIXEL_VAR_AVX2(16, 64, 16, 4, 6);
+AOM_SUB_PIXEL_VAR_AVX2(16, 32, 16, 4, 5);
+AOM_SUB_PIXEL_VAR_AVX2(16, 16, 16, 4, 4);
+AOM_SUB_PIXEL_VAR_AVX2(16, 8, 16, 4, 3);
+AOM_SUB_PIXEL_VAR_AVX2(16, 4, 16, 4, 2);
 
 #define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2)                \
   unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \
diff --git a/media/libaom/src/aom_dsp/x86/variance_impl_avx2.c b/media/libaom/src/aom_dsp/x86/variance_impl_avx2.c
index 88e27aef3a..f779270ae3 100644
--- a/media/libaom/src/aom_dsp/x86/variance_impl_avx2.c
+++ b/media/libaom/src/aom_dsp/x86/variance_impl_avx2.c
@@ -104,6 +104,65 @@ DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
   sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
         _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
 
+// Functions related to sub pixel variance width 16
+#define LOAD_SRC_DST_INSERT(src_stride, dst_stride)              \
+  /* load source and destination of 2 rows and insert*/          \
+  src_reg = _mm256_inserti128_si256(                             \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \
+      _mm_loadu_si128((__m128i *)(src + src_stride)), 1);        \
+  dst_reg = _mm256_inserti128_si256(                             \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
+      _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
+
+#define AVG_NEXT_SRC_INSERT(src_reg, size_stride)                              \
+  src_next_reg = _mm256_inserti128_si256(                                      \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
+      _mm_loadu_si128((__m128i *)(src + (size_stride << 1))), 1);              \
+  /* average between current and next stride source */                         \
+  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC_INSERT(src_reg, size_stride)                            \
+  src_next_reg = _mm256_inserti128_si256(                                      \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
+      _mm_loadu_si128((__m128i *)(src + (src_stride + size_stride))), 1);      \
+  MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define LOAD_SRC_NEXT_BYTE_INSERT                                    \
+  /* load source and another source from next row   */               \
+  src_reg = _mm256_inserti128_si256(                                 \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))),     \
+      _mm_loadu_si128((__m128i *)(src + src_stride)), 1);            \
+  /* load source and next row source from 1 byte onwards   */        \
+  src_next_reg = _mm256_inserti128_si256(                            \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + 1))), \
+      _mm_loadu_si128((__m128i *)(src + src_stride + 1)), 1);
+
+#define LOAD_DST_INSERT                                          \
+  dst_reg = _mm256_inserti128_si256(                             \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
+      _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
+
+#define LOAD_SRC_MERGE_128BIT(filter)                        \
+  __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));     \
+  __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \
+  __m128i src_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);  \
+  __m128i src_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);  \
+  __m128i filter_128bit = _mm256_castsi256_si128(filter);    \
+  __m128i pw8_128bit = _mm256_castsi256_si128(pw8);
+
+#define FILTER_SRC_128BIT(filter)             \
+  /* filter the source */                     \
+  src_lo = _mm_maddubs_epi16(src_lo, filter); \
+  src_hi = _mm_maddubs_epi16(src_hi, filter); \
+                                              \
+  /* add 8 to source */                       \
+  src_lo = _mm_add_epi16(src_lo, pw8_128bit); \
+  src_hi = _mm_add_epi16(src_hi, pw8_128bit); \
+                                              \
+  /* divide source by 16 */                   \
+  src_lo = _mm_srai_epi16(src_lo, 4);         \
+  src_hi = _mm_srai_epi16(src_hi, 4);
+
 unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
                                              int x_offset, int y_offset,
                                              const uint8_t *dst, int dst_stride,
@@ -127,8 +186,8 @@ unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
         src += src_stride;
         dst += dst_stride;
       }
-      // x_offset = 0 and y_offset = 8
-    } else if (y_offset == 8) {
+      // x_offset = 0 and y_offset = 4
+    } else if (y_offset == 4) {
       __m256i src_next_reg;
       for (i = 0; i < height; i++) {
         LOAD_SRC_DST
@@ -156,8 +215,8 @@ unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
         dst += dst_stride;
       }
     }
-    // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
+    // x_offset = 4  and y_offset = 0
+  } else if (x_offset == 4) {
     if (y_offset == 0) {
       __m256i src_next_reg;
       for (i = 0; i < height; i++) {
@@ -169,8 +228,8 @@ unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
         src += src_stride;
         dst += dst_stride;
       }
-      // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
+      // x_offset = 4  and y_offset = 4
+    } else if (y_offset == 4) {
       __m256i src_next_reg, src_avg;
       // load source and another source starting from the next
       // following byte
@@ -189,7 +248,7 @@ unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
         CALC_SUM_SSE_INSIDE_LOOP
         dst += dst_stride;
       }
-      // x_offset = 8  and y_offset = bilin interpolation
+      // x_offset = 4  and y_offset = bilin interpolation
     } else {
       __m256i filter, pw8, src_next_reg, src_avg;
       y_offset <<= 5;
@@ -228,8 +287,8 @@ unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
         src += src_stride;
         dst += dst_stride;
       }
-      // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
+      // x_offset = bilin interpolation and y_offset = 4
+    } else if (y_offset == 4) {
       __m256i filter, pw8, src_next_reg, src_pack;
       x_offset <<= 5;
       filter = _mm256_load_si256(
@@ -292,6 +351,244 @@ unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
   return sum;
 }
 
+unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height, unsigned int *sse) {
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+      // x_offset = 0 and y_offset = 4
+    } else if (y_offset == 4) {
+      __m256i src_next_reg;
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+        AVG_NEXT_SRC_INSERT(src_reg, src_stride)
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+      // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+        MERGE_NEXT_SRC_INSERT(src_reg, src_stride)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+    }
+    // x_offset = 4  and y_offset = 0
+  } else if (x_offset == 4) {
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        LOAD_DST_INSERT
+        /* average between current and next stride source */
+        src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+      // x_offset = 4  and y_offset = 4
+    } else if (y_offset == 4) {
+      __m256i src_next_reg, src_avg, src_temp;
+      // load and insert source and next row source
+      LOAD_SRC_NEXT_BYTE_INSERT
+      src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+      src += src_stride << 1;
+      for (i = 0; i < height - 2; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+        src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);
+        src_temp = _mm256_avg_epu8(src_avg, src_temp);
+        LOAD_DST_INSERT
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_temp, zero_reg)
+        // save current source average
+        src_avg = src_next_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride << 1;
+        src += src_stride << 1;
+      }
+      // last 2 rows processing happens here
+      __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));
+      __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));
+      src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);
+      src_next_reg = _mm256_permute2x128_si256(
+          src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);
+      LOAD_DST_INSERT
+      src_avg = _mm256_avg_epu8(src_avg, src_next_reg);
+      MERGE_WITH_SRC(src_avg, zero_reg)
+      CALC_SUM_SSE_INSIDE_LOOP
+    } else {
+      // x_offset = 4  and y_offset = bilin interpolation
+      __m256i filter, pw8, src_next_reg, src_avg, src_temp;
+      y_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load and insert source and next row source
+      LOAD_SRC_NEXT_BYTE_INSERT
+      src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+      src += src_stride << 1;
+      for (i = 0; i < height - 2; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+        src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);
+        LOAD_DST_INSERT
+        MERGE_WITH_SRC(src_avg, src_temp)
+        // save current source average
+        src_avg = src_next_reg;
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride << 1;
+        src += src_stride << 1;
+      }
+      // last 2 rows processing happens here
+      __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));
+      __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));
+      src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);
+      src_next_reg = _mm256_permute2x128_si256(
+          src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);
+      LOAD_DST_INSERT
+      MERGE_WITH_SRC(src_avg, src_next_reg)
+      FILTER_SRC(filter)
+      CALC_SUM_SSE_INSIDE_LOOP
+    }
+    // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+        MERGE_NEXT_SRC_INSERT(src_reg, 1)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+      // x_offset = bilin interpolation and y_offset = 4
+    } else if (y_offset == 4) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load and insert source and next row source
+      LOAD_SRC_NEXT_BYTE_INSERT
+      MERGE_WITH_SRC(src_reg, src_next_reg)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      src += src_stride << 1;
+      for (i = 0; i < height - 2; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        LOAD_DST_INSERT
+        MERGE_WITH_SRC(src_reg, src_next_reg)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_next_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src_pack = src_reg;
+        src += src_stride << 1;
+        dst += dst_stride << 1;
+      }
+      // last 2 rows processing happens here
+      LOAD_SRC_MERGE_128BIT(filter)
+      LOAD_DST_INSERT
+      FILTER_SRC_128BIT(filter_128bit)
+      src_reg_0 = _mm_packus_epi16(src_lo, src_hi);
+      src_next_reg = _mm256_permute2x128_si256(
+          src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);
+      // average between previous pack to the current
+      src_pack = _mm256_avg_epu8(src_pack, src_next_reg);
+      MERGE_WITH_SRC(src_pack, zero_reg)
+      CALC_SUM_SSE_INSIDE_LOOP
+    } else {
+      // x_offset = bilin interpolation and y_offset = bilin interpolation
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load and insert source and next row source
+      LOAD_SRC_NEXT_BYTE_INSERT
+      MERGE_WITH_SRC(src_reg, src_next_reg)
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      src += src_stride << 1;
+      for (i = 0; i < height - 2; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        LOAD_DST_INSERT
+        MERGE_WITH_SRC(src_reg, src_next_reg)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);
+        // average between previous pack to the current
+        MERGE_WITH_SRC(src_pack, src_next_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride << 1;
+        dst += dst_stride << 1;
+      }
+      // last 2 rows processing happens here
+      LOAD_SRC_MERGE_128BIT(xfilter)
+      LOAD_DST_INSERT
+      FILTER_SRC_128BIT(filter_128bit)
+      src_reg_0 = _mm_packus_epi16(src_lo, src_hi);
+      src_next_reg = _mm256_permute2x128_si256(
+          src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);
+      MERGE_WITH_SRC(src_pack, src_next_reg)
+      FILTER_SRC(yfilter)
+      CALC_SUM_SSE_INSIDE_LOOP
+    }
+  }
+  CALC_SUM_AND_SSE
+  _mm256_zeroupper();
+  return sum;
+}
+
 unsigned int aom_sub_pixel_avg_variance32xh_avx2(
     const uint8_t *src, int src_stride, int x_offset, int y_offset,
     const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
diff --git a/media/libaom/src/aom_dsp/x86/variance_sse2.c b/media/libaom/src/aom_dsp/x86/variance_sse2.c
index 3c37e77c06..4e2b5a1aa0 100644
--- a/media/libaom/src/aom_dsp/x86/variance_sse2.c
+++ b/media/libaom/src/aom_dsp/x86/variance_sse2.c
@@ -21,9 +21,10 @@
 
 #include "aom_ports/mem.h"
 
+#include "av1/common/av1_common_int.h"
 #include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
 #include "av1/common/reconinter.h"
+#include "av1/encoder/reconinter_enc.h"
 
 unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
   __m128i vsum = _mm_setzero_si128();
@@ -144,6 +145,7 @@ static INLINE void variance8_sse2(const uint8_t *src, const int src_stride,
                                   __m128i *const sum) {
   assert(h <= 128);  // May overflow for larger height.
   *sum = _mm_setzero_si128();
+  *sse = _mm_setzero_si128();
   for (int i = 0; i < h; i++) {
     const __m128i s = load8_8to16_sse2(src);
     const __m128i r = load8_8to16_sse2(ref);
@@ -236,6 +238,14 @@ static INLINE void variance128_sse2(const uint8_t *src, const int src_stride,
   }
 }
 
+void aom_get8x8var_sse2(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *ref_ptr, int ref_stride,
+                        unsigned int *sse, int *sum) {
+  __m128i vsse, vsum;
+  variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_128_pel_sse2(vsse, vsum, sse, sum);
+}
+
 #define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels)                        \
   unsigned int aom_variance##bw##x##bh##_sse2(                                \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
@@ -494,88 +504,36 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
     const int ref_num = 0;
     const int is_intrabc = is_intrabc_block(mi);
     const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+        is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
     const int is_scaled = av1_is_scaled(sf);
 
     if (is_scaled) {
-      // Note: This is mostly a copy from the >=8X8 case in
-      // build_inter_predictors() function, with some small tweaks.
-
-      // Some assumptions.
-      const int plane = 0;
-
-      // Get pre-requisites.
+      int plane = 0;
+      const int mi_x = mi_col * MI_SIZE;
+      const int mi_y = mi_row * MI_SIZE;
       const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const int ssx = pd->subsampling_x;
-      const int ssy = pd->subsampling_y;
-      assert(ssx == 0 && ssy == 0);
       const struct buf_2d *const dst_buf = &pd->dst;
       const struct buf_2d *const pre_buf =
           is_intrabc ? dst_buf : &pd->pre[ref_num];
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
 
-      // Calculate subpel_x/y and x/y_step.
-      const int row_start = 0;  // Because ss_y is 0.
-      const int col_start = 0;  // Because ss_x is 0.
-      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
-      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
-      int orig_pos_y = pre_y << SUBPEL_BITS;
-      orig_pos_y += mv->row * (1 << (1 - ssy));
-      int orig_pos_x = pre_x << SUBPEL_BITS;
-      orig_pos_x += mv->col * (1 << (1 - ssx));
-      int pos_y = sf->scale_value_y(orig_pos_y, sf);
-      int pos_x = sf->scale_value_x(orig_pos_x, sf);
-      pos_x += SCALE_EXTRA_OFF;
-      pos_y += SCALE_EXTRA_OFF;
-
-      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
-      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
-      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                         << SCALE_SUBPEL_BITS;
-      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
-                        << SCALE_SUBPEL_BITS;
-      pos_y = clamp(pos_y, top, bottom);
-      pos_x = clamp(pos_x, left, right);
-
-      const uint8_t *const pre =
-          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-          (pos_x >> SCALE_SUBPEL_BITS);
-
-      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
-                                           pos_x & SCALE_SUBPEL_MASK,
-                                           pos_y & SCALE_SUBPEL_MASK };
-
-      // Get warp types.
-      const WarpedMotionParams *const wm =
-          &xd->global_motion[mi->ref_frame[ref_num]];
-      const int is_global = is_global_mv_block(mi, wm->wmtype);
-      WarpTypesAllowed warp_types;
-      warp_types.global_warp_allowed = is_global;
-      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-
-      // Get convolve parameters.
-      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
-      const InterpFilters filters =
+      InterPredParams inter_pred_params;
+      inter_pred_params.conv_params = get_conv_params(0, plane, xd->bd);
+      const int_interpfilters filters =
           av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-
-      // Get the inter predictor.
-      const int build_for_obmc = 0;
-      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
-                               &subpel_params, sf, width, height, &conv_params,
-                               filters, &warp_types, mi_x >> pd->subsampling_x,
-                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
-                               build_for_obmc, xd, cm->allow_warped_motion);
-
+      av1_init_inter_params(
+          &inter_pred_params, width, height, mi_y >> pd->subsampling_y,
+          mi_x >> pd->subsampling_x, pd->subsampling_x, pd->subsampling_y,
+          xd->bd, is_cur_buf_hbd(xd), is_intrabc, sf, pre_buf, filters);
+      av1_enc_build_one_inter_predictor(comp_pred, width, mv,
+                                        &inter_pred_params);
       return;
     }
   }
 
-  const InterpFilterParams *filter =
-      (subpel_search == 1)
-          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
-          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
-  int filter_taps = (subpel_search == 1) ? 4 : SUBPEL_TAPS;
+  const InterpFilterParams *filter = av1_get_filter(subpel_search);
+  // (TODO:yunqing) 2-tap case uses 4-tap functions since there is no SIMD for
+  // 2-tap yet.
+  int filter_taps = (subpel_search <= USE_4_TAPS) ? 4 : SUBPEL_TAPS;
 
   if (!subpel_x_q3 && !subpel_y_q3) {
     if (width >= 16) {
@@ -638,20 +596,13 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
     const int16_t *const kernel_y =
         av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
     const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
-    uint8_t *temp_start_horiz =
-        (subpel_search == 1) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp;
+    uint8_t *temp_start_horiz = (subpel_search <= USE_4_TAPS)
+                                    ? temp + (filter_taps >> 1) * MAX_SB_SIZE
+                                    : temp;
     uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
     int intermediate_height =
         (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
     assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    // TODO(Deepa): Remove the memset below when we have
-    // 4 tap simd for sse2 and ssse3.
-    if (subpel_search == 1) {
-      memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width);
-      memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width);
-      memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0, width);
-      memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0, width);
-    }
     aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
                         kernel_x, 16, NULL, -1, width, intermediate_height);
     aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,