Author: Michael R. Crusoe <michael.crusoe@gmail.com>
Description: Add support for more architectures

using the SIMD Everywhere library
--- minimap2.orig/ksw2_extd2_sse.c
+++ minimap2/ksw2_extd2_sse.c
@@ -3,81 +3,70 @@
 #include <assert.h>
 #include "ksw2.h"
 
-#ifdef __SSE2__
-#include <emmintrin.h>
+#include "debian/include/simde/x86/sse4.1.h"
 
-#ifdef KSW_SSE2_ONLY
-#undef __SSE4_1__
-#endif
-
-#ifdef __SSE4_1__
-#include <smmintrin.h>
-#endif
-
-#ifdef KSW_CPU_DISPATCH
-#ifdef __SSE4_1__
+#if defined(SIMDE_SSE4_1_NATIVE)
 void ksw_extd2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
 				   int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
-#else
+#elif defined(SIMDE_SSE2_NATIVE)
 void ksw_extd2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
-				   int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
-#endif
+                                  int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
 #else
 void ksw_extd2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
-				   int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
-#endif // ~KSW_CPU_DISPATCH
+                                  int8_t q, int8_t e, int8_t q2, int8_t e2, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
+#endif
 {
 #define __dp_code_block1 \
-	z = _mm_load_si128(&s[t]); \
-	xt1 = _mm_load_si128(&x[t]);                     /* xt1 <- x[r-1][t..t+15] */ \
-	tmp = _mm_srli_si128(xt1, 15);                   /* tmp <- x[r-1][t+15] */ \
-	xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \
+	z = simde_mm_load_si128(&s[t]); \
+	xt1 = simde_mm_load_si128(&x[t]);                     /* xt1 <- x[r-1][t..t+15] */ \
+	tmp = simde_mm_srli_si128(xt1, 15);                   /* tmp <- x[r-1][t+15] */ \
+	xt1 = simde_mm_or_si128(simde_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \
 	x1_ = tmp; \
-	vt1 = _mm_load_si128(&v[t]);                     /* vt1 <- v[r-1][t..t+15] */ \
-	tmp = _mm_srli_si128(vt1, 15);                   /* tmp <- v[r-1][t+15] */ \
-	vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \
+	vt1 = simde_mm_load_si128(&v[t]);                     /* vt1 <- v[r-1][t..t+15] */ \
+	tmp = simde_mm_srli_si128(vt1, 15);                   /* tmp <- v[r-1][t+15] */ \
+	vt1 = simde_mm_or_si128(simde_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \
 	v1_ = tmp; \
-	a = _mm_add_epi8(xt1, vt1);                      /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \
-	ut = _mm_load_si128(&u[t]);                      /* ut <- u[t..t+15] */ \
-	b = _mm_add_epi8(_mm_load_si128(&y[t]), ut);     /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ \
-	x2t1= _mm_load_si128(&x2[t]); \
-	tmp = _mm_srli_si128(x2t1, 15); \
-	x2t1= _mm_or_si128(_mm_slli_si128(x2t1, 1), x21_); \
+	a = simde_mm_add_epi8(xt1, vt1);                      /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \
+	ut = simde_mm_load_si128(&u[t]);                      /* ut <- u[t..t+15] */ \
+	b = simde_mm_add_epi8(simde_mm_load_si128(&y[t]), ut);     /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ \
+	x2t1= simde_mm_load_si128(&x2[t]); \
+	tmp = simde_mm_srli_si128(x2t1, 15); \
+	x2t1= simde_mm_or_si128(simde_mm_slli_si128(x2t1, 1), x21_); \
 	x21_= tmp; \
-	a2= _mm_add_epi8(x2t1, vt1); \
-	b2= _mm_add_epi8(_mm_load_si128(&y2[t]), ut);
+	a2= simde_mm_add_epi8(x2t1, vt1); \
+	b2= simde_mm_add_epi8(simde_mm_load_si128(&y2[t]), ut);
 
 #define __dp_code_block2 \
-	_mm_store_si128(&u[t], _mm_sub_epi8(z, vt1));    /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \
-	_mm_store_si128(&v[t], _mm_sub_epi8(z, ut));     /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \
-	tmp = _mm_sub_epi8(z, q_); \
-	a = _mm_sub_epi8(a, tmp); \
-	b = _mm_sub_epi8(b, tmp); \
-	tmp = _mm_sub_epi8(z, q2_); \
-	a2= _mm_sub_epi8(a2, tmp); \
-	b2= _mm_sub_epi8(b2, tmp);
+	simde_mm_store_si128(&u[t], simde_mm_sub_epi8(z, vt1));    /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \
+	simde_mm_store_si128(&v[t], simde_mm_sub_epi8(z, ut));     /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \
+	tmp = simde_mm_sub_epi8(z, q_); \
+	a = simde_mm_sub_epi8(a, tmp); \
+	b = simde_mm_sub_epi8(b, tmp); \
+	tmp = simde_mm_sub_epi8(z, q2_); \
+	a2= simde_mm_sub_epi8(a2, tmp); \
+	b2= simde_mm_sub_epi8(b2, tmp);
 
 	int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc, long_thres, long_diff;
 	int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
 	int32_t *H = 0, H0 = 0, last_H0_t = 0;
 	uint8_t *qr, *sf, *mem, *mem2 = 0;
-	__m128i q_, q2_, qe_, qe2_, zero_, sc_mch_, sc_mis_, m1_, sc_N_;
-	__m128i *u, *v, *x, *y, *x2, *y2, *s, *p = 0;
+	simde__m128i q_, q2_, qe_, qe2_, zero_, sc_mch_, sc_mis_, m1_, sc_N_;
+	simde__m128i *u, *v, *x, *y, *x2, *y2, *s, *p = 0;
 
 	ksw_reset_extz(ez);
 	if (m <= 1 || qlen <= 0 || tlen <= 0) return;
 
 	if (q2 + e2 < q + e) t = q, q = q2, q2 = t, t = e, e = e2, e2 = t; // make sure q+e no larger than q2+e2
 
-	zero_   = _mm_set1_epi8(0);
-	q_      = _mm_set1_epi8(q);
-	q2_     = _mm_set1_epi8(q2);
-	qe_     = _mm_set1_epi8(q + e);
-	qe2_    = _mm_set1_epi8(q2 + e2);
-	sc_mch_ = _mm_set1_epi8(mat[0]);
-	sc_mis_ = _mm_set1_epi8(mat[1]);
-	sc_N_   = mat[m*m-1] == 0? _mm_set1_epi8(-e2) : _mm_set1_epi8(mat[m*m-1]);
-	m1_     = _mm_set1_epi8(m - 1); // wildcard
+	zero_   = simde_mm_set1_epi8(0);
+	q_      = simde_mm_set1_epi8(q);
+	q2_     = simde_mm_set1_epi8(q2);
+	qe_     = simde_mm_set1_epi8(q + e);
+	qe2_    = simde_mm_set1_epi8(q2 + e2);
+	sc_mch_ = simde_mm_set1_epi8(mat[0]);
+	sc_mis_ = simde_mm_set1_epi8(mat[1]);
+	sc_N_   = mat[m*m-1] == 0? simde_mm_set1_epi8(-e2) : simde_mm_set1_epi8(mat[m*m-1]);
+	m1_     = simde_mm_set1_epi8(m - 1); // wildcard
 
 	if (w < 0) w = tlen > qlen? tlen : qlen;
 	wl = wr = w;
@@ -97,7 +86,7 @@
 	long_diff = long_thres * (e - e2) - (q2 - q) - e2;
 
 	mem = (uint8_t*)kcalloc(km, tlen_ * 8 + qlen_ + 1, 16);
-	u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned
+	u = (simde__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned
 	v = u + tlen_, x = v + tlen_, y = x + tlen_, x2 = y + tlen_, y2 = x2 + tlen_;
 	s = y2 + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16;
 	memset(u,  -q  - e,  tlen_ * 16);
@@ -112,7 +101,7 @@
 	}
 	if (with_cigar) {
 		mem2 = (uint8_t*)kmalloc(km, ((size_t)(qlen + tlen - 1) * n_col_ + 1) * 16);
-		p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4);
+		p = (simde__m128i*)(((size_t)mem2 + 15) >> 4 << 4);
 		off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2);
 		off_end = off + qlen + tlen - 1;
 	}
@@ -125,7 +114,7 @@
 		int8_t x1, x21, v1;
 		uint8_t *qrr = qr + (qlen - 1 - r);
 		int8_t *u8 = (int8_t*)u, *v8 = (int8_t*)v, *x8 = (int8_t*)x, *x28 = (int8_t*)x2;
-		__m128i x1_, x21_, v1_;
+		simde__m128i x1_, x21_, v1_;
 		// find the boundaries
 		if (st < r - qlen + 1) st = r - qlen + 1;
 		if (en > r) en = r;
@@ -156,160 +145,99 @@
 		// loop fission: set scores first
 		if (!(flag & KSW_EZ_GENERIC_SC)) {
 			for (t = st0; t <= en0; t += 16) {
-				__m128i sq, st, tmp, mask;
-				sq = _mm_loadu_si128((__m128i*)&sf[t]);
-				st = _mm_loadu_si128((__m128i*)&qrr[t]);
-				mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
-				tmp = _mm_cmpeq_epi8(sq, st);
-#ifdef __SSE4_1__
-				tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
-				tmp = _mm_blendv_epi8(tmp,     sc_N_,   mask);
-#else
-				tmp = _mm_or_si128(_mm_andnot_si128(tmp,  sc_mis_), _mm_and_si128(tmp,  sc_mch_));
-				tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp),     _mm_and_si128(mask, sc_N_));
-#endif
-				_mm_storeu_si128((__m128i*)((int8_t*)s + t), tmp);
+				simde__m128i sq, st, tmp, mask;
+				sq = simde_mm_loadu_si128((simde__m128i*)&sf[t]);
+				st = simde_mm_loadu_si128((simde__m128i*)&qrr[t]);
+				mask = simde_mm_or_si128(simde_mm_cmpeq_epi8(sq, m1_), simde_mm_cmpeq_epi8(st, m1_));
+				tmp = simde_mm_cmpeq_epi8(sq, st);
+				tmp = simde_mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
+				tmp = simde_mm_blendv_epi8(tmp,     sc_N_,   mask);
+				simde_mm_storeu_si128((simde__m128i*)((int8_t*)s + t), tmp);
 			}
 		} else {
 			for (t = st0; t <= en0; ++t)
 				((uint8_t*)s)[t] = mat[sf[t] * m + qrr[t]];
 		}
 		// core loop
-		x1_  = _mm_cvtsi32_si128((uint8_t)x1);
-		x21_ = _mm_cvtsi32_si128((uint8_t)x21);
-		v1_  = _mm_cvtsi32_si128((uint8_t)v1);
+		x1_  = simde_mm_cvtsi32_si128((uint8_t)x1);
+		x21_ = simde_mm_cvtsi32_si128((uint8_t)x21);
+		v1_  = simde_mm_cvtsi32_si128((uint8_t)v1);
 		st_ = st / 16, en_ = en / 16;
 		assert(en_ - st_ + 1 <= n_col_);
 		if (!with_cigar) { // score only
 			for (t = st_; t <= en_; ++t) {
-				__m128i z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
+				simde__m128i z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
 				__dp_code_block1;
-#ifdef __SSE4_1__
-				z = _mm_max_epi8(z, a);
-				z = _mm_max_epi8(z, b);
-				z = _mm_max_epi8(z, a2);
-				z = _mm_max_epi8(z, b2);
-				z = _mm_min_epi8(z, sc_mch_);
+				z = simde_mm_max_epi8(z, a);
+				z = simde_mm_max_epi8(z, b);
+				z = simde_mm_max_epi8(z, a2);
+				z = simde_mm_max_epi8(z, b2);
+				z = simde_mm_min_epi8(z, sc_mch_);
 				__dp_code_block2; // save u[] and v[]; update a, b, a2 and b2
-				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_max_epi8(a,  zero_), qe_));
-				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_max_epi8(b,  zero_), qe_));
-				_mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, zero_), qe2_));
-				_mm_store_si128(&y2[t], _mm_sub_epi8(_mm_max_epi8(b2, zero_), qe2_));
-#else
-				tmp = _mm_cmpgt_epi8(a,  z);
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
-				tmp = _mm_cmpgt_epi8(b,  z);
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
-				tmp = _mm_cmpgt_epi8(a2, z);
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2));
-				tmp = _mm_cmpgt_epi8(b2, z);
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2));
-				tmp = _mm_cmplt_epi8(sc_mch_, z);
-				z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
-				__dp_code_block2;
-				tmp = _mm_cmpgt_epi8(a, zero_);
-				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_and_si128(tmp, a),  qe_));
-				tmp = _mm_cmpgt_epi8(b, zero_);
-				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_and_si128(tmp, b),  qe_));
-				tmp = _mm_cmpgt_epi8(a2, zero_);
-				_mm_store_si128(&x2[t], _mm_sub_epi8(_mm_and_si128(tmp, a2), qe2_));
-				tmp = _mm_cmpgt_epi8(b2, zero_);
-				_mm_store_si128(&y2[t], _mm_sub_epi8(_mm_and_si128(tmp, b2), qe2_));
-#endif
+				simde_mm_store_si128(&x[t],  simde_mm_sub_epi8(simde_mm_max_epi8(a,  zero_), qe_));
+				simde_mm_store_si128(&y[t],  simde_mm_sub_epi8(simde_mm_max_epi8(b,  zero_), qe_));
+				simde_mm_store_si128(&x2[t], simde_mm_sub_epi8(simde_mm_max_epi8(a2, zero_), qe2_));
+				simde_mm_store_si128(&y2[t], simde_mm_sub_epi8(simde_mm_max_epi8(b2, zero_), qe2_));
 			}
 		} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
-			__m128i *pr = p + (size_t)r * n_col_ - st_;
+			simde__m128i *pr = p + (size_t)r * n_col_ - st_;
 			off[r] = st, off_end[r] = en;
 			for (t = st_; t <= en_; ++t) {
-				__m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
+				simde__m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
 				__dp_code_block1;
-#ifdef __SSE4_1__
-				d = _mm_and_si128(_mm_cmpgt_epi8(a, z), _mm_set1_epi8(1));       // d = a  > z? 1 : 0
-				z = _mm_max_epi8(z, a);
-				d = _mm_blendv_epi8(d, _mm_set1_epi8(2), _mm_cmpgt_epi8(b,  z)); // d = b  > z? 2 : d
-				z = _mm_max_epi8(z, b);
-				d = _mm_blendv_epi8(d, _mm_set1_epi8(3), _mm_cmpgt_epi8(a2, z)); // d = a2 > z? 3 : d
-				z = _mm_max_epi8(z, a2);
-				d = _mm_blendv_epi8(d, _mm_set1_epi8(4), _mm_cmpgt_epi8(b2, z)); // d = a2 > z? 3 : d
-				z = _mm_max_epi8(z, b2);
-				z = _mm_min_epi8(z, sc_mch_);
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-				tmp = _mm_cmpgt_epi8(a,  z);
-				d = _mm_and_si128(tmp, _mm_set1_epi8(1));
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
-				tmp = _mm_cmpgt_epi8(b,  z);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(2)));
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
-				tmp = _mm_cmpgt_epi8(a2, z);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3)));
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2));
-				tmp = _mm_cmpgt_epi8(b2, z);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(4)));
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b2));
-				tmp = _mm_cmplt_epi8(sc_mch_, z);
-				z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
-#endif
+				d = simde_mm_and_si128(simde_mm_cmpgt_epi8(a, z), simde_mm_set1_epi8(1));       // d = a  > z? 1 : 0
+				z = simde_mm_max_epi8(z, a);
+				d = simde_mm_blendv_epi8(d, simde_mm_set1_epi8(2), simde_mm_cmpgt_epi8(b,  z)); // d = b  > z? 2 : d
+				z = simde_mm_max_epi8(z, b);
+				d = simde_mm_blendv_epi8(d, simde_mm_set1_epi8(3), simde_mm_cmpgt_epi8(a2, z)); // d = a2 > z? 3 : d
+				z = simde_mm_max_epi8(z, a2);
+				d = simde_mm_blendv_epi8(d, simde_mm_set1_epi8(4), simde_mm_cmpgt_epi8(b2, z)); // d = a2 > z? 3 : d
+				z = simde_mm_max_epi8(z, b2);
+				z = simde_mm_min_epi8(z, sc_mch_);
 				__dp_code_block2;
-				tmp = _mm_cmpgt_epi8(a, zero_);
-				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_and_si128(tmp, a),  qe_));
-				d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0
-				tmp = _mm_cmpgt_epi8(b, zero_);
-				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_and_si128(tmp, b),  qe_));
-				d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0
-				tmp = _mm_cmpgt_epi8(a2, zero_);
-				_mm_store_si128(&x2[t], _mm_sub_epi8(_mm_and_si128(tmp, a2), qe2_));
-				d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0
-				tmp = _mm_cmpgt_epi8(b2, zero_);
-				_mm_store_si128(&y2[t], _mm_sub_epi8(_mm_and_si128(tmp, b2), qe2_));
-				d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x40))); // d = b > 0? 1<<6 : 0
-				_mm_store_si128(&pr[t], d);
+				tmp = simde_mm_cmpgt_epi8(a, zero_);
+				simde_mm_store_si128(&x[t],  simde_mm_sub_epi8(simde_mm_and_si128(tmp, a),  qe_));
+				d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, simde_mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0
+				tmp = simde_mm_cmpgt_epi8(b, zero_);
+				simde_mm_store_si128(&y[t],  simde_mm_sub_epi8(simde_mm_and_si128(tmp, b),  qe_));
+				d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, simde_mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0
+				tmp = simde_mm_cmpgt_epi8(a2, zero_);
+				simde_mm_store_si128(&x2[t], simde_mm_sub_epi8(simde_mm_and_si128(tmp, a2), qe2_));
+				d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, simde_mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0
+				tmp = simde_mm_cmpgt_epi8(b2, zero_);
+				simde_mm_store_si128(&y2[t], simde_mm_sub_epi8(simde_mm_and_si128(tmp, b2), qe2_));
+				d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, simde_mm_set1_epi8(0x40))); // d = b > 0? 1<<6 : 0
+				simde_mm_store_si128(&pr[t], d);
 			}
 		} else { // gap right-alignment
-			__m128i *pr = p + (size_t)r * n_col_ - st_;
+			simde__m128i *pr = p + (size_t)r * n_col_ - st_;
 			off[r] = st, off_end[r] = en;
 			for (t = st_; t <= en_; ++t) {
-				__m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
+				simde__m128i d, z, a, b, a2, b2, xt1, x2t1, vt1, ut, tmp;
 				__dp_code_block1;
-#ifdef __SSE4_1__
-				d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), _mm_set1_epi8(1));    // d = z > a?  0 : 1
-				z = _mm_max_epi8(z, a);
-				d = _mm_blendv_epi8(_mm_set1_epi8(2), d, _mm_cmpgt_epi8(z, b));  // d = z > b?  d : 2
-				z = _mm_max_epi8(z, b);
-				d = _mm_blendv_epi8(_mm_set1_epi8(3), d, _mm_cmpgt_epi8(z, a2)); // d = z > a2? d : 3
-				z = _mm_max_epi8(z, a2);
-				d = _mm_blendv_epi8(_mm_set1_epi8(4), d, _mm_cmpgt_epi8(z, b2)); // d = z > b2? d : 4
-				z = _mm_max_epi8(z, b2);
-				z = _mm_min_epi8(z, sc_mch_);
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-				tmp = _mm_cmpgt_epi8(z, a);
-				d = _mm_andnot_si128(tmp, _mm_set1_epi8(1));
-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a));
-				tmp = _mm_cmpgt_epi8(z, b);
-				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(2)));
-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b));
-				tmp = _mm_cmpgt_epi8(z, a2);
-				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3)));
-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2));
-				tmp = _mm_cmpgt_epi8(z, b2);
-				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(4)));
-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b2));
-				tmp = _mm_cmplt_epi8(sc_mch_, z);
-				z = _mm_or_si128(_mm_and_si128(tmp, sc_mch_), _mm_andnot_si128(tmp, z));
-#endif
+				d = simde_mm_andnot_si128(simde_mm_cmpgt_epi8(z, a), simde_mm_set1_epi8(1));    // d = z > a?  0 : 1
+				z = simde_mm_max_epi8(z, a);
+				d = simde_mm_blendv_epi8(simde_mm_set1_epi8(2), d, simde_mm_cmpgt_epi8(z, b));  // d = z > b?  d : 2
+				z = simde_mm_max_epi8(z, b);
+				d = simde_mm_blendv_epi8(simde_mm_set1_epi8(3), d, simde_mm_cmpgt_epi8(z, a2)); // d = z > a2? d : 3
+				z = simde_mm_max_epi8(z, a2);
+				d = simde_mm_blendv_epi8(simde_mm_set1_epi8(4), d, simde_mm_cmpgt_epi8(z, b2)); // d = z > b2? d : 4
+				z = simde_mm_max_epi8(z, b2);
+				z = simde_mm_min_epi8(z, sc_mch_);
 				__dp_code_block2;
-				tmp = _mm_cmpgt_epi8(zero_, a);
-				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_andnot_si128(tmp, a),  qe_));
-				d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0
-				tmp = _mm_cmpgt_epi8(zero_, b);
-				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_andnot_si128(tmp, b),  qe_));
-				d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0
-				tmp = _mm_cmpgt_epi8(zero_, a2);
-				_mm_store_si128(&x2[t], _mm_sub_epi8(_mm_andnot_si128(tmp, a2), qe2_));
-				d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0
-				tmp = _mm_cmpgt_epi8(zero_, b2);
-				_mm_store_si128(&y2[t], _mm_sub_epi8(_mm_andnot_si128(tmp, b2), qe2_));
-				d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x40))); // d = b > 0? 1<<6 : 0
-				_mm_store_si128(&pr[t], d);
+				tmp = simde_mm_cmpgt_epi8(zero_, a);
+				simde_mm_store_si128(&x[t],  simde_mm_sub_epi8(simde_mm_andnot_si128(tmp, a),  qe_));
+				d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, simde_mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0
+				tmp = simde_mm_cmpgt_epi8(zero_, b);
+				simde_mm_store_si128(&y[t],  simde_mm_sub_epi8(simde_mm_andnot_si128(tmp, b),  qe_));
+				d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, simde_mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0
+				tmp = simde_mm_cmpgt_epi8(zero_, a2);
+				simde_mm_store_si128(&x2[t], simde_mm_sub_epi8(simde_mm_andnot_si128(tmp, a2), qe2_));
+				d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, simde_mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0
+				tmp = simde_mm_cmpgt_epi8(zero_, b2);
+				simde_mm_store_si128(&y2[t], simde_mm_sub_epi8(simde_mm_andnot_si128(tmp, b2), qe2_));
+				d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, simde_mm_set1_epi8(0x40))); // d = b > 0? 1<<6 : 0
+				simde_mm_store_si128(&pr[t], d);
 			}
 		}
 		if (!approx_max) { // find the exact max with a 32-bit score array
@@ -317,29 +245,24 @@
 			// compute H[], max_H and max_t
 			if (r > 0) {
 				int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i;
-				__m128i max_H_, max_t_;
+				simde__m128i max_H_, max_t_;
 				max_H = H[en0] = en0 > 0? H[en0-1] + u8[en0] : H[en0] + v8[en0]; // special casing the last element
 				max_t = en0;
-				max_H_ = _mm_set1_epi32(max_H);
-				max_t_ = _mm_set1_epi32(max_t);
+				max_H_ = simde_mm_set1_epi32(max_H);
+				max_t_ = simde_mm_set1_epi32(max_t);
 				for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t;
-					__m128i H1, tmp, t_;
-					H1 = _mm_loadu_si128((__m128i*)&H[t]);
-					t_ = _mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]);
-					H1 = _mm_add_epi32(H1, t_);
-					_mm_storeu_si128((__m128i*)&H[t], H1);
-					t_ = _mm_set1_epi32(t);
-					tmp = _mm_cmpgt_epi32(H1, max_H_);
-#ifdef __SSE4_1__
-					max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
-					max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
-#else
-					max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
-					max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
-#endif
+					simde__m128i H1, tmp, t_;
+					H1 = simde_mm_loadu_si128((simde__m128i*)&H[t]);
+					t_ = simde_mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]);
+					H1 = simde_mm_add_epi32(H1, t_);
+					simde_mm_storeu_si128((simde__m128i*)&H[t], H1);
+					t_ = simde_mm_set1_epi32(t);
+					tmp = simde_mm_cmpgt_epi32(H1, max_H_);
+					max_H_ = simde_mm_blendv_epi8(max_H_, H1, tmp);
+					max_t_ = simde_mm_blendv_epi8(max_t_, t_, tmp);
 				}
-				_mm_storeu_si128((__m128i*)HH, max_H_);
-				_mm_storeu_si128((__m128i*)tt, max_t_);
+				simde_mm_storeu_si128((simde__m128i*)HH, max_H_);
+				simde_mm_storeu_si128((simde__m128i*)tt, max_t_);
 				for (i = 0; i < 4; ++i)
 					if (max_H < HH[i]) max_H = HH[i], max_t = tt[i] + i;
 				for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE
@@ -391,4 +314,3 @@
 		kfree(km, mem2); kfree(km, off);
 	}
 }
-#endif // __SSE2__
--- minimap2.orig/ksw2_exts2_sse.c
+++ minimap2/ksw2_exts2_sse.c
@@ -3,76 +3,65 @@
 #include <assert.h>
 #include "ksw2.h"
 
-#ifdef __SSE2__
-#include <emmintrin.h>
+#include "debian/include/simde/x86/sse4.1.h"
 
-#ifdef KSW_SSE2_ONLY
-#undef __SSE4_1__
-#endif
-
-#ifdef __SSE4_1__
-#include <smmintrin.h>
-#endif
-
-#ifdef KSW_CPU_DISPATCH
-#ifdef __SSE4_1__
+#if defined(SIMDE_SSE4_1_NATIVE)
 void ksw_exts2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
 				   int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez)
-#else
+#elif defined(SIMDE_SSE2_NATIVE)
 void ksw_exts2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
 				   int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez)
-#endif
 #else
 void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat,
 				   int8_t q, int8_t e, int8_t q2, int8_t noncan, int zdrop, int8_t junc_bonus, int flag, const uint8_t *junc, ksw_extz_t *ez)
-#endif // ~KSW_CPU_DISPATCH
+#endif
 {
 #define __dp_code_block1 \
-	z = _mm_load_si128(&s[t]); \
-	xt1 = _mm_load_si128(&x[t]);                     /* xt1 <- x[r-1][t..t+15] */ \
-	tmp = _mm_srli_si128(xt1, 15);                   /* tmp <- x[r-1][t+15] */ \
-	xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \
+	z = simde_mm_load_si128(&s[t]); \
+	xt1 = simde_mm_load_si128(&x[t]);                     /* xt1 <- x[r-1][t..t+15] */ \
+	tmp = simde_mm_srli_si128(xt1, 15);                   /* tmp <- x[r-1][t+15] */ \
+	xt1 = simde_mm_or_si128(simde_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \
 	x1_ = tmp; \
-	vt1 = _mm_load_si128(&v[t]);                     /* vt1 <- v[r-1][t..t+15] */ \
-	tmp = _mm_srli_si128(vt1, 15);                   /* tmp <- v[r-1][t+15] */ \
-	vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \
+	vt1 = simde_mm_load_si128(&v[t]);                     /* vt1 <- v[r-1][t..t+15] */ \
+	tmp = simde_mm_srli_si128(vt1, 15);                   /* tmp <- v[r-1][t+15] */ \
+	vt1 = simde_mm_or_si128(simde_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \
 	v1_ = tmp; \
-	a = _mm_add_epi8(xt1, vt1);                      /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \
-	ut = _mm_load_si128(&u[t]);                      /* ut <- u[t..t+15] */ \
-	b = _mm_add_epi8(_mm_load_si128(&y[t]), ut);     /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ \
-	x2t1= _mm_load_si128(&x2[t]); \
-	tmp = _mm_srli_si128(x2t1, 15); \
-	x2t1= _mm_or_si128(_mm_slli_si128(x2t1, 1), x21_); \
+	a = simde_mm_add_epi8(xt1, vt1);                      /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \
+	ut = simde_mm_load_si128(&u[t]);                      /* ut <- u[t..t+15] */ \
+	b = simde_mm_add_epi8(simde_mm_load_si128(&y[t]), ut);     /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */ \
+	x2t1= simde_mm_load_si128(&x2[t]); \
+	tmp = simde_mm_srli_si128(x2t1, 15); \
+	x2t1= simde_mm_or_si128(simde_mm_slli_si128(x2t1, 1), x21_); \
 	x21_= tmp; \
-	a2  = _mm_add_epi8(x2t1, vt1); \
-	a2a = _mm_add_epi8(a2, _mm_load_si128(&acceptor[t]));
+	a2  = simde_mm_add_epi8(x2t1, vt1); \
+	a2a = simde_mm_add_epi8(a2, simde_mm_load_si128(&acceptor[t]));
 
 #define __dp_code_block2 \
-	_mm_store_si128(&u[t], _mm_sub_epi8(z, vt1));    /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \
-	_mm_store_si128(&v[t], _mm_sub_epi8(z, ut));     /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \
-	tmp = _mm_sub_epi8(z, q_); \
-	a = _mm_sub_epi8(a, tmp); \
-	b = _mm_sub_epi8(b, tmp); \
-	a2= _mm_sub_epi8(a2, _mm_sub_epi8(z, q2_));
+	simde_mm_store_si128(&u[t], simde_mm_sub_epi8(z, vt1));    /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \
+	simde_mm_store_si128(&v[t], simde_mm_sub_epi8(z, ut));     /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \
+	tmp = simde_mm_sub_epi8(z, q_); \
+	a = simde_mm_sub_epi8(a, tmp); \
+	b = simde_mm_sub_epi8(b, tmp); \
+	a2= simde_mm_sub_epi8(a2, simde_mm_sub_epi8(z, q2_));
 
 	int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, max_sc, min_sc, long_thres, long_diff;
 	int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
 	int32_t *H = 0, H0 = 0, last_H0_t = 0;
 	uint8_t *qr, *sf, *mem, *mem2 = 0;
-	__m128i q_, q2_, qe_, zero_, sc_mch_, sc_mis_, sc_N_, m1_;
-	__m128i *u, *v, *x, *y, *x2, *s, *p = 0, *donor, *acceptor;
+	simde__m128i q_, q2_, qe_, zero_, sc_mch_, sc_mis_, sc_N_, m1_;
+	simde__m128i *u, *v, *x, *y, *x2, *s, *p = 0, *donor, *acceptor;
 
 	ksw_reset_extz(ez);
 	if (m <= 1 || qlen <= 0 || tlen <= 0 || q2 <= q + e) return;
 
-	zero_   = _mm_set1_epi8(0);
-	q_      = _mm_set1_epi8(q);
-	q2_     = _mm_set1_epi8(q2);
-	qe_     = _mm_set1_epi8(q + e);
-	sc_mch_ = _mm_set1_epi8(mat[0]);
-	sc_mis_ = _mm_set1_epi8(mat[1]);
-	sc_N_   = mat[m*m-1] == 0? _mm_set1_epi8(-e) : _mm_set1_epi8(mat[m*m-1]);
-	m1_     = _mm_set1_epi8(m - 1); // wildcard
+	zero_   = simde_mm_set1_epi8(0);
+	q_      = simde_mm_set1_epi8(q);
+	q2_     = simde_mm_set1_epi8(q2);
+	qe_     = simde_mm_set1_epi8(q + e);
+	sc_mch_ = simde_mm_set1_epi8(mat[0]);
+	sc_mis_ = simde_mm_set1_epi8(mat[1]);
+	sc_N_   = mat[m*m-1] == 0? simde_mm_set1_epi8(-e) : simde_mm_set1_epi8(mat[m*m-1]);
+	m1_     = simde_mm_set1_epi8(m - 1); // wildcard
 
 	tlen_ = (tlen + 15) / 16;
 	n_col_ = ((qlen < tlen? qlen : tlen) + 15) / 16 + 1;
@@ -89,7 +78,7 @@
 	long_diff = long_thres * e - (q2 - q);
 
 	mem = (uint8_t*)kcalloc(km, tlen_ * 9 + qlen_ + 1, 16);
-	u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned
+	u = (simde__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned
 	v = u + tlen_, x = v + tlen_, y = x + tlen_, x2 = y + tlen_;
 	donor = x2 + tlen_, acceptor = donor + tlen_;
 	s = acceptor + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16;
@@ -101,7 +90,7 @@
 	}
 	if (with_cigar) {
 		mem2 = (uint8_t*)kmalloc(km, ((size_t)(qlen + tlen - 1) * n_col_ + 1) * 16);
-		p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4);
+		p = (simde__m128i*)(((size_t)mem2 + 15) >> 4 << 4);
 		off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2);
 		off_end = off + qlen + tlen - 1;
 	}
@@ -167,7 +156,7 @@
 		int st = 0, en = tlen - 1, st0, en0, st_, en_;
 		int8_t x1, x21, v1, *u8 = (int8_t*)u, *v8 = (int8_t*)v;
 		uint8_t *qrr = qr + (qlen - 1 - r);
-		__m128i x1_, x21_, v1_;
+		simde__m128i x1_, x21_, v1_;
 		// find the boundaries
 		if (st < r - qlen + 1) st = r - qlen + 1;
 		if (en > r) en = r;
@@ -189,146 +178,91 @@
 		// loop fission: set scores first
 		if (!(flag & KSW_EZ_GENERIC_SC)) {
 			for (t = st0; t <= en0; t += 16) {
-				__m128i sq, st, tmp, mask;
-				sq = _mm_loadu_si128((__m128i*)&sf[t]);
-				st = _mm_loadu_si128((__m128i*)&qrr[t]);
-				mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
-				tmp = _mm_cmpeq_epi8(sq, st);
-#ifdef __SSE4_1__
-				tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
-				tmp = _mm_blendv_epi8(tmp,     sc_N_,   mask);
-#else
-				tmp = _mm_or_si128(_mm_andnot_si128(tmp,  sc_mis_), _mm_and_si128(tmp,  sc_mch_));
-				tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp),     _mm_and_si128(mask, sc_N_));
-#endif
-				_mm_storeu_si128((__m128i*)((int8_t*)s + t), tmp);
+				simde__m128i sq, st, tmp, mask;
+				sq = simde_mm_loadu_si128((simde__m128i*)&sf[t]);
+				st = simde_mm_loadu_si128((simde__m128i*)&qrr[t]);
+				mask = simde_mm_or_si128(simde_mm_cmpeq_epi8(sq, m1_), simde_mm_cmpeq_epi8(st, m1_));
+				tmp = simde_mm_cmpeq_epi8(sq, st);
+				tmp = simde_mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
+				tmp = simde_mm_blendv_epi8(tmp,     sc_N_,   mask);
+				simde_mm_storeu_si128((simde__m128i*)((int8_t*)s + t), tmp);
 			}
 		} else {
 			for (t = st0; t <= en0; ++t)
 				((uint8_t*)s)[t] = mat[sf[t] * m + qrr[t]];
 		}
 		// core loop
-		x1_  = _mm_cvtsi32_si128((uint8_t)x1);
-		x21_ = _mm_cvtsi32_si128((uint8_t)x21);
-		v1_  = _mm_cvtsi32_si128((uint8_t)v1);
+		x1_  = simde_mm_cvtsi32_si128((uint8_t)x1);
+		x21_ = simde_mm_cvtsi32_si128((uint8_t)x21);
+		v1_  = simde_mm_cvtsi32_si128((uint8_t)v1);
 		st_ = st / 16, en_ = en / 16;
 		assert(en_ - st_ + 1 <= n_col_);
 		if (!with_cigar) { // score only
 			for (t = st_; t <= en_; ++t) {
-				__m128i z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp;
+				simde__m128i z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp;
 				__dp_code_block1;
-#ifdef __SSE4_1__
-				z = _mm_max_epi8(z, a);
-				z = _mm_max_epi8(z, b);
-				z = _mm_max_epi8(z, a2a);
+				z = simde_mm_max_epi8(z, a);
+				z = simde_mm_max_epi8(z, b);
+				z = simde_mm_max_epi8(z, a2a);
 				__dp_code_block2; // save u[] and v[]; update a, b and a2
-				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_max_epi8(a,  zero_), qe_));
-				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_max_epi8(b,  zero_), qe_));
-				tmp = _mm_load_si128(&donor[t]);
-				_mm_store_si128(&x2[t], _mm_sub_epi8(_mm_max_epi8(a2, tmp), q2_));
-#else
-				tmp = _mm_cmpgt_epi8(a,  z);
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
-				tmp = _mm_cmpgt_epi8(b,  z);
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
-				tmp = _mm_cmpgt_epi8(a2a, z);
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a));
-				__dp_code_block2;
-				tmp = _mm_cmpgt_epi8(a, zero_);
-				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_and_si128(tmp, a),  qe_));
-				tmp = _mm_cmpgt_epi8(b, zero_);
-				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_and_si128(tmp, b),  qe_));
-				tmp = _mm_load_si128(&donor[t]); // TODO: check if this is correct
-				tmp = _mm_cmpgt_epi8(a2, tmp);
-				tmp = _mm_or_si128(_mm_andnot_si128(tmp, tmp), _mm_and_si128(tmp, a2));
-				_mm_store_si128(&x2[t], _mm_sub_epi8(tmp, q2_));
-#endif
+				simde_mm_store_si128(&x[t],  simde_mm_sub_epi8(simde_mm_max_epi8(a,  zero_), qe_));
+				simde_mm_store_si128(&y[t],  simde_mm_sub_epi8(simde_mm_max_epi8(b,  zero_), qe_));
+				tmp = simde_mm_load_si128(&donor[t]);
+				simde_mm_store_si128(&x2[t], simde_mm_sub_epi8(simde_mm_max_epi8(a2, tmp), q2_));
 			}
 		} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
-			__m128i *pr = p + r * n_col_ - st_;
+			simde__m128i *pr = p + r * n_col_ - st_;
 			off[r] = st, off_end[r] = en;
 			for (t = st_; t <= en_; ++t) {
-				__m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
+				simde__m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
 				__dp_code_block1;
-#ifdef __SSE4_1__
-				d = _mm_and_si128(_mm_cmpgt_epi8(a, z), _mm_set1_epi8(1));       // d = a  > z? 1 : 0
-				z = _mm_max_epi8(z, a);
-				d = _mm_blendv_epi8(d, _mm_set1_epi8(2), _mm_cmpgt_epi8(b,  z)); // d = b  > z? 2 : d
-				z = _mm_max_epi8(z, b);
-				d = _mm_blendv_epi8(d, _mm_set1_epi8(3), _mm_cmpgt_epi8(a2a, z)); // d = a2 > z? 3 : d
-				z = _mm_max_epi8(z, a2a);
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-				tmp = _mm_cmpgt_epi8(a,  z);
-				d = _mm_and_si128(tmp, _mm_set1_epi8(1));
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a));
-				tmp = _mm_cmpgt_epi8(b,  z);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(2)));
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, b));
-				tmp = _mm_cmpgt_epi8(a2a, z);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, _mm_set1_epi8(3)));
-				z = _mm_or_si128(_mm_andnot_si128(tmp, z), _mm_and_si128(tmp, a2a));
-#endif
+				d = simde_mm_and_si128(simde_mm_cmpgt_epi8(a, z), simde_mm_set1_epi8(1));       // d = a  > z? 1 : 0
+				z = simde_mm_max_epi8(z, a);
+				d = simde_mm_blendv_epi8(d, simde_mm_set1_epi8(2), simde_mm_cmpgt_epi8(b,  z)); // d = b  > z? 2 : d
+				z = simde_mm_max_epi8(z, b);
+				d = simde_mm_blendv_epi8(d, simde_mm_set1_epi8(3), simde_mm_cmpgt_epi8(a2a, z)); // d = a2 > z? 3 : d
+				z = simde_mm_max_epi8(z, a2a);
 				__dp_code_block2;
-				tmp = _mm_cmpgt_epi8(a, zero_);
-				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_and_si128(tmp, a),  qe_));
-				d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0
-				tmp = _mm_cmpgt_epi8(b, zero_);
-				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_and_si128(tmp, b),  qe_));
-				d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0
-
-				tmp2 = _mm_load_si128(&donor[t]);
-				tmp = _mm_cmpgt_epi8(a2, tmp2);
-#ifdef __SSE4_1__
-				tmp2 = _mm_max_epi8(a2, tmp2);
-#else
-				tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, tmp2), _mm_and_si128(tmp, a2));
-#endif
-				_mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_));
-				d = _mm_or_si128(d, _mm_and_si128(tmp, _mm_set1_epi8(0x20)));
-				_mm_store_si128(&pr[t], d);
+				tmp = simde_mm_cmpgt_epi8(a, zero_);
+				simde_mm_store_si128(&x[t],  simde_mm_sub_epi8(simde_mm_and_si128(tmp, a),  qe_));
+				d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, simde_mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0
+				tmp = simde_mm_cmpgt_epi8(b, zero_);
+				simde_mm_store_si128(&y[t],  simde_mm_sub_epi8(simde_mm_and_si128(tmp, b),  qe_));
+				d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, simde_mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0
+
+				tmp2 = simde_mm_load_si128(&donor[t]);
+				tmp = simde_mm_cmpgt_epi8(a2, tmp2);
+				tmp2 = simde_mm_max_epi8(a2, tmp2);
+				simde_mm_store_si128(&x2[t], simde_mm_sub_epi8(tmp2, q2_));
+				d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, simde_mm_set1_epi8(0x20)));
+				simde_mm_store_si128(&pr[t], d);
 			}
 		} else { // gap right-alignment
-			__m128i *pr = p + r * n_col_ - st_;
+			simde__m128i *pr = p + r * n_col_ - st_;
 			off[r] = st, off_end[r] = en;
 			for (t = st_; t <= en_; ++t) {
-				__m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
+				simde__m128i d, z, a, b, a2, a2a, xt1, x2t1, vt1, ut, tmp, tmp2;
 				__dp_code_block1;
-#ifdef __SSE4_1__
-				d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), _mm_set1_epi8(1));    // d = z > a?  0 : 1
-				z = _mm_max_epi8(z, a);
-				d = _mm_blendv_epi8(_mm_set1_epi8(2), d, _mm_cmpgt_epi8(z, b));  // d = z > b?  d : 2
-				z = _mm_max_epi8(z, b);
-				d = _mm_blendv_epi8(_mm_set1_epi8(3), d, _mm_cmpgt_epi8(z, a2a)); // d = z > a2? d : 3
-				z = _mm_max_epi8(z, a2a);
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-				tmp = _mm_cmpgt_epi8(z, a);
-				d = _mm_andnot_si128(tmp, _mm_set1_epi8(1));
-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a));
-				tmp = _mm_cmpgt_epi8(z, b);
-				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(2)));
-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, b));
-				tmp = _mm_cmpgt_epi8(z, a2a);
-				d = _mm_or_si128(_mm_and_si128(tmp, d), _mm_andnot_si128(tmp, _mm_set1_epi8(3)));
-				z = _mm_or_si128(_mm_and_si128(tmp, z), _mm_andnot_si128(tmp, a2a));
-#endif
+				d = simde_mm_andnot_si128(simde_mm_cmpgt_epi8(z, a), simde_mm_set1_epi8(1));    // d = z > a?  0 : 1
+				z = simde_mm_max_epi8(z, a);
+				d = simde_mm_blendv_epi8(simde_mm_set1_epi8(2), d, simde_mm_cmpgt_epi8(z, b));  // d = z > b?  d : 2
+				z = simde_mm_max_epi8(z, b);
+				d = simde_mm_blendv_epi8(simde_mm_set1_epi8(3), d, simde_mm_cmpgt_epi8(z, a2a)); // d = z > a2? d : 3
+				z = simde_mm_max_epi8(z, a2a);
 				__dp_code_block2;
-				tmp = _mm_cmpgt_epi8(zero_, a);
-				_mm_store_si128(&x[t],  _mm_sub_epi8(_mm_andnot_si128(tmp, a),  qe_));
-				d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0
-				tmp = _mm_cmpgt_epi8(zero_, b);
-				_mm_store_si128(&y[t],  _mm_sub_epi8(_mm_andnot_si128(tmp, b),  qe_));
-				d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0
-
-				tmp2 = _mm_load_si128(&donor[t]);
-				tmp = _mm_cmpgt_epi8(tmp2, a2);
-#ifdef __SSE4_1__
-				tmp2 = _mm_max_epi8(tmp2, a2);
-#else
-				tmp2 = _mm_or_si128(_mm_andnot_si128(tmp, a2), _mm_and_si128(tmp, tmp2));
-#endif
-				_mm_store_si128(&x2[t], _mm_sub_epi8(tmp2, q2_));
-				d = _mm_or_si128(d, _mm_andnot_si128(tmp, _mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0
-				_mm_store_si128(&pr[t], d);
+				tmp = simde_mm_cmpgt_epi8(zero_, a);
+				simde_mm_store_si128(&x[t],  simde_mm_sub_epi8(simde_mm_andnot_si128(tmp, a),  qe_));
+				d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, simde_mm_set1_epi8(0x08))); // d = a > 0? 1<<3 : 0
+				tmp = simde_mm_cmpgt_epi8(zero_, b);
+				simde_mm_store_si128(&y[t],  simde_mm_sub_epi8(simde_mm_andnot_si128(tmp, b),  qe_));
+				d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, simde_mm_set1_epi8(0x10))); // d = b > 0? 1<<4 : 0
+
+				tmp2 = simde_mm_load_si128(&donor[t]);
+				tmp = simde_mm_cmpgt_epi8(tmp2, a2);
+				tmp2 = simde_mm_max_epi8(tmp2, a2);
+				simde_mm_store_si128(&x2[t], simde_mm_sub_epi8(tmp2, q2_));
+				d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, simde_mm_set1_epi8(0x20))); // d = a > 0? 1<<5 : 0
+				simde_mm_store_si128(&pr[t], d);
 			}
 		}
 		if (!approx_max) { // find the exact max with a 32-bit score array
@@ -336,29 +270,24 @@
 			// compute H[], max_H and max_t
 			if (r > 0) {
 				int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i;
-				__m128i max_H_, max_t_;
+				simde__m128i max_H_, max_t_;
 				max_H = H[en0] = en0 > 0? H[en0-1] + u8[en0] : H[en0] + v8[en0]; // special casing the last element
 				max_t = en0;
-				max_H_ = _mm_set1_epi32(max_H);
-				max_t_ = _mm_set1_epi32(max_t);
+				max_H_ = simde_mm_set1_epi32(max_H);
+				max_t_ = simde_mm_set1_epi32(max_t);
 				for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t;
-					__m128i H1, tmp, t_;
-					H1 = _mm_loadu_si128((__m128i*)&H[t]);
-					t_ = _mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]);
-					H1 = _mm_add_epi32(H1, t_);
-					_mm_storeu_si128((__m128i*)&H[t], H1);
-					t_ = _mm_set1_epi32(t);
-					tmp = _mm_cmpgt_epi32(H1, max_H_);
-#ifdef __SSE4_1__
-					max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
-					max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
-#else
-					max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
-					max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
-#endif
+					simde__m128i H1, tmp, t_;
+					H1 = simde_mm_loadu_si128((simde__m128i*)&H[t]);
+					t_ = simde_mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]);
+					H1 = simde_mm_add_epi32(H1, t_);
+					simde_mm_storeu_si128((simde__m128i*)&H[t], H1);
+					t_ = simde_mm_set1_epi32(t);
+					tmp = simde_mm_cmpgt_epi32(H1, max_H_);
+					max_H_ = simde_mm_blendv_epi8(max_H_, H1, tmp);
+					max_t_ = simde_mm_blendv_epi8(max_t_, t_, tmp);
 				}
-				_mm_storeu_si128((__m128i*)HH, max_H_);
-				_mm_storeu_si128((__m128i*)tt, max_t_);
+				simde_mm_storeu_si128((simde__m128i*)HH, max_H_);
+				simde_mm_storeu_si128((simde__m128i*)tt, max_t_);
 				for (i = 0; i < 4; ++i)
 					if (max_H < HH[i]) max_H = HH[i], max_t = tt[i] + i;
 				for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE
@@ -406,4 +335,3 @@
 		kfree(km, mem2); kfree(km, off);
 	}
 }
-#endif // __SSE2__
--- minimap2.orig/ksw2_extz2_sse.c
+++ minimap2/ksw2_extz2_sse.c
@@ -2,72 +2,61 @@
 #include <assert.h>
 #include "ksw2.h"
 
-#ifdef __SSE2__
-#include <emmintrin.h>
 
-#ifdef KSW_SSE2_ONLY
-#undef __SSE4_1__
-#endif
-
-#ifdef __SSE4_1__
-#include <smmintrin.h>
-#endif
-
-#ifdef KSW_CPU_DISPATCH
-#ifdef __SSE4_1__
+#include "debian/include/simde/x86/sse4.1.h"
+#if defined(SIMDE_SSE4_1_NATIVE)
 void ksw_extz2_sse41(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
-#else
+#elif defined(SIMDE_SSE2_NATIVE)
 void ksw_extz2_sse2(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
-#endif
 #else
 void ksw_extz2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uint8_t *target, int8_t m, const int8_t *mat, int8_t q, int8_t e, int w, int zdrop, int end_bonus, int flag, ksw_extz_t *ez)
-#endif // ~KSW_CPU_DISPATCH
+#endif
 {
 #define __dp_code_block1 \
-	z = _mm_add_epi8(_mm_load_si128(&s[t]), qe2_); \
-	xt1 = _mm_load_si128(&x[t]);                     /* xt1 <- x[r-1][t..t+15] */ \
-	tmp = _mm_srli_si128(xt1, 15);                   /* tmp <- x[r-1][t+15] */ \
-	xt1 = _mm_or_si128(_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \
+	z = simde_mm_add_epi8(simde_mm_load_si128(&s[t]), qe2_); \
+	xt1 = simde_mm_load_si128(&x[t]);                     /* xt1 <- x[r-1][t..t+15] */ \
+	tmp = simde_mm_srli_si128(xt1, 15);                   /* tmp <- x[r-1][t+15] */ \
+	xt1 = simde_mm_or_si128(simde_mm_slli_si128(xt1, 1), x1_); /* xt1 <- x[r-1][t-1..t+14] */ \
 	x1_ = tmp; \
-	vt1 = _mm_load_si128(&v[t]);                     /* vt1 <- v[r-1][t..t+15] */ \
-	tmp = _mm_srli_si128(vt1, 15);                   /* tmp <- v[r-1][t+15] */ \
-	vt1 = _mm_or_si128(_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \
+	vt1 = simde_mm_load_si128(&v[t]);                     /* vt1 <- v[r-1][t..t+15] */ \
+	tmp = simde_mm_srli_si128(vt1, 15);                   /* tmp <- v[r-1][t+15] */ \
+	vt1 = simde_mm_or_si128(simde_mm_slli_si128(vt1, 1), v1_); /* vt1 <- v[r-1][t-1..t+14] */ \
 	v1_ = tmp; \
-	a = _mm_add_epi8(xt1, vt1);                      /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \
-	ut = _mm_load_si128(&u[t]);                      /* ut <- u[t..t+15] */ \
-	b = _mm_add_epi8(_mm_load_si128(&y[t]), ut);     /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */
+	a = simde_mm_add_epi8(xt1, vt1);                      /* a <- x[r-1][t-1..t+14] + v[r-1][t-1..t+14] */ \
+	ut = simde_mm_load_si128(&u[t]);                      /* ut <- u[t..t+15] */ \
+	b = simde_mm_add_epi8(simde_mm_load_si128(&y[t]), ut);     /* b <- y[r-1][t..t+15] + u[r-1][t..t+15] */
 
 #define __dp_code_block2 \
-	z = _mm_max_epu8(z, b);                          /* z = max(z, b); this works because both are non-negative */ \
-	z = _mm_min_epu8(z, max_sc_); \
-	_mm_store_si128(&u[t], _mm_sub_epi8(z, vt1));    /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \
-	_mm_store_si128(&v[t], _mm_sub_epi8(z, ut));     /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \
-	z = _mm_sub_epi8(z, q_); \
-	a = _mm_sub_epi8(a, z); \
-	b = _mm_sub_epi8(b, z);
+	z = simde_mm_max_epu8(z, b);                          /* z = max(z, b); this works because both are non-negative */ \
+	z = simde_mm_min_epu8(z, max_sc_); \
+	simde_mm_store_si128(&u[t], simde_mm_sub_epi8(z, vt1));    /* u[r][t..t+15] <- z - v[r-1][t-1..t+14] */ \
+	simde_mm_store_si128(&v[t], simde_mm_sub_epi8(z, ut));     /* v[r][t..t+15] <- z - u[r-1][t..t+15] */ \
+	z = simde_mm_sub_epi8(z, q_); \
+	a = simde_mm_sub_epi8(a, z); \
+	b = simde_mm_sub_epi8(b, z);
 
 	int r, t, qe = q + e, n_col_, *off = 0, *off_end = 0, tlen_, qlen_, last_st, last_en, wl, wr, max_sc, min_sc;
 	int with_cigar = !(flag&KSW_EZ_SCORE_ONLY), approx_max = !!(flag&KSW_EZ_APPROX_MAX);
 	int32_t *H = 0, H0 = 0, last_H0_t = 0;
 	uint8_t *qr, *sf, *mem, *mem2 = 0;
-	__m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_, sc_mch_, sc_mis_, sc_N_, m1_, max_sc_;
-	__m128i *u, *v, *x, *y, *s, *p = 0;
+	simde__m128i q_, qe2_, zero_, flag1_, flag2_, flag8_, flag16_, sc_mch_, sc_mis_, sc_N_, m1_, max_sc_;
+	simde__m128i *u, *v, *x, *y, *s, *p = 0;
 
 	ksw_reset_extz(ez);
 	if (m <= 0 || qlen <= 0 || tlen <= 0) return;
 
-	zero_   = _mm_set1_epi8(0);
-	q_      = _mm_set1_epi8(q);
-	qe2_    = _mm_set1_epi8((q + e) * 2);
-	flag1_  = _mm_set1_epi8(1);
-	flag2_  = _mm_set1_epi8(2);
-	flag8_  = _mm_set1_epi8(0x08);
-	flag16_ = _mm_set1_epi8(0x10);
-	sc_mch_ = _mm_set1_epi8(mat[0]);
-	sc_mis_ = _mm_set1_epi8(mat[1]);
-	sc_N_   = mat[m*m-1] == 0? _mm_set1_epi8(-e) : _mm_set1_epi8(mat[m*m-1]);
-	m1_     = _mm_set1_epi8(m - 1); // wildcard
-	max_sc_ = _mm_set1_epi8(mat[0] + (q + e) * 2);
+	zero_   = simde_mm_set1_epi8(0);
+	q_      = simde_mm_set1_epi8(q);
+	qe2_    = simde_mm_set1_epi8((q + e) * 2);
+	flag1_  = simde_mm_set1_epi8(1);
+	flag2_  = simde_mm_set1_epi8(2);
+	flag8_  = simde_mm_set1_epi8(0x08);
+	flag16_ = simde_mm_set1_epi8(0x10);
+	sc_mch_ = simde_mm_set1_epi8(mat[0]);
+	sc_mis_ = simde_mm_set1_epi8(mat[1]);
+	sc_N_   = mat[m*m-1] == 0? simde_mm_set1_epi8(-e) : simde_mm_set1_epi8(mat[m*m-1]);
+	m1_     = simde_mm_set1_epi8(m - 1); // wildcard
+	max_sc_ = simde_mm_set1_epi8(mat[0] + (q + e) * 2);
 
 	if (w < 0) w = tlen > qlen? tlen : qlen;
 	wl = wr = w;
@@ -82,7 +71,7 @@
 	if (-min_sc > 2 * (q + e)) return; // otherwise, we won't see any mismatches
 
 	mem = (uint8_t*)kcalloc(km, tlen_ * 6 + qlen_ + 1, 16);
-	u = (__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned
+	u = (simde__m128i*)(((size_t)mem + 15) >> 4 << 4); // 16-byte aligned
 	v = u + tlen_, x = v + tlen_, y = x + tlen_, s = y + tlen_, sf = (uint8_t*)(s + tlen_), qr = sf + tlen_ * 16;
 	if (!approx_max) {
 		H = (int32_t*)kmalloc(km, tlen_ * 16 * 4);
@@ -90,7 +79,7 @@
 	}
 	if (with_cigar) {
 		mem2 = (uint8_t*)kmalloc(km, ((size_t)(qlen + tlen - 1) * n_col_ + 1) * 16);
-		p = (__m128i*)(((size_t)mem2 + 15) >> 4 << 4);
+		p = (simde__m128i*)(((size_t)mem2 + 15) >> 4 << 4);
 		off = (int*)kmalloc(km, (qlen + tlen - 1) * sizeof(int) * 2);
 		off_end = off + qlen + tlen - 1;
 	}
@@ -102,7 +91,7 @@
 		int st = 0, en = tlen - 1, st0, en0, st_, en_;
 		int8_t x1, v1;
 		uint8_t *qrr = qr + (qlen - 1 - r), *u8 = (uint8_t*)u, *v8 = (uint8_t*)v;
-		__m128i x1_, v1_;
+		simde__m128i x1_, v1_;
 		// find the boundaries
 		if (st < r - qlen + 1) st = r - qlen + 1;
 		if (en > r) en = r;
@@ -124,101 +113,70 @@
 		// loop fission: set scores first
 		if (!(flag & KSW_EZ_GENERIC_SC)) {
 			for (t = st0; t <= en0; t += 16) {
-				__m128i sq, st, tmp, mask;
-				sq = _mm_loadu_si128((__m128i*)&sf[t]);
-				st = _mm_loadu_si128((__m128i*)&qrr[t]);
-				mask = _mm_or_si128(_mm_cmpeq_epi8(sq, m1_), _mm_cmpeq_epi8(st, m1_));
-				tmp = _mm_cmpeq_epi8(sq, st);
-#ifdef __SSE4_1__
-				tmp = _mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
-				tmp = _mm_blendv_epi8(tmp,     sc_N_,   mask);
-#else
-				tmp = _mm_or_si128(_mm_andnot_si128(tmp,  sc_mis_), _mm_and_si128(tmp,  sc_mch_));
-				tmp = _mm_or_si128(_mm_andnot_si128(mask, tmp),     _mm_and_si128(mask, sc_N_));
-#endif
-				_mm_storeu_si128((__m128i*)((uint8_t*)s + t), tmp);
+				simde__m128i sq, st, tmp, mask;
+				sq = simde_mm_loadu_si128((simde__m128i*)&sf[t]);
+				st = simde_mm_loadu_si128((simde__m128i*)&qrr[t]);
+				mask = simde_mm_or_si128(simde_mm_cmpeq_epi8(sq, m1_), simde_mm_cmpeq_epi8(st, m1_));
+				tmp = simde_mm_cmpeq_epi8(sq, st);
+				tmp = simde_mm_blendv_epi8(sc_mis_, sc_mch_, tmp);
+				tmp = simde_mm_blendv_epi8(tmp,     sc_N_,   mask);
+				simde_mm_storeu_si128((simde__m128i*)((uint8_t*)s + t), tmp);
 			}
 		} else {
 			for (t = st0; t <= en0; ++t)
 				((uint8_t*)s)[t] = mat[sf[t] * m + qrr[t]];
 		}
 		// core loop
-		x1_ = _mm_cvtsi32_si128(x1);
-		v1_ = _mm_cvtsi32_si128(v1);
+		x1_ = simde_mm_cvtsi32_si128(x1);
+		v1_ = simde_mm_cvtsi32_si128(v1);
 		st_ = st / 16, en_ = en / 16;
 		assert(en_ - st_ + 1 <= n_col_);
 		if (!with_cigar) { // score only
 			for (t = st_; t <= en_; ++t) {
-				__m128i z, a, b, xt1, vt1, ut, tmp;
+				simde__m128i z, a, b, xt1, vt1, ut, tmp;
 				__dp_code_block1;
-#ifdef __SSE4_1__
-				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8()
-				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
-				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
-#endif
+				z = simde_mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
 				__dp_code_block2;
-#ifdef __SSE4_1__
-				_mm_store_si128(&x[t], _mm_max_epi8(a, zero_));
-				_mm_store_si128(&y[t], _mm_max_epi8(b, zero_));
-#else
-				tmp = _mm_cmpgt_epi8(a, zero_);
-				_mm_store_si128(&x[t], _mm_and_si128(a, tmp));
-				tmp = _mm_cmpgt_epi8(b, zero_);
-				_mm_store_si128(&y[t], _mm_and_si128(b, tmp));
-#endif
+				simde_mm_store_si128(&x[t], simde_mm_max_epi8(a, zero_));
+				simde_mm_store_si128(&y[t], simde_mm_max_epi8(b, zero_));
 			}
 		} else if (!(flag&KSW_EZ_RIGHT)) { // gap left-alignment
-			__m128i *pr = p + (size_t)r * n_col_ - st_;
+			simde__m128i *pr = p + (size_t)r * n_col_ - st_;
 			off[r] = st, off_end[r] = en;
 			for (t = st_; t <= en_; ++t) {
-				__m128i d, z, a, b, xt1, vt1, ut, tmp;
+				simde__m128i d, z, a, b, xt1, vt1, ut, tmp;
 				__dp_code_block1;
-				d = _mm_and_si128(_mm_cmpgt_epi8(a, z), flag1_); // d = a > z? 1 : 0
-#ifdef __SSE4_1__
-				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
-				tmp = _mm_cmpgt_epi8(b, z);
-				d = _mm_blendv_epi8(d, flag2_, tmp);             // d = b > z? 2 : d
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
-				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
-				tmp = _mm_cmpgt_epi8(b, z);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, d), _mm_and_si128(tmp, flag2_)); // d = b > z? 2 : d; emulating blendv
-#endif
+				d = simde_mm_and_si128(simde_mm_cmpgt_epi8(a, z), flag1_); // d = a > z? 1 : 0
+				z = simde_mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
+				tmp = simde_mm_cmpgt_epi8(b, z);
+				d = simde_mm_blendv_epi8(d, flag2_, tmp);             // d = b > z? 2 : d
 				__dp_code_block2;
-				tmp = _mm_cmpgt_epi8(a, zero_);
-				_mm_store_si128(&x[t], _mm_and_si128(tmp, a));
-				d = _mm_or_si128(d, _mm_and_si128(tmp, flag8_));  // d = a > 0? 0x08 : 0
-				tmp = _mm_cmpgt_epi8(b, zero_);
-				_mm_store_si128(&y[t], _mm_and_si128(tmp, b));
-				d = _mm_or_si128(d, _mm_and_si128(tmp, flag16_)); // d = b > 0? 0x10 : 0
-				_mm_store_si128(&pr[t], d);
+				tmp = simde_mm_cmpgt_epi8(a, zero_);
+				simde_mm_store_si128(&x[t], simde_mm_and_si128(tmp, a));
+				d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, flag8_));  // d = a > 0? 0x08 : 0
+				tmp = simde_mm_cmpgt_epi8(b, zero_);
+				simde_mm_store_si128(&y[t], simde_mm_and_si128(tmp, b));
+				d = simde_mm_or_si128(d, simde_mm_and_si128(tmp, flag16_)); // d = b > 0? 0x10 : 0
+				simde_mm_store_si128(&pr[t], d);
 			}
 		} else { // gap right-alignment
-			__m128i *pr = p + (size_t)r * n_col_ - st_;
+			simde__m128i *pr = p + (size_t)r * n_col_ - st_;
 			off[r] = st, off_end[r] = en;
 			for (t = st_; t <= en_; ++t) {
-				__m128i d, z, a, b, xt1, vt1, ut, tmp;
+				simde__m128i d, z, a, b, xt1, vt1, ut, tmp;
 				__dp_code_block1;
-				d = _mm_andnot_si128(_mm_cmpgt_epi8(z, a), flag1_); // d = z > a? 0 : 1
-#ifdef __SSE4_1__
-				z = _mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
-				tmp = _mm_cmpgt_epi8(z, b);
-				d = _mm_blendv_epi8(flag2_, d, tmp);             // d = z > b? d : 2
-#else // we need to emulate SSE4.1 intrinsics _mm_max_epi8() and _mm_blendv_epi8()
-				z = _mm_and_si128(z, _mm_cmpgt_epi8(z, zero_));  // z = z > 0? z : 0;
-				z = _mm_max_epu8(z, a);                          // z = max(z, a); this works because both are non-negative
-				tmp = _mm_cmpgt_epi8(z, b);
-				d = _mm_or_si128(_mm_andnot_si128(tmp, flag2_), _mm_and_si128(tmp, d)); // d = z > b? d : 2; emulating blendv
-#endif
+				d = simde_mm_andnot_si128(simde_mm_cmpgt_epi8(z, a), flag1_); // d = z > a? 0 : 1
+				z = simde_mm_max_epi8(z, a);                          // z = z > a? z : a (signed)
+				tmp = simde_mm_cmpgt_epi8(z, b);
+				d = simde_mm_blendv_epi8(flag2_, d, tmp);             // d = z > b? d : 2
 				__dp_code_block2;
-				tmp = _mm_cmpgt_epi8(zero_, a);
-				_mm_store_si128(&x[t], _mm_andnot_si128(tmp, a));
-				d = _mm_or_si128(d, _mm_andnot_si128(tmp, flag8_));  // d = 0 > a? 0 : 0x08
-				tmp = _mm_cmpgt_epi8(zero_, b);
-				_mm_store_si128(&y[t], _mm_andnot_si128(tmp, b));
-				d = _mm_or_si128(d, _mm_andnot_si128(tmp, flag16_)); // d = 0 > b? 0 : 0x10
-				_mm_store_si128(&pr[t], d);
+				tmp = simde_mm_cmpgt_epi8(zero_, a);
+				simde_mm_store_si128(&x[t], simde_mm_andnot_si128(tmp, a));
+				d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, flag8_));  // d = 0 > a? 0 : 0x08
+				tmp = simde_mm_cmpgt_epi8(zero_, b);
+				simde_mm_store_si128(&y[t], simde_mm_andnot_si128(tmp, b));
+				d = simde_mm_or_si128(d, simde_mm_andnot_si128(tmp, flag16_)); // d = 0 > b? 0 : 0x10
+				simde_mm_store_si128(&pr[t], d);
 			}
 		}
 		if (!approx_max) { // find the exact max with a 32-bit score array
@@ -226,31 +184,26 @@
 			// compute H[], max_H and max_t
 			if (r > 0) {
 				int32_t HH[4], tt[4], en1 = st0 + (en0 - st0) / 4 * 4, i;
-				__m128i max_H_, max_t_, qe_;
+				simde__m128i max_H_, max_t_, qe_;
 				max_H = H[en0] = en0 > 0? H[en0-1] + u8[en0] - qe : H[en0] + v8[en0] - qe; // special casing the last element
 				max_t = en0;
-				max_H_ = _mm_set1_epi32(max_H);
-				max_t_ = _mm_set1_epi32(max_t);
-				qe_    = _mm_set1_epi32(q + e);
+				max_H_ = simde_mm_set1_epi32(max_H);
+				max_t_ = simde_mm_set1_epi32(max_t);
+				qe_    = simde_mm_set1_epi32(q + e);
 				for (t = st0; t < en1; t += 4) { // this implements: H[t]+=v8[t]-qe; if(H[t]>max_H) max_H=H[t],max_t=t;
-					__m128i H1, tmp, t_;
-					H1 = _mm_loadu_si128((__m128i*)&H[t]);
-					t_ = _mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]);
-					H1 = _mm_add_epi32(H1, t_);
-					H1 = _mm_sub_epi32(H1, qe_);
-					_mm_storeu_si128((__m128i*)&H[t], H1);
-					t_ = _mm_set1_epi32(t);
-					tmp = _mm_cmpgt_epi32(H1, max_H_);
-#ifdef __SSE4_1__
-					max_H_ = _mm_blendv_epi8(max_H_, H1, tmp);
-					max_t_ = _mm_blendv_epi8(max_t_, t_, tmp);
-#else
-					max_H_ = _mm_or_si128(_mm_and_si128(tmp, H1), _mm_andnot_si128(tmp, max_H_));
-					max_t_ = _mm_or_si128(_mm_and_si128(tmp, t_), _mm_andnot_si128(tmp, max_t_));
-#endif
+					simde__m128i H1, tmp, t_;
+					H1 = simde_mm_loadu_si128((simde__m128i*)&H[t]);
+					t_ = simde_mm_setr_epi32(v8[t], v8[t+1], v8[t+2], v8[t+3]);
+					H1 = simde_mm_add_epi32(H1, t_);
+					H1 = simde_mm_sub_epi32(H1, qe_);
+					simde_mm_storeu_si128((simde__m128i*)&H[t], H1);
+					t_ = simde_mm_set1_epi32(t);
+					tmp = simde_mm_cmpgt_epi32(H1, max_H_);
+					max_H_ = simde_mm_blendv_epi8(max_H_, H1, tmp);
+					max_t_ = simde_mm_blendv_epi8(max_t_, t_, tmp);
 				}
-				_mm_storeu_si128((__m128i*)HH, max_H_);
-				_mm_storeu_si128((__m128i*)tt, max_t_);
+				simde_mm_storeu_si128((simde__m128i*)HH, max_H_);
+				simde_mm_storeu_si128((simde__m128i*)tt, max_t_);
 				for (i = 0; i < 4; ++i)
 					if (max_H < HH[i]) max_H = HH[i], max_t = tt[i] + i;
 				for (; t < en0; ++t) { // for the rest of values that haven't been computed with SSE
@@ -302,4 +255,3 @@
 		kfree(km, mem2); kfree(km, off);
 	}
 }
-#endif // __SSE2__
--- minimap2.orig/ksw2_ll_sse.c
+++ minimap2/ksw2_ll_sse.c
@@ -1,7 +1,7 @@
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
-#include <emmintrin.h>
+#include "debian/include/simde/x86/sse2.h"
 #include "ksw2.h"
 
 #ifdef __GNUC__
@@ -15,7 +15,7 @@
 typedef struct {
 	int qlen, slen;
 	uint8_t shift, mdiff, max, size;
-	__m128i *qp, *H0, *H1, *E, *Hmax;
+	simde__m128i *qp, *H0, *H1, *E, *Hmax;
 } kswq_t;
 
 /**
@@ -35,10 +35,10 @@
 	int slen, a, tmp, p;
 
 	size = size > 1? 2 : 1;
-	p = 8 * (3 - size); // # values per __m128i
+	p = 8 * (3 - size); // # values per simde__m128i
 	slen = (qlen + p - 1) / p; // segmented length
 	q = (kswq_t*)kmalloc(km, sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory
-	q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
+	q->qp = (simde__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory
 	q->H0 = q->qp + slen * m;
 	q->H1 = q->H0 + slen;
 	q->E  = q->H1 + slen;
@@ -81,63 +81,63 @@
 {
 	kswq_t *q = (kswq_t*)q_;
 	int slen, i, gmax = 0, qlen8;
-	__m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax;
+	simde__m128i zero, gapoe, gape, *H0, *H1, *E, *Hmax;
 	uint16_t *H8;
 
 #define __max_8(ret, xx) do { \
-		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \
-		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \
-		(xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \
-		(ret) = _mm_extract_epi16((xx), 0); \
+		(xx) = simde_mm_max_epi16((xx), simde_mm_srli_si128((xx), 8)); \
+		(xx) = simde_mm_max_epi16((xx), simde_mm_srli_si128((xx), 4)); \
+		(xx) = simde_mm_max_epi16((xx), simde_mm_srli_si128((xx), 2)); \
+		(ret) = simde_mm_extract_epi16((xx), 0); \
 	} while (0)
 
 	// initialization
 	*qe = *te = -1;
-	zero = _mm_set1_epi32(0);
-	gapoe = _mm_set1_epi16(_gapo + _gape);
-	gape = _mm_set1_epi16(_gape);
+	zero = simde_mm_set1_epi32(0);
+	gapoe = simde_mm_set1_epi16(_gapo + _gape);
+	gape = simde_mm_set1_epi16(_gape);
 	H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax;
 	slen = q->slen, qlen8 = slen * 8;
-	memset(E,    0, slen * sizeof(__m128i));
-	memset(H0,   0, slen * sizeof(__m128i));
-	memset(Hmax, 0, slen * sizeof(__m128i));
+	memset(E,    0, slen * sizeof(simde__m128i));
+	memset(H0,   0, slen * sizeof(simde__m128i));
+	memset(Hmax, 0, slen * sizeof(simde__m128i));
 	// the core loop
 	for (i = 0; i < tlen; ++i) {
 		int j, k, imax;
-		__m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
-		h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
-		h = _mm_slli_si128(h, 2);
+		simde__m128i e, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector
+		h = simde_mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example
+		h = simde_mm_slli_si128(h, 2);
 		for (j = 0; LIKELY(j < slen); ++j) {
-			h = _mm_adds_epi16(h, *S++);
-			e = _mm_load_si128(E + j);
-			h = _mm_max_epi16(h, e);
-			h = _mm_max_epi16(h, f);
-			max = _mm_max_epi16(max, h);
-			_mm_store_si128(H1 + j, h);
-			h = _mm_subs_epu16(h, gapoe);
-			e = _mm_subs_epu16(e, gape);
-			e = _mm_max_epi16(e, h);
-			_mm_store_si128(E + j, e);
-			f = _mm_subs_epu16(f, gape);
-			f = _mm_max_epi16(f, h);
-			h = _mm_load_si128(H0 + j);
+			h = simde_mm_adds_epi16(h, *S++);
+			e = simde_mm_load_si128(E + j);
+			h = simde_mm_max_epi16(h, e);
+			h = simde_mm_max_epi16(h, f);
+			max = simde_mm_max_epi16(max, h);
+			simde_mm_store_si128(H1 + j, h);
+			h = simde_mm_subs_epu16(h, gapoe);
+			e = simde_mm_subs_epu16(e, gape);
+			e = simde_mm_max_epi16(e, h);
+			simde_mm_store_si128(E + j, e);
+			f = simde_mm_subs_epu16(f, gape);
+			f = simde_mm_max_epi16(f, h);
+			h = simde_mm_load_si128(H0 + j);
 		}
 		for (k = 0; LIKELY(k < 8); ++k) {
-			f = _mm_slli_si128(f, 2);
+			f = simde_mm_slli_si128(f, 2);
 			for (j = 0; LIKELY(j < slen); ++j) {
-				h = _mm_load_si128(H1 + j);
-				h = _mm_max_epi16(h, f);
-				_mm_store_si128(H1 + j, h);
-				h = _mm_subs_epu16(h, gapoe);
-				f = _mm_subs_epu16(f, gape);
-				if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop_i16;
+				h = simde_mm_load_si128(H1 + j);
+				h = simde_mm_max_epi16(h, f);
+				simde_mm_store_si128(H1 + j, h);
+				h = simde_mm_subs_epu16(h, gapoe);
+				f = simde_mm_subs_epu16(f, gape);
+				if(UNLIKELY(!simde_mm_movemask_epi8(simde_mm_cmpgt_epi16(f, h)))) goto end_loop_i16;
 			}
 		}
 end_loop_i16:
 		__max_8(imax, max);
 		if (imax >= gmax) {
 			gmax = imax; *te = i;
-			memcpy(Hmax, H1, slen * sizeof(__m128i));
+			memcpy(Hmax, H1, slen * sizeof(simde__m128i));
 		}
 		S = H1; H1 = H0; H0 = S;
 	}
--- minimap2.orig/Makefile
+++ minimap2/Makefile
@@ -6,21 +6,17 @@
 PROG_EXTRA=	sdust minimap2-lite
 LIBS=		-lm -lz -lpthread
 
-ifeq ($(arm_neon),) # if arm_neon is not defined
-ifeq ($(sse2only),) # if sse2only is not defined
+OBJS+=ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_exts2_sse.o
+
+ifneq ($(amd64),)
 	OBJS+=ksw2_extz2_sse41.o ksw2_extd2_sse41.o ksw2_exts2_sse41.o ksw2_extz2_sse2.o ksw2_extd2_sse2.o ksw2_exts2_sse2.o ksw2_dispatch.o
-else                # if sse2only is defined
-	OBJS+=ksw2_extz2_sse.o ksw2_extd2_sse.o ksw2_exts2_sse.o
+else ifneq ($(i386),)
+	OBJS+=ksw2_extz2_sse2.o ksw2_extd2_sse2.o ksw2_exts2_sse2.o ksw2_dispatch.o
 endif
-else				# if arm_neon is defined
-	OBJS+=ksw2_extz2_neon.o ksw2_extd2_neon.o ksw2_exts2_neon.o
-    INCLUDES+=-Isse2neon
-ifeq ($(aarch64),)	#if aarch64 is not defined
-	CFLAGS+=-D_FILE_OFFSET_BITS=64 -mfpu=neon -fsigned-char
-else				#if aarch64 is defined
+
+ifneq ($(aarch64),)	#if aarch64 is defined
 	CFLAGS+=-D_FILE_OFFSET_BITS=64 -fsigned-char
 endif
-endif
 
 .PHONY:all extra clean depend
 .SUFFIXES:.c .o
@@ -46,7 +42,7 @@
 
 # SSE-specific targets on x86/x86_64
 
-ifeq ($(arm_neon),)   # if arm_neon is defined, compile this target with the default setting (i.e. no -msse2)
+ifneq ($(amd64),)   # if amd64 is not defined, compile this target with the default setting (i.e. no -msse2)
 ksw2_ll_sse.o:ksw2_ll_sse.c ksw2.h kalloc.h
 		$(CC) -c $(CFLAGS) -msse2 $(CPPFLAGS) $(INCLUDES) $< -o $@
 endif
@@ -72,17 +68,6 @@
 ksw2_dispatch.o:ksw2_dispatch.c ksw2.h
 		$(CC) -c $(CFLAGS) -msse4.1 $(CPPFLAGS) -DKSW_CPU_DISPATCH $(INCLUDES) $< -o $@
 
-# NEON-specific targets on ARM
-
-ksw2_extz2_neon.o:ksw2_extz2_sse.c ksw2.h kalloc.h
-		$(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ $(INCLUDES) $< -o $@
-
-ksw2_extd2_neon.o:ksw2_extd2_sse.c ksw2.h kalloc.h
-		$(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ $(INCLUDES) $< -o $@
-
-ksw2_exts2_neon.o:ksw2_exts2_sse.c ksw2.h kalloc.h
-		$(CC) -c $(CFLAGS) $(CPPFLAGS) -DKSW_SSE2_ONLY -D__SSE2__ $(INCLUDES) $< -o $@
-
 # other non-file targets
 
 clean:
