Author: Michael R. Crusoe <michael.crusoe@gmail.com>
Description: Increase portability by using SIMD Everywhere library
--- vg.orig/deps/dozeu/dozeu.h
+++ vg/deps/dozeu/dozeu.h
@@ -42,7 +42,7 @@
 #include <stdint.h>
 #include <stddef.h>
 #include <string.h>
-#include <x86intrin.h>
+#include "../../debian/include/simde/x86/sse4.1.h"
 
 #ifndef DZ_CIGAR_OP
 #  define DZ_CIGAR_OP				0x04030201
@@ -143,9 +143,9 @@
 #define dz_storeu_u64(p, e)			{ uint8_t *_p = (uint8_t *)(p); *((uint64_t *)(_p)) = (e); }
 
 #ifdef __SSE4_1__
-#  define dz_is_all_zero(x)			( _mm_test_all_zeros((x), (x)) == 1 )
+#  define dz_is_all_zero(x)			( simde_mm_test_all_zeros((x), (x)) == 1 )
 #else
-#  define dz_is_all_zero(x)			( _mm_movemask_epi8((x)) == 0 )
+#  define dz_is_all_zero(x)			( simde_mm_movemask_epi8((x)) == 0 )
 #endif
 
 #define DZ_MEM_MARGIN_SIZE			( 256 )
@@ -159,15 +159,15 @@
 #define DZ_CELL_MARGINED_MAX		( DZ_CELL_MAX - DZ_CELL_MARGIN )
 
 /* query; preconverted query sequence; blen = roundup(qlen, L) / L; array must have 16-byte-length margin at the tail */
-struct dz_query_s { uint64_t blen; char const *q; int16_t bonus[2 * sizeof(__m128i) / sizeof(int16_t)]; uint8_t arr[]; };
-dz_static_assert(sizeof(struct dz_query_s) % sizeof(__m128i) == 0);
+struct dz_query_s { uint64_t blen; char const *q; int16_t bonus[2 * sizeof(simde__m128i) / sizeof(int16_t)]; uint8_t arr[]; };
+dz_static_assert(sizeof(struct dz_query_s) % sizeof(simde__m128i) == 0);
 
 /* node (reference) */
 struct dz_node_s { int32_t id, len; uint8_t const *ptr; };
-dz_static_assert(sizeof(struct dz_node_s) % sizeof(__m128i) == 0);
+dz_static_assert(sizeof(struct dz_node_s) % sizeof(simde__m128i) == 0);
 
 /* DP matrix structures */
-struct dz_swgv_s { __m128i e, f, s; };						/* followed by dz_cap_s */
+struct dz_swgv_s { simde__m128i e, f, s; };						/* followed by dz_cap_s */
 struct dz_range_s { uint32_t spos, epos; };					/* placed just after every score vector to indicate the length */
 
 struct dz_head_s {
@@ -186,9 +186,9 @@
 	struct dz_query_s const *query;
 	struct dz_cap_s const *mcap;
 };
-dz_static_assert(sizeof(struct dz_swgv_s) % sizeof(__m128i) == 0);
-dz_static_assert(sizeof(struct dz_cap_s) % sizeof(__m128i) == 0);
-dz_static_assert(sizeof(struct dz_forefront_s) % sizeof(__m128i) == 0);
+dz_static_assert(sizeof(struct dz_swgv_s) % sizeof(simde__m128i) == 0);
+dz_static_assert(sizeof(struct dz_cap_s) % sizeof(simde__m128i) == 0);
+dz_static_assert(sizeof(struct dz_forefront_s) % sizeof(simde__m128i) == 0);
 #define dz_swgv(_p)					( (struct dz_swgv_s *)(_p) )
 #define dz_cswgv(_p)				( (struct dz_swgv_s const *)(_p) )
 #define dz_range(_p)				( (struct dz_range_s *)(_p) )
@@ -220,7 +220,7 @@
 #define dz_mem_stack_rem(_mem)		( (size_t)((_mem)->stack.end - (_mem)->stack.top) )
 
 struct dz_s { int8_t matrix[32]; uint16_t giv[8], gev[8], xt, bonus, max_gap_len, _pad[9]; struct dz_forefront_s const *root; int8_t protein_matrix[]; };
-dz_static_assert(sizeof(struct dz_s) % sizeof(__m128i) == 0);
+dz_static_assert(sizeof(struct dz_s) % sizeof(simde__m128i) == 0);
 #define dz_mem(_self)				( (struct dz_mem_s *)(_self) - 1 )
 
 #define dz_root(_self)				( (struct dz_forefront_s const **)(&_self->root) )
@@ -231,14 +231,14 @@
 #ifdef DZ_PRINT_VECTOR
 #define print_vector(v) { \
 	debug("%s (%d, %d, %d, %d, %d, %d, %d, %d)", #v, \
-	(int16_t)_mm_extract_epi16(v, 7), \
-	(int16_t)_mm_extract_epi16(v, 6), \
-	(int16_t)_mm_extract_epi16(v, 5), \
-	(int16_t)_mm_extract_epi16(v, 4), \
-	(int16_t)_mm_extract_epi16(v, 3), \
-	(int16_t)_mm_extract_epi16(v, 2), \
-	(int16_t)_mm_extract_epi16(v, 1), \
-	(int16_t)_mm_extract_epi16(v, 0)); \
+	(int16_t)simde_mm_extract_epi16(v, 7), \
+	(int16_t)simde_mm_extract_epi16(v, 6), \
+	(int16_t)simde_mm_extract_epi16(v, 5), \
+	(int16_t)simde_mm_extract_epi16(v, 4), \
+	(int16_t)simde_mm_extract_epi16(v, 3), \
+	(int16_t)simde_mm_extract_epi16(v, 2), \
+	(int16_t)simde_mm_extract_epi16(v, 1), \
+	(int16_t)simde_mm_extract_epi16(v, 0)); \
 }
 #else
 #define print_vector(v) ;
@@ -363,7 +363,7 @@
 {
 	if(dz_mem_stack_rem(mem) < 4096) { dz_mem_add_stack(mem, 0); }
 	void *ptr = (void *)mem->stack.top;
-	mem->stack.top += dz_roundup(size, sizeof(__m128i));
+	mem->stack.top += dz_roundup(size, sizeof(simde__m128i));
 	return(ptr);
 }
 
@@ -382,7 +382,7 @@
  * vector update macros
  */
 #define _calc_next_size(_sp, _ep, _nt) ({ \
-	size_t forefront_arr_size = dz_roundup(sizeof(struct dz_forefront_s *) * (_nt), sizeof(__m128i)); \
+	size_t forefront_arr_size = dz_roundup(sizeof(struct dz_forefront_s *) * (_nt), sizeof(simde__m128i)); \
 	size_t est_column_size = 2 * ((_ep) - (_sp)) * sizeof(struct dz_swgv_s); \
 	size_t next_req = forefront_arr_size + est_column_size + sizeof(struct dz_cap_s); \
 	/* debug("est_column_size(%lu), next_req(%lu)", est_column_size, next_req); */ \
@@ -390,7 +390,7 @@
 })
 #define _init_cap(_adj, _rch, _forefronts, _n_forefronts) ({ \
 	/* push forefront pointers */ \
-	size_t forefront_arr_size = dz_roundup(sizeof(struct dz_forefront_s *) * (_n_forefronts), sizeof(__m128i)); \
+	size_t forefront_arr_size = dz_roundup(sizeof(struct dz_forefront_s *) * (_n_forefronts), sizeof(simde__m128i)); \
 	struct dz_forefront_s const **dst = (struct dz_forefront_s const **)(dz_mem(self)->stack.top + forefront_arr_size); \
 	struct dz_forefront_s const **src = (struct dz_forefront_s const **)(_forefronts); \
 	for(size_t i = 0; i < (_n_forefronts); i++) { dst[-((int64_t)(_n_forefronts)) + i] = src[i]; } \
@@ -464,7 +464,7 @@
 	uint32_t blim = (_query)->blen - 1; \
 	uint8_t const *pbonus = (_query)->arr;
 
-#define _add_bonus(_i, _v)			( _mm_add_epi16((_v), _mm_load_si128(&((__m128i const *)pbonus)[-2 + ((_i) == blim)])) )
+#define _add_bonus(_i, _v)			( simde_mm_add_epi16((_v), simde_mm_load_si128(&((simde__m128i const *)pbonus)[-2 + ((_i) == blim)])) )
 #else
 #define _init_bonus(_query)			;
 #define _add_bonus(_i, _v)			( (_v) )
@@ -476,12 +476,12 @@
 	uint32_t rch = conv[_rt[-_rrem] & 0x0f]; \
 	/* debug("rch(%c, %u, %x)", _rt[-_rrem], rch, rch); */ \
 	uint8_t const *parr = (_query)->arr; \
-	__m128i const rv = _mm_set1_epi8(rch);
+	simde__m128i const rv = simde_mm_set1_epi8(rch);
 
 #define _calc_score_profile(_i) ({ \
-	__m128i qv = _mm_loadl_epi64((__m128i const *)&parr[(_i) * L]); \
-	__m128i sc = _mm_cvtepi8_epi16(_mm_shuffle_epi8(_mm_load_si128((__m128i const *)self->matrix), _mm_or_si128(rv, qv))); \
-	/* print_vector(_mm_cvtepi8_epi16(rv)); print_vector(_mm_cvtepi8_epi16(qv)); */ \
+	simde__m128i qv = simde_mm_loadl_epi64((simde__m128i const *)&parr[(_i) * L]); \
+	simde__m128i sc = simde_mm_cvtepi8_epi16(simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i const *)self->matrix), simde_mm_or_si128(rv, qv))); \
+	/* print_vector(simde_mm_cvtepi8_epi16(rv)); print_vector(simde_mm_cvtepi8_epi16(qv)); */ \
 	sc; \
 })
 #elif defined(DZ_NUCL_2BIT)
@@ -490,11 +490,11 @@
 	uint32_t rch = dir < 0 ? _rt[-_rrem] : (_rt[-_rrem] ^ 0x03); \
 	/* debug("rch(%c, %u, %x)", _rt[-_rrem], rch, rch); */ \
 	uint8_t const *parr = (_query)->arr; \
-	__m128i const rv = _mm_set1_epi8(rch);
+	simde__m128i const rv = simde_mm_set1_epi8(rch);
 
 #define _calc_score_profile(_i) ({ \
-	__m128i qv = _mm_loadl_epi64((__m128i const *)&parr[(_i) * L]); \
-	__m128i sc = _mm_cvtepi8_epi16(_mm_shuffle_epi8(_mm_load_si128((__m128i const *)self->matrix), _mm_or_si128(rv, qv))); \
+	simde__m128i qv = simde_mm_loadl_epi64((simde__m128i const *)&parr[(_i) * L]); \
+	simde__m128i sc = simde_mm_cvtepi8_epi16(simde_mm_shuffle_epi8(simde_mm_load_si128((simde__m128i const *)self->matrix), simde_mm_or_si128(rv, qv))); \
 	sc; \
 })
 #else /* DZ_PROTEIN */
@@ -505,41 +505,41 @@
 	int8_t const *parr = (int8_t const *)&(_query)->arr[rch * (_query)->blen * L];
 
 #define _calc_score_profile(_i) ({ \
-	__m128i sc = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i const *)&parr[(_i) * L])); \
+	simde__m128i sc = simde_mm_cvtepi8_epi16(simde_mm_loadl_epi64((simde__m128i const *)&parr[(_i) * L])); \
 	sc; \
 })
 #endif
 
 #define _load_vector(_p) \
-	__m128i e = _mm_load_si128((__m128i const *)(&dz_cswgv(_p)->e)); /* print_vector(e); */ \
-	__m128i s = _mm_load_si128((__m128i const *)(&dz_cswgv(_p)->s)); /* print_vector(s); */
+	simde__m128i e = simde_mm_load_si128((simde__m128i const *)(&dz_cswgv(_p)->e)); /* print_vector(e); */ \
+	simde__m128i s = simde_mm_load_si128((simde__m128i const *)(&dz_cswgv(_p)->s)); /* print_vector(s); */
 #define _update_vector(_p) { \
-	__m128i sc = _calc_score_profile(_p); \
-	__m128i te = _mm_subs_epi16(_mm_max_epi16(e, _mm_subs_epi16(s, giv)), gev1); \
-	/* print_vector(_mm_alignr_epi8(s, ps, 14)); print_vector(sc); */ \
-	__m128i ts = _mm_max_epi16(te, _mm_adds_epi16(sc, _mm_alignr_epi8(s, ps, 14))); ps = s; \
-	__m128i tf = _mm_max_epi16(_mm_subs_epi16(ts, giv), _mm_subs_epi16(_mm_alignr_epi8(minv, f, 14), gev1)); \
-	tf = _mm_max_epi16(tf, _mm_subs_epi16(_mm_alignr_epi8(tf, minv, 14), gev1)); \
-	tf = _mm_max_epi16(tf, _mm_subs_epi16(_mm_alignr_epi8(tf, minv, 12), gev2)); \
-	tf = _mm_max_epi16(tf, _mm_subs_epi16(_mm_alignr_epi8(tf, minv, 8), gev4)); \
-	ts = _mm_max_epi16(ts, tf); print_vector(ts); \
-	maxv = _mm_max_epi16(maxv, _add_bonus(_p, ts)); \
+	simde__m128i sc = _calc_score_profile(_p); \
+	simde__m128i te = simde_mm_subs_epi16(simde_mm_max_epi16(e, simde_mm_subs_epi16(s, giv)), gev1); \
+	/* print_vector(simde_mm_alignr_epi8(s, ps, 14)); print_vector(sc); */ \
+	simde__m128i ts = simde_mm_max_epi16(te, simde_mm_adds_epi16(sc, simde_mm_alignr_epi8(s, ps, 14))); ps = s; \
+	simde__m128i tf = simde_mm_max_epi16(simde_mm_subs_epi16(ts, giv), simde_mm_subs_epi16(simde_mm_alignr_epi8(minv, f, 14), gev1)); \
+	tf = simde_mm_max_epi16(tf, simde_mm_subs_epi16(simde_mm_alignr_epi8(tf, minv, 14), gev1)); \
+	tf = simde_mm_max_epi16(tf, simde_mm_subs_epi16(simde_mm_alignr_epi8(tf, minv, 12), gev2)); \
+	tf = simde_mm_max_epi16(tf, simde_mm_subs_epi16(simde_mm_alignr_epi8(tf, minv, 8), gev4)); \
+	ts = simde_mm_max_epi16(ts, tf); print_vector(ts); \
+	maxv = simde_mm_max_epi16(maxv, _add_bonus(_p, ts)); \
 	/* print_vector(te); print_vector(_add_bonus(_p, ts)); print_vector(tf); print_vector(maxv);*/ \
 	e = te; f = tf; s = ts; \
 }
 #define _store_vector(_p) { \
-	_mm_store_si128((__m128i *)(&dz_swgv(_p)->e), e); \
-	_mm_store_si128((__m128i *)(&dz_swgv(_p)->f), f); \
-	_mm_store_si128((__m128i *)(&dz_swgv(_p)->s), s); \
+	simde_mm_store_si128((simde__m128i *)(&dz_swgv(_p)->e), e); \
+	simde_mm_store_si128((simde__m128i *)(&dz_swgv(_p)->f), f); \
+	simde_mm_store_si128((simde__m128i *)(&dz_swgv(_p)->s), s); \
 }
 #define _hmax_vector(_v) ({ \
-	__m128i _t = _mm_max_epi16(_v, _mm_srli_si128(_v, 8)); \
-	_t = _mm_max_epi16(_t, _mm_srli_si128(_t, 4)); \
-	_t = _mm_max_epi16(_t, _mm_srli_si128(_t, 2)); \
-	((int16_t)(_mm_extract_epi16(_t, 0))); \
+	simde__m128i _t = simde_mm_max_epi16(_v, simde_mm_srli_si128(_v, 8)); \
+	_t = simde_mm_max_epi16(_t, simde_mm_srli_si128(_t, 4)); \
+	_t = simde_mm_max_epi16(_t, simde_mm_srli_si128(_t, 2)); \
+	((int16_t)(simde_mm_extract_epi16(_t, 0))); \
 })
 #define _test_xdrop(_s, _xtv) ({ \
-	__m128i xtest = _mm_cmpgt_epi16(_s, _xtv); \
+	simde__m128i xtest = simde_mm_cmpgt_epi16(_s, _xtv); \
 	/* print_vector(_s); print_vector(_xtv); */ \
 	dz_is_all_zero(xtest); \
 })
@@ -555,7 +555,7 @@
 	uint64_t max_gap_len,			/* as X-drop threshold */
 	uint16_t full_length_bonus)		/* end-to-end mapping bonus; only activated when compiled with -DDZ_FULL_LENGTH_BONUS */
 {
-	size_t const L = sizeof(__m128i) / sizeof(uint16_t), gi = gap_open, ge = gap_extend;
+	size_t const L = sizeof(simde__m128i) / sizeof(uint16_t), gi = gap_open, ge = gap_extend;
 	/*
 	static uint8_t const transpose[16] __attribute__(( aligned(16) )) = {
 		0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
@@ -571,15 +571,15 @@
 		struct dz_s *self = (struct dz_s *)dz_mem_malloc(mem, sizeof(struct dz_s));
 
 		/* constants */
-		__m128i const tmat = _mm_loadu_si128((__m128i const *)score_matrix);
-		_mm_store_si128((__m128i *)&self->matrix[0], tmat);
-		_mm_store_si128((__m128i *)&self->matrix[16], _mm_setzero_si128());
+		simde__m128i const tmat = simde_mm_loadu_si128((simde__m128i const *)score_matrix);
+		simde_mm_store_si128((simde__m128i *)&self->matrix[0], tmat);
+		simde_mm_store_si128((simde__m128i *)&self->matrix[16], simde_mm_setzero_si128());
 	#else
-		struct dz_s *self = (struct dz_s *)dz_mem_malloc(mem, sizeof(struct dz_s) + DZ_MAT_SIZE * DZ_MAT_SIZE + 2 * sizeof(__m128i));
+		struct dz_s *self = (struct dz_s *)dz_mem_malloc(mem, sizeof(struct dz_s) + DZ_MAT_SIZE * DZ_MAT_SIZE + 2 * sizeof(simde__m128i));
 
 		/* clear the first matrix field for protein */
-		_mm_store_si128((__m128i *)&self->matrix[0], _mm_setzero_si128());
-		_mm_store_si128((__m128i *)&self->matrix[16], _mm_setzero_si128());
+		simde_mm_store_si128((simde__m128i *)&self->matrix[0], simde_mm_setzero_si128());
+		simde_mm_store_si128((simde__m128i *)&self->matrix[16], simde_mm_setzero_si128());
 
 		/* transpose */
 		for(uint64_t i = 0; i < DZ_MAT_SIZE; i++) {
@@ -589,10 +589,10 @@
 		}
 	#endif
 
-	__m128i const giv = _mm_set1_epi16(gi);
-	__m128i const gev = _mm_set1_epi16(ge);
-	_mm_store_si128((__m128i *)self->giv, giv);
-	_mm_store_si128((__m128i *)self->gev, gev);
+	simde__m128i const giv = simde_mm_set1_epi16(gi);
+	simde__m128i const gev = simde_mm_set1_epi16(ge);
+	simde_mm_store_si128((simde__m128i *)self->giv, giv);
+	simde_mm_store_si128((simde__m128i *)self->gev, gev);
 	self->xt = gi + ge * max_gap_len;			/* X-drop threshold */
 	self->bonus = full_length_bonus;
 	self->max_gap_len = max_gap_len;			/* save raw value */
@@ -600,7 +600,7 @@
 
 	/* create root head */
 	struct dz_cap_s *cap = (struct dz_cap_s *)dz_mem_malloc(mem, sizeof(struct dz_cap_s));
-	_mm_store_si128((__m128i *)cap, _mm_setzero_si128());
+	simde_mm_store_si128((simde__m128i *)cap, simde_mm_setzero_si128());
 
 	/* calc vector length; query = NULL for the first (root) column */
 	max_gap_len = dz_roundup(max_gap_len, L);
@@ -611,16 +611,16 @@
 	struct dz_swgv_s *dp = _begin_column_head(0, max_gap_len / L, 0, &a, 0);
 
 	/* fill the root (the leftmost) column; first init vectors */
-	__m128i s = _mm_setr_epi16(0, -(gi+ge), -(gi+2*ge), -(gi+3*ge), -(gi+4*ge), -(gi+5*ge), -(gi+6*ge), -(gi+7*ge));
-	__m128i const e = _mm_set1_epi16(DZ_CELL_MIN), xtv = _mm_set1_epi16(-self->xt);
+	simde__m128i s = simde_mm_setr_epi16(0, -(gi+ge), -(gi+2*ge), -(gi+3*ge), -(gi+4*ge), -(gi+5*ge), -(gi+6*ge), -(gi+7*ge));
+	simde__m128i const e = simde_mm_set1_epi16(DZ_CELL_MIN), xtv = simde_mm_set1_epi16(-self->xt);
 
 	/* until the X-drop test fails on all the cells in a vector */
 	for(size_t p = 0; p < max_gap_len / L; p++) {
-		__m128i const f = s;
+		simde__m128i const f = s;
 		if(dz_unlikely(_test_xdrop(s, xtv))) { debug("p(%lu)", p); w.r.epos = p; break; }
 		_store_vector(&dp[p]);
-		if(p == 0) { s = _mm_setr_epi16(-gi, -(gi+ge), -(gi+2*ge), -(gi+3*ge), -(gi+4*ge), -(gi+5*ge), -(gi+6*ge), -(gi+7*ge)); }
-		s = _mm_subs_epi16(s, _mm_slli_epi16(gev, 3));
+		if(p == 0) { s = simde_mm_setr_epi16(-gi, -(gi+ge), -(gi+2*ge), -(gi+3*ge), -(gi+4*ge), -(gi+5*ge), -(gi+6*ge), -(gi+7*ge)); }
+		s = simde_mm_subs_epi16(s, simde_mm_slli_epi16(gev, 3));
 	}
 
 	/* done; create and return a forefront object */
@@ -787,8 +787,8 @@
 	struct dz_s *self,
 	char const *query, size_t qlen)
 {
-	size_t const L = sizeof(__m128i) / sizeof(uint16_t);
-	struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + dz_roundup(qlen + 1, L) + sizeof(__m128i));
+	size_t const L = sizeof(simde__m128i) / sizeof(uint16_t);
+	struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + dz_roundup(qlen + 1, L) + sizeof(simde__m128i));
 	*q = (struct dz_query_s){
 		.blen = qlen == 0 ? 0 : (dz_roundup(qlen + 1, L) / L),
 		.q = query,
@@ -812,24 +812,24 @@
 			qA, qC, qG, qT, qN, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, qN
 		#endif
 	};
-	__m128i pv = _mm_set1_epi8((int8_t)qN);
-	__m128i const fv = _mm_set1_epi8(0x0f);														/* conversion mask */
-	__m128i const cv = _mm_load_si128((__m128i const *)conv);									/* conversion table */
+	simde__m128i pv = simde_mm_set1_epi8((int8_t)qN);
+	simde__m128i const fv = simde_mm_set1_epi8(0x0f);														/* conversion mask */
+	simde__m128i const cv = simde_mm_load_si128((simde__m128i const *)conv);									/* conversion table */
 
 	/* until the end of the query sequence */
-	for(size_t i = 0; i < dz_rounddown(qlen, sizeof(__m128i)); i += sizeof(__m128i)) {
-		__m128i const qv = _mm_loadu_si128((__m128i const *)&query[i]);
-		__m128i tv = _mm_shuffle_epi8(cv, _mm_and_si128(qv, fv));
-		_mm_store_si128((__m128i *)&q->arr[i], _mm_alignr_epi8(tv, pv, 15)); pv = tv;			/* shift by one to make room for a base */
+	for(size_t i = 0; i < dz_rounddown(qlen, sizeof(simde__m128i)); i += sizeof(simde__m128i)) {
+		simde__m128i const qv = simde_mm_loadu_si128((simde__m128i const *)&query[i]);
+		simde__m128i tv = simde_mm_shuffle_epi8(cv, simde_mm_and_si128(qv, fv));
+		simde_mm_store_si128((simde__m128i *)&q->arr[i], simde_mm_alignr_epi8(tv, pv, 15)); pv = tv;			/* shift by one to make room for a base */
 	}
 
 	/* continue the same conversion on the remainings */
-	// _mm_store_si128((__m128i *)&q->arr[dz_rounddown(qlen, sizeof(__m128i))], _mm_srli_si128(pv, 15));
-	q->arr[dz_rounddown(qlen, sizeof(__m128i))] = _mm_extract_epi8(pv, 15);
-	for(size_t i = dz_rounddown(qlen, sizeof(__m128i)); i < qlen; i++) {
+	// simde_mm_store_si128((simde__m128i *)&q->arr[dz_rounddown(qlen, sizeof(simde__m128i))], simde_mm_srli_si128(pv, 15));
+	q->arr[dz_rounddown(qlen, sizeof(simde__m128i))] = simde_mm_extract_epi8(pv, 15);
+	for(size_t i = dz_rounddown(qlen, sizeof(simde__m128i)); i < qlen; i++) {
 		q->arr[i + 1] = conv[(uint8_t)query[i] & 0x0f];
 	}
-	for(size_t i = qlen; i < dz_roundup(qlen + 1, sizeof(__m128i)); i++) {
+	for(size_t i = qlen; i < dz_roundup(qlen + 1, sizeof(simde__m128i)); i++) {
 		q->arr[i + 1] = qS;
 	}
 
@@ -841,8 +841,8 @@
 	struct dz_s *self,
 	char const *query, size_t qlen)
 {
-	size_t const L = sizeof(__m128i) / sizeof(uint16_t);
-	struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + dz_roundup(qlen + 1, L) + sizeof(__m128i));
+	size_t const L = sizeof(simde__m128i) / sizeof(uint16_t);
+	struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + dz_roundup(qlen + 1, L) + sizeof(simde__m128i));
 	*q = (struct dz_query_s){
 		.blen = qlen == 0 ? 0 : (dz_roundup(qlen + 1, L) / L),
 		.q = query,
@@ -860,23 +860,23 @@
 	static uint8_t const rev[16] __attribute__(( aligned(16) )) = {
 		15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 	};
-	__m128i pv = _mm_set1_epi8((int8_t)qN);
-	__m128i const fv = _mm_set1_epi8(0x0f);
-	__m128i const cv = _mm_load_si128((__m128i const *)conv), rv = _mm_load_si128((__m128i const *)rev);
+	simde__m128i pv = simde_mm_set1_epi8((int8_t)qN);
+	simde__m128i const fv = simde_mm_set1_epi8(0x0f);
+	simde__m128i const cv = simde_mm_load_si128((simde__m128i const *)conv), rv = simde_mm_load_si128((simde__m128i const *)rev);
 
 	/* until the end of the query sequence */
-	for(size_t i = 0; i < dz_rounddown(qlen, sizeof(__m128i)); i += sizeof(__m128i)) {
-		__m128i const qv = _mm_loadu_si128((__m128i const *)&query[qlen - 16 - i]);
-		__m128i tv = _mm_shuffle_epi8(_mm_shuffle_epi8(cv, _mm_and_si128(qv, fv)), rv);
-		_mm_store_si128((__m128i *)&q->arr[i], _mm_alignr_epi8(tv, pv, 15)); pv = tv;			/* shift by one to make room for a base */
+	for(size_t i = 0; i < dz_rounddown(qlen, sizeof(simde__m128i)); i += sizeof(simde__m128i)) {
+		simde__m128i const qv = simde_mm_loadu_si128((simde__m128i const *)&query[qlen - 16 - i]);
+		simde__m128i tv = simde_mm_shuffle_epi8(simde_mm_shuffle_epi8(cv, simde_mm_and_si128(qv, fv)), rv);
+		simde_mm_store_si128((simde__m128i *)&q->arr[i], simde_mm_alignr_epi8(tv, pv, 15)); pv = tv;			/* shift by one to make room for a base */
 	}
 
 	/* continue the same conversion on the remainings */
-	q->arr[dz_rounddown(qlen, sizeof(__m128i))] = _mm_extract_epi8(pv, 15);
-	for(size_t i = dz_rounddown(qlen, sizeof(__m128i)); i < qlen; i++) {
+	q->arr[dz_rounddown(qlen, sizeof(simde__m128i))] = simde_mm_extract_epi8(pv, 15);
+	for(size_t i = dz_rounddown(qlen, sizeof(simde__m128i)); i < qlen; i++) {
 		q->arr[i + 1] = conv[(uint8_t)query[qlen - 1 - i] & 0x0f];
 	}
-	for(size_t i = qlen; i < dz_roundup(qlen + 1, sizeof(__m128i)); i++) {
+	for(size_t i = qlen; i < dz_roundup(qlen + 1, sizeof(simde__m128i)); i++) {
 		q->arr[i + 1] = qS;
 	}
 
@@ -890,8 +890,8 @@
 	struct dz_s *self,
 	char const *query, size_t qlen)
 {
-	size_t const L = sizeof(__m128i) / sizeof(uint16_t);
-	struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + DZ_MAT_SIZE * dz_roundup(qlen + 1, L) + sizeof(__m128i));
+	size_t const L = sizeof(simde__m128i) / sizeof(uint16_t);
+	struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + DZ_MAT_SIZE * dz_roundup(qlen + 1, L) + sizeof(simde__m128i));
 	*q = (struct dz_query_s){
 		.blen = qlen == 0 ? 0 : (dz_roundup(qlen + 1, L) / L),
 		.q = query,
@@ -905,23 +905,23 @@
 		a[0] = 0;
 
 		int8_t const *conv = &self->protein_matrix[j * DZ_MAT_SIZE];
-		__m128i pv = _mm_setzero_si128();
-		__m128i const fv = _mm_set1_epi8(0x0f);													/* conversion mask */
-		__m128i const lcv = _mm_loadu_si128((__m128i const *)&conv[0]);							/* conversion table */
-		__m128i const hcv = _mm_loadu_si128((__m128i const *)&conv[16]);						/* conversion table */
+		simde__m128i pv = simde_mm_setzero_si128();
+		simde__m128i const fv = simde_mm_set1_epi8(0x0f);													/* conversion mask */
+		simde__m128i const lcv = simde_mm_loadu_si128((simde__m128i const *)&conv[0]);							/* conversion table */
+		simde__m128i const hcv = simde_mm_loadu_si128((simde__m128i const *)&conv[16]);						/* conversion table */
 
 		/* until the end of the query sequence */
-		for(size_t i = 0; i < dz_rounddown(qlen, sizeof(__m128i)); i += sizeof(__m128i)) {
-			__m128i const qv = _mm_loadu_si128((__m128i const *)&query[i]);
-			__m128i const _qv = _mm_and_si128(qv, fv);
-			__m128i lv = _mm_shuffle_epi8(lcv, _qv), hv = _mm_shuffle_epi8(hcv, _qv);
-			__m128i tv = _mm_blendv_epi8(lv, hv, _mm_slli_epi32(qv, 3));
-			_mm_storeu_si128((__m128i *)&a[i], _mm_alignr_epi8(tv, pv, 15)); pv = tv;			/* shift by one to make room for a base */
+		for(size_t i = 0; i < dz_rounddown(qlen, sizeof(simde__m128i)); i += sizeof(simde__m128i)) {
+			simde__m128i const qv = simde_mm_loadu_si128((simde__m128i const *)&query[i]);
+			simde__m128i const _qv = simde_mm_and_si128(qv, fv);
+			simde__m128i lv = simde_mm_shuffle_epi8(lcv, _qv), hv = simde_mm_shuffle_epi8(hcv, _qv);
+			simde__m128i tv = simde_mm_blendv_epi8(lv, hv, simde_mm_slli_epi32(qv, 3));
+			simde_mm_storeu_si128((simde__m128i *)&a[i], simde_mm_alignr_epi8(tv, pv, 15)); pv = tv;			/* shift by one to make room for a base */
 		}
 
 		/* continue the same conversion on the remainings */
-		a[dz_rounddown(qlen, sizeof(__m128i))] = _mm_extract_epi8(pv, 15);
-		for(size_t i = dz_rounddown(qlen, sizeof(__m128i)); i < qlen; i++) {
+		a[dz_rounddown(qlen, sizeof(simde__m128i))] = simde_mm_extract_epi8(pv, 15);
+		for(size_t i = dz_rounddown(qlen, sizeof(simde__m128i)); i < qlen; i++) {
 			a[i + 1] = conv[(uint8_t)query[i] & 0x1f];
 		}
 		for(size_t i = qlen; i < dz_roundup(qlen + 1, L); i++) {
@@ -936,8 +936,8 @@
 	struct dz_s *self,
 	char const *query, size_t qlen)
 {
-	size_t const L = sizeof(__m128i) / sizeof(uint16_t);
-	struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + DZ_MAT_SIZE * dz_roundup(qlen + 1, L) + sizeof(__m128i));
+	size_t const L = sizeof(simde__m128i) / sizeof(uint16_t);
+	struct dz_query_s *q = (struct dz_query_s *)dz_mem_malloc(dz_mem(self), sizeof(struct dz_query_s) + DZ_MAT_SIZE * dz_roundup(qlen + 1, L) + sizeof(simde__m128i));
 	*q = (struct dz_query_s){
 		.blen = qlen == 0 ? 0 : (dz_roundup(qlen + 1, L) / L),
 		.q = query,
@@ -954,24 +954,24 @@
 			15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 		};
 		int8_t const *conv = &self->protein_matrix[j * DZ_MAT_SIZE];
-		__m128i pv = _mm_setzero_si128();
-		__m128i const fv = _mm_set1_epi8(0x0f);
-		__m128i const lcv = _mm_loadu_si128((__m128i const *)&conv[0]);							/* conversion table */
-		__m128i const hcv = _mm_loadu_si128((__m128i const *)&conv[16]);						/* conversion table */
-		__m128i const rv = _mm_load_si128((__m128i const *)rev);
+		simde__m128i pv = simde_mm_setzero_si128();
+		simde__m128i const fv = simde_mm_set1_epi8(0x0f);
+		simde__m128i const lcv = simde_mm_loadu_si128((simde__m128i const *)&conv[0]);							/* conversion table */
+		simde__m128i const hcv = simde_mm_loadu_si128((simde__m128i const *)&conv[16]);						/* conversion table */
+		simde__m128i const rv = simde_mm_load_si128((simde__m128i const *)rev);
 
 		/* toward the head of the query sequence */
-		for(size_t i = 0; i < dz_rounddown(qlen, sizeof(__m128i)); i += sizeof(__m128i)) {
-			__m128i const qv = _mm_shuffle_epi8(_mm_loadu_si128((__m128i const *)&query[qlen - 16 - i]), rv);
-			__m128i const _qv = _mm_and_si128(qv, fv);
-			__m128i lv = _mm_shuffle_epi8(lcv, _qv), hv = _mm_shuffle_epi8(hcv, _qv);
-			__m128i tv = _mm_blendv_epi8(lv, hv, _mm_slli_epi32(qv, 3));
-			_mm_storeu_si128((__m128i *)&a[i], _mm_alignr_epi8(tv, pv, 15)); pv = tv;			/* shift by one to make room for a base */
+		for(size_t i = 0; i < dz_rounddown(qlen, sizeof(simde__m128i)); i += sizeof(simde__m128i)) {
+			simde__m128i const qv = simde_mm_shuffle_epi8(simde_mm_loadu_si128((simde__m128i const *)&query[qlen - 16 - i]), rv);
+			simde__m128i const _qv = simde_mm_and_si128(qv, fv);
+			simde__m128i lv = simde_mm_shuffle_epi8(lcv, _qv), hv = simde_mm_shuffle_epi8(hcv, _qv);
+			simde__m128i tv = simde_mm_blendv_epi8(lv, hv, simde_mm_slli_epi32(qv, 3));
+			simde_mm_storeu_si128((simde__m128i *)&a[i], simde_mm_alignr_epi8(tv, pv, 15)); pv = tv;			/* shift by one to make room for a base */
 		}
 
 		/* continue the same conversion on the remainings */
-		a[dz_rounddown(qlen, sizeof(__m128i))] = _mm_extract_epi8(pv, 15);
-		for(size_t i = dz_rounddown(qlen, sizeof(__m128i)); i < qlen; i++) {
+		a[dz_rounddown(qlen, sizeof(simde__m128i))] = simde_mm_extract_epi8(pv, 15);
+		for(size_t i = dz_rounddown(qlen, sizeof(simde__m128i)); i < qlen; i++) {
 			a[i + 1] = conv[(uint8_t)query[qlen - 1 - i] & 0x0f];
 		}
 		for(size_t i = qlen; i < dz_roundup(qlen + 1, L); i++) {
@@ -1028,21 +1028,21 @@
 	struct dz_swgv_s *cdp = _begin_column_head(w.r.spos, w.r.epos, w.max, forefronts, n_forefronts);	/* allocate memory for the first column */ \
 	for(uint64_t p = w.r.spos; p < w.r.epos; p++) { \
 		/* memset(cdp, 0xff, sizeof(struct dz_swgv_s) * (w.r.epos - w.r.spos)); */ \
-		__m128i const e = _mm_set1_epi16(INT16_MIN), f = e, s = e; _store_vector(&cdp[p]); \
+		simde__m128i const e = simde_mm_set1_epi16(INT16_MIN), f = e, s = e; _store_vector(&cdp[p]); \
 	} \
 	/* paste the last vectors */ \
 	for(size_t i = 0; i < n_forefronts; i++) { \
 		struct dz_swgv_s const *tdp = (struct dz_swgv_s const *)forefronts[i] - forefronts[i]->r.epos; \
-		__m128i const adjv = _mm_set1_epi16(init_s == 0 ? 0 : adj[i]); \
+		simde__m128i const adjv = simde_mm_set1_epi16(init_s == 0 ? 0 : adj[i]); \
 		for(uint64_t p = forefronts[i]->r.spos; p < forefronts[i]->r.epos; p++) { \
 			/* adjust offset */ \
-			__m128i e = _mm_subs_epi16(_mm_load_si128(&tdp[p].e), adjv); \
-			__m128i f = _mm_subs_epi16(_mm_load_si128(&tdp[p].f), adjv); \
-			__m128i s = _mm_subs_epi16(_mm_load_si128(&tdp[p].s), adjv); \
+			simde__m128i e = simde_mm_subs_epi16(simde_mm_load_si128(&tdp[p].e), adjv); \
+			simde__m128i f = simde_mm_subs_epi16(simde_mm_load_si128(&tdp[p].f), adjv); \
+			simde__m128i s = simde_mm_subs_epi16(simde_mm_load_si128(&tdp[p].s), adjv); \
 			/* read-max-write */ \
-			_mm_store_si128(&cdp[p].e, _mm_max_epi16(e, _mm_load_si128(&cdp[p].e))); \
-			_mm_store_si128(&cdp[p].f, _mm_max_epi16(f, _mm_load_si128(&cdp[p].f))); \
-			_mm_store_si128(&cdp[p].s, _mm_max_epi16(s, _mm_load_si128(&cdp[p].s))); \
+			simde_mm_store_si128(&cdp[p].e, simde_mm_max_epi16(e, simde_mm_load_si128(&cdp[p].e))); \
+			simde_mm_store_si128(&cdp[p].f, simde_mm_max_epi16(f, simde_mm_load_si128(&cdp[p].f))); \
+			simde_mm_store_si128(&cdp[p].s, simde_mm_max_epi16(s, simde_mm_load_si128(&cdp[p].s))); \
 			print_vector(s); \
 		} \
 	} \
@@ -1054,8 +1054,8 @@
 	_init_rch(query, rt, rrem); \
 	struct dz_swgv_s *cdp = _begin_column(w, rch, rrem); \
 	/* init vectors */ \
-	__m128i f = minv, ps = _mm_set1_epi16(init_s), maxv = _mm_set1_epi16(INT16_MIN); \
-	__m128i const xtv = _mm_set1_epi16(w.inc - self->xt);	/* next offset == current max thus X-drop threshold is always -xt */ \
+	simde__m128i f = minv, ps = simde_mm_set1_epi16(init_s), maxv = simde_mm_set1_epi16(INT16_MIN); \
+	simde__m128i const xtv = simde_mm_set1_epi16(w.inc - self->xt);	/* next offset == current max thus X-drop threshold is always -xt */ \
 	/* until the bottommost vertically placed band... */ \
 	uint32_t sspos = w.r.spos;					/* save spos on the stack */ \
 	for(uint64_t p = w.r.spos; p < w.r.epos; p++) { \
@@ -1069,11 +1069,11 @@
 	/* if reached the forefront of the query sequence, finish the extension */ \
 	if(w.r.epos < query->blen) { \
 		/* forefront extension; clip the column length if too long */ \
-		__m128i e = minv, s = minv; _update_vector(w.r.epos); \
+		simde__m128i e = minv, s = minv; _update_vector(w.r.epos); \
 		do { \
 			if(_test_xdrop(s, xtv)) { break; } \
 			_store_vector(&cdp[w.r.epos]); w.r.epos++; \
-			f = _mm_subs_epi16(f, gev8); s = _mm_subs_epi16(s, gev8); \
+			f = simde_mm_subs_epi16(f, gev8); s = simde_mm_subs_epi16(s, gev8); \
 		} while(w.r.epos < query->blen); \
 	} \
 dz_pp_cat(_forefront_, __LINE__):; \
@@ -1097,7 +1097,7 @@
 	char const *ref, int32_t rlen, uint32_t rid,
 	uint16_t init_s)
 {
-	size_t const L = sizeof(__m128i) / sizeof(uint16_t);
+	size_t const L = sizeof(simde__m128i) / sizeof(uint16_t);
 	if(n_forefronts == 0) { return(NULL); }										/* invalid */
 	if(rlen == 0 && n_forefronts == 1) { return(forefronts[0]); }				/* no need to merge incoming vectors */
 
@@ -1115,15 +1115,15 @@
 
 		/* create conversion table (accessed by indirect %rbp) */
 		uint8_t conv[16] __attribute__(( aligned(16) ));
-		_mm_store_si128((__m128i *)conv, _mm_load_si128((__m128i const *)&conv_fr[rlen > 0 ? 0 : 16]));
+		simde_mm_store_si128((simde__m128i *)conv, simde_mm_load_si128((simde__m128i const *)&conv_fr[rlen > 0 ? 0 : 16]));
 	#endif
 
-	__m128i const minv = _mm_set1_epi16(DZ_CELL_MIN);
-	__m128i const giv = _mm_load_si128((__m128i const *)self->giv);
-	__m128i const gev1 = _mm_load_si128((__m128i const *)self->gev);
-	__m128i const gev2 = _mm_add_epi16(gev1, gev1);
-	__m128i const gev4 = _mm_slli_epi16(gev1, 2);
-	__m128i const gev8 = _mm_slli_epi16(gev1, 3);
+	simde__m128i const minv = simde_mm_set1_epi16(DZ_CELL_MIN);
+	simde__m128i const giv = simde_mm_load_si128((simde__m128i const *)self->giv);
+	simde__m128i const gev1 = simde_mm_load_si128((simde__m128i const *)self->gev);
+	simde__m128i const gev2 = simde_mm_add_epi16(gev1, gev1);
+	simde__m128i const gev4 = simde_mm_slli_epi16(gev1, 2);
+	simde__m128i const gev8 = simde_mm_slli_epi16(gev1, 3);
 
 	struct dz_forefront_s w = { { UINT32_MAX, 0 }, 0, 0, 0, 0, 0, 0, NULL, NULL };	/* uint32_t spos, epos, max, inc; struct dz_query_s const *query; struct dz_cap_s const *cap; */ \
 	w.rlen = rlen;
@@ -1358,15 +1358,15 @@
 	struct dz_forefront_s const *forefront)
 {
 	(void)self;
-	size_t const L = sizeof(__m128i) / sizeof(uint16_t);
+	size_t const L = sizeof(simde__m128i) / sizeof(uint16_t);
 	#define _dp(_cap)					( (struct dz_swgv_s const *)(_cap) - (_cap)->r.epos )
 
 	_init_bonus(forefront->query);
 	struct dz_cap_s const *pcap = forefront->mcap;
-	__m128i const maxv = _mm_set1_epi16(forefront->inc);
+	simde__m128i const maxv = simde_mm_set1_epi16(forefront->inc);
 	for(uint64_t p = pcap->r.spos; p < pcap->r.epos; p++) {
-		__m128i const s = _add_bonus(p, _mm_load_si128(&_dp(pcap)[p].s)); print_vector(s);
-		uint64_t eq = _mm_movemask_epi8(_mm_cmpeq_epi16(s, maxv));
+		simde__m128i const s = _add_bonus(p, simde_mm_load_si128(&_dp(pcap)[p].s)); print_vector(s);
+		uint64_t eq = simde_mm_movemask_epi8(simde_mm_cmpeq_epi16(s, maxv));
 		if(eq != 0) {
 			/* tzcntq is faster but avoid using it b/c it requires relatively newer archs */
 			uint64_t zcnt = (eq - 1) & (~eq & 0x5555);		/* subq, andnq, andq; chain length == 2 */
@@ -1447,7 +1447,7 @@
 	struct dz_s *self,
 	struct dz_forefront_s const *forefront)
 {
-	size_t const L = sizeof(__m128i) / sizeof(uint16_t);
+	size_t const L = sizeof(simde__m128i) / sizeof(uint16_t);
 	if(forefront->mcap == NULL) { return(NULL); }
 
 	/* detect pos */
--- vg.orig/deps/gssw/Makefile
+++ vg/deps/gssw/Makefile
@@ -1,5 +1,5 @@
 CC:=gcc
-CFLAGS+=-Wall -O3 -msse4 -g
+CFLAGS+=-Wall -O3 -g
 OBJ_DIR:=obj
 BIN_DIR:=bin
 SRC_DIR:=src
--- vg.orig/deps/gssw/src/gssw.c
+++ vg/deps/gssw/src/gssw.c
@@ -29,7 +29,7 @@
  *  Created by Mengyao Zhao on 6/22/10.
  *  Generalized to operate on graphs by Erik Garrison and renamed gssw.c
  */
-#include <emmintrin.h>
+#include "../debian/include/simde/x86/sse2.h"
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdio.h>
@@ -85,7 +85,7 @@
 
 
 /* Generate query profile rearrange query sequence & calculate the weight of match/mismatch. */
-__m128i* gssw_qP_byte (const int8_t* read_num,
+simde__m128i* gssw_qP_byte (const int8_t* read_num,
                        const int8_t* mat,
                        const int32_t readLen,
                        const int32_t n,    /* the edge length of the squre matrix mat */
@@ -98,7 +98,7 @@
                                      Calculate 16 segments in parallel.
                                      This holds the number of segments needed to fit the read.
                                    */
-    __m128i* vProfile = (__m128i*)malloc(n * segLen * sizeof(__m128i));
+    simde__m128i* vProfile = (simde__m128i*)malloc(n * segLen * sizeof(simde__m128i));
     int8_t* t = (int8_t*)vProfile; // This points to each byte in the profile vector, one at a time
     // nt tracks the nucleotide we're computing the profile for. We do each possible character.
     // i tracks which swizzled register we're working on
@@ -140,7 +140,7 @@
     return vProfile;
 }
 
-__m128i* gssw_adj_qP_byte (const int8_t* read_num,
+simde__m128i* gssw_adj_qP_byte (const int8_t* read_num,
                            const int8_t* qual,
                            const int8_t* adj_mat,
                            const int32_t readLen,
@@ -153,7 +153,7 @@
                                            Each piece is 8 bit. Split the read into 16 segments.
                                            Calculat 16 segments in parallel.
                                            */
-    __m128i* vProfile = (__m128i*)malloc(n * segLen * sizeof(__m128i));
+    simde__m128i* vProfile = (simde__m128i*)malloc(n * segLen * sizeof(simde__m128i));
     int8_t* t = (int8_t*)vProfile;
     int32_t nt, i, j, segNum;
     
@@ -199,7 +199,7 @@
  * Look up the value in a profile matrix for the given base code observed at the given read index.
  * Useful for non-swizzled access to the the swizzled profile.
  */
-uint8_t profile_get_byte(__m128i* vProfile, int32_t readLen, int32_t read_position, int32_t observed_base) {
+uint8_t profile_get_byte(simde__m128i* vProfile, int32_t readLen, int32_t read_position, int32_t observed_base) {
     // Profile is stored by observed base (most significant), then by position in the segment, then by segment in the read (lwast significant).
     
     // How long is a segment? We have 16.
@@ -332,7 +332,7 @@
                                            int32_t readLen,
                                            const uint8_t weight_gapO, /* will be used as - */
                                            const uint8_t weight_gapE, /* will be used as - */
-                                           __m128i* vProfile,
+                                           simde__m128i* vProfile,
                                            uint8_t terminate,    /* the best alignment score: used to terminate
                                                                    the matrix calculation when locating the
                                                                    alignment beginning point. If this score
@@ -367,16 +367,16 @@
     uint8_t* pv;
     
     /* Note use of aligned memory.  Return value of 0 means success for posix_memalign. */
-    if (!(!posix_memalign((void**)&pvHStore, sizeof(__m128i), padded_read_length) &&
-          !posix_memalign((void**)&pvEStore, sizeof(__m128i), padded_read_length) &&
-          !posix_memalign((void**)&pvFStore, sizeof(__m128i), padded_read_length) &&
-          !posix_memalign((void**)&pvHLoad,  sizeof(__m128i), padded_read_length) &&
-          !posix_memalign((void**)&pvELoad,  sizeof(__m128i), padded_read_length) &&
-          !posix_memalign((void**)&alignment->seed.pvE,      sizeof(__m128i), padded_read_length) &&
-          !posix_memalign((void**)&alignment->seed.pvHStore, sizeof(__m128i), padded_read_length) &&
-          !posix_memalign((void**)&mH,           sizeof(__m128i), refLen*padded_read_length) &&
-          !posix_memalign((void**)&mE,           sizeof(__m128i), refLen*padded_read_length) &&
-          !posix_memalign((void**)&mF,           sizeof(__m128i), refLen*padded_read_length))) {
+    if (!(!posix_memalign((void**)&pvHStore, sizeof(simde__m128i), padded_read_length) &&
+          !posix_memalign((void**)&pvEStore, sizeof(simde__m128i), padded_read_length) &&
+          !posix_memalign((void**)&pvFStore, sizeof(simde__m128i), padded_read_length) &&
+          !posix_memalign((void**)&pvHLoad,  sizeof(simde__m128i), padded_read_length) &&
+          !posix_memalign((void**)&pvELoad,  sizeof(simde__m128i), padded_read_length) &&
+          !posix_memalign((void**)&alignment->seed.pvE,      sizeof(simde__m128i), padded_read_length) &&
+          !posix_memalign((void**)&alignment->seed.pvHStore, sizeof(simde__m128i), padded_read_length) &&
+          !posix_memalign((void**)&mH,           sizeof(simde__m128i), refLen*padded_read_length) &&
+          !posix_memalign((void**)&mE,           sizeof(simde__m128i), refLen*padded_read_length) &&
+          !posix_memalign((void**)&mF,           sizeof(simde__m128i), refLen*padded_read_length))) {
         fprintf(stderr, "error:[gssw] Could not allocate memory required for alignment buffers.\n");
         exit(1);
     }
@@ -522,23 +522,23 @@
 /* To determine the maximum values within each vector, rather than between vectors. */
 
 #define m128i_max16(m, vm) \
-    (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 8)); \
-    (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 4)); \
-    (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 2)); \
-    (vm) = _mm_max_epu8((vm), _mm_srli_si128((vm), 1)); \
-    (m) = _mm_extract_epi16((vm), 0)
+    (vm) = simde_mm_max_epu8((vm), simde_mm_srli_si128((vm), 8)); \
+    (vm) = simde_mm_max_epu8((vm), simde_mm_srli_si128((vm), 4)); \
+    (vm) = simde_mm_max_epu8((vm), simde_mm_srli_si128((vm), 2)); \
+    (vm) = simde_mm_max_epu8((vm), simde_mm_srli_si128((vm), 1)); \
+    (m) = simde_mm_extract_epi16((vm), 0)
 
 #define m128i_max8(m, vm) \
-    (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 8)); \
-    (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 4)); \
-    (vm) = _mm_max_epi16((vm), _mm_srli_si128((vm), 2)); \
-    (m) = _mm_extract_epi16((vm), 0)
+    (vm) = simde_mm_max_epi16((vm), simde_mm_srli_si128((vm), 8)); \
+    (vm) = simde_mm_max_epi16((vm), simde_mm_srli_si128((vm), 4)); \
+    (vm) = simde_mm_max_epi16((vm), simde_mm_srli_si128((vm), 2)); \
+    (m) = simde_mm_extract_epi16((vm), 0)
     
 // See https://stackoverflow.com/q/33824300 for this unsigned comparison macro
 // for the missing unsigned comparison instruction _mm_cmpgt_epu8
 #define m128i_cmpgt(v0, v1) \
-         _mm_cmpgt_epi8(_mm_xor_si128(v0, _mm_set1_epi8(-128)), \
-                        _mm_xor_si128(v1, _mm_set1_epi8(-128)))
+         simde_mm_cmpgt_epi8(simde_mm_xor_si128(v0, simde_mm_set1_epi8(-128)), \
+                        simde_mm_xor_si128(v1, simde_mm_set1_epi8(-128)))
 
 /* Striped Smith-Waterman
    Record the highest score of each reference position.
@@ -553,7 +553,7 @@
                                        int32_t readLen,
                                        const uint8_t weight_gapO, /* will be used as - */
                                        const uint8_t weight_gapE, /* will be used as - */
-                                       __m128i* vProfile,
+                                       simde__m128i* vProfile,
                                        uint8_t terminate,    /* the best alignment score: used to terminate
                                                                the matrix calculation when locating the
                                                                alignment beginning point. If this score
@@ -570,55 +570,55 @@
     int32_t segLen = (readLen + 15) / 16; /* number of segment */
 
     /* Initialize buffers used in alignment */
-    __m128i* pvHStore;
-    __m128i* pvHLoad;
-    __m128i* pvHmax;
-    __m128i* pvE; // TODO: appears redundant with pvEStore
+    simde__m128i* pvHStore;
+    simde__m128i* pvHLoad;
+    simde__m128i* pvHmax;
+    simde__m128i* pvE; // TODO: appears redundant with pvEStore
     // We have a couple extra arrays for logging columns
-    __m128i* pvEStore;
-    __m128i* pvFStore;
+    simde__m128i* pvEStore;
+    simde__m128i* pvFStore;
     uint8_t* mH = NULL; // used to save matrices for external traceback: overall best score
     uint8_t* mE = NULL; // Gap in read best score
     uint8_t* mF = NULL; // Gap in ref best score
     /* Note use of aligned memory.  Return value of 0 means success for posix_memalign. */
-    if (!(!posix_memalign((void**)&pvHStore,     sizeof(__m128i), segLen*sizeof(__m128i)) &&
-          !posix_memalign((void**)&pvHLoad,      sizeof(__m128i), segLen*sizeof(__m128i)) &&
-          !posix_memalign((void**)&pvHmax,       sizeof(__m128i), segLen*sizeof(__m128i)) &&
-          !posix_memalign((void**)&pvE,          sizeof(__m128i), segLen*sizeof(__m128i)) &&
-          !posix_memalign((void**)&pvEStore,     sizeof(__m128i), segLen*sizeof(__m128i)) &&
-          !posix_memalign((void**)&pvFStore,     sizeof(__m128i), segLen*sizeof(__m128i)) &&
-          !posix_memalign((void**)&alignment->seed.pvE,      sizeof(__m128i), segLen*sizeof(__m128i)) &&
-          !posix_memalign((void**)&alignment->seed.pvHStore, sizeof(__m128i), segLen*sizeof(__m128i)))) {
+    if (!(!posix_memalign((void**)&pvHStore,     sizeof(simde__m128i), segLen*sizeof(simde__m128i)) &&
+          !posix_memalign((void**)&pvHLoad,      sizeof(simde__m128i), segLen*sizeof(simde__m128i)) &&
+          !posix_memalign((void**)&pvHmax,       sizeof(simde__m128i), segLen*sizeof(simde__m128i)) &&
+          !posix_memalign((void**)&pvE,          sizeof(simde__m128i), segLen*sizeof(simde__m128i)) &&
+          !posix_memalign((void**)&pvEStore,     sizeof(simde__m128i), segLen*sizeof(simde__m128i)) &&
+          !posix_memalign((void**)&pvFStore,     sizeof(simde__m128i), segLen*sizeof(simde__m128i)) &&
+          !posix_memalign((void**)&alignment->seed.pvE,      sizeof(simde__m128i), segLen*sizeof(simde__m128i)) &&
+          !posix_memalign((void**)&alignment->seed.pvHStore, sizeof(simde__m128i), segLen*sizeof(simde__m128i)))) {
         fprintf(stderr, "error:[gssw] Could not allocate memory required for alignment buffers.\n");
         exit(1);
     }
 
-    if (save_matrixes && !(!posix_memalign((void**)&mH,           sizeof(__m128i), segLen*refLen*sizeof(__m128i)) &&
-                           !posix_memalign((void**)&mE,           sizeof(__m128i), segLen*refLen*sizeof(__m128i)) &&
-                           !posix_memalign((void**)&mF,           sizeof(__m128i), segLen*refLen*sizeof(__m128i)))) {
+    if (save_matrixes && !(!posix_memalign((void**)&mH,           sizeof(simde__m128i), segLen*refLen*sizeof(simde__m128i)) &&
+                           !posix_memalign((void**)&mE,           sizeof(simde__m128i), segLen*refLen*sizeof(simde__m128i)) &&
+                           !posix_memalign((void**)&mF,           sizeof(simde__m128i), segLen*refLen*sizeof(simde__m128i)))) {
         fprintf(stderr, "error:[gssw] Could not allocate memory required for alignment traceback matrixes.\n");
         exit(1);
     }
 
     /* Workaround: zero memory ourselves because we don't have an aligned calloc */
-    memset(pvHStore,                 0, segLen*sizeof(__m128i));
-    memset(pvHLoad,                  0, segLen*sizeof(__m128i));
-    memset(pvHmax,                   0, segLen*sizeof(__m128i));
-    memset(pvE,                      0, segLen*sizeof(__m128i));
-    memset(pvEStore,                 0, segLen*sizeof(__m128i));
-    memset(pvFStore,                 0, segLen*sizeof(__m128i));
-    memset(alignment->seed.pvE,      0, segLen*sizeof(__m128i));
-    memset(alignment->seed.pvHStore, 0, segLen*sizeof(__m128i));
+    memset(pvHStore,                 0, segLen*sizeof(simde__m128i));
+    memset(pvHLoad,                  0, segLen*sizeof(simde__m128i));
+    memset(pvHmax,                   0, segLen*sizeof(simde__m128i));
+    memset(pvE,                      0, segLen*sizeof(simde__m128i));
+    memset(pvEStore,                 0, segLen*sizeof(simde__m128i));
+    memset(pvFStore,                 0, segLen*sizeof(simde__m128i));
+    memset(alignment->seed.pvE,      0, segLen*sizeof(simde__m128i));
+    memset(alignment->seed.pvHStore, 0, segLen*sizeof(simde__m128i));
     if (save_matrixes) {
-        memset(mH,                       0, segLen*refLen*sizeof(__m128i));
-        memset(mE,                       0, segLen*refLen*sizeof(__m128i));
-        memset(mF,                       0, segLen*refLen*sizeof(__m128i));
+        memset(mH,                       0, segLen*refLen*sizeof(simde__m128i));
+        memset(mE,                       0, segLen*refLen*sizeof(simde__m128i));
+        memset(mF,                       0, segLen*refLen*sizeof(simde__m128i));
     }
 
     /* if we are running a seeded alignment, copy over the seeds */
     if (seed) {
-        memcpy(pvE, seed->pvE, segLen*sizeof(__m128i));
-        memcpy(pvHStore, seed->pvHStore, segLen*sizeof(__m128i));
+        memcpy(pvE, seed->pvE, segLen*sizeof(simde__m128i));
+        memcpy(pvHStore, seed->pvHStore, segLen*sizeof(simde__m128i));
     }
 
     /* Set external matrix pointers */
@@ -632,23 +632,23 @@
     alignment->is_byte = 1;
 
     /* Define 16 byte 0 vector. */
-    __m128i vZero = _mm_set1_epi32(0);
+    simde__m128i vZero = simde_mm_set1_epi32(0);
 
     /* Used for iteration */
     int32_t i, j;
 
     /* 16 byte insertion begin vector */
-    __m128i vGapO = _mm_set1_epi8(weight_gapO);
+    simde__m128i vGapO = simde_mm_set1_epi8(weight_gapO);
 
     /* 16 byte insertion extension vector */
-    __m128i vGapE = _mm_set1_epi8(weight_gapE);
+    simde__m128i vGapE = simde_mm_set1_epi8(weight_gapE);
 
     /* 16 byte bias vector */
-    __m128i vBias = _mm_set1_epi8(bias);
+    simde__m128i vBias = simde_mm_set1_epi8(bias);
 
-    __m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */
-    __m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */
-    __m128i vTemp;
+    simde__m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */
+    simde__m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */
+    simde__m128i vTemp;
     int32_t begin = 0, end = refLen, step = 1;
 
     /* outer loop to process the reference sequence */
@@ -661,22 +661,22 @@
         // For each column
     
         int32_t cmp;
-        __m128i e = vZero, vF = vZero, vMaxColumn = vZero; /* Initialize F value to 0.
+        simde__m128i e = vZero, vF = vZero, vMaxColumn = vZero; /* Initialize F value to 0.
                                Any errors to vH values will be corrected in the Lazy_F loop.
                              */
         //max16(maxColumn[i], vMaxColumn);
         //fprintf(stderr, "middle[%d]: %d\n", i, maxColumn[i]);
 
         // Load the last column's last H value in each segment
-        //__m128i vH = pvHStore[segLen - 1];
-        __m128i vH = _mm_load_si128 (pvHStore + (segLen - 1));
+        //simde__m128i vH = pvHStore[segLen - 1];
+        simde__m128i vH = simde_mm_load_si128 (pvHStore + (segLen - 1));
         // Shift it over (TODO: why??? We only shift this initial read and not later reads.)
-        vH = _mm_slli_si128 (vH, 1); /* Shift the 128-bit value in vH left by 1 byte. */
+        vH = simde_mm_slli_si128 (vH, 1); /* Shift the 128-bit value in vH left by 1 byte. */
         // Find the profile entries for matching this column's ref base against each read base.
-        __m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */
+        simde__m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */
 
         /* Swap the 2 H buffers. */
-        __m128i* pv = pvHLoad;
+        simde__m128i* pv = pvHLoad;
         pvHLoad = pvHStore;
         pvHStore = pv;
 
@@ -686,9 +686,9 @@
             // at position j in each segment
 
             // Add the profile scores for matching against this ref base
-            vH = _mm_adds_epu8(vH, _mm_load_si128(vP + j));
+            vH = simde_mm_adds_epu8(vH, simde_mm_load_si128(vP + j));
             // And subtract out the profile's bias (so profile scores can be <0)
-            vH = _mm_subs_epu8(vH, vBias); /* vH will be always > 0 because of saturation arithmetic */
+            vH = simde_mm_subs_epu8(vH, vBias); /* vH will be always > 0 because of saturation arithmetic */
             //    max16(maxColumn[i], vH);
             //    fprintf(stderr, "H[%d]: %d\n", i, maxColumn[i]);
             /*
@@ -703,8 +703,8 @@
             // Next we are going to replace entries if we have a better score from a gap matrix.
 
             /* Get max from vH, vE and vF. */
-            e = _mm_load_si128(pvE + j);
-            //_mm_store_si128(vE + j, e);
+            e = simde_mm_load_si128(pvE + j);
+            //simde_mm_store_si128(vE + j, e);
             
             // So e holds the *current* column's read gap open/extend scores,
             // which we computed on the *previous* column's pass.
@@ -712,9 +712,9 @@
             // gap open/extend scores, which we computed on the *previous*
             // cursor position.
 
-            vH = _mm_max_epu8(vH, e);
-            vH = _mm_max_epu8(vH, vF);
-            vMaxColumn = _mm_max_epu8(vMaxColumn, vH);
+            vH = simde_mm_max_epu8(vH, e);
+            vH = simde_mm_max_epu8(vH, vF);
+            vMaxColumn = simde_mm_max_epu8(vMaxColumn, vH);
             
             // So now vH has the correct (modulo wrong F values) H matrix entries.
 
@@ -725,42 +725,42 @@
             //fprintf(stdout, "\n");
 
             /* Save vH values. */
-            _mm_store_si128(pvHStore + j, vH);
+            simde_mm_store_si128(pvHStore + j, vH);
             
             /* Save the vE and vF values they derived from */
-            _mm_store_si128(pvEStore + j, e);
-            _mm_store_si128(pvFStore + j, vF);
+            simde_mm_store_si128(pvEStore + j, e);
+            simde_mm_store_si128(pvFStore + j, vF);
 
             // Now we need to compute the E values for the *next* column, based
             // on our non-F-loop-processed H values
 
             /* Update vE value. */
-            vH = _mm_subs_epu8(vH, vGapO); /* saturation arithmetic, result >= 0 */
-            e = _mm_subs_epu8(e, vGapE);
-            e = _mm_max_epu8(e, vH);
+            vH = simde_mm_subs_epu8(vH, vGapO); /* saturation arithmetic, result >= 0 */
+            e = simde_mm_subs_epu8(e, vGapE);
+            e = simde_mm_max_epu8(e, vH);
 
             // And we compute the F values for the next cursor position.
 
             /* Compute new vF value, giving F matrix values at next cursor position */
-            vF = _mm_subs_epu8(vF, vGapE);
-            vF = _mm_max_epu8(vF, vH); // We already charged a gap open against vH
+            vF = simde_mm_subs_epu8(vF, vGapE);
+            vF = simde_mm_max_epu8(vF, vH); // We already charged a gap open against vH
 
             /* Save the E values we computed for the next column */
-            _mm_store_si128(pvE + j, e);
+            simde_mm_store_si128(pvE + j, e);
 
             /* Load the next vH. */
-            vH = _mm_load_si128(pvHLoad + j);
+            vH = simde_mm_load_si128(pvHLoad + j);
         }
 
 
         /* reset pointers to the start of the saved data */
         j = 0;
-        vH = _mm_load_si128 (pvHStore + j);
+        vH = simde_mm_load_si128 (pvHStore + j);
 
         /*  
          * Wrap vF around from the end of each segment to the start of the next.
          */
-        vF = _mm_slli_si128 (vF, 1);
+        vF = simde_mm_slli_si128 (vF, 1);
         
         // So now we're looking at the F value for every first position, after a
         // full pass. So the first F is guaranteed to be right, and other Fs
@@ -775,29 +775,29 @@
         
         // If we beat the stored H
         vTemp = m128i_cmpgt (vF, vH);
-        cmp = _mm_movemask_epi8 (vTemp);
+        cmp = simde_mm_movemask_epi8 (vTemp);
         // Or we beat the stored F
-        vTemp = _mm_load_si128 (pvFStore + j);
+        vTemp = simde_mm_load_si128 (pvFStore + j);
         vTemp = m128i_cmpgt (vF, vTemp);
-        cmp |= _mm_movemask_epi8 (vTemp);
+        cmp |= simde_mm_movemask_epi8 (vTemp);
         while (cmp != 0x0000)
         {
             // Then we do the update
             
             // Update this stripe of the H matrix
-            vH = _mm_max_epu8 (vH, vF);
-            vMaxColumn = _mm_max_epu8(vMaxColumn, vH);
-            _mm_store_si128 (pvHStore + j, vH);
+            vH = simde_mm_max_epu8 (vH, vF);
+            vMaxColumn = simde_mm_max_epu8(vMaxColumn, vH);
+            simde_mm_store_si128 (pvHStore + j, vH);
             
             // Update the E matrix for the next column
             // Since we may have changed the H matrix
             // This is to allow a gap-to-gap transition in the alignment
-            e = _mm_load_si128(pvE + j);
+            e = simde_mm_load_si128(pvE + j);
             // The H matrix can only get better, so the gap open scores can only
             // get better, so the E matrix can only get better too.
-            vTemp = _mm_subs_epu8(vH, vGapO);
-            e = _mm_max_epu8(e, vTemp);
-            _mm_store_si128(pvE + j, e);
+            vTemp = simde_mm_subs_epu8(vH, vGapO);
+            e = simde_mm_max_epu8(e, vTemp);
+            simde_mm_store_si128(pvE + j, e);
             // TODO: Instead of doing this, would it be smarter to just compute
             // the E matrix for each column when we're doing its H matrix? Or
             // would the extra buffer slow us down more than the extra compute?
@@ -806,12 +806,12 @@
             // Save the stripe of the F matrix
             // Only add in better F scores. Sometimes during this loop we'll
             // recompute worse ones.
-            vTemp = _mm_load_si128 (pvFStore + j);
-            vTemp = _mm_max_epu8 (vTemp, vF);
-            _mm_store_si128(pvFStore + j, vTemp);
+            vTemp = simde_mm_load_si128 (pvFStore + j);
+            vTemp = simde_mm_max_epu8 (vTemp, vF);
+            simde_mm_store_si128(pvFStore + j, vTemp);
 
             // Then think about extending
-            vF = _mm_subs_epu8 (vF, vGapE);
+            vF = simde_mm_subs_epu8 (vF, vGapE);
             // We never need to think about gap opens because nothing that came
             // from a gap open can ever change, because you won't close and then
             // immediately open a gap.
@@ -821,25 +821,25 @@
             {
                 // Wrap around to the next segment again
                 j = 0;
-                vF = _mm_slli_si128 (vF, 1);
+                vF = simde_mm_slli_si128 (vF, 1);
             }
 
             // Again compute if H or F needs updating based on this new set of F
             // values.
-            vH = _mm_load_si128 (pvHStore + j);
+            vH = simde_mm_load_si128 (pvHStore + j);
             
             // See if we beat the stored H
             vTemp = m128i_cmpgt (vF, vH);
-            cmp = _mm_movemask_epi8 (vTemp);
+            cmp = simde_mm_movemask_epi8 (vTemp);
             // Or if we beat the stored F
-            vTemp = _mm_load_si128 (pvFStore + j);
+            vTemp = simde_mm_load_si128 (pvFStore + j);
             vTemp = m128i_cmpgt (vF, vTemp);
-            cmp |= _mm_movemask_epi8 (vTemp);
+            cmp |= simde_mm_movemask_epi8 (vTemp);
         }
 
-        vMaxScore = _mm_max_epu8(vMaxScore, vMaxColumn);
-        vTemp = _mm_cmpeq_epi8(vMaxMark, vMaxScore);
-        cmp = _mm_movemask_epi8(vTemp);
+        vMaxScore = simde_mm_max_epu8(vMaxScore, vMaxColumn);
+        vTemp = simde_mm_cmpeq_epi8(vMaxMark, vMaxScore);
+        cmp = simde_mm_movemask_epi8(vTemp);
         if (cmp != 0xffff) {
             uint8_t temp;
             vMaxMark = vMaxScore;
@@ -910,8 +910,8 @@
         
     //fprintf(stderr, "%p %p %p %p %p %p\n", *pmH, mH, pvHmax, pvE, pvHLoad, pvHStore);
     // save the last vH
-    memcpy(alignment->seed.pvE,      pvE,      segLen*sizeof(__m128i));
-    memcpy(alignment->seed.pvHStore, pvHStore, segLen*sizeof(__m128i));
+    memcpy(alignment->seed.pvE,      pvE,      segLen*sizeof(simde__m128i));
+    memcpy(alignment->seed.pvHStore, pvHStore, segLen*sizeof(simde__m128i));
 
     /* Trace the alignment ending position on read. */
     uint8_t *t = (uint8_t*)pvHmax;
@@ -943,7 +943,7 @@
     return bests;
 }
 
-__m128i* gssw_qP_word (const int8_t* read_num,
+simde__m128i* gssw_qP_word (const int8_t* read_num,
                        const int8_t* mat,
                        const int32_t readLen,
                        const int32_t n,
@@ -951,7 +951,7 @@
                        int8_t end_full_length_bonus) {
 
     int32_t segLen = (readLen + 7) / 8;
-    __m128i* vProfile = (__m128i*)malloc(n * segLen * sizeof(__m128i));
+    simde__m128i* vProfile = (simde__m128i*)malloc(n * segLen * sizeof(simde__m128i));
     int16_t* t = (int16_t*)vProfile;
     int32_t nt, i, j;
     int32_t segNum;
@@ -987,7 +987,7 @@
     return vProfile;
 }
 
-__m128i* gssw_adj_qP_word (const int8_t* read_num,
+simde__m128i* gssw_adj_qP_word (const int8_t* read_num,
                            const int8_t* qual,
                            const int8_t* adj_mat,
                            const int32_t readLen,
@@ -996,7 +996,7 @@
                            int8_t end_full_length_bonus) {
 
     int32_t segLen = (readLen + 7) / 8;
-    __m128i* vProfile = (__m128i*) malloc(n * segLen * sizeof(__m128i));
+    simde__m128i* vProfile = (simde__m128i*) malloc(n * segLen * sizeof(simde__m128i));
     int16_t* t = (int16_t*) vProfile;
     int32_t nt, i, j, segNum;
 
@@ -1040,7 +1040,7 @@
  * Look up the value in a profile matrix for the given base code observed at the given read index.
  * Useful for non-swizzled access to the the swizzled profile.
  */
-uint16_t profile_get_word(__m128i* vProfile, int32_t readLen, int32_t read_position, int32_t observed_base) {
+uint16_t profile_get_word(simde__m128i* vProfile, int32_t readLen, int32_t read_position, int32_t observed_base) {
     // Profile is stored by observed base (most significant), then by position in the segment, then by segment in the read (lwast significant).
     
     // How long is a segment? We have 8.
@@ -1180,7 +1180,7 @@
                                            int32_t readLen,
                                            const uint8_t weight_gapO, /* will be used as - */
                                            const uint8_t weight_gapE, /* will be used as - */
-                                           __m128i* vProfile,
+                                           simde__m128i* vProfile,
                                            uint16_t terminate,
                                            int32_t maskLen,
                                            gssw_align* alignment, /* to save seed and matrix */
@@ -1211,16 +1211,16 @@
     int16_t* pv;
     
     /* Note use of aligned memory.  Return value of 0 means success for posix_memalign. */
-    if (!(!posix_memalign((void**)&pvHStore, sizeof(__m128i), padded_read_length * sizeof(int16_t)) &&
-          !posix_memalign((void**)&pvEStore, sizeof(__m128i), padded_read_length * sizeof(int16_t)) &&
-          !posix_memalign((void**)&pvFStore, sizeof(__m128i), padded_read_length * sizeof(int16_t)) &&
-          !posix_memalign((void**)&pvHLoad,  sizeof(__m128i), padded_read_length * sizeof(int16_t)) &&
-          !posix_memalign((void**)&pvELoad,  sizeof(__m128i), padded_read_length * sizeof(int16_t)) &&
-          !posix_memalign((void**)&alignment->seed.pvE,      sizeof(__m128i), padded_read_length * sizeof(int16_t)) &&
-          !posix_memalign((void**)&alignment->seed.pvHStore, sizeof(__m128i), padded_read_length * sizeof(int16_t)) &&
-          !posix_memalign((void**)&mH,           sizeof(__m128i), refLen * padded_read_length * sizeof(int16_t)) &&
-          !posix_memalign((void**)&mE,           sizeof(__m128i), refLen * padded_read_length * sizeof(int16_t)) &&
-          !posix_memalign((void**)&mF,           sizeof(__m128i), refLen * padded_read_length * sizeof(int16_t)))) {
+    if (!(!posix_memalign((void**)&pvHStore, sizeof(simde__m128i), padded_read_length * sizeof(int16_t)) &&
+          !posix_memalign((void**)&pvEStore, sizeof(simde__m128i), padded_read_length * sizeof(int16_t)) &&
+          !posix_memalign((void**)&pvFStore, sizeof(simde__m128i), padded_read_length * sizeof(int16_t)) &&
+          !posix_memalign((void**)&pvHLoad,  sizeof(simde__m128i), padded_read_length * sizeof(int16_t)) &&
+          !posix_memalign((void**)&pvELoad,  sizeof(simde__m128i), padded_read_length * sizeof(int16_t)) &&
+          !posix_memalign((void**)&alignment->seed.pvE,      sizeof(simde__m128i), padded_read_length * sizeof(int16_t)) &&
+          !posix_memalign((void**)&alignment->seed.pvHStore, sizeof(simde__m128i), padded_read_length * sizeof(int16_t)) &&
+          !posix_memalign((void**)&mH,           sizeof(simde__m128i), refLen * padded_read_length * sizeof(int16_t)) &&
+          !posix_memalign((void**)&mE,           sizeof(simde__m128i), refLen * padded_read_length * sizeof(int16_t)) &&
+          !posix_memalign((void**)&mF,           sizeof(simde__m128i), refLen * padded_read_length * sizeof(int16_t)))) {
         fprintf(stderr, "error:[gssw] Could not allocate memory required for alignment buffers.\n");
         exit(1);
     }
@@ -1373,7 +1373,7 @@
                                        int32_t readLen,
                                        const uint8_t weight_gapO, /* will be used as - */
                                        const uint8_t weight_gapE, /* will be used as - */
-                                       __m128i* vProfile,
+                                       simde__m128i* vProfile,
                                        uint16_t terminate,
                                        int32_t maskLen,
                                        gssw_align* alignment, /* to save seed and matrix */
@@ -1387,56 +1387,56 @@
     int32_t segLen = (readLen + 7) / 8; /* number of segment */
 
     /* Initialize buffers used in alignment */
-    __m128i* pvHStore;
-    __m128i* pvHLoad;
-    __m128i* pvHmax;
-    __m128i* pvE;
+    simde__m128i* pvHStore;
+    simde__m128i* pvHLoad;
+    simde__m128i* pvHmax;
+    simde__m128i* pvE;
     // We have a couple extra arrays for logging columns
-    __m128i* pvEStore;
-    __m128i* pvFStore;
+    simde__m128i* pvEStore;
+    simde__m128i* pvFStore;
     uint16_t* mH = NULL; // used to save matrices for external traceback: overall best
     uint16_t* mE = NULL; // Read gap
     uint16_t* mF = NULL; // Ref gap
     /* Note use of aligned memory */
 
-    if (!(!posix_memalign((void**)&pvHStore,     sizeof(__m128i), segLen*sizeof(__m128i)) &&
-          !posix_memalign((void**)&pvHLoad,      sizeof(__m128i), segLen*sizeof(__m128i)) &&
-          !posix_memalign((void**)&pvHmax,       sizeof(__m128i), segLen*sizeof(__m128i)) &&
-          !posix_memalign((void**)&pvE,          sizeof(__m128i), segLen*sizeof(__m128i)) &&
-          !posix_memalign((void**)&pvEStore,     sizeof(__m128i), segLen*sizeof(__m128i)) &&
-          !posix_memalign((void**)&pvFStore,     sizeof(__m128i), segLen*sizeof(__m128i)) &&
-          !posix_memalign((void**)&alignment->seed.pvE,      sizeof(__m128i), segLen*sizeof(__m128i)) &&
-          !posix_memalign((void**)&alignment->seed.pvHStore, sizeof(__m128i), segLen*sizeof(__m128i)))) {
+    if (!(!posix_memalign((void**)&pvHStore,     sizeof(simde__m128i), segLen*sizeof(simde__m128i)) &&
+          !posix_memalign((void**)&pvHLoad,      sizeof(simde__m128i), segLen*sizeof(simde__m128i)) &&
+          !posix_memalign((void**)&pvHmax,       sizeof(simde__m128i), segLen*sizeof(simde__m128i)) &&
+          !posix_memalign((void**)&pvE,          sizeof(simde__m128i), segLen*sizeof(simde__m128i)) &&
+          !posix_memalign((void**)&pvEStore,     sizeof(simde__m128i), segLen*sizeof(simde__m128i)) &&
+          !posix_memalign((void**)&pvFStore,     sizeof(simde__m128i), segLen*sizeof(simde__m128i)) &&
+          !posix_memalign((void**)&alignment->seed.pvE,      sizeof(simde__m128i), segLen*sizeof(simde__m128i)) &&
+          !posix_memalign((void**)&alignment->seed.pvHStore, sizeof(simde__m128i), segLen*sizeof(simde__m128i)))) {
         fprintf(stderr, "error:[gssw] Could not allocate memory required for alignment buffers.\n");
         exit(1);
     }
 
-    if (save_matrixes && !(!posix_memalign((void**)&mH,           sizeof(__m128i), segLen*refLen*sizeof(__m128i)) &&
-                           !posix_memalign((void**)&mE,           sizeof(__m128i), segLen*refLen*sizeof(__m128i)) &&
-                           !posix_memalign((void**)&mF,           sizeof(__m128i), segLen*refLen*sizeof(__m128i)))) {
+    if (save_matrixes && !(!posix_memalign((void**)&mH,           sizeof(simde__m128i), segLen*refLen*sizeof(simde__m128i)) &&
+                           !posix_memalign((void**)&mE,           sizeof(simde__m128i), segLen*refLen*sizeof(simde__m128i)) &&
+                           !posix_memalign((void**)&mF,           sizeof(simde__m128i), segLen*refLen*sizeof(simde__m128i)))) {
         fprintf(stderr, "error:[gssw] Could not allocate memory required for alignment traceback matrixes.\n");
         exit(1);
     }
 
     /* Workaround: zero ourselves because we don't have an aligned calloc */
-    memset(pvHStore,                 0, segLen*sizeof(__m128i));
-    memset(pvHLoad,                  0, segLen*sizeof(__m128i));
-    memset(pvHmax,                   0, segLen*sizeof(__m128i));
-    memset(pvE,                      0, segLen*sizeof(__m128i));
-    memset(pvEStore,                 0, segLen*sizeof(__m128i));
-    memset(pvFStore,                 0, segLen*sizeof(__m128i));
-    memset(alignment->seed.pvE,      0, segLen*sizeof(__m128i));
-    memset(alignment->seed.pvHStore, 0, segLen*sizeof(__m128i));
+    memset(pvHStore,                 0, segLen*sizeof(simde__m128i));
+    memset(pvHLoad,                  0, segLen*sizeof(simde__m128i));
+    memset(pvHmax,                   0, segLen*sizeof(simde__m128i));
+    memset(pvE,                      0, segLen*sizeof(simde__m128i));
+    memset(pvEStore,                 0, segLen*sizeof(simde__m128i));
+    memset(pvFStore,                 0, segLen*sizeof(simde__m128i));
+    memset(alignment->seed.pvE,      0, segLen*sizeof(simde__m128i));
+    memset(alignment->seed.pvHStore, 0, segLen*sizeof(simde__m128i));
     if (save_matrixes) {
-        memset(mH,                       0, segLen*refLen*sizeof(__m128i));
-        memset(mE,                       0, segLen*refLen*sizeof(__m128i));
-        memset(mF,                       0, segLen*refLen*sizeof(__m128i));
+        memset(mH,                       0, segLen*refLen*sizeof(simde__m128i));
+        memset(mE,                       0, segLen*refLen*sizeof(simde__m128i));
+        memset(mF,                       0, segLen*refLen*sizeof(simde__m128i));
     }
 
     /* if we are running a seeded alignment, copy over the seeds */
     if (seed) {
-        memcpy(pvE, seed->pvE, segLen*sizeof(__m128i));
-        memcpy(pvHStore, seed->pvHStore, segLen*sizeof(__m128i));
+        memcpy(pvE, seed->pvE, segLen*sizeof(simde__m128i));
+        memcpy(pvHStore, seed->pvHStore, segLen*sizeof(simde__m128i));
     }
 
     /* Set external matrix pointers */
@@ -1450,20 +1450,20 @@
     alignment->is_byte = 0;
 
     /* Define 16 byte 0 vector. */
-    __m128i vZero = _mm_set1_epi32(0);
+    simde__m128i vZero = simde_mm_set1_epi32(0);
 
     /* Used for iteration */
     int32_t i, j;
 
     /* 16 byte insertion begin vector */
-    __m128i vGapO = _mm_set1_epi16(weight_gapO);
+    simde__m128i vGapO = simde_mm_set1_epi16(weight_gapO);
 
     /* 16 byte insertion extension vector */
-    __m128i vGapE = _mm_set1_epi16(weight_gapE);
+    simde__m128i vGapE = simde_mm_set1_epi16(weight_gapE);
 
-    __m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */
-    __m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */
-    __m128i vTemp;
+    simde__m128i vMaxScore = vZero; /* Trace the highest score of the whole SW matrix. */
+    simde__m128i vMaxMark = vZero; /* Trace the highest score till the previous column. */
+    simde__m128i vTemp;
     int32_t begin = 0, end = refLen, step = 1;
 
     /* outer loop to process the reference sequence */
@@ -1474,50 +1474,50 @@
     }
     for (i = begin; LIKELY(i != end); i += step) {
         int32_t cmp;
-        __m128i e = vZero, vF = vZero; /* Initialize F value to 0.
+        simde__m128i e = vZero, vF = vZero; /* Initialize F value to 0.
                                Any errors to vH values will be corrected in the Lazy_F loop.
                              */
-        __m128i vH = pvHStore[segLen - 1];
-        vH = _mm_slli_si128 (vH, 2); /* Shift the 128-bit value in vH left by 2 byte. */
+        simde__m128i vH = pvHStore[segLen - 1];
+        vH = simde_mm_slli_si128 (vH, 2); /* Shift the 128-bit value in vH left by 2 byte. */
 
-        __m128i vMaxColumn = vZero; /* vMaxColumn is used to record the max values of column i. */
+        simde__m128i vMaxColumn = vZero; /* vMaxColumn is used to record the max values of column i. */
 
-        __m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */
+        simde__m128i* vP = vProfile + ref[i] * segLen; /* Right part of the vProfile */
         
         /* Swap the 2 H buffers. */
-        __m128i* pv = pvHLoad;
+        simde__m128i* pv = pvHLoad;
         pvHLoad = pvHStore;
         pvHStore = pv;
 
         /* inner loop to process the query sequence */
         for (j = 0; LIKELY(j < segLen); j ++) {
-            vH = _mm_adds_epi16(vH, _mm_load_si128(vP + j));
+            vH = simde_mm_adds_epi16(vH, simde_mm_load_si128(vP + j));
 
             /* Get max from vH, vE and vF. */
-            e = _mm_load_si128(pvE + j);
-            vH = _mm_max_epi16(vH, e);
-            vH = _mm_max_epi16(vH, vF);
-            vMaxColumn = _mm_max_epi16(vMaxColumn, vH);
+            e = simde_mm_load_si128(pvE + j);
+            vH = simde_mm_max_epi16(vH, e);
+            vH = simde_mm_max_epi16(vH, vF);
+            vMaxColumn = simde_mm_max_epi16(vMaxColumn, vH);
 
             /* Save vH values. */
-            _mm_store_si128(pvHStore + j, vH);
+            simde_mm_store_si128(pvHStore + j, vH);
             
             /* Save the vE and vF values they derived from */
-            _mm_store_si128(pvEStore + j, e);
-            _mm_store_si128(pvFStore + j, vF);
+            simde_mm_store_si128(pvEStore + j, e);
+            simde_mm_store_si128(pvFStore + j, vF);
 
             /* Update vE value. */
-            vH = _mm_subs_epu16(vH, vGapO); /* saturation arithmetic, result >= 0 */
-            e = _mm_subs_epu16(e, vGapE);
-            e = _mm_max_epi16(e, vH);
-            _mm_store_si128(pvE + j, e);
+            vH = simde_mm_subs_epu16(vH, vGapO); /* saturation arithmetic, result >= 0 */
+            e = simde_mm_subs_epu16(e, vGapE);
+            e = simde_mm_max_epi16(e, vH);
+            simde_mm_store_si128(pvE + j, e);
 
             /* Update vF value. */
-            vF = _mm_subs_epu16(vF, vGapE);
-            vF = _mm_max_epi16(vF, vH);
+            vF = simde_mm_subs_epu16(vF, vGapE);
+            vF = simde_mm_max_epi16(vF, vH);
 
             /* Load the next vH. */
-            vH = _mm_load_si128(pvHLoad + j);
+            vH = simde_mm_load_si128(pvHLoad + j);
         }
 
         // Now we have the exact same lazy F loop as for bytes, but adapted.
@@ -1525,42 +1525,42 @@
 
         /* reset pointers to the start of the saved data */
         j = 0;
-        vH = _mm_load_si128 (pvHStore + j);
+        vH = simde_mm_load_si128 (pvHStore + j);
 
         /*  
          * Wrap vF around from the end of each segment to the start of the next.
          */
-        vF = _mm_slli_si128 (vF, 2);
+        vF = simde_mm_slli_si128 (vF, 2);
         
         // Now we need to work out if we actually want to update anything. We
         // need to do an F loop if we would modify H, or if we would improve
         // over the old F.
         
         // If we beat the stored H
-        vTemp = _mm_cmpgt_epi16 (vF, vH);
-        cmp = _mm_movemask_epi8 (vTemp);
+        vTemp = simde_mm_cmpgt_epi16 (vF, vH);
+        cmp = simde_mm_movemask_epi8 (vTemp);
         // Or we beat the stored F
-        vTemp = _mm_load_si128 (pvFStore + j);
-        vTemp = _mm_cmpgt_epi16 (vF, vTemp);
-        cmp |= _mm_movemask_epi8 (vTemp);
+        vTemp = simde_mm_load_si128 (pvFStore + j);
+        vTemp = simde_mm_cmpgt_epi16 (vF, vTemp);
+        cmp |= simde_mm_movemask_epi8 (vTemp);
         while (cmp != 0x0000)
         {
             // Then we do the update
             
             // Update this stripe of the H matrix
-            vH = _mm_max_epi16 (vH, vF);
-            vMaxColumn = _mm_max_epi16(vMaxColumn, vH);
-            _mm_store_si128 (pvHStore + j, vH);
+            vH = simde_mm_max_epi16 (vH, vF);
+            vMaxColumn = simde_mm_max_epi16(vMaxColumn, vH);
+            simde_mm_store_si128 (pvHStore + j, vH);
             
             // Update the E matrix for the next column
             // Since we may have changed the H matrix
             // This is to allow a gap-to-gap transition in the alignment
-            e = _mm_load_si128(pvE + j);
+            e = simde_mm_load_si128(pvE + j);
             // The H matrix can only get better, so the gap open scores can only
             // get better, so the E matrix can only get better too.
-            vTemp = _mm_subs_epu16(vH, vGapO);
-            e = _mm_max_epi16(e, vTemp);
-            _mm_store_si128(pvE + j, e);
+            vTemp = simde_mm_subs_epu16(vH, vGapO);
+            e = simde_mm_max_epi16(e, vTemp);
+            simde_mm_store_si128(pvE + j, e);
             // TODO: Instead of doing this, would it be smarter to just compute
             // the E matrix for each column when we're doing its H matrix? Or
             // would the extra buffer slow us down more than the extra compute?
@@ -1569,12 +1569,12 @@
             // Save the stripe of the F matrix
             // Only add in better F scores. Sometimes during this loop we'll
             // recompute worse ones.
-            vTemp = _mm_load_si128 (pvFStore + j);
-            vTemp = _mm_max_epi16 (vTemp, vF);
-            _mm_store_si128(pvFStore + j, vTemp);
+            vTemp = simde_mm_load_si128 (pvFStore + j);
+            vTemp = simde_mm_max_epi16 (vTemp, vF);
+            simde_mm_store_si128(pvFStore + j, vTemp);
 
             // Then think about extending
-            vF = _mm_subs_epu16 (vF, vGapE);
+            vF = simde_mm_subs_epu16 (vF, vGapE);
             // We never need to think about gap opens because nothing that came
             // from a gap open can ever change, because you won't close and then
             // immediately open a gap.
@@ -1584,27 +1584,27 @@
             {
                 // Wrap around to the next segment again
                 j = 0;
-                vF = _mm_slli_si128 (vF, 2);
+                vF = simde_mm_slli_si128 (vF, 2);
             }
 
             // Again compute if H or F needs updating based on this new set of F
             // values.
-            vH = _mm_load_si128 (pvHStore + j);
+            vH = simde_mm_load_si128 (pvHStore + j);
             
             // See if we beat the stored H
-            vTemp = _mm_cmpgt_epi16 (vF, vH);
-            cmp = _mm_movemask_epi8 (vTemp);
+            vTemp = simde_mm_cmpgt_epi16 (vF, vH);
+            cmp = simde_mm_movemask_epi8 (vTemp);
             // Or if we beat the stored F
-            vTemp = _mm_load_si128 (pvFStore + j);
-            vTemp = _mm_cmpgt_epi16 (vF, vTemp);
-            cmp |= _mm_movemask_epi8 (vTemp);
+            vTemp = simde_mm_load_si128 (pvFStore + j);
+            vTemp = simde_mm_cmpgt_epi16 (vF, vTemp);
+            cmp |= simde_mm_movemask_epi8 (vTemp);
         }
 
         // Now H, E, and F are all up to date with downwards gap propagations.
         
-        vMaxScore = _mm_max_epi16(vMaxScore, vMaxColumn);
-        vTemp = _mm_cmpeq_epi16(vMaxMark, vMaxScore);
-        cmp = _mm_movemask_epi8(vTemp);
+        vMaxScore = simde_mm_max_epi16(vMaxScore, vMaxColumn);
+        vTemp = simde_mm_cmpeq_epi16(vMaxMark, vMaxScore);
+        cmp = simde_mm_movemask_epi8(vTemp);
         if (cmp != 0xffff) {
             uint16_t temp;
             vMaxMark = vMaxScore;
@@ -1665,8 +1665,8 @@
 
     }
 
-    memcpy(alignment->seed.pvE,      pvE,      segLen*sizeof(__m128i));
-    memcpy(alignment->seed.pvHStore, pvHStore, segLen*sizeof(__m128i));
+    memcpy(alignment->seed.pvE,      pvE,      segLen*sizeof(simde__m128i));
+    memcpy(alignment->seed.pvHStore, pvHStore, segLen*sizeof(simde__m128i));
 
 
     /* Trace the alignment ending position on read. */
@@ -5078,28 +5078,28 @@
             exit(1);
         }
     }
-    __m128i vZero = _mm_set1_epi32(0);
+    simde__m128i vZero = simde_mm_set1_epi32(0);
     int32_t segLen = (readLen + 15) / 16;
     gssw_seed* seed = (gssw_seed*)calloc(1, sizeof(gssw_seed));
-    if (!(!posix_memalign((void**)&seed->pvE,      sizeof(__m128i), segLen*sizeof(__m128i)) &&
-          !posix_memalign((void**)&seed->pvHStore, sizeof(__m128i), segLen*sizeof(__m128i)))) {
+    if (!(!posix_memalign((void**)&seed->pvE,      sizeof(simde__m128i), segLen*sizeof(simde__m128i)) &&
+          !posix_memalign((void**)&seed->pvHStore, sizeof(simde__m128i), segLen*sizeof(simde__m128i)))) {
         fprintf(stderr, "error:[gssw] Could not allocate memory for alignment seed\n"); exit(1);
         exit(1);
     }
-    memset(seed->pvE,      0, segLen*sizeof(__m128i));
-    memset(seed->pvHStore, 0, segLen*sizeof(__m128i));
+    memset(seed->pvE,      0, segLen*sizeof(simde__m128i));
+    memset(seed->pvHStore, 0, segLen*sizeof(simde__m128i));
     // take the max of all inputs
-    __m128i pvE = vZero, pvH = vZero, ovE = vZero, ovH = vZero;
+    simde__m128i pvE = vZero, pvH = vZero, ovE = vZero, ovH = vZero;
     for (j = 0; j < segLen; ++j) {
         pvE = vZero; pvH = vZero;
         for (k = 0; k < count; ++k) {
-            ovE = _mm_load_si128(prev[k]->alignment->seed.pvE + j);
-            ovH = _mm_load_si128(prev[k]->alignment->seed.pvHStore + j);
-            pvE = _mm_max_epu8(pvE, ovE);
-            pvH = _mm_max_epu8(pvH, ovH);
+            ovE = simde_mm_load_si128(prev[k]->alignment->seed.pvE + j);
+            ovH = simde_mm_load_si128(prev[k]->alignment->seed.pvHStore + j);
+            pvE = simde_mm_max_epu8(pvE, ovE);
+            pvH = simde_mm_max_epu8(pvH, ovH);
         }
-        _mm_store_si128(seed->pvHStore + j, pvH);
-        _mm_store_si128(seed->pvE + j, pvE);
+        simde_mm_store_si128(seed->pvHStore + j, pvH);
+        simde_mm_store_si128(seed->pvE + j, pvE);
     }
     return seed;
 }
@@ -5113,28 +5113,28 @@
             exit(1);
         }
     }
-    __m128i vZero = _mm_set1_epi32(0);
+    simde__m128i vZero = simde_mm_set1_epi32(0);
     int32_t segLen = (readLen + 7) / 8;
     gssw_seed* seed = (gssw_seed*)calloc(1, sizeof(gssw_seed));
-    if (!(!posix_memalign((void**)&seed->pvE,      sizeof(__m128i), segLen*sizeof(__m128i)) &&
-          !posix_memalign((void**)&seed->pvHStore, sizeof(__m128i), segLen*sizeof(__m128i)))) {
+    if (!(!posix_memalign((void**)&seed->pvE,      sizeof(simde__m128i), segLen*sizeof(simde__m128i)) &&
+          !posix_memalign((void**)&seed->pvHStore, sizeof(simde__m128i), segLen*sizeof(simde__m128i)))) {
         fprintf(stderr, "error:[gssw] Could not allocate memory for alignment seed\n"); exit(1);
         exit(1);
     }
-    memset(seed->pvE,      0, segLen*sizeof(__m128i));
-    memset(seed->pvHStore, 0, segLen*sizeof(__m128i));
+    memset(seed->pvE,      0, segLen*sizeof(simde__m128i));
+    memset(seed->pvHStore, 0, segLen*sizeof(simde__m128i));
     // take the max of all inputs
-    __m128i pvE = vZero, pvH = vZero, ovE = vZero, ovH = vZero;
+    simde__m128i pvE = vZero, pvH = vZero, ovE = vZero, ovH = vZero;
     for (j = 0; j < segLen; ++j) {
         pvE = vZero; pvH = vZero;
         for (k = 0; k < count; ++k) {
-            ovE = _mm_load_si128(prev[k]->alignment->seed.pvE + j);
-            ovH = _mm_load_si128(prev[k]->alignment->seed.pvHStore + j);
-            pvE = _mm_max_epu16(pvE, ovE);
-            pvH = _mm_max_epu16(pvH, ovH);
+            ovE = simde_mm_load_si128(prev[k]->alignment->seed.pvE + j);
+            ovH = simde_mm_load_si128(prev[k]->alignment->seed.pvHStore + j);
+            pvE = simde_mm_max_epu16(pvE, ovE);
+            pvH = simde_mm_max_epu16(pvH, ovH);
         }
-        _mm_store_si128(seed->pvHStore + j, pvH);
-        _mm_store_si128(seed->pvE + j, pvE);
+        simde_mm_store_si128(seed->pvHStore + j, pvH);
+        simde_mm_store_si128(seed->pvE + j, pvE);
     }
     return seed;
 }
--- vg.orig/deps/gssw/src/gssw.h
+++ vg/deps/gssw/src/gssw.h
@@ -13,7 +13,7 @@
 #include <stdint.h>
 #include <string.h>
 #include <stdbool.h>
-#include <smmintrin.h>
+#include "../debian/include/simde/x86/sse4.1.h"
 
 /*!	@typedef	structure of the query profile	*/
 struct gssw_profile;
@@ -21,7 +21,7 @@
 
 // This file makes extensive use of SSE intrinsics, which operate on 128-bit
 // values. These values are stored in the 128-bit special registers XMM0 through
-// XMM7, as assigned by the compiler. In C, these values have type "__m128i",
+// XMM7, as assigned by the compiler. In C, these values have type "simde__m128i",
 // with the "i" meaning integer (because we do integer math on them). We have
 // code to use these in alignments, either to store 16 single-byte values or 8
 // double-byte values, depending on how large we're allowing the scores to get.
@@ -63,9 +63,9 @@
 typedef struct {
     // Stores the E values (best gap in read scores) for the *next* column to be
     // generated, in the matrix to be filled. They are known in advance.
-    __m128i* pvE;
+    simde__m128i* pvE;
     // Stores the H values (overall best scores) from the previous column, before the matrix to be filled.
-    __m128i* pvHStore;
+    simde__m128i* pvHStore;
 } gssw_seed;
 
 
@@ -135,9 +135,9 @@
 // no data dependence between any entries, so we can just fill this in.
 struct gssw_profile{
     // We keep one version, stored striped, for byte-sized alignment
-	__m128i* profile_byte;	// 0: none
+	simde__m128i* profile_byte;	// 0: none
 	// And another for word-sized alignment.
-	__m128i* profile_word;	// 0: none
+	simde__m128i* profile_word;	// 0: none
 	const int8_t* read;
 	const int8_t* mat;
 	int32_t readLen;
--- vg.orig/deps/gssw/src/main.c
+++ vg/deps/gssw/src/main.c
@@ -6,7 +6,7 @@
 
 #include <stdlib.h>
 #include <stdint.h>
-#include <emmintrin.h>
+#include "../debian/include/simde/x86/sse2.h"
 #include <zlib.h>
 #include <stdio.h>
 #include <time.h>
--- vg.orig/deps/libbdsg/Makefile
+++ vg/deps/libbdsg/Makefile
@@ -12,7 +12,7 @@
 
 OBJS:=$(OBJ_DIR)/eades_algorithm.o $(OBJ_DIR)/hash_graph.o $(OBJ_DIR)/is_single_stranded.o $(OBJ_DIR)/node.o $(OBJ_DIR)/odgi.o $(OBJ_DIR)/packed_graph.o $(OBJ_DIR)/packed_structs.o $(OBJ_DIR)/path_position_overlays.o $(OBJ_DIR)/packed_path_position_overlays.o $(OBJ_DIR)/vectorizable_overlays.o $(OBJ_DIR)/split_strand_graph.o $(OBJ_DIR)/utility.o
 
-CXXFLAGS :=-O3 -Werror=return-type -std=c++14 -ggdb -g -msse4.2 -I$(INC_DIR) $(CXXFLAGS)
+CXXFLAGS :=-O3 -Werror=return-type -std=c++14 -ggdb -g -I$(INC_DIR) $(CXXFLAGS)
 
 ifeq ($(shell uname -s),Darwin)
 	CXXFLAGS := $(CXXFLAGS) -Xpreprocessor -fopenmp
--- vg.orig/deps/sdsl-lite/CMakeLists.txt
+++ vg/deps/sdsl-lite/CMakeLists.txt
@@ -82,19 +82,6 @@
   endif()
 endif()
 
-include(CheckSSE4_2)
-if( BUILTIN_POPCNT )
-  if( CMAKE_COMPILER_IS_GNUCXX )
-    append_cxx_compiler_flags("-msse4.2 -march=native" "GCC" CMAKE_CXX_OPT_FLAGS)
-  endif()
-  if( CMAKE_COMPILER_IS_GNUCXX )
-    append_cxx_compiler_flags("-msse4.2 -march=native" "CLANG" CMAKE_CXX_OPT_FLAGS)
-  endif()
-  if( CMAKE_COMPILER_IS_INTEL )
-    append_cxx_compiler_flags("-msse4.2 -march=native" "INTEL" CMAKE_CXX_FLAGS)
-  endif()
-endif()
-
 # check for demangle support to get pretty C++ class names
 include(FindCxaDemangle)
 if( HAVE_CXA_DEMANGLE )
--- vg.orig/deps/vowpal_wabbit/Makefile
+++ vg/deps/vowpal_wabbit/Makefile
@@ -61,7 +61,7 @@
 ifeq ($(ARCH_UNAME), ppc64le)
   OPTIM_FLAGS ?= -DNDEBUG -O3 -fomit-frame-pointer -fno-strict-aliasing #-msse2 is not supported on power
 else
-  OPTIM_FLAGS ?= -DNDEBUG -O3 -fomit-frame-pointer -fno-strict-aliasing -msse2 -mfpmath=sse #-ffast-math #uncomment for speed, comment for testability
+  OPTIM_FLAGS ?= -DNDEBUG -O3 -fomit-frame-pointer -fno-strict-aliasing #-ffast-math #uncomment for speed, comment for testability
 endif
 
 ifeq ($(UNAME), FreeBSD)
--- vg.orig/Makefile
+++ vg/Makefile
@@ -16,7 +16,8 @@
 CWD:=$(shell pwd)
 CXX ?= g++
 
-EXE:=vg
+SFX :=
+EXE:=vg$(SFX)
 
 all: $(BIN_DIR)/$(EXE)
 
@@ -595,7 +596,6 @@
 	$(RM) -f $(INC_DIR)/vg_system_version.hpp
 
 clean:
-	$(RM) -r $(BIN_DIR)
 	$(RM) -r $(LIB_DIR)
 	$(RM) -r $(UNITTEST_OBJ_DIR)
 	$(RM) -r $(SUBCOMMAND_OBJ_DIR)
--- vg.orig/deps/structures/Makefile
+++ vg/deps/structures/Makefile
@@ -9,7 +9,7 @@
 TESTOBJ =$(OBJDIR)/tests.o
 HEADERS = $(INCDIR)/suffix_tree.hpp $(INCDIR)/union_find.hpp $(INCDIR)/min_max_heap.hpp $(INCDIR)/immutable_list.hpp $(INCDIR)/stable_double.hpp $(INCDIR)/rank_pairing_heap.hpp
 CXX = g++
-CPPFLAGS += -std=c++11 -m64 -g -O3 -I$(INCSEARCHDIR)
+CPPFLAGS += -std=c++11 -g -O3 -I$(INCSEARCHDIR)
 
 
 all: 
