SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
simd_algorithm_avx2.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <array>
16 
21 
22 //-----------------------------------------------------------------------------
23 // forward declare avx2 simd algorithms that use avx2 intrinsics
24 //-----------------------------------------------------------------------------
25 
26 namespace seqan3::detail
27 {
31 template <simd::simd_concept simd_t>
32 constexpr simd_t load_avx2(void const * mem_addr);
33 
37 template <simd::simd_concept simd_t>
38 inline void transpose_matrix_avx2(std::array<simd_t, simd_traits<simd_t>::length> & matrix);
39 
43 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
44 constexpr target_simd_t upcast_signed_avx2(source_simd_t const & src);
45 
49 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
50 constexpr target_simd_t upcast_unsigned_avx2(source_simd_t const & src);
51 
55 template <uint8_t index, simd::simd_concept simd_t>
56 constexpr simd_t extract_half_avx2(simd_t const & src);
57 
61 template <uint8_t index, simd::simd_concept simd_t>
62 constexpr simd_t extract_quarter_avx2(simd_t const & src);
63 
67 template <uint8_t index, simd::simd_concept simd_t>
68 constexpr simd_t extract_eighth_avx2(simd_t const & src);
69 
70 }
71 
72 //-----------------------------------------------------------------------------
73 // implementation
74 //-----------------------------------------------------------------------------
75 
76 #ifdef __AVX2__
77 
78 namespace seqan3::detail
79 {
80 
81 template <simd::simd_concept simd_t>
82 constexpr simd_t load_avx2(void const * mem_addr)
83 {
84  return reinterpret_cast<simd_t>(_mm256_loadu_si256(reinterpret_cast<__m256i const *>(mem_addr)));
85 }
86 
87 template <simd::simd_concept simd_t>
88 inline void transpose_matrix_avx2(std::array<simd_t, simd_traits<simd_t>::length> & matrix)
89 {
90  // emulate missing _mm256_unpacklo_epi128/_mm256_unpackhi_epi128 instructions
91  auto _mm256_unpacklo_epi128 = [] (__m256i const & a, __m256i const & b)
92  {
93  return _mm256_permute2x128_si256(a, b, 0x20);
94  };
95 
96  auto _mm256_unpackhi_epi128 = [] (__m256i const & a, __m256i const & b)
97  {
98  return _mm256_permute2x128_si256(a, b, 0x31);
99  };
100 
101  // A look-up table to reverse the lowest 4 bits in order to permute the transposed rows.
102  static const uint8_t bit_rev[] = { 0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15,
103  16,24,20,28,18,26,22,30,17,25,21,29,19,27,23,31};
104 
105  // transpose a 32x32 byte matrix
106  __m256i tmp1[32];
107  for (int i = 0; i < 16; ++i)
108  {
109  tmp1[i] = _mm256_unpacklo_epi8(
110  reinterpret_cast<const __m256i &>(matrix[2*i]),
111  reinterpret_cast<const __m256i &>(matrix[2*i+1])
112  );
113  tmp1[i+16] = _mm256_unpackhi_epi8(
114  reinterpret_cast<const __m256i &>(matrix[2*i]),
115  reinterpret_cast<const __m256i &>(matrix[2*i+1])
116  );
117  }
118  __m256i tmp2[32];
119  for (int i = 0; i < 16; ++i)
120  {
121  tmp2[i] = _mm256_unpacklo_epi16(tmp1[2*i], tmp1[2*i+1]);
122  tmp2[i+16] = _mm256_unpackhi_epi16(tmp1[2*i], tmp1[2*i+1]);
123  }
124  for (int i = 0; i < 16; ++i)
125  {
126  tmp1[i] = _mm256_unpacklo_epi32(tmp2[2*i], tmp2[2*i+1]);
127  tmp1[i+16] = _mm256_unpackhi_epi32(tmp2[2*i], tmp2[2*i+1]);
128  }
129  for (int i = 0; i < 16; ++i)
130  {
131  tmp2[i] = _mm256_unpacklo_epi64(tmp1[2*i], tmp1[2*i+1]);
132  tmp2[i+16] = _mm256_unpackhi_epi64(tmp1[2*i], tmp1[2*i+1]);
133  }
134  for (int i = 0; i < 16; ++i)
135  {
136  matrix[bit_rev[i]] = reinterpret_cast<simd_t>(_mm256_unpacklo_epi128(tmp2[2*i],tmp2[2*i+1]));
137  matrix[bit_rev[i+16]] = reinterpret_cast<simd_t>(_mm256_unpackhi_epi128(tmp2[2*i],tmp2[2*i+1]));
138  }
139 }
140 
141 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
142 constexpr target_simd_t upcast_signed_avx2(source_simd_t const & src)
143 {
144  __m128i const & tmp = _mm256_castsi256_si128(reinterpret_cast<__m256i const &>(src));
145  if constexpr (simd_traits<source_simd_t>::length == 32) // cast from epi8 ...
146  {
147  if constexpr (simd_traits<target_simd_t>::length == 16) // to epi16
148  return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi16(tmp));
149  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
150  return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi32(tmp));
151  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
152  return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi64(tmp));
153  }
154  else if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi16 ...
155  {
156  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
157  return reinterpret_cast<target_simd_t>(_mm256_cvtepi16_epi32(tmp));
158  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
159  return reinterpret_cast<target_simd_t>(_mm256_cvtepi16_epi64(tmp));
160  }
161  else // cast from epi32 to epi64
162  {
163  static_assert(simd_traits<source_simd_t>::length == 8, "Expected 32 bit scalar type.");
164  return reinterpret_cast<target_simd_t>(_mm256_cvtepi32_epi64(tmp));
165  }
166 }
167 
168 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
169 constexpr target_simd_t upcast_unsigned_avx2(source_simd_t const & src)
170 {
171  __m128i const & tmp = _mm256_castsi256_si128(reinterpret_cast<__m256i const &>(src));
172  if constexpr (simd_traits<source_simd_t>::length == 32) // cast from epi8 ...
173  {
174  if constexpr (simd_traits<target_simd_t>::length == 16) // to epi16
175  return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi16(tmp));
176  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
177  return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi32(tmp));
178  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
179  return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi64(tmp));
180  }
181  else if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi16 ...
182  {
183  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
184  return reinterpret_cast<target_simd_t>(_mm256_cvtepu16_epi32(tmp));
185  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
186  return reinterpret_cast<target_simd_t>(_mm256_cvtepu16_epi64(tmp));
187  }
188  else // cast from epi32 to epi64
189  {
190  static_assert(simd_traits<source_simd_t>::length == 8, "Expected 32 bit scalar type.");
191  return reinterpret_cast<target_simd_t>(_mm256_cvtepu32_epi64(tmp));
192  }
193 }
194 
195 template <uint8_t index, simd::simd_concept simd_t>
196 constexpr simd_t extract_half_avx2(simd_t const & src)
197 {
198  return reinterpret_cast<simd_t>(_mm256_castsi128_si256(
199  _mm256_extracti128_si256(reinterpret_cast<__m256i const &>(src), index)));
200 }
201 
202 template <uint8_t index, simd::simd_concept simd_t>
203 constexpr simd_t extract_quarter_avx2(simd_t const & src)
204 {
205  return reinterpret_cast<simd_t>(_mm256_castsi128_si256(
206  _mm_cvtsi64x_si128(_mm256_extract_epi64(reinterpret_cast<__m256i const &>(src), index))));
207 }
208 
209 template <uint8_t index, simd::simd_concept simd_t>
210 constexpr simd_t extract_eighth_avx2(simd_t const & src)
211 {
212  return reinterpret_cast<simd_t>(_mm256_castsi128_si256(
213  _mm_cvtsi32_si128(_mm256_extract_epi32(reinterpret_cast<__m256i const &>(src), index))));
214 }
215 
216 } // namespace seqan3::detail
217 
218 #endif // __AVX2__
Provides seqan3::simd::simd_concept.
Provides seqan3::detail::builtin_simd, seqan3::detail::is_builtin_simd and seqan3::simd::simd_traits<...
Provides intrinsics include for builtin simd.
Provides seqan3::simd::simd_traits.