SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
format_bam.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <seqan3/std/bit>
16 #include <seqan3/std/concepts>
17 #include <iterator>
18 #include <seqan3/std/ranges>
19 #include <string>
20 #include <vector>
21 
48 
49 namespace seqan3
50 {
51 
63 {
64 public:
68  // string_buffer is of type std::string and has some problems with pre-C++11 ABI
69  format_bam() = default;
70  format_bam(format_bam const &) = default;
71  format_bam & operator=(format_bam const &) = default;
72  format_bam(format_bam &&) = default;
73  format_bam & operator=(format_bam &&) = default;
74  ~format_bam() = default;
76 
79  {
80  { "bam" }
81  };
82 
83 protected:
84  template <typename stream_type, // constraints checked by file
85  typename seq_legal_alph_type,
86  typename ref_seqs_type,
87  typename ref_ids_type,
88  typename seq_type,
89  typename id_type,
90  typename offset_type,
91  typename ref_seq_type,
92  typename ref_id_type,
93  typename ref_offset_type,
94  typename align_type,
95  typename cigar_type,
96  typename flag_type,
97  typename mapq_type,
98  typename qual_type,
99  typename mate_type,
100  typename tag_dict_type,
101  typename e_value_type,
102  typename bit_score_type>
103  void read_alignment_record(stream_type & stream,
104  alignment_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
105  ref_seqs_type & ref_seqs,
107  seq_type & seq,
108  qual_type & qual,
109  id_type & id,
110  offset_type & offset,
111  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
112  ref_id_type & ref_id,
113  ref_offset_type & ref_offset,
114  align_type & align,
115  cigar_type & cigar_vector,
116  flag_type & flag,
117  mapq_type & mapq,
118  mate_type & mate,
119  tag_dict_type & tag_dict,
120  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
121  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
122 
123  template <typename stream_type,
124  typename header_type,
125  typename seq_type,
126  typename id_type,
127  typename ref_seq_type,
128  typename ref_id_type,
129  typename align_type,
130  typename cigar_type,
131  typename qual_type,
132  typename mate_type,
133  typename tag_dict_type>
134  void write_alignment_record([[maybe_unused]] stream_type & stream,
135  [[maybe_unused]] alignment_file_output_options const & options,
136  [[maybe_unused]] header_type && header,
137  [[maybe_unused]] seq_type && seq,
138  [[maybe_unused]] qual_type && qual,
139  [[maybe_unused]] id_type && id,
140  [[maybe_unused]] int32_t const offset,
141  [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
142  [[maybe_unused]] ref_id_type && ref_id,
143  [[maybe_unused]] std::optional<int32_t> ref_offset,
144  [[maybe_unused]] align_type && align,
145  [[maybe_unused]] cigar_type && cigar_vector,
146  [[maybe_unused]] sam_flag const flag,
147  [[maybe_unused]] uint8_t const mapq,
148  [[maybe_unused]] mate_type && mate,
149  [[maybe_unused]] tag_dict_type && tag_dict,
150  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
151  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
152 
153 private:
155  bool header_was_read{false};
156 
159 
162  { // naming corresponds to official SAM/BAM specifications
163  int32_t block_size;
164  int32_t refID;
165  int32_t pos;
166  uint32_t l_read_name:8;
167  uint32_t mapq:8;
168  uint32_t bin:16;
169  uint32_t n_cigar_op:16;
171  int32_t l_seq;
172  int32_t next_refID;
173  int32_t next_pos;
174  int32_t tlen;
175  };
176 
179  {
180  [] () constexpr
181  {
183 
184  using index_t = std::make_unsigned_t<char>;
185 
186  // ret['M'] = 0; set anyway by initialization
187  ret[static_cast<index_t>('I')] = 1;
188  ret[static_cast<index_t>('D')] = 2;
189  ret[static_cast<index_t>('N')] = 3;
190  ret[static_cast<index_t>('S')] = 4;
191  ret[static_cast<index_t>('H')] = 5;
192  ret[static_cast<index_t>('P')] = 6;
193  ret[static_cast<index_t>('=')] = 7;
194  ret[static_cast<index_t>('X')] = 8;
195 
196  return ret;
197  }()
198  };
199 
201  static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
202  {
203  --end;
204  if (beg >> 14 == end >> 14) return ((1 << 15) - 1) / 7 + (beg >> 14);
205  if (beg >> 17 == end >> 17) return ((1 << 12) - 1) / 7 + (beg >> 17);
206  if (beg >> 20 == end >> 20) return ((1 << 9) - 1) / 7 + (beg >> 20);
207  if (beg >> 23 == end >> 23) return ((1 << 6) - 1) / 7 + (beg >> 23);
208  if (beg >> 26 == end >> 26) return ((1 << 3) - 1) / 7 + (beg >> 26);
209  return 0;
210  }
211 
212  using format_sam_base::read_field; // inherit read_field functions from format_base explicitly
213 
220  template <typename stream_view_type, std::integral number_type>
221  void read_field(stream_view_type && stream_view, number_type & target)
222  {
223  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
224  }
225 
231  template <typename stream_view_type>
232  void read_field(stream_view_type && stream_view, float & target)
233  {
234  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
235  }
236 
247  template <typename stream_view_type, typename optional_value_type>
248  void read_field(stream_view_type && stream_view, std::optional<optional_value_type> & target)
249  {
250  optional_value_type tmp;
251  read_field(std::forward<stream_view_type>(stream_view), tmp);
252  target = tmp;
253  }
254 
255  template <typename stream_view_type, typename value_type>
257  stream_view_type && stream_view,
258  value_type const & SEQAN3_DOXYGEN_ONLY(value));
259 
260  template <typename stream_view_type>
261  void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
262 
263  template <typename cigar_input_type>
264  auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const;
265 
266  static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
267 };
268 
270 template <typename stream_type, // constraints checked by file
271  typename seq_legal_alph_type,
272  typename ref_seqs_type,
273  typename ref_ids_type,
274  typename seq_type,
275  typename id_type,
276  typename offset_type,
277  typename ref_seq_type,
278  typename ref_id_type,
279  typename ref_offset_type,
280  typename align_type,
281  typename cigar_type,
282  typename flag_type,
283  typename mapq_type,
284  typename qual_type,
285  typename mate_type,
286  typename tag_dict_type,
287  typename e_value_type,
288  typename bit_score_type>
289 inline void format_bam::read_alignment_record(stream_type & stream,
290  alignment_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
291  ref_seqs_type & ref_seqs,
293  seq_type & seq,
294  qual_type & qual,
295  id_type & id,
296  offset_type & offset,
297  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
298  ref_id_type & ref_id,
299  ref_offset_type & ref_offset,
300  align_type & align,
301  cigar_type & cigar_vector,
302  flag_type & flag,
303  mapq_type & mapq,
304  mate_type & mate,
305  tag_dict_type & tag_dict,
306  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
307  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
308 {
309  static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
310  detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
311  "The ref_offset must be a specialisation of std::optional.");
312 
313  static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
314  "The type of field::mapq must be uint8_t.");
315 
316  static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
317  "The type of field::flag must be seqan3::sam_flag.");
318 
319  auto stream_view = seqan3::views::istreambuf(stream);
320 
321  // these variables need to be stored to compute the ALIGNMENT
322  [[maybe_unused]] int32_t offset_tmp{};
323  [[maybe_unused]] int32_t soft_clipping_end{};
324  [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
325  [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
326 
327  // Header
328  // -------------------------------------------------------------------------------------------------------------
329  if (!header_was_read)
330  {
331  // magic BAM string
332  if (!std::ranges::equal(stream_view | views::take_exactly_or_throw(4), std::string_view{"BAM\1"}))
333  throw format_error{"File is not in BAM format."};
334 
335  int32_t tmp32{};
336  read_field(stream_view, tmp32);
337 
338  if (tmp32 > 0) // header text is present
339  read_header(stream_view | views::take_exactly_or_throw(tmp32), header, ref_seqs);
340 
341  int32_t n_ref;
342  read_field(stream_view, n_ref);
343 
344  for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
345  {
346  read_field(stream_view, tmp32); // l_name (length of reference name including \0 character)
347 
348  string_buffer.resize(tmp32 - 1);
349  std::ranges::copy_n(std::ranges::begin(stream_view), tmp32 - 1, string_buffer.data()); // copy without \0 character
350  std::ranges::next(std::ranges::begin(stream_view)); // skip \0 character
351 
352  read_field(stream_view, tmp32); // l_ref (length of reference sequence)
353 
354  auto id_it = header.ref_dict.find(string_buffer);
355 
356  // sanity checks of reference information to existing header object:
357  if (id_it == header.ref_dict.end()) // [unlikely]
358  {
359  throw format_error{detail::to_string("Unknown reference name '" + string_buffer +
360  "' found in BAM file header (header.ref_ids():",
361  header.ref_ids(), ").")};
362  }
363  else if (id_it->second != ref_idx) // [unlikely]
364  {
365  throw format_error{detail::to_string("Reference id '", string_buffer, "' at position ", ref_idx,
366  " does not correspond to the position ", id_it->second,
367  " in the header (header.ref_ids():", header.ref_ids(), ").")};
368  }
369  else if (std::get<0>(header.ref_id_info[id_it->second]) != tmp32) // [unlikely]
370  {
371  throw format_error{"Provided reference has unequal length as specified in the header."};
372  }
373  }
374 
375  header_was_read = true;
376 
377  if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // no records follow
378  return;
379  }
380 
381  // read alignment record into buffer
382  // -------------------------------------------------------------------------------------------------------------
384  std::ranges::copy(stream_view | views::take_exactly_or_throw(sizeof(core)), reinterpret_cast<char *>(&core));
385 
386  if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
387  {
388  throw format_error{detail::to_string("Reference id index '", core.refID, "' is not in range of ",
389  "header.ref_ids(), which has size ", header.ref_ids().size(), ".")};
390  }
391  else if (core.refID > -1) // not unmapped
392  {
393  ref_id = core.refID; // field::ref_id
394  }
395 
396  flag = core.flag; // field::flag
397  mapq = core.mapq; // field::mapq
398 
399  if (core.pos > -1) // [[likely]]
400  ref_offset = core.pos; // field::ref_offset
401 
402  if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
403  {
404  if (core.next_refID > -1)
405  get<0>(mate) = core.next_refID;
406 
407  if (core.next_pos > -1) // [[likely]]
408  get<1>(mate) = core.next_pos;
409 
410  get<2>(mate) = core.tlen;
411  }
412 
413  // read id
414  // -------------------------------------------------------------------------------------------------------------
415  read_field(stream_view | views::take_exactly_or_throw(core.l_read_name - 1), id); // field::id
416  std::ranges::next(std::ranges::begin(stream_view)); // skip '\0'
417 
418  // read cigar string
419  // -------------------------------------------------------------------------------------------------------------
420  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
421  {
422  std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
423  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
424  // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
425  }
426  else
427  {
429  }
430 
431  offset = offset_tmp;
432 
433  // read sequence
434  // -------------------------------------------------------------------------------------------------------------
435  if (core.l_seq > 0) // sequence information is given
436  {
437  auto seq_stream = stream_view
438  | views::take_exactly_or_throw(core.l_seq / 2) // one too short if uneven
440  {
441  return {sam_dna16{}.assign_rank(std::min(15, static_cast<uint8_t>(c) >> 4)),
442  sam_dna16{}.assign_rank(std::min(15, static_cast<uint8_t>(c) & 0x0f))};
443  });
444 
445  if constexpr (detail::decays_to_ignore_v<seq_type>)
446  {
447  if constexpr (!detail::decays_to_ignore_v<align_type>)
448  {
449  static_assert(sequence_container<std::remove_reference_t<decltype(get<1>(align))>>,
450  "If you want to read ALIGNMENT but not SEQ, the alignment"
451  " object must store a sequence container at the second (query) position.");
452 
453  if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
454  {
455  assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end)); // sanity check
456  using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
457  constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, sam_dna16>;
458 
459  get<1>(align).reserve(seq_length);
460 
461  auto tmp_iter = std::ranges::begin(seq_stream);
462  std::ranges::advance(tmp_iter, offset_tmp / 2); // skip soft clipped bases at the beginning
463 
464  if (offset_tmp & 1)
465  {
466  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
467  ++tmp_iter;
468  --seq_length;
469  }
470 
471  for (size_t i = (seq_length / 2); i > 0; --i)
472  {
473  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
474  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
475  ++tmp_iter;
476  }
477 
478  if (seq_length & 1)
479  {
480  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
481  ++tmp_iter;
482  --soft_clipping_end;
483  }
484 
485  std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
486  }
487  else
488  {
489  get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // assign empty container
490  }
491  }
492  else
493  {
494  detail::consume(seq_stream);
495  if (core.l_seq & 1)
496  std::ranges::next(std::ranges::begin(stream_view));
497  }
498  }
499  else
500  {
501  using alph_t = std::ranges::range_value_t<decltype(seq)>;
502  constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, sam_dna16>;
503 
504  for (auto [d1, d2] : seq_stream)
505  {
506  seq.push_back(from_dna16[to_rank(d1)]);
507  seq.push_back(from_dna16[to_rank(d2)]);
508  }
509 
510  if (core.l_seq & 1)
511  {
512  sam_dna16 d = sam_dna16{}.assign_rank(std::min(15, static_cast<uint8_t>(*std::ranges::begin(stream_view)) >> 4));
513  seq.push_back(from_dna16[to_rank(d)]);
514  std::ranges::next(std::ranges::begin(stream_view));
515  }
516 
517  if constexpr (!detail::decays_to_ignore_v<align_type>)
518  {
519  assign_unaligned(get<1>(align),
520  seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
521  std::ranges::distance(seq) - soft_clipping_end));
522  }
523  }
524  }
525 
526  // read qual string
527  // -------------------------------------------------------------------------------------------------------------
528  read_field(stream_view | views::take_exactly_or_throw(core.l_seq)
529  | std::views::transform([] (char chr) { return static_cast<char>(chr + 33); }), qual);
530 
531  // All remaining optional fields if any: SAM tags dictionary
532  // -------------------------------------------------------------------------------------------------------------
533  int32_t remaining_bytes = core.block_size - (sizeof(alignment_record_core) - 4/*block_size excluded*/) -
534  core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
535  assert(remaining_bytes >= 0);
536  auto tags_view = stream_view | views::take_exactly_or_throw(remaining_bytes);
537 
538  while (tags_view.size() > 0)
539  read_field(tags_view, tag_dict);
540 
541  // DONE READING - wrap up
542  // -------------------------------------------------------------------------------------------------------------
543  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
544  {
545  // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
546  // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
547  // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
548  if (core.l_seq != 0 && offset_tmp == core.l_seq)
549  {
550  if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
551  { // maybe only throw in debug mode and otherwise return an empty alignment?
552  throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
553  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
554  "stored in the optional field CG. You need to read in the field::tags and "
555  "field::seq in order to access this information.")};
556  }
557  else
558  {
559  auto it = tag_dict.find("CG"_tag);
560 
561  if (it == tag_dict.end())
562  throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
563  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
564  "stored in the optional field CG but this tag is not present in the given ",
565  "record.")};
566 
567  auto cigar_view = std::views::all(std::get<std::string>(it->second));
568  std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_cigar(cigar_view);
569  offset_tmp = soft_clipping_end = 0;
570  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
571  tag_dict.erase(it); // remove redundant information
572 
573  if constexpr (!detail::decays_to_ignore_v<align_type>)
574  {
575  assign_unaligned(get<1>(align),
576  seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
577  std::ranges::distance(seq) - soft_clipping_end));
578  }
579  }
580  }
581  }
582 
583  // Alignment object construction
584  if constexpr (!detail::decays_to_ignore_v<align_type>)
585  construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length); // inherited from SAM
586 
587  if constexpr (!detail::decays_to_ignore_v<cigar_type>)
588  std::swap(cigar_vector, tmp_cigar_vector);
589 }
590 
592 template <typename stream_type,
593  typename header_type,
594  typename seq_type,
595  typename id_type,
596  typename ref_seq_type,
597  typename ref_id_type,
598  typename align_type,
599  typename cigar_type,
600  typename qual_type,
601  typename mate_type,
602  typename tag_dict_type>
603 inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
604  [[maybe_unused]] alignment_file_output_options const & options,
605  [[maybe_unused]] header_type && header,
606  [[maybe_unused]] seq_type && seq,
607  [[maybe_unused]] qual_type && qual,
608  [[maybe_unused]] id_type && id,
609  [[maybe_unused]] int32_t const offset,
610  [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
611  [[maybe_unused]] ref_id_type && ref_id,
612  [[maybe_unused]] std::optional<int32_t> ref_offset,
613  [[maybe_unused]] align_type && align,
614  [[maybe_unused]] cigar_type && cigar_vector,
615  [[maybe_unused]] sam_flag const flag,
616  [[maybe_unused]] uint8_t const mapq,
617  [[maybe_unused]] mate_type && mate,
618  [[maybe_unused]] tag_dict_type && tag_dict,
619  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
620  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
621 {
622  // ---------------------------------------------------------------------
623  // Type Requirements (as static asserts for user friendliness)
624  // ---------------------------------------------------------------------
625  static_assert((std::ranges::forward_range<seq_type> &&
626  alphabet<std::ranges::range_reference_t<seq_type>>),
627  "The seq object must be a std::ranges::forward_range over "
628  "letters that model seqan3::alphabet.");
629 
630  static_assert((std::ranges::forward_range<id_type> &&
631  alphabet<std::ranges::range_reference_t<id_type>>),
632  "The id object must be a std::ranges::forward_range over "
633  "letters that model seqan3::alphabet.");
634 
635  static_assert((std::ranges::forward_range<ref_seq_type> &&
636  alphabet<std::ranges::range_reference_t<ref_seq_type>>),
637  "The ref_seq object must be a std::ranges::forward_range "
638  "over letters that model seqan3::alphabet.");
639 
640  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
641  {
642  static_assert((std::ranges::forward_range<ref_id_type> ||
643  std::integral<std::remove_reference_t<ref_id_type>> ||
644  detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
645  "The ref_id object must be a std::ranges::forward_range "
646  "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
647  }
648 
650  "The align object must be a std::pair of two ranges whose "
651  "value_type is comparable to seqan3::gap");
652 
653  static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
654  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
655  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
656  "The align object must be a std::pair of two ranges whose "
657  "value_type is comparable to seqan3::gap");
658 
659  static_assert((std::ranges::forward_range<qual_type> &&
660  alphabet<std::ranges::range_reference_t<qual_type>>),
661  "The qual object must be a std::ranges::forward_range "
662  "over letters that model seqan3::alphabet.");
663 
665  "The mate object must be a std::tuple of size 3 with "
666  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
667  "2) a std::integral or std::optional<std::integral>, and "
668  "3) a std::integral.");
669 
670  static_assert(((std::ranges::forward_range<decltype(std::get<0>(mate))> ||
671  std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
672  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>) &&
673  (std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>> ||
674  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<1>(mate))>, std::optional>) &&
675  std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
676  "The mate object must be a std::tuple of size 3 with "
677  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
678  "2) a std::integral or std::optional<std::integral>, and "
679  "3) a std::integral.");
680 
681  static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
682  "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
683 
684  if constexpr (detail::decays_to_ignore_v<header_type>)
685  {
686  throw format_error{"BAM can only be written with a header but you did not provide enough information! "
687  "You can either construct the output file with ref_ids and ref_seqs information and "
688  "the header will be created for you, or you can access the `header` member directly."};
689  }
690  else
691  {
692  // ---------------------------------------------------------------------
693  // logical Requirements
694  // ---------------------------------------------------------------------
695 
696  if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
697  throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
698 
699  detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
700 
701  // ---------------------------------------------------------------------
702  // Writing the BAM Header on first call
703  // ---------------------------------------------------------------------
704  if (!header_was_written)
705  {
706  stream << "BAM\1";
708  write_header(os, options, header); // write SAM header to temporary stream to query the size.
709  int32_t l_text{static_cast<int32_t>(os.str().size())};
710  std::ranges::copy_n(reinterpret_cast<char *>(&l_text), 4, stream_it); // write read id
711 
712  stream << os.str();
713 
714  int32_t n_ref{static_cast<int32_t>(header.ref_ids().size())};
715  std::ranges::copy_n(reinterpret_cast<char *>(&n_ref), 4, stream_it); // write read id
716 
717  for (int32_t ridx = 0; ridx < n_ref; ++ridx)
718  {
719  int32_t l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
720  std::ranges::copy_n(reinterpret_cast<char *>(&l_name), 4, stream_it); // write l_name
721  // write reference name:
722  std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
723  stream_it = '\0';
724  // write reference sequence length:
725  std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
726  }
727 
728  header_was_written = true;
729  }
730 
731  // ---------------------------------------------------------------------
732  // Writing the Record
733  // ---------------------------------------------------------------------
734  int32_t ref_length{};
735 
736  // if alignment is non-empty, replace cigar_vector.
737  // else, compute the ref_length from given cigar_vector which is needed to fill field `bin`.
738  if (!std::ranges::empty(cigar_vector))
739  {
740  int32_t dummy_seq_length{};
741  for (auto & [count, operation] : cigar_vector)
742  update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
743  }
744  else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
745  {
746  ref_length = std::ranges::distance(get<1>(align));
747 
748  // compute possible distance from alignment end to sequence end
749  // which indicates soft clipping at the end.
750  // This should be replaced by a free count_gaps function for
751  // aligned sequences which is more efficient if possible.
752  int32_t off_end{static_cast<int32_t>(std::ranges::distance(seq)) - offset};
753 
754  for (auto chr : get<1>(align))
755  if (chr == gap{})
756  ++off_end;
757 
758  off_end -= ref_length;
759  cigar_vector = detail::get_cigar_vector(align, offset, off_end);
760  }
761 
762  if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
763  {
764  tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
765  cigar_vector.resize(2);
766  cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
767  cigar_vector[1] = cigar{static_cast<uint32_t>(std::ranges::distance(get<1>(align))), 'N'_cigar_operation};
768  }
769 
770  std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
771 
772  // Compute the value for the l_read_name field for the bam record.
773  // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
774  // the data type to store the value is uint8_t and 255 is the maximal size.
775  // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
776  // 2 bytes.
777  uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
778  read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
779 
781  {
782  /* block_size */ 0, // will be initialised right after
783  /* refID */ -1, // will be initialised right after
784  /* pos */ ref_offset.value_or(-1),
785  /* l_read_name */ read_name_size,
786  /* mapq */ mapq,
787  /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
788  /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
789  /* flag */ flag,
790  /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
791  /* next_refId */ -1, // will be initialised right after
792  /* next_pos */ get<1>(mate).value_or(-1),
793  /* tlen */ get<2>(mate)
794  };
795 
796  auto check_and_assign_id_to = [&header] ([[maybe_unused]] auto & id_source,
797  [[maybe_unused]] auto & id_target)
798  {
799  using id_t = std::remove_reference_t<decltype(id_source)>;
800 
801  if constexpr (!detail::decays_to_ignore_v<id_t>)
802  {
803  if constexpr (std::integral<id_t>)
804  {
805  id_target = id_source;
806  }
807  else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
808  {
809  id_target = id_source.value_or(-1);
810  }
811  else
812  {
813  if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
814  {
815  auto id_it = header.ref_dict.end();
816 
817  if constexpr (std::ranges::contiguous_range<decltype(id_source)> &&
818  std::ranges::sized_range<decltype(id_source)> &&
819  std::ranges::borrowed_range<decltype(id_source)>)
820  {
821  id_it = header.ref_dict.find(std::span{std::ranges::data(id_source),
822  std::ranges::size(id_source)});
823  }
824  else
825  {
826  using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
827 
828  static_assert(implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
829  "The ref_id type is not convertible to the reference id information stored in the "
830  "reference dictionary of the header object.");
831 
832  id_it = header.ref_dict.find(id_source);
833  }
834 
835  if (id_it == header.ref_dict.end())
836  {
837  throw format_error{detail::to_string("Unknown reference name '", id_source, "' could "
838  "not be found in BAM header ref_dict: ",
839  header.ref_dict, ".")};
840  }
841 
842  id_target = id_it->second;
843  }
844  }
845  }
846  };
847 
848  // initialise core.refID
849  check_and_assign_id_to(ref_id, core.refID);
850 
851  // initialise core.next_refID
852  check_and_assign_id_to(get<0>(mate), core.next_refID);
853 
854  // initialise core.block_size
855  core.block_size = sizeof(core) - 4/*block_size excluded*/ +
856  core.l_read_name +
857  core.n_cigar_op * 4 + // each int32_t has 4 bytes
858  (core.l_seq + 1) / 2 + // bitcompressed seq
859  core.l_seq + // quality string
860  tag_dict_binary_str.size();
861 
862  std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
863 
864  if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
865  stream_it = '*';
866  else
867  std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
868  stream_it = '\0';
869 
870  // write cigar
871  for (auto [cigar_count, op] : cigar_vector)
872  {
873  cigar_count = cigar_count << 4;
874  cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
875  std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
876  }
877 
878  // write seq (bit-compressed: sam_dna16 characters go into one byte)
879  using alph_t = std::ranges::range_value_t<seq_type>;
880  constexpr auto to_dna16 = detail::convert_through_char_representation<sam_dna16, alph_t>;
881 
882  auto sit = std::ranges::begin(seq);
883  for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
884  {
885  uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
886  ++sidx, ++sit;
887  compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
888  stream_it = static_cast<char>(compressed_chr);
889  }
890 
891  if (core.l_seq & 1) // write one more
892  stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
893 
894  // write qual
895  if (std::ranges::empty(qual))
896  {
897  auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
898  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
899  }
900  else
901  {
902  if (std::ranges::distance(qual) != core.l_seq)
903  throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
904  core.l_seq, ". Got quality with size ",
905  std::ranges::distance(qual), " instead.")};
906 
907  auto v = qual | std::views::transform([] (auto chr) { return static_cast<char>(to_rank(chr)); });
908  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
909  }
910 
911  // write optional fields
912  stream << tag_dict_binary_str;
913  } // if constexpr (!detail::decays_to_ignore_v<header_type>)
914 }
915 
917 template <typename stream_view_type, typename value_type>
919  stream_view_type && stream_view,
920  value_type const & SEQAN3_DOXYGEN_ONLY(value))
921 {
922  int32_t count;
923  read_field(stream_view, count); // read length of vector
924  std::vector<value_type> tmp_vector;
925  tmp_vector.reserve(count);
926 
927  while (count > 0)
928  {
929  value_type tmp{};
930  read_field(stream_view, tmp);
931  tmp_vector.push_back(std::move(tmp));
932  --count;
933  }
934  variant = std::move(tmp_vector);
935 }
936 
954 template <typename stream_view_type>
955 inline void format_bam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
956 {
957  /* Every BA< tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
958  name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
959  describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
960  VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
961  by the length (int32_t) of the array, followed by the values.
962  */
963  auto it = std::ranges::begin(stream_view);
964  uint16_t tag = static_cast<uint16_t>(*it) << 8;
965  ++it; // skip char read before
966 
967  tag += static_cast<uint16_t>(*it);
968  ++it; // skip char read before
969 
970  char type_id = *it;
971  ++it; // skip char read before
972 
973  switch (type_id)
974  {
975  case 'A' : // char
976  {
977  target[tag] = *it;
978  ++it; // skip char that has been read
979  break;
980  }
981  // all integer sizes are possible
982  case 'c' : // int8_t
983  {
984  int8_t tmp;
985  read_field(stream_view, tmp);
986  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
987  break;
988  }
989  case 'C' : // uint8_t
990  {
991  uint8_t tmp;
992  read_field(stream_view, tmp);
993  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
994  break;
995  }
996  case 's' : // int16_t
997  {
998  int16_t tmp;
999  read_field(stream_view, tmp);
1000  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1001  break;
1002  }
1003  case 'S' : // uint16_t
1004  {
1005  uint16_t tmp;
1006  read_field(stream_view, tmp);
1007  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1008  break;
1009  }
1010  case 'i' : // int32_t
1011  {
1012  int32_t tmp;
1013  read_field(stream_view, tmp);
1014  target[tag] = std::move(tmp); // readable sam format only allows int32_t
1015  break;
1016  }
1017  case 'I' : // uint32_t
1018  {
1019  uint32_t tmp;
1020  read_field(stream_view, tmp);
1021  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1022  break;
1023  }
1024  case 'f' : // float
1025  {
1026  float tmp;
1027  read_field(stream_view, tmp);
1028  target[tag] = tmp;
1029  break;
1030  }
1031  case 'Z' : // string
1032  {
1033  string_buffer.clear();
1034  while (!is_char<'\0'>(*it))
1035  {
1036  string_buffer.push_back(*it);
1037  ++it;
1038  }
1039  ++it; // skip \0
1040  target[tag] = string_buffer;
1041  break;
1042  }
1043  case 'H' :
1044  {
1045  // TODO
1046  throw format_error{"'H' not implemented yet."};
1047  break;
1048  }
1049  case 'B' : // Array. Value type depends on second char [cCsSiIf]
1050  {
1051  char array_value_type_id = *it;
1052  ++it; // skip char read before
1053 
1054  switch (array_value_type_id)
1055  {
1056  case 'c' : // int8_t
1057  read_sam_dict_vector(target[tag], stream_view, int8_t{});
1058  break;
1059  case 'C' : // uint8_t
1060  read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1061  break;
1062  case 's' : // int16_t
1063  read_sam_dict_vector(target[tag], stream_view, int16_t{});
1064  break;
1065  case 'S' : // uint16_t
1066  read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1067  break;
1068  case 'i' : // int32_t
1069  read_sam_dict_vector(target[tag], stream_view, int32_t{});
1070  break;
1071  case 'I' : // uint32_t
1072  read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1073  break;
1074  case 'f' : // float
1075  read_sam_dict_vector(target[tag], stream_view, float{});
1076  break;
1077  default:
1078  throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1079  "must be one of [cCsSiIf] but '", array_value_type_id, "' was given.")};
1080  }
1081  break;
1082  }
1083  default:
1084  throw format_error{detail::to_string("The second character in the numerical id of a "
1085  "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id, "' was given.")};
1086  }
1087 }
1088 
1103 template <typename cigar_input_type>
1104 inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const
1105 {
1106  std::vector<cigar> operations{};
1107  char operation{'\0'};
1108  uint32_t count{};
1109  int32_t ref_length{}, seq_length{};
1110  uint32_t operation_and_count{}; // in BAM operation and count values are stored within one 32 bit integer
1111  constexpr char const * cigar_mapping = "MIDNSHP=X*******";
1112  constexpr uint32_t cigar_mask = 0x0f; // 0000000000001111
1113 
1114  if (n_cigar_op == 0) // [[unlikely]]
1115  return std::tuple{operations, ref_length, seq_length};
1116 
1117  // parse the rest of the cigar
1118  // -------------------------------------------------------------------------------------------------------------
1119  while (n_cigar_op > 0) // until stream is not empty
1120  {
1121  std::ranges::copy_n(std::ranges::begin(cigar_input),
1122  sizeof(operation_and_count),
1123  reinterpret_cast<char*>(&operation_and_count));
1124  operation = cigar_mapping[operation_and_count & cigar_mask];
1125  count = operation_and_count >> 4;
1126 
1127  update_alignment_lengths(ref_length, seq_length, operation, count);
1128  operations.emplace_back(count, cigar::operation{}.assign_char(operation));
1129  --n_cigar_op;
1130  }
1131 
1132  return std::tuple{operations, ref_length, seq_length};
1133 }
1134 
1139 {
1140  std::string result{};
1141 
1142  auto stream_variant_fn = [&result] (auto && arg) // helper to print an std::variant
1143  {
1144  // T is either char, int32_t, float, std::string, or a std::vector<some int>
1145  using T = std::remove_cvref_t<decltype(arg)>;
1146 
1147  if constexpr (std::same_as<T, int32_t>)
1148  {
1149  // always choose the smallest possible representation [cCsSiI]
1150  size_t const absolute_arg = std::abs(arg);
1151  auto n = std::countr_zero(std::bit_ceil(absolute_arg + 1u) >> 1u) / 8u;
1152  bool const negative = arg < 0;
1153  n = n * n + 2 * negative; // for switch case order
1154 
1155  switch (n)
1156  {
1157  case 0:
1158  {
1159  result[result.size() - 1] = 'C';
1160  result.append(reinterpret_cast<char const *>(&arg), 1);
1161  break;
1162  }
1163  case 1:
1164  {
1165  result[result.size() - 1] = 'S';
1166  result.append(reinterpret_cast<char const *>(&arg), 2);
1167  break;
1168  }
1169  case 2:
1170  {
1171  result[result.size() - 1] = 'c';
1172  int8_t tmp = static_cast<int8_t>(arg);
1173  result.append(reinterpret_cast<char const *>(&tmp), 1);
1174  break;
1175  }
1176  case 3:
1177  {
1178  result[result.size() - 1] = 's';
1179  int16_t tmp = static_cast<int16_t>(arg);
1180  result.append(reinterpret_cast<char const *>(&tmp), 2);
1181  break;
1182  }
1183  default:
1184  {
1185  result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1186  break;
1187  }
1188  }
1189  }
1190  else if constexpr (std::same_as<T, std::string>)
1191  {
1192  result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1/*+ null character*/);
1193  }
1194  else if constexpr (!std::ranges::range<T>) // char, float
1195  {
1196  result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1197  }
1198  else // std::vector of some arithmetic_type type
1199  {
1200  int32_t sz{static_cast<int32_t>(arg.size())};
1201  result.append(reinterpret_cast<char *>(&sz), 4);
1202  result.append(reinterpret_cast<char const *>(arg.data()),
1203  arg.size() * sizeof(std::ranges::range_value_t<T>));
1204  }
1205  };
1206 
1207  for (auto & [tag, variant] : tag_dict)
1208  {
1209  result.push_back(static_cast<char>(tag / 256));
1210  result.push_back(static_cast<char>(tag % 256));
1211 
1212  result.push_back(detail::sam_tag_type_char[variant.index()]);
1213 
1214  if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1215  result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1216 
1217  std::visit(stream_variant_fn, variant);
1218  }
1219 
1220  return result;
1221 }
1222 
1223 } // namespace seqan3
Provides seqan3::alignment_file_input_format and auxiliary classes.
Provides seqan3::alignment_file_input_options.
Provides seqan3::alignment_file_output_format and auxiliary classes.
Provides seqan3::alignment_file_output_options.
Provides seqan3::detail::convert_through_char_representation.
Provides the C++20 <bit> header if it is not already available.
Stores the header information of alignment files.
Definition: header.hpp:33
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition: header.hpp:155
std::unordered_map< key_type, int32_t, std::hash< key_type >, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:158
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:119
constexpr derived_type & assign_char(char_type const c) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:158
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:185
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: cigar.hpp:59
Functionally the same as std::ostreambuf_iterator, but offers writing a range more efficiently.
Definition: fast_ostreambuf_iterator.hpp:39
The alignment base format.
Definition: format_sam_base.hpp:63
std::tuple< std::vector< cigar >, int32_t, int32_t > parse_cigar(cigar_input_type &&cigar_input) const
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: format_sam_base.hpp:260
void read_header(stream_view_type &&stream_view, alignment_file_header< ref_ids_type > &hdr, ref_seqs_type &)
Reads the SAM header.
Definition: format_sam_base.hpp:457
void write_header(stream_t &stream, alignment_file_output_options const &options, alignment_file_header< ref_ids_type > &header)
Writes the SAM header.
Definition: format_sam_base.hpp:651
static void update_alignment_lengths(int32_t &ref_length, int32_t &seq_length, char const cigar_operation, uint32_t const cigar_count)
Updates the sequence lengths by cigar_count depending on the cigar operation op.
Definition: format_sam_base.hpp:195
void transfer_soft_clipping_to(std::vector< cigar > const &cigar_vector, int32_t &sc_begin, int32_t &sc_end) const
Transfer soft clipping information from the cigar_vector to sc_begin and sc_end.
Definition: format_sam_base.hpp:215
bool header_was_written
A variable that tracks whether the content of header has been written or not.
Definition: format_sam_base.hpp:83
void construct_alignment(align_type &align, std::vector< cigar > &cigar_vector, [[maybe_unused]] int32_t rid, [[maybe_unused]] ref_seqs_type &ref_seqs, [[maybe_unused]] int32_t ref_start, size_t ref_length)
Construct the field::alignment depending on the given information.
Definition: format_sam_base.hpp:300
The actual implementation of seqan3::cigar::operation for documentation purpose-only.
Definition: cigar_operation.hpp:48
The BAM format.
Definition: format_bam.hpp:63
format_bam()=default
Defaulted.
void read_field(stream_view_type &&stream_view, std::optional< optional_value_type > &target)
Delegate parsing of std::optional types to parsing of the inner value type.
Definition: format_bam.hpp:248
void write_alignment_record([[maybe_unused]] stream_type &stream, [[maybe_unused]] alignment_file_output_options const &options, [[maybe_unused]] header_type &&header, [[maybe_unused]] seq_type &&seq, [[maybe_unused]] qual_type &&qual, [[maybe_unused]] id_type &&id, [[maybe_unused]] int32_t const offset, [[maybe_unused]] ref_seq_type &&ref_seq, [[maybe_unused]] ref_id_type &&ref_id, [[maybe_unused]] std::optional< int32_t > ref_offset, [[maybe_unused]] align_type &&align, [[maybe_unused]] cigar_type &&cigar_vector, [[maybe_unused]] sam_flag const flag, [[maybe_unused]] uint8_t const mapq, [[maybe_unused]] mate_type &&mate, [[maybe_unused]] tag_dict_type &&tag_dict, [[maybe_unused]] double e_value, [[maybe_unused]] double bit_score)
Write the given fields to the specified stream.
Definition: format_bam.hpp:603
format_bam & operator=(format_bam &&)=default
Defaulted.
format_bam(format_bam &&)=default
Defaulted.
~format_bam()=default
void read_field(stream_view_type &&stream_view, float &target)
Reads a float field from binary stream by directly reinterpreting the bits.
Definition: format_bam.hpp:232
format_bam & operator=(format_bam const &)=default
Defaulted.
format_bam(format_bam const &)=default
Defaulted.
auto parse_binary_cigar(cigar_input_type &&cigar_input, uint16_t n_cigar_op) const
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: format_bam.hpp:1104
static std::string get_tag_dict_str(sam_tag_dictionary const &tag_dict)
Writes the optional fields of the seqan3::sam_tag_dictionary.
Definition: format_bam.hpp:1138
std::string string_buffer
Local buffer to read into while avoiding reallocation.
Definition: format_bam.hpp:158
void read_alignment_record(stream_type &stream, alignment_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, alignment_file_header< ref_ids_type > &header, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_bam.hpp:289
static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
Computes the bin number for a given region [beg, end), copied from the official SAM specifications.
Definition: format_bam.hpp:201
void read_field(stream_view_type &&stream_view, number_type &target)
Reads a arithmetic field from binary stream by directly reinterpreting the bits.
Definition: format_bam.hpp:221
void read_sam_dict_vector(seqan3::detail::sam_tag_variant &variant, stream_view_type &&stream_view, value_type const &value)
Reads a list of values separated by comma as it is the case for SAM tag arrays.
Definition: format_bam.hpp:918
bool header_was_read
A variable that tracks whether the content of header has been read or not.
Definition: format_bam.hpp:155
static constexpr std::array< uint8_t, 256 > char_to_sam_rank
Converts a cigar op character to the rank according to the official BAM specifications.
Definition: format_bam.hpp:179
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:79
The alphabet of a gap character '-'.
Definition: gap.hpp:37
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=').
Definition: sam_dna16.hpp:46
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:326
T clear(T... args)
The Concepts library.
Provides various transformation traits used by the range module.
T data(T... args)
Auxiliary for pretty printing of exception messages.
Provides seqan3::debug_stream and related types.
Provides type traits for working with templates.
Provides concepts for core language types and relations that don't have concepts in C++20 (yet).
Provides seqan3::detail::fast_ostreambuf_iterator.
T find(T... args)
Provides the seqan3::format_sam_base that can be inherited from.
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: misc.hpp:73
std::vector< cigar > get_cigar_vector(alignment_type &&alignment, uint32_t const query_start_pos=0, uint32_t const query_end_pos=0, bool const extended_cigar=false)
Creates a cigar string (SAM format) given a seqan3::detail::pairwise_alignment represented by two seq...
Definition: detail.hpp:138
std::string get_cigar_string(std::vector< cigar > const &cigar_vector)
Transforms a vector of cigar elements into a string representation.
Definition: detail.hpp:200
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:146
constexpr T bit_ceil(T x) noexcept
Calculates the smallest integral power of two that is not smaller than x.
Definition: bit:133
constexpr int countr_zero(T x) noexcept
Returns the number of consecutive 0 bits in the value of x, starting from the least significant bit (...
Definition: bit:199
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr void consume(rng_t &&rng)
Iterate over a range (consumes single-pass input ranges).
Definition: misc.hpp:28
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:434
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:168
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:150
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:141
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:114
constexpr auto take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly.hpp:91
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:94
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:70
Provides the seqan3::alignment_file_header class.
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>(). <dl class="no-api">This entity i...
A more refined container concept than seqan3::container.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides helper data structures for the seqan3::alignment_file_output.
Provides various utility functions.
Provides seqan3::views::istreambuf.
T min(T... args)
constexpr char sam_tag_type_char_extra[12]
Each types SAM tag type extra char id. Index corresponds to the seqan3::detail::sam_tag_variant types...
Definition: sam_tag_dictionary.hpp:38
constexpr char sam_tag_type_char[12]
Each SAM tag type char identifier. Index corresponds to the seqan3::detail::sam_tag_variant types.
Definition: sam_tag_dictionary.hpp:36
std::string to_string(value_type &&...values)
Streams all parameters via the seqan3::debug_stream and returns a concatenated string.
Definition: to_string.hpp:29
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
T push_back(T... args)
Provides various utility functions.
Adaptations of concepts from the Ranges TS.
T reserve(T... args)
T resize(T... args)
Provides seqan3::sam_dna16.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
T size(T... args)
Provides seqan3::views::slice.
T str(T... args)
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:24
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:23
Stores all fixed length variables which can be read/written directly by reinterpreting the binary str...
Definition: format_bam.hpp:162
uint32_t bin
The bin number.
Definition: format_bam.hpp:168
sam_flag flag
The flag value (uint16_t enum).
Definition: format_bam.hpp:170
uint32_t n_cigar_op
The number of cigar operations of the alignment.
Definition: format_bam.hpp:169
int32_t next_refID
The reference id of the mate.
Definition: format_bam.hpp:172
int32_t l_seq
The number of bases of the read sequence.
Definition: format_bam.hpp:171
int32_t refID
The reference id the read was mapped to.
Definition: format_bam.hpp:164
int32_t pos
The begin position of the alignment.
Definition: format_bam.hpp:165
int32_t block_size
The size in bytes of the whole BAM record.
Definition: format_bam.hpp:163
uint32_t l_read_name
The length of the read name including the \0 character.
Definition: format_bam.hpp:166
int32_t tlen
The template length of the read and its mate.
Definition: format_bam.hpp:174
uint32_t mapq
The mapping quality.
Definition: format_bam.hpp:167
int32_t next_pos
The begin position of the mate alignment.
Definition: format_bam.hpp:173
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:88
Exposes the value_type of another type.
Definition: pre.hpp:58
T swap(T... args)
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
T tie(T... args)
T tuple_size_v
Provides character predicates for tokenisation.
Provides seqan3::tuple_like.
T visit(T... args)