LocARNA-2.0.0
multiple_alignment.hh
1 #ifndef LOCARNA_MULTIPLE_ALIGNMENT_HH
2 #define LOCARNA_MULTIPLE_ALIGNMENT_HH
3 
4 #ifdef HAVE_CONFIG_H
5 #include <config.h>
6 #endif
7 
8 #include <iosfwd>
9 #include <string>
10 #include <vector>
11 #include <map>
12 
13 #include "aux.hh"
14 #include "string1.hh"
15 #include "scoring_fwd.hh"
16 #include "sequence_annotation.hh"
17 
18 #include <assert.h>
19 
20 #include <iostream>
21 
22 namespace LocARNA {
23 
24  class Alignment;
25  class AlignmentEdges;
26  template <class T, size_t N>
27  class Alphabet;
28  class BasePairs;
29  class Scoring;
30  class Sequence;
31 
66  public:
70  enum class FormatType {
71  STOCKHOLM,
72  PP,
73  CLUSTAL,
74  FASTA
75  };
77  static const std::vector<FormatType> FormatTypes;
78 
81  enum class AnnoType {
86  structure,
91  anchors
92  };
94  static const std::vector<AnnoType> AnnoTypes;
95 
98  static size_t
100  return annotation_tags.size();
101  }
102 
111  class SeqEntry {
112  public:
113 
114  typedef std::pair<pos_type, pos_type>
116 
124  SeqEntry(const std::string &name, const std::string &seq)
125  : name_(name), description_(""), seq_((string1)seq) {}
126 
134  SeqEntry(const std::string &name, const string1 &seq)
135  : name_(name), description_(""), seq_(seq) {}
136 
144  SeqEntry(const std::string &name,
145  const std::string &description,
146  const std::string &seq)
147  : name_(name), description_(description), seq_((string1)seq) {}
148 
157  SeqEntry(const std::string &name,
158  const std::string &description,
159  const string1 &seq)
160  : name_(name), description_(description), seq_(seq) {}
161 
162  // access
163 
165  const std::string &
166  name() const {
167  return name_;
168  }
169 
171  const std::string &
172  description() const {
173  return description_;
174  }
175 
177  const string1 &
178  seq() const {
179  return seq_;
180  }
181 
183  size_type
184  length_wogaps() const;
185 
186  //****************************************
187  // projections
188 
196  pos_type
197  pos_to_col(pos_type pos) const;
198 
210  pos_pair_t
211  col_to_pos(pos_type col) const;
212 
217  void
219  seq_.reverse();
220  }
221 
226  void
227  push_back(char c) {
228  seq_.push_back(c);
229  }
230 
232  void
233  set_seq(const string1 &seq) {
234  seq_ = seq;
235  }
236 
237  private:
238  std::string name_;
239  std::string description_;
240  string1 seq_; //<! alignment string of the sequence
241 
242  }; //end class SeqEntry
243 
250  class AliColumn {
251  public:
252  using value_type = char;
253 
260  AliColumn(const MultipleAlignment &ma, size_type col_index)
261  : ma_(ma), col_index_(col_index) {
262  assert(1 <= col_index);
263  assert(col_index <= ma.length());
264  }
265 
273  const char &operator[](size_type row_index) const {
274  return ma_.seqentry(row_index).seq()[col_index_];
275  }
276 
281  size_type
282  size() const {
283  return ma_.num_of_rows();
284  }
285 
293  bool
294  operator==(const AliColumn &ac) const {
295  bool ret = this->size() == ac.size();
296  for (size_type i = 0; ret && i < size(); i++) {
297  ret = (this->ma_.seqentry(i).seq()[this->col_index_] ==
298  ac.ma_.seqentry(i).seq()[ac.col_index_]);
299  }
300  return ret;
301  }
302 
310  bool
311  operator!=(const AliColumn &ac) const {
312  return !(*this == ac);
313  }
314 
315  // make (forward) iterable
316 
321  public:
322 
324  operator ++() {
325  ++row_index_;
326  return *this;
327  }
328 
329  const char &
330  operator *() {
331  return col_[row_index_];
332  }
333 
334  bool
335  operator != (const const_iterator &it) {
336  return row_index_ != it.row_index_ || col_!=it.col_;
337  }
338 
339  private:
340  friend class AliColumn;
341 
342  const_iterator(const AliColumn &col, size_type row_index)
343  : col_(col), row_index_(row_index) {
344  }
345 
346  const AliColumn &col_;
347  size_type row_index_;
348  };
349 
356 
360  auto
361  begin() const {
362  return const_iterator(*this,0);
363  }
364 
368  auto
369  end() const {
370  return const_iterator(*this,this->size());
371  }
372 
373  private:
374  const MultipleAlignment &ma_;
375  size_type col_index_;
376  }; // end class AliColumn
377 
378  public:
380  typedef std::vector<SeqEntry>::const_iterator const_iterator;
382  typedef std::vector<SeqEntry>::iterator iterator;
383 
386 
388  MultipleAlignment(const MultipleAlignment &ma) = default;
389 
392 
395  operator =(const MultipleAlignment &ma) = default;
398  operator =(MultipleAlignment &&ma) = default;
399 
403  virtual ~MultipleAlignment();
404 
405 
414  MultipleAlignment(const std::string &file,
416 
424  MultipleAlignment(std::istream &in,
426 
432  MultipleAlignment(const std::string &name, const std::string &sequence);
433 
445  MultipleAlignment(const std::string &nameA,
446  const std::string &nameB,
447  const std::string &alistringA,
448  const std::string &alistringB);
449 
463  MultipleAlignment(const Alignment &alignment,
464  bool only_local = false,
465  bool special_gap_symbols = false);
466 
478  MultipleAlignment(const AlignmentEdges &edges,
479  const Sequence &seqA,
480  const Sequence &seqB);
481 
489  void
491 
496  size_type
497  num_of_rows() const {
498  return alig_.size();
499  }
500 
509  bool
510  empty() const {
511  return alig_.empty();
512  }
513 
520  const SequenceAnnotation &
521  annotation(const AnnoType &annotype) const;
522 
530  void
531  set_annotation(const AnnoType &annotype,
533  annotations_[annotype] = annotation;
534  }
535 
541  bool
542  has_annotation(const AnnoType &annotype) const {
543  return annotations_.find(annotype) != annotations_.end();
544  }
545 
550  bool
551  is_proper() const;
552 
560  pos_type
561  length() const {
562  return alig_.empty() ? 0 : alig_[0].seq().length();
563  }
564 
565  using value_type = SeqEntry;
566 
567 
573  begin() const {
574  return alig_.begin();
575  }
576 
582  end() const {
583  return alig_.end();
584  }
585 
591  bool
592  contains(const std::string &name) const;
593 
594  /* index access saves time over access by sequence name */
595 
603  size_type
604  index(const std::string &name) const {
605  str2idx_map_t::const_iterator it = name2idx_.find(name);
606  assert(it != name2idx_.end());
607  return it->second;
608  }
609 
617  const SeqEntry &
619  return alig_[index];
620  }
621 
628  const SeqEntry &
629  seqentry(const std::string &name) const {
630  return alig_[index(name)];
631  }
632 
643  size_type
644  deviation(const MultipleAlignment &ma) const;
645 
661  double
662  sps(const MultipleAlignment &ma, bool compalign = true) const;
663 
679  double
681 
695  double
696  avg_deviation_score(const MultipleAlignment &ma) const;
697 
706  std::string
707  consensus_sequence() const;
708 
716  AliColumn
717  column(size_type col_index) const {
718  return AliColumn(*this, col_index);
719  }
720 
728  void
729  append(const SeqEntry &seqentry);
730 
741  void
742  prepend(const SeqEntry &seqentry);
743 
749  void
750  operator+=(const AliColumn &c);
751 
757  void
758  operator+=(char c);
759 
763  void
764  reverse();
765 
766  // ------------------------------------------------------------
767  // output
768 
783  std::ostream &
784  write(std::ostream &out,
785  FormatType format =
787 
802  std::ostream &
803  write(std::ostream &out,
804  size_t width,
805  FormatType format =
807 
820  std::ostream &
821  write_name_sequence_line(std::ostream &out,
822  const std::string &name,
823  const std::string &sequence,
824  size_t namewidth) const;
825 
839  std::ostream &
840  write(std::ostream &out,
841  size_type start,
842  size_type end,
843  FormatType format =
845 
856  template <size_t N>
857  bool
858  checkAlphabet(const Alphabet<char, N> &alphabet) const;
859 
864  void
865  write_debug(std::ostream &out = std::cout) const;
866 
867  protected:
877  void
878  init(const AlignmentEdges &edges,
879  const Sequence &seqA,
880  const Sequence &seqB,
881  bool special_gap_symbols);
882 
883  private:
889  class annotation_tags_t : public std::map< FormatType, std::map< AnnoType, std::string> > {
890  //@brief constructor: initialize annotation tags table
891  public:
892  annotation_tags_t();
893  };
894 
896  typedef std::map<std::string, size_type> str2idx_map_t;
897 
899  typedef std::map<AnnoType, SequenceAnnotation> annotation_map_t;
900 
901  //************************************************************
902  // attributes of MultipleAlignment
903 
905  static annotation_tags_t annotation_tags;
906 
908  std::vector<SeqEntry> alig_;
909 
911  annotation_map_t annotations_;
912 
917  str2idx_map_t name2idx_;
918 
919  // end attributes
920  //************************************************************
921 
922 
933  static size_type
934  deviation2(const string1 &a1,
935  const string1 &a2,
936  const string1 &ref1,
937  const string1 &ref2);
938 
953  static double
954  pairwise_match_score(const SeqEntry &a1,
955  const SeqEntry &a2,
956  const SeqEntry &ref1,
957  const SeqEntry &ref2,
958  bool score_common_gaps);
959 
970  static std::vector<int>
971  match_vector(const string1 &s, const string1 &t);
972 
983  static std::vector<int>
984  match_vector2(const string1 &s, const string1 &t);
985 
994  static size_t
995  count_matches(const SeqEntry &a1, const SeqEntry &a2);
996 
1009  static size_t
1010  count_exclusive_matches(const SeqEntry &a1,
1011  const SeqEntry &a2,
1012  const SeqEntry &ref1,
1013  const SeqEntry &ref2);
1014 
1032  static double
1033  pairwise_deviation_score(const SeqEntry &a1,
1034  const SeqEntry &a2,
1035  const SeqEntry &ref1,
1036  const SeqEntry &ref2);
1037 
1039  void
1040  create_name2idx_map();
1041 
1050  void
1051  read_clustallike(std::istream &in, FormatType format);
1052 
1059  void
1060  read_stockholm(std::istream &in);
1061 
1074  void
1075  read_clustalw(std::istream &in);
1076 
1096  void
1097  read_fasta(std::istream &in);
1098  };
1099 
1106  std::ostream &
1107  operator<<(std::ostream &out, const MultipleAlignment &ma);
1108 
1109 
1110  template <size_t N>
1111  bool
1113  return std::all_of(alig_.begin(), alig_.end(),
1114  [&a](const auto &row) {
1115  return std::all_of(row.seq().begin(), row.seq().end(),
1116  [&a](const auto &c) {
1117  return a.in(c);
1118  });
1119  });
1120  }
1121 
1122 
1123 } // end namespace
1124 
1125 #endif // LOCARNA_MULTIPLE_ALIGNMENT_HH
Definition: alignment.hh:73
Represents a structure-annotated sequence alignment.
Definition: alignment.hh:83
Specifies an alphabet of static size.
Definition: alphabet.hh:21
const iterator
Definition: multiple_alignment.hh:320
read only proxy class representing a column of the alignment
Definition: multiple_alignment.hh:250
bool operator!=(const AliColumn &ac) const
Test inequality.
Definition: multiple_alignment.hh:311
bool operator==(const AliColumn &ac) const
Test equality.
Definition: multiple_alignment.hh:294
auto end() const
end iterator (always const)
Definition: multiple_alignment.hh:369
const char & operator[](size_type row_index) const
element access
Definition: multiple_alignment.hh:273
auto begin() const
begin iterator (always const)
Definition: multiple_alignment.hh:361
size_type size() const
Size / Number of rows.
Definition: multiple_alignment.hh:282
AliColumn(const MultipleAlignment &ma, size_type col_index)
Construct from multiple alignment column.
Definition: multiple_alignment.hh:260
A row in a multiple alignment.
Definition: multiple_alignment.hh:111
pos_pair_t col_to_pos(pos_type col) const
Definition: multiple_alignment.cc:540
std::pair< pos_type, pos_type > pos_pair_t
pair of positions
Definition: multiple_alignment.hh:115
SeqEntry(const std::string &name, const std::string &description, const string1 &seq)
Construct from strings name, description and 1-based string seq.
Definition: multiple_alignment.hh:157
void push_back(char c)
append character to sequence
Definition: multiple_alignment.hh:227
const std::string & name() const
(read-only) access to name
Definition: multiple_alignment.hh:166
void set_seq(const string1 &seq)
write access to seq
Definition: multiple_alignment.hh:233
pos_type pos_to_col(pos_type pos) const
map sequence position -> alignment column.
Definition: multiple_alignment.cc:520
SeqEntry(const std::string &name, const std::string &seq)
Construct from strings name and seq.
Definition: multiple_alignment.hh:124
void reverse()
reverse sequence
Definition: multiple_alignment.hh:218
size_type length_wogaps() const
length without gaps
Definition: multiple_alignment.cc:509
SeqEntry(const std::string &name, const string1 &seq)
Construct from strings name and 1-based string seq.
Definition: multiple_alignment.hh:134
SeqEntry(const std::string &name, const std::string &description, const std::string &seq)
Construct from strings name, description and seq.
Definition: multiple_alignment.hh:144
const std::string & description() const
(read-only) access to description
Definition: multiple_alignment.hh:172
const string1 & seq() const
(read-only) access to seq
Definition: multiple_alignment.hh:178
Represents a multiple alignment.
Definition: multiple_alignment.hh:65
void write_debug(std::ostream &out=std::cout) const
Print contents of object to stream.
Definition: multiple_alignment.cc:957
void operator+=(const AliColumn &c)
Append a column.
Definition: multiple_alignment.cc:1117
size_type index(const std::string &name) const
Access index by name.
Definition: multiple_alignment.hh:604
MultipleAlignment(const MultipleAlignment &ma)=default
Copy construct.
void set_annotation(const AnnoType &annotype, const SequenceAnnotation &annotation)
Write access to annotation.
Definition: multiple_alignment.hh:531
size_type num_of_rows() const
Number of rows of multiple aligment.
Definition: multiple_alignment.hh:497
size_type deviation(const MultipleAlignment &ma) const
Deviation of a multiple alignment from a reference alignment.
Definition: multiple_alignment.cc:668
void append(const SeqEntry &seqentry)
Append sequence entry.
Definition: multiple_alignment.cc:1102
void prepend(const SeqEntry &seqentry)
Prepend sequence entry.
Definition: multiple_alignment.cc:1109
FormatType
file format type for multiple alignments
Definition: multiple_alignment.hh:70
@ CLUSTAL
(extended) clustal file format
bool empty() const
Emptiness check.
Definition: multiple_alignment.hh:510
AliColumn column(size_type col_index) const
Access alignment column.
Definition: multiple_alignment.hh:717
std::vector< SeqEntry >::const_iterator const_iterator
const iterator of sequence entries
Definition: multiple_alignment.hh:380
double avg_deviation_score(const MultipleAlignment &ma) const
Average deviation score.
Definition: multiple_alignment.cc:733
const_iterator end() const
End for read-only traversal of name/sequence pairs.
Definition: multiple_alignment.hh:582
std::vector< SeqEntry >::iterator iterator
iterator of sequence entries
Definition: multiple_alignment.hh:382
bool contains(const std::string &name) const
Test whether name exists.
Definition: multiple_alignment.cc:613
std::ostream & write(std::ostream &out, FormatType format=MultipleAlignment::FormatType::CLUSTAL) const
Write alignment to stream.
Definition: multiple_alignment.cc:1088
static size_t num_of_annotypes()
number of annotation types
Definition: multiple_alignment.hh:99
std::string consensus_sequence() const
Consensus sequence of multiple alignment.
Definition: multiple_alignment.cc:964
AnnoType
type of sequence annotation. enumerates legal annotation types
Definition: multiple_alignment.hh:81
@ anchors
anchor annotation (anchor constraints)
@ consensus_structure
consensus structure annotation (consensus structure)
bool has_annotation(const AnnoType &annotype) const
Definition: multiple_alignment.hh:542
const SeqEntry & seqentry(size_type index) const
Access name/sequence pair by index.
Definition: multiple_alignment.hh:618
bool checkAlphabet(const Alphabet< char, N > &alphabet) const
check character constraints
Definition: multiple_alignment.hh:1112
double sps(const MultipleAlignment &ma, bool compalign=true) const
Sum-of-pairs score between a multiple alignment and a reference alignment.
Definition: multiple_alignment.cc:690
void init(const AlignmentEdges &edges, const Sequence &seqA, const Sequence &seqB, bool special_gap_symbols)
Initialize from alignment edges and sequences.
Definition: multiple_alignment.cc:186
MultipleAlignment(MultipleAlignment &&ma)=default
Move construct.
pos_type length() const
Length of multiple aligment.
Definition: multiple_alignment.hh:561
bool is_proper() const
Test whether alignment is proper.
Definition: multiple_alignment.cc:597
const SequenceAnnotation & annotation(const AnnoType &annotype) const
Read access of annotation by prefix.
Definition: multiple_alignment.cc:587
static const std::vector< AnnoType > AnnoTypes
collection of the format types
Definition: multiple_alignment.hh:94
virtual ~MultipleAlignment()
virtual destructor
Definition: multiple_alignment.cc:250
std::ostream & write_name_sequence_line(std::ostream &out, const std::string &name, const std::string &sequence, size_t namewidth) const
Write formatted line of name and sequence.
Definition: multiple_alignment.cc:995
void reverse()
reverse the multiple alignment
Definition: multiple_alignment.cc:1095
MultipleAlignment()
Construct empty.
Definition: multiple_alignment.cc:64
MultipleAlignment & operator=(const MultipleAlignment &ma)=default
Copy assignment.
static const std::vector< FormatType > FormatTypes
collection of the format types
Definition: multiple_alignment.hh:77
void normalize_rna_symbols()
normalize rna symbols
Definition: multiple_alignment.cc:577
const SeqEntry & seqentry(const std::string &name) const
Access name/sequence pair by name.
Definition: multiple_alignment.hh:629
const_iterator begin() const
Begin for read-only traversal of name/sequence pairs.
Definition: multiple_alignment.hh:573
double cmfinder_realignment_score(const MultipleAlignment &ma) const
Cmfinder realignment score of a multiple alignment to a reference alignment.
Definition: multiple_alignment.cc:812
Annotation of a sequence.
Definition: sequence_annotation.hh:24
"Sequence View" of multiple alignment as array of column vectors
Definition: sequence.hh:17
A simple 1-based string.
Definition: string1.hh:21
void push_back(char c)
push back character
Definition: string1.hh:126
void reverse()
reverse string
Definition: string1.hh:116
Definition: aligner.cc:15
size_type pos_type
type of a sequence position
Definition: aux.hh:126
std::ostream & operator<<(std::ostream &out, const AlignerRestriction &r)
Definition: aligner_restriction.hh:135
size_t size_type
general size type
Definition: aux.hh:120