LocARNA-2.0.0
sparsification_mapper.hh
1 #ifndef SPARSIFICATION_MAPPER_HH
2 #define SPARSIFICATION_MAPPER_HH
3 
4 #ifdef HAVE_CONFIG_H
5 #include <config.h>
6 #endif
7 
8 #include <iostream>
9 #include <limits>
10 
11 #include "aux.hh"
12 #include "basepairs.hh"
13 #include "ext_rna_data.hh"
14 
15 namespace LocARNA {
16 
17  // print a pair
18  template <class T1, class T2>
19  std::ostream &
20  operator<<(std::ostream &out, const std::pair<T1, T2> &pair) {
21  return out << "(" << pair.first << "," << pair.second << ") ";
22  }
23 
31  public:
32  typedef BasePairs__Arc Arc;
33  typedef size_t ArcIdx;
34  typedef std::vector<ArcIdx> ArcIdxVec;
35  typedef pos_type matidx_t;
36  typedef pos_type seq_pos_t;
37 
38  typedef size_t index_t;
39 
42  struct info_for_pos {
44  bool
48 
50  void
51  reset() {
52  this->seq_pos = 0;
53  this->valid_arcs.clear();
54  this->unpaired = false;
55  }
56  };
57 
58  typedef std::vector<info_for_pos> InfoForPosVec;
64 
65  private:
66  const BasePairs &bps;
67  const ExtRnaData &rnadata;
68  const double prob_unpaired_in_loop_threshold;
71  const double prob_basepair_in_loop_threshold;
73  size_type max_info_vec_size;
77  std::vector<InfoForPosVec> info_valid_seq_pos_vecs;
78 
82  std::vector<std::vector<matidx_t> > valid_mat_pos_vecs_before_eq;
83 
87  std::vector<std::vector<ArcIdxVec> > left_adj_vec;
88 
91  void
92  compute_mapping_idx_arcs();
93 
96  void
97  compute_mapping_idx_left_ends();
98 
103  void
104  iterate_left_adj_list(pos_type cur_left_end,
105  pos_type cur_pos,
106  const Arc *inner_arc,
107  info_for_pos &struct_pos);
108 
109  void
110  valid_pos_external(pos_type cur_pos,
111  const Arc *inner_arc,
112  info_for_pos &struct_pos);
113 
114  public:
130  const ExtRnaData &rnadata_,
131  double prob_unpaired_in_loop_threshold_,
132  double prob_basepair_in_loop_threshold_,
133  bool index_left_ends)
134  : bps(bps_),
135  rnadata(rnadata_),
136  prob_unpaired_in_loop_threshold(prob_unpaired_in_loop_threshold_),
137  prob_basepair_in_loop_threshold(prob_basepair_in_loop_threshold_),
138  max_info_vec_size(0) {
139  if (index_left_ends) {
140  compute_mapping_idx_left_ends();
141  } else {
142  compute_mapping_idx_arcs();
143  }
144  }
145 
149  size_type
151  return max_info_vec_size;
152  }
153 
160  const InfoForPosVec &
162  return info_valid_seq_pos_vecs.at(idx);
163  }
164 
171  const ArcIdxVec &
173  return info_valid_seq_pos_vecs.at(idx).at(pos).valid_arcs;
174  }
175 
187  matidx_t
189  index_t index,
190  seq_pos_t pos,
191  index_t left_end = std::numeric_limits<index_t>::max()) const {
192  if (left_end == std::numeric_limits<index_t>::max())
193  left_end = index;
194  assert(pos >= left_end); // tocheck
195  return valid_mat_pos_vecs_before_eq.at(index).at(pos - left_end);
196  }
197 
207  inline matidx_t
209  index_t index,
210  seq_pos_t pos,
211  index_t left_end = std::numeric_limits<index_t>::max()) const {
212  // if (left_end == std::numeric_limits<index_t>::max()) assert (pos
213  // > index);
214  return first_valid_mat_pos_before_eq(index, pos - 1, left_end);
215  }
216 
224  inline seq_pos_t
226  assert(pos < number_of_valid_mat_pos(idx));
227  return (
228  info_valid_seq_pos_vecs.at(idx).at(pos).seq_pos); //+arc.left();
229  }
230 
236  size_type
238  return info_valid_seq_pos_vecs.at(idx).size();
239  }
240 
248  bool
249  pos_unpaired(index_t idx, matidx_t pos) const {
250  return info_valid_seq_pos_vecs.at(idx).at(pos).unpaired;
251  }
252 
265  bool
266  matching_without_gap_possible(const Arc &arc, seq_pos_t pos) const {
267  const pos_type &mat_pos =
268  first_valid_mat_pos_before(arc.idx(), pos, arc.left());
269  return get_pos_in_seq_new(arc.idx(), mat_pos) == pos - 1;
270  }
271 
279  const ArcIdxVec &
280  valid_arcs_left_adj(const Arc &arc, seq_pos_t pos) const {
281  return left_adj_vec.at(arc.idx()).at(pos - arc.left());
282  }
283 
286 
298  matidx_t
300  seq_pos_t min_col,
301  index_t left_end = std::numeric_limits<index_t>::max()) const {
302  if (left_end == std::numeric_limits<index_t>::max())
303  left_end = index;
304 
305  size_t num_pos = number_of_valid_mat_pos(index);
306  seq_pos_t last_mapped_seq_pos =
307  get_pos_in_seq_new(index, num_pos - 1);
308 
309  // if the position min_col is smaller than or equal to the left end
310  // (first mapped position), return 0
311  if (min_col <= left_end)
312  return 0;
313 
314  // if the position min_col is greater than the last mapped sequence
315  // position, the position does not exists
316  // return the number of valid positions
317  if (min_col > last_mapped_seq_pos)
318  return num_pos;
319 
320  // the matrix position after the first valid position before the
321  // position min_col (first matrix position that
322  // stores a sequence position that is greater than or equal to
323  // min_col-1)
324  matidx_t idx_geq =
325  first_valid_mat_pos_before_eq(index, min_col - 1, left_end) + 1;
326 
327  assert(get_pos_in_seq_new(index, idx_geq) >= min_col &&
328  !(get_pos_in_seq_new(index, idx_geq - 1) >= min_col));
329 
330  return idx_geq;
331  }
332 
343  matidx_t
345  index_t index,
346  seq_pos_t max_col,
347  index_t left_end = std::numeric_limits<index_t>::max()) const {
348  if (left_end == std::numeric_limits<index_t>::max())
349  left_end = index;
350 
351  size_t num_pos = number_of_valid_mat_pos(index);
352  seq_pos_t last_mapped_seq_pos =
353  get_pos_in_seq_new(index, num_pos - 1);
354 
355  // if the position max_col is smaller than the left_end (first
356  // mapped position), return 0
357  if (max_col < left_end)
358  return 0;
359 
360  // if the position max_col is greater than or equal to the last
361  // mapped sequence position,
362  // return the number of positions
363  if (max_col >= last_mapped_seq_pos)
364  return num_pos;
365 
366  // the matrix position after the first matrix position before or
367  // equal the sequence position max_col
368  // (last matrix position that stores a sequence position that is
369  // lower than or equal max_col)
371  first_valid_mat_pos_before_eq(index, max_col, left_end) + 1;
372 
373  assert(get_pos_in_seq_new(index, idx_after_leq - 1) <= max_col &&
374  !(get_pos_in_seq_new(index, idx_after_leq) <= max_col));
375 
376  return idx_after_leq;
377  }
378 
379  private:
390  bool
391  is_valid_arc(const Arc &inner_arc, const Arc &arc) const {
392  assert(inner_arc.left() > arc.left() &&
393  inner_arc.right() < arc.right());
394  return rnadata.arc_in_loop_prob(inner_arc.left(), inner_arc.right(),
395  arc.left(), arc.right()) >=
396  prob_basepair_in_loop_threshold;
397  }
398 
407  bool
408  is_valid_arc_external(const Arc &inner_arc) const {
409  return rnadata.arc_external_prob(inner_arc.left(),
410  inner_arc.right()) >=
411  prob_basepair_in_loop_threshold;
412  }
413 
414  public:
425  bool
426  is_valid_pos(const Arc &arc, seq_pos_t pos) const {
427  assert(arc.left() < pos && pos < arc.right());
428  return rnadata.unpaired_in_loop_prob(pos, arc.left(),
429  arc.right()) >=
430  prob_unpaired_in_loop_threshold; // todo: additional variable
431  // for external case
432  }
433 
440  bool
442  return rnadata.unpaired_external_prob(pos) >=
443  prob_unpaired_in_loop_threshold; // todo: additional variable
444  // for external case
445  }
446  };
447 
454  template <class T>
455  std::ostream &
456  operator<<(std::ostream &out, const std::vector<T> &vec) {
457  for (typename std::vector<T>::const_iterator it = vec.begin();
458  it != vec.end(); ++it) {
459  out << *it << " ";
460  }
461  return out;
462  }
463 
471  std::ostream &
472  operator<<(
473  std::ostream &out,
474  const std::vector<SparsificationMapper::InfoForPosVec> &pos_vecs_);
475 
483  std::ostream &
484  operator<<(std::ostream &out,
485  const SparsificationMapper::InfoForPosVec &pos_vec_);
486 
487 } // end namespace
488 
489 #endif // SPARSIFICATION_MAPPER_HH
Represents a base pair.
Definition: basepairs.hh:39
size_t right() const
Definition: basepairs.hh:77
size_t idx() const
Definition: basepairs.hh:87
size_t left() const
Definition: basepairs.hh:67
Describes sequence and structure ensemble of an RNA.
Definition: basepairs.hh:108
represent sparsified data of RNA ensemble extended by in loop probabilities
Definition: ext_rna_data.hh:37
double arc_external_prob(pos_type i, pos_type j) const
Get base pair in loop probability.
Definition: rna_data.cc:755
double unpaired_in_loop_prob(pos_type k, pos_type p, pos_type q) const
Get base pair in loop probability.
Definition: rna_data.cc:767
double unpaired_external_prob(pos_type k) const
Get base pair in loop probability.
Definition: rna_data.cc:776
double arc_in_loop_prob(pos_type i, pos_type j, pos_type p, pos_type q) const
Get base pair in loop probability.
Definition: rna_data.cc:745
Represents the mapping for sparsification.
Definition: sparsification_mapper.hh:30
seq_pos_t get_pos_in_seq_new(index_t idx, matidx_t pos) const
Definition: sparsification_mapper.hh:225
size_type number_of_valid_mat_pos(index_t idx) const
Definition: sparsification_mapper.hh:237
std::vector< ArcIdx > ArcIdxVec
vector of arc indices
Definition: sparsification_mapper.hh:34
SparsificationMapper(const BasePairs &bps_, const ExtRnaData &rnadata_, double prob_unpaired_in_loop_threshold_, double prob_basepair_in_loop_threshold_, bool index_left_ends)
Definition: sparsification_mapper.hh:129
const ArcIdxVec & valid_arcs_left_adj(const Arc &arc, seq_pos_t pos) const
Definition: sparsification_mapper.hh:280
matidx_t first_valid_mat_pos_before(index_t index, seq_pos_t pos, index_t left_end=std::numeric_limits< index_t >::max()) const
Definition: sparsification_mapper.hh:208
bool matching_without_gap_possible(const Arc &arc, seq_pos_t pos) const
Definition: sparsification_mapper.hh:266
virtual ~SparsificationMapper()
class destructor
Definition: sparsification_mapper.hh:285
size_t ArcIdx
type of arc index
Definition: sparsification_mapper.hh:33
pos_type seq_pos_t
type for a sequence position
Definition: sparsification_mapper.hh:36
pos_type matidx_t
type for a matrix position
Definition: sparsification_mapper.hh:35
bool is_valid_pos_external(seq_pos_t pos) const
Definition: sparsification_mapper.hh:441
matidx_t idx_after_leq(index_t index, seq_pos_t max_col, index_t left_end=std::numeric_limits< index_t >::max()) const
Definition: sparsification_mapper.hh:344
const ArcIdxVec & valid_arcs_right_adj(index_t idx, matidx_t pos) const
Definition: sparsification_mapper.hh:172
bool is_valid_pos(const Arc &arc, seq_pos_t pos) const
Definition: sparsification_mapper.hh:426
const InfoForPosVec & valid_seq_positions(index_t idx) const
Definition: sparsification_mapper.hh:161
size_t index_t
type for an index
Definition: sparsification_mapper.hh:38
std::vector< info_for_pos > InfoForPosVec
Definition: sparsification_mapper.hh:58
matidx_t idx_geq(index_t index, seq_pos_t min_col, index_t left_end=std::numeric_limits< index_t >::max()) const
Definition: sparsification_mapper.hh:299
BasePairs__Arc Arc
type of arc
Definition: sparsification_mapper.hh:32
matidx_t first_valid_mat_pos_before_eq(index_t index, seq_pos_t pos, index_t left_end=std::numeric_limits< index_t >::max()) const
Definition: sparsification_mapper.hh:188
bool pos_unpaired(index_t idx, matidx_t pos) const
Definition: sparsification_mapper.hh:249
size_type get_max_info_vec_size() const
Definition: sparsification_mapper.hh:150
Definition: aligner.cc:15
size_type pos_type
type of a sequence position
Definition: aux.hh:126
std::ostream & operator<<(std::ostream &out, const AlignerRestriction &r)
Definition: aligner_restriction.hh:135
size_t size_type
general size type
Definition: aux.hh:120
Definition: sparsification_mapper.hh:42
bool unpaired
if true, the sequence position can occur unpaired
Definition: sparsification_mapper.hh:45
ArcIdxVec valid_arcs
Definition: sparsification_mapper.hh:46
seq_pos_t seq_pos
the sequence position
Definition: sparsification_mapper.hh:43
void reset()
resets the content of the struct
Definition: sparsification_mapper.hh:51