LocARNA-2.0.0
scoring.hh
1 #ifndef LOCARNA_SCORING_HH
2 #define LOCARNA_SCORING_HH
3 
4 #ifdef HAVE_CONFIG_H
5 #include <config.h>
6 #endif
7 
8 #include <cmath>
9 #include <vector>
10 
11 #include "aux.hh"
12 
13 #include "scoring_fwd.hh"
14 #include "matrix.hh"
15 #include "sequence.hh"
16 #include "arc_matches.hh"
17 #include "named_arguments.hh"
18 
19 namespace LocARNA {
20 
21  //#define MEA_SCORING_OLD
22 
23  class RibosumFreq;
24  class Ribofit;
25  class Scoring;
26  class Sequence;
27  class BasePairs;
28  class BasePairs__Arc;
29  class ArcMatch;
30  class MatchProbs;
31  class RnaData;
32 
34  typedef std::vector<infty_score_t> ScoreVector;
35 
38 
40  typedef Matrix<double> ProbMatrix;
41 
51  class ScoringParams {
52  public:
60 
63 
66 
69 
72 
75  DEFINE_NAMED_ARG_DEFAULT_FEATURE(indel_opening_loop, score_t, -800);
76 
82  DEFINE_NAMED_ARG_DEFAULT_FEATURE(ribosum, const RibosumFreq *, nullptr);
83 
90  DEFINE_NAMED_ARG_DEFAULT_FEATURE(ribofit, const Ribofit *, nullptr);
91 
94 
100 
106 
109 
110  DEFINE_NAMED_ARG_FEATURE(exp_probA, double);
111 
112  DEFINE_NAMED_ARG_FEATURE(exp_probB, double);
113 
114  DEFINE_NAMED_ARG_DEFAULT_FEATURE(temperature_alipf, double, 150);
115 
117  DEFINE_NAMED_ARG_DEFAULT_FEATURE(stacking, bool, false);
118 
120  DEFINE_NAMED_ARG_DEFAULT_FEATURE(new_stacking, bool, false);
121 
123  DEFINE_NAMED_ARG_DEFAULT_FEATURE(mea_scoring, bool, false);
124 
127 
130 
133 
141  DEFINE_NAMED_ARG_DEFAULT_FEATURE(probability_scale, score_t, 10000);
142 
143  using valid_args = std::tuple<
144  match,
145  mismatch,
146  indel,
147  indel_loop,
148  indel_opening,
149  indel_opening_loop,
150  ribosum,
151  ribofit,
152  unpaired_penalty,
153  struct_weight,
154  tau_factor,
155  exclusion,
156  exp_probA,
157  exp_probB,
158  temperature_alipf,
159  stacking,
160  new_stacking,
161  mea_scoring,
162  mea_alpha,
163  mea_beta,
164  mea_gamma,
165  probability_scale
166  >;
167 
168  /*
169  The mea score is composed in the following way:
170 
171  params->probability_scale_ *
172  (
173  sum_basematchs (i,j)
174  P(i~j)
175  + mea_alpha/100 * (P(i unstructured) + P(j unstructured))
176  +
177  sum_arcmatchs (a,b)
178  mea_beta/100 * (P(a)+P(b)) * P(al~bl) * P(ar~br) *
179  ribosum_arcmatch(a,b)
180  )
181  */
182 
183  public:
190  template <typename... Args>
191  ScoringParams(Args... argpack) {
192  static_assert( type_subset_of<
193  std::tuple<Args...>,
194  valid_args>::value,
195  "Invalid type in named arguments pack." );
196 
197  auto args = std::make_tuple(argpack...);
198 
199  //mandatory
200  exp_probA_ = get_named_arg<exp_probA>(args);
201  exp_probB_ = get_named_arg<exp_probB>(args);
202 
203  //optional
204  match_ = get_named_arg_opt<match>(args);
205  mismatch_ = get_named_arg_opt<mismatch>(args);
206  indel_ = get_named_arg_opt<indel>(args);
207  indel_loop_ = get_named_arg_opt<indel_loop>(args);
208  indel_opening_ = get_named_arg_opt<indel_opening>(args);
209  indel_opening_loop_ = get_named_arg_opt<indel_opening_loop>(args);
210  ribosum_ = get_named_arg_opt<ribosum>(args);
211  ribofit_ = get_named_arg_opt<ribofit>(args);
212  unpaired_penalty_ = get_named_arg_opt<unpaired_penalty>(args);
213  struct_weight_ = get_named_arg_opt<struct_weight>(args);
214  tau_factor_ = get_named_arg_opt<tau_factor>(args);
215  exclusion_ = get_named_arg_opt<exclusion>(args);
216  temperature_alipf_ = get_named_arg_opt<temperature_alipf>(args);
217  stacking_ = get_named_arg_opt<stacking>(args);
218  new_stacking_ = get_named_arg_opt<new_stacking>(args);
219  mea_scoring_ = get_named_arg_opt<mea_scoring>(args);
220  mea_alpha_ = get_named_arg_opt<mea_alpha>(args);
221  mea_beta_ = get_named_arg_opt<mea_beta>(args);
222  mea_gamma_ = get_named_arg_opt<mea_gamma>(args);
223  probability_scale_ = get_named_arg_opt<probability_scale>(args);
224  }
225  };
226 
271  class Scoring {
272  public:
273  typedef BasePairs__Arc Arc;
274 
275  protected:
277 
279 
281 
284  const Sequence &seqA;
285  const Sequence &seqB;
286 
292 
293  public:
309  Scoring(const Sequence &seqA,
310  const Sequence &seqB,
311  const RnaData &rna_dataA,
312  const RnaData &rna_dataB,
313  const ArcMatches &arc_matches,
314  const MatchProbs *match_probs,
315  const ScoringParams &params);
316 
330  void
332 
340  void
342 
348  score_t
349  lambda() const {
350  return lambda_;
351  }
352 
353  protected:
354  // ------------------------------
355  // tables for precomputed score contributions
356  //
359 
360  std::vector<score_t> gapcost_tabA;
361  std::vector<score_t> gapcost_tabB;
362 
363  std::vector<score_t> weightsA; //<! weights of base pairs in A
364  std::vector<score_t> weightsB; //<! weights of base pairs in B
365 
366  std::vector<score_t> stack_weightsA; //<! weights of stacked base
367  //<! pairs in A
368  std::vector<score_t> stack_weightsB; //<! weights of stacked base
369  //<! pairs in B
370 
372 
373  void
374  precompute_sequence_identities();
375 
384  score_t
385  round2score(double d) const {
386  return (score_t)((d < 0) ? (d - 0.5) : (d + 0.5));
387  }
388 
398  score_t
399  sigma_(int i, int j) const;
400 
406  void
408 
410  void
412 
414  void
416 
426  void
427  precompute_weights(const RnaData &rna_data,
428  const BasePairs &bps,
429  double exp_prob,
430  std::vector<score_t> &weights,
431  std::vector<score_t> &stack_weights);
432 
442  score_t
443  probToWeight(double p, double prob_exp) const;
444 
449  double
450  ribosum_arcmatch_prob(const Arc &arcA, const Arc &arcB) const;
451 
465  score_t
466  riboX_arcmatch_score(const Arc &arcA, const Arc &arcB) const;
467 
469  void
470  subtract(std::vector<score_t> &v, score_t x) const;
471 
473  void
474  subtract(Matrix<score_t> &m, score_t x) const;
475 
476  public:
477  // ------------------------------------------------------------
478  // SCORE CONTRIBUTIONS
479 
488  score_t
490  return sigma_tab(i, j);
491  }
492 
510  score_t
511  arcmatch(const ArcMatch &am, bool stacked = false) const;
512 
530  score_t
531  arcmatch(const BasePairs__Arc &arcA,
532  const BasePairs__Arc &arcB,
533  bool stacked = false) const;
534 
537  const ArcMatches *
538  arc_matches() const {
539  return arc_matches_;
540  }
541 
551  template <bool gapAorB>
552  score_t
553  arcDel(const BasePairs__Arc &arc, bool stacked = false) const;
554 
562  score_t
563  arcmatch_stacked(const ArcMatch &am) const {
564  return arcmatch(am, true);
565  }
566 
576  template <bool gapInA>
577  inline
578  score_t
579  gapX(size_type alignedToGap) const;
580 
588  score_t
589  gapA(size_type posA) const {
590  assert(1 <= posA && posA <= seqA.length());
591 
592  return gapcost_tabA[posA];
593  }
594 
602  score_t
603  gapB(size_type posB) const {
604  assert(1 <= posB && posB <= seqB.length());
605 
606  return gapcost_tabB[posB];
607  }
608 
610  score_t
611  exclusion() const {
612  return params->exclusion_;
613  }
614 
616  score_t
617  indel_opening() const {
618  return params->indel_opening_;
619  }
620 
622  score_t
623  loop_indel_score(const score_t score) const {
624  return round2score(score * params->indel_loop_ / params->indel_);
625  }
627  score_t
629  return params->indel_opening_loop_;
630  }
631 
632  //
633  // ------------------------------------------------------------
634 
644  double
645  prob_exp(size_type len) const;
646 
652  bool
653  stacking() const {
654  return params->stacking_ || params->new_stacking_;
655  }
656 
664  bool
665  is_stackable_arcA(const Arc &a) const;
666 
674  bool
675  is_stackable_arcB(const Arc &a) const;
676 
684  bool
685  is_stackable_am(const ArcMatch &am) const;
686 
687  }; // end class Scoring
688 
689  template <bool isA>
690  score_t
691  Scoring::arcDel(const Arc &arcX,
692  bool stacked) const { // TODO Important Scoring scheme for
693  // aligning an arc to a gap is not
694  // defined and implemented!
695 
696  if (arc_matches_
697  ->explicit_scores()) { // will not take stacking into account!!!
698  std::cerr
699  << "ERROR sparse explicit scores is not supported!"
700  << std::endl; // TODO: Supporting explicit scores for arcgap
701  assert(!arc_matches_->explicit_scores());
702  }
703 
704  if (!params->mea_scoring_) {
705  return
706  // base pair weights
708  gapX<isA>(arcX.left()) +
709  gapX<isA>(arcX.right())) + // score of aligining base-pair
710  // ends wo gap, such that it is
711  // multiply by gamma_loop ratio
712  (stacked ? (isA ? stack_weightsA[arcX.idx()]
713  : stack_weightsB[arcX.idx()])
714  : (isA ? weightsA[arcX.idx()] : weightsB[arcX.idx()]));
715  } else {
716  std::cerr << "ERROR sparse mea_scoring is not supported!"
717  << std::endl; // TODO: Supporting mea_scoring for arcgap
718  assert(!params->mea_scoring_);
719  return 0;
720  }
721  }
722 
723  template <>
724  inline
725  score_t
726  Scoring::gapX<true>(size_type alignedToGap) const {
727  return gapA(alignedToGap);
728  }
729 
730  template <>
731  inline
732  score_t
733  Scoring::gapX<false>(size_type alignedToGap) const {
734  return gapB(alignedToGap);
735  }
736 
737 
744  template<typename T>
745  class PFScoring : public Scoring {
746  public:
747  using pf_score_t = T;
748 
750  using PFScoreVector = std::vector<pf_score_t>;
751 
754 
755 
769  PFScoring(const Sequence &seqA,
770  const Sequence &seqB,
771  const RnaData &rna_dataA,
772  const RnaData &rna_dataB,
773  const ArcMatches &arc_matches,
774  const MatchProbs *match_probs,
775  const ScoringParams &params);
776 
785  pf_score_t
787  return exp_sigma_tab(i, j);
788  }
789 
797  pf_score_t
798  exp_arcmatch(const ArcMatch &am) const {
799  return boltzmann_weight(arcmatch(am));
800  }
801 
809  pf_score_t
810  exp_gapA(size_type posA) const {
811  assert(1 <= posA && posA <= seqA.length());
812  return exp_gapcost_tabA[posA];
813  }
814 
822  pf_score_t
823  exp_gapB(size_type posB) const {
824  assert(1 <= posB && posB <= seqB.length());
825 
826  return exp_gapcost_tabB[posB];
827  }
828 
830  pf_score_t
832  return exp_indel_opening_score;
833  }
834 
836  pf_score_t
838  return exp_indel_opening_loop_score;
839  }
840 
841 
842  protected:
844  void
846 
847 
853  void
855 
856 
857  pf_score_t
858  boltzmann_weight(score_t s) const {
859  return std::exp(s / (pf_score_t)params->temperature_alipf_);
860  }
861 
862 
863  private:
864  // ------------------------------
865  // tables for precomputed exp score contributions for partition function
866  //
867  Matrix<pf_score_t>
868  exp_sigma_tab;
869  pf_score_t exp_indel_opening_score;
871  pf_score_t exp_indel_opening_loop_score;
874  std::vector<pf_score_t>
875  exp_gapcost_tabA;
876  std::vector<pf_score_t>
877  exp_gapcost_tabB;
878 
879  };
880 
881  template <typename T>
883  const Sequence &seqB,
884  const RnaData &rna_dataA,
885  const RnaData &rna_dataB,
886  const ArcMatches &arc_matches,
887  const MatchProbs *match_probs,
888  const ScoringParams &params)
889  : Scoring(seqA,seqB,rna_dataA,rna_dataB,
890  arc_matches,match_probs,params) {
891 
892  exp_indel_opening_score = boltzmann_weight(params.indel_opening_);
893  exp_indel_opening_loop_score =
894  boltzmann_weight(params.indel_opening_loop_);
897  }
898 
899  template <typename T>
900  void
902  size_type lenA = seqA.length();
903  size_type lenB = seqB.length();
904 
905  exp_sigma_tab.resize(lenA + 1, lenB + 1);
906 
907  for (size_type i = 1; i <= lenA; ++i) {
908  for (size_type j = 1; j <= lenB; ++j) {
909  exp_sigma_tab(i, j) = boltzmann_weight(sigma_tab(i, j));
910  }
911  }
912  }
913 
914  template <typename T>
915  void
917  size_type lenA = seqA.length();
918  size_type lenB = seqB.length();
919 
920  // resize and create tables
921  exp_gapcost_tabA.resize(lenA + 1);
922  exp_gapcost_tabB.resize(lenB + 1);
923 
924  for (size_type i = 1; i < lenA + 1; i++) {
925  exp_gapcost_tabA[i] = boltzmann_weight(gapcost_tabA[i]);
926  }
927 
928  for (size_type i = 1; i < lenB + 1; i++) {
929  exp_gapcost_tabB[i] = boltzmann_weight(gapcost_tabB[i]);
930  }
931  }
932 
933 } // end namespace LocARNA
934 
935 #endif // LOCARNA_SCORING_HH
Represents a match of two base pairs (arc match)
Definition: arc_matches.hh:35
Maintains the relevant arc matches and their scores.
Definition: arc_matches.hh:116
bool explicit_scores() const
Definition: arc_matches.hh:362
Represents a base pair.
Definition: basepairs.hh:39
size_t right() const
Definition: basepairs.hh:77
size_t idx() const
Definition: basepairs.hh:87
size_t left() const
Definition: basepairs.hh:67
Describes sequence and structure ensemble of an RNA.
Definition: basepairs.hh:108
Provide match probabilities.
Definition: edge_probs.hh:91
pos_type length() const
Length of multiple aligment.
Definition: multiple_alignment.hh:561
Definition: scoring.hh:745
std::vector< pf_score_t > PFScoreVector
Vector of partition functions.
Definition: scoring.hh:750
pf_score_t exp_indel_opening_loop() const
exp of cost to begin a new indel in loops
Definition: scoring.hh:837
pf_score_t exp_basematch(size_type i, size_type j) const
Boltzmann weight of score of a base match (without structure)
Definition: scoring.hh:786
pf_score_t exp_arcmatch(const ArcMatch &am) const
Boltzmann weight of score of arc match.
Definition: scoring.hh:798
void precompute_exp_gapcost()
Precompute the tables for Boltzmann weights of gapcost.
Definition: scoring.hh:916
pf_score_t exp_gapA(size_type posA) const
Boltzmann weight of score of deletion.
Definition: scoring.hh:810
PFScoring(const Sequence &seqA, const Sequence &seqB, const RnaData &rna_dataA, const RnaData &rna_dataB, const ArcMatches &arc_matches, const MatchProbs *match_probs, const ScoringParams &params)
construct scoring object
Definition: scoring.hh:882
pf_score_t exp_indel_opening() const
exp of cost to begin a new indel
Definition: scoring.hh:831
pf_score_t exp_gapB(size_type posB) const
Boltzmann weight of score of insertion.
Definition: scoring.hh:823
void precompute_exp_sigma()
Precompute all Boltzmann weights of base similarities.
Definition: scoring.hh:901
Family of Ribofit matrices.
Definition: ribofit.hh:25
Represents ribosum similarity matrices including raw frequencies.
Definition: ribosum.hh:190
represent sparsified data of RNA ensemble
Definition: rna_data.hh:44
Parameters for scoring.
Definition: scoring.hh:51
ScoringParams(Args... argpack)
Definition: scoring.hh:191
DEFINE_NAMED_ARG_DEFAULT_FEATURE(stacking, bool, false)
turn on/off stacking terms
DEFINE_NAMED_ARG_DEFAULT_FEATURE(indel_loop, score_t, -200)
cost per indel for loops (for linear or affine gap cost).
DEFINE_NAMED_ARG_DEFAULT_FEATURE(indel_opening_loop, score_t, -800)
DEFINE_NAMED_ARG_DEFAULT_FEATURE(indel_opening, score_t, -750)
cost per gap (for affine gap-cost). Use affine gap cost if non-zero.
DEFINE_NAMED_ARG_DEFAULT_FEATURE(exclusion, score_t, 0)
cost of one exclusion.
DEFINE_NAMED_ARG_DEFAULT_FEATURE(mea_gamma, score_t, 100)
weight for mea contribution "consensus"
DEFINE_NAMED_ARG_DEFAULT_FEATURE(indel, score_t, -150)
cost per indel (for linear or affine gap cost).
DEFINE_NAMED_ARG_DEFAULT_FEATURE(match, score_t, 50)
DEFINE_NAMED_ARG_DEFAULT_FEATURE(ribofit, const Ribofit *, nullptr)
DEFINE_NAMED_ARG_DEFAULT_FEATURE(mea_beta, score_t, 200)
weight for mea contribution "structure"
DEFINE_NAMED_ARG_DEFAULT_FEATURE(probability_scale, score_t, 10000)
DEFINE_NAMED_ARG_DEFAULT_FEATURE(unpaired_penalty, score_t, 0)
penalty/cost for unpaired bases matched/mismatched/gapped
DEFINE_NAMED_ARG_DEFAULT_FEATURE(struct_weight, score_t, 200)
DEFINE_NAMED_ARG_DEFAULT_FEATURE(new_stacking, bool, false)
turn on/off new stacking terms
DEFINE_NAMED_ARG_DEFAULT_FEATURE(mea_scoring, bool, false)
turn on/off mea scoring
DEFINE_NAMED_ARG_DEFAULT_FEATURE(mea_alpha, score_t, 0)
weight for mea contribution "unstructured"
DEFINE_NAMED_ARG_DEFAULT_FEATURE(ribosum, const RibosumFreq *, nullptr)
DEFINE_NAMED_ARG_DEFAULT_FEATURE(tau_factor, score_t, 50)
DEFINE_NAMED_ARG_DEFAULT_FEATURE(mismatch, score_t, 0)
constant cost of a base mismatch
Provides methods for the scoring of alignments.
Definition: scoring.hh:271
score_t gapB(size_type posB) const
Definition: scoring.hh:603
score_t lambda_
Definition: scoring.hh:291
double ribosum_arcmatch_prob(const Arc &arcA, const Arc &arcB) const
Definition: scoring.cc:318
const RnaData & rna_dataA
rna data for RNA A
Definition: scoring.hh:282
score_t indel_opening_loop() const
cost to begin a new indel
Definition: scoring.hh:628
Scoring(const Sequence &seqA, const Sequence &seqB, const RnaData &rna_dataA, const RnaData &rna_dataB, const ArcMatches &arc_matches, const MatchProbs *match_probs, const ScoringParams &params)
construct scoring object
Definition: scoring.cc:28
const ScoringParams * params
a collection of parameters for scoring
Definition: scoring.hh:276
bool is_stackable_am(const ArcMatch &am) const
Is arc match stackable.
Definition: scoring.cc:567
const MatchProbs * match_probs
base match probabilities
Definition: scoring.hh:280
void precompute_sigma()
Precompute all base similarities.
Definition: scoring.cc:112
score_t arcmatch_stacked(const ArcMatch &am) const
Score of stacked arc match.
Definition: scoring.hh:563
const Sequence & seqA
sequence A
Definition: scoring.hh:284
bool stacking() const
Query stacking flag.
Definition: scoring.hh:653
BasePairs__Arc Arc
arc
Definition: scoring.hh:273
score_t gapA(size_type posA) const
Definition: scoring.hh:589
void subtract(std::vector< score_t > &v, score_t x) const
subtract from each element of a score_t vector v a value x
Definition: scoring.cc:55
score_t sigma_(int i, int j) const
Compute base similarity.
Definition: scoring.cc:141
void precompute_gapcost()
Precompute the tables for gapcost.
Definition: scoring.cc:272
score_t arcmatch(const ArcMatch &am, bool stacked=false) const
Score of arc match, support explicit arc match scores.
Definition: scoring.cc:541
score_t riboX_arcmatch_score(const Arc &arcA, const Arc &arcB) const
ribofit or ribosum arcmatch score contribution
Definition: scoring.cc:369
double prob_exp(size_type len) const
Expected base pair probability.
Matrix< score_t > sigma_tab
precomputed table of base match similarities
Definition: scoring.hh:358
score_t lambda() const
Get factor lambda for normalized alignment.
Definition: scoring.hh:349
score_t arcDel(const BasePairs__Arc &arc, bool stacked=false) const
Very basic interface, score of aligning a basepair to gap.
Definition: scoring.hh:691
std::vector< score_t > gapcost_tabB
table for gapcost in B
Definition: scoring.hh:361
bool is_stackable_arcB(const Arc &a) const
Is arc of B stackable.
Definition: scoring.cc:562
score_t loop_indel_score(const score_t score) const
multiply an score by the ratio of indel_loop/indel
Definition: scoring.hh:623
const Sequence & seqB
sequence B
Definition: scoring.hh:285
void precompute_weights()
Precompute weights/stacked weights for all arcs in A and B.
Definition: scoring.cc:240
score_t exclusion() const
cost of an exclusion
Definition: scoring.hh:611
score_t gapX(size_type alignedToGap) const
const RnaData & rna_dataB
rna data for RNA B
Definition: scoring.hh:283
const ArcMatches * arc_matches_
arc matches
Definition: scoring.hh:278
score_t round2score(double d) const
Round a double to score_t.
Definition: scoring.hh:385
bool is_stackable_arcA(const Arc &a) const
Is arc of A stackable.
Definition: scoring.cc:557
void modify_by_parameter(score_t lambda)
modify scoring by a parameter lambda.
Definition: scoring.cc:77
score_t indel_opening() const
cost to begin a new indel
Definition: scoring.hh:617
std::vector< score_t > gapcost_tabA
table for gapcost in A
Definition: scoring.hh:360
Matrix< size_t > identity
sequence identities in percent
Definition: scoring.hh:371
void apply_unpaired_penalty()
Definition: scoring.cc:65
score_t probToWeight(double p, double prob_exp) const
convert probability to weight for scoring
Definition: scoring.cc:251
const ArcMatches * arc_matches() const
Definition: scoring.hh:538
score_t basematch(size_type i, size_type j) const
Score of a match of bases (without structure)
Definition: scoring.hh:489
"Sequence View" of multiple alignment as array of column vectors
Definition: sequence.hh:17
Definition: aligner.cc:15
std::vector< infty_score_t > ScoreVector
matrix of scores supporting infinity
Definition: scoring.hh:31
Matrix< double > ProbMatrix
matrix for storing probabilities
Definition: aligner_p.hh:27
Matrix< infty_score_t > ScoreMatrix
matrix of scores supporting infinity
Definition: scoring.hh:37
size_t size_type
general size type
Definition: aux.hh:120
long int score_t
type of the locarna score as defined by the class Scoring
Definition: scoring_fwd.hh:13
Definition: named_arguments.hh:56