/* Copyright by Matthias Hoechsmann (C) 2002-2004 ===================================== You may use, copy and distribute this file freely as long as you - do not change the file, - leave this copyright notice in the file, - do not make any profit with the distribution of this file - give credit where credit is due You are not allowed to copy or distribute this file otherwise The commercial usage and distribution of this file is prohibited Please report bugs and suggestions to */ #ifndef WIN32 #include "config.h" #endif #include #ifdef HAVE_LIBG2 #include #include #endif #include #include #include #include #include #ifdef HAVE_LIBXMLPLUSPLUS #include #endif #include "matrix.h" #include "rna_profile_alignment.h" #include "rnafuncs.h" #include "utils.h" #ifdef HAVE_LIBRNA // This features require the ViennaRNA library #include "part_func.h" #include "fold_vars.h" extern "C"{ #include "fold.h" } #endif #include "ppforest.t.cpp" ostream& operator <<(ostream &s,RNA_Alphabet_Profile p) { s.precision(2); s << p.p[ALPHA_PRO_BASE_A] << "\\n"; s.precision(2); s << p.p[ALPHA_PRO_BASE_C] << "\\n"; s.precision(2); s << p.p[ALPHA_PRO_BASE_G] << "\\n"; s.precision(2); s << p.p[ALPHA_PRO_BASE_U] << "\\n"; s.precision(2); s << p.p[ALPHA_PRO_BASEPAIR] << "\\n"; s.precision(2); s << p.p[ALPHA_PRO_GAP]; s.precision(2); s << p.p[ALPHA_PRO_BASE]; // for(Uint i=0;i(RNAFuncs::treeSize(viennaStr)), m_name(name), m_numStructures(1) { buildForest(baseStr,viennaStr); addStrName(name); }; #ifdef HAVE_LIBRNA // This features require the ViennaRNA library RNAProfileAlignment::RNAProfileAlignment(const string &baseStr, const string &name, const string &constraint, double t) : PPForestAli(2*baseStr.length()), m_name(name), m_numStructures(1) { char *viennaStr=NULL; // calculate partition function for the sequence do_backtrack=1; init_pf_fold(baseStr.length()); //if(constraint.length()>0) //pf_fold((char*)baseStr.c_str(),(char*)constraint.c_str()); // expicit conversion to non-const value, but pf_fold does not alter baseStr //else pf_fold((char*)baseStr.c_str(),NULL); // expicit conversion to non-const value, but pf_fold does not alter baseStr viennaStr=new char[baseStr.length()+1]; dangles=2; fold((char*)baseStr.c_str(),viennaStr); setSize(RNAFuncs::treeSize(viennaStr)); buildForest(baseStr,viennaStr,true); free_pf_arrays(); delete[] viennaStr; // hasSequence=true; addStrName(name); } #endif RNAProfileAlignment::RNAProfileAlignment(const string &filename) { // read ppforest from file size_type size; ifstream s; char *colStr; s.open(filename.c_str()); // first of all save the size s.read((char*)&size,sizeof(size_type)); s.read((char*)&m_numStructures,sizeof(Uint)); colStr=new char[m_numStructures+1]; colStr[m_numStructures]=0; // terminate string // initialize structures // ::PPForestBase(m_size); // m_lb=new L[ppf.size()]; initialize(size); // save the arrays for(Uint i=0;i(&d),sizeof(double)); l.p[r]=d; } s.read(colStr,sizeof(char)*m_numStructures); l.columnStr=colStr; m_lb[i]=l; } s.read(reinterpret_cast(m_rb),sizeof(size_type)*size); s.read(reinterpret_cast(m_noc),sizeof(size_type)*size); // s.read((char*)m_sumUpCSF,sizeof(size_type)*size); // s.read((char*)m_rmb,sizeof(size_type)*size); for(Uint r=0;r(ppf) { m_numStructures=ppf.m_numStructuresX+ppf.m_numStructuresY; }*/ /* ****************************************** */ /* Private functions */ /* ****************************************** */ void RNAProfileAlignment::makeRepLabel(size_type node, RNA_Alphabet_Profile a, RNA_Alphabet_Profile b) { double p,q; double m; p=m_numStructuresX; // weight number of structures in profile q=m_numStructuresY; m=p+q; p/=m; q/=m; // profile for(int i=0;i 0) // if(baseStr.length() != viennaStrLen) // throw RNAForestExceptionInput(RNAForestExceptionInput::Error_BaseStringAndViennaStringIncompatible); nodeStack=new Uint[maxDepth+1]; numChildrenStack=new Uint[maxDepth+1]; baseposStack=new Uint[maxDepth+1]; memset(nodeStack,0,sizeof(Uint)*maxDepth+1); memset(numChildrenStack,0,sizeof(Uint)*maxDepth+1); memset(baseposStack,0,sizeof(Uint)*maxDepth+1); // fill PPForest structure stackPtr=0; node=0; for(Uint i=0;i &pairprob, double t,size_type i, size_type j) const { size_type h; double bestPairScore; // list leftPairList; //list rightPairList; //Uint bestLeftIndex,bestRightIndex; //Uint lastLeftIndex,lastRightIndex; // QWATCH(i); // QWATCH(j); if(j==0) return; if(isPair(i)) { // TRACE(DBG_GET_PROFILE_STRUCTURE,"Profile_RNA_Alignment_Forest::getStructureAlignmentFromCSF","basepair"); // WATCH(DBG_GET_PROFILE_STRUCTURE,"Profile_RNA_Alignment_Forest::getStructureAlignmentFromCSF",m_lb[i].p[ALPHA_BASEPAIR]); bestPairScore=bestPairs(i); // backtrack best pairs if(bestPairScore == bestPairs(i+1) + bestPairs(getRightmostBrotherIndex(i+1)) || m_lb[i].p[ALPHA_PRO_BASEPAIR] < t) { // i pairs not // cout << "unpaired" << endl; getStructureAlignmentFromCSF(s,pairprob,t,i+1,noc(i)); // cout << "back to:" << endl; // QWATCH(i); // QWATCH(j); } else { // cout << "paired" << endl; // i pairs s += '('; pairprob.push_back(m_lb[i].p[ALPHA_PRO_BASEPAIR]); // left path - righthand best pairs h=i+1; while(h < size() && isPair(h)) { // cout << "left" << endl; // QWATCH(h); assert((int)noc(h)-1>=0); getStructureAlignmentFromCSF(s,pairprob,t,rb(h+1),noc(h)-1); h=h+1; } assert((int)noc(i)-2>=0); getStructureAlignmentFromCSF(s,pairprob,t,rb(i+1),noc(i)-2); // cout << "back to:" << endl; // QWATCH(i); // QWATCH(j); // right path - lefthand best pairs h=getRightmostBrotherIndex(i+1); while(h < size() && isPair(h)) { // cout << "right" << endl; // QWATCH(h); assert((int)noc(h)-1>=0); getStructureAlignmentFromCSF(s,pairprob,t,h+1,noc(h)-1); //h=h+1; h=getRightmostBrotherIndex(h+1); } s += ')'; pairprob.push_back(m_lb[i].p[ALPHA_PRO_BASEPAIR]); } } else { s+= '.'; pairprob.push_back(m_lb[i].p[ALPHA_PRO_BASE]); } // right forest getStructureAlignmentFromCSF(s,pairprob,t,rb(i),j-1); } double RNAProfileAlignment::bestPairs(size_type node) const { size_type i=node; double d1,d2; WATCH(DBG_GET_PROFILE_STRUCTURE,"RNAProfileForest::getStructureAlignment",node); if(isBase(node)) return 0; else { // node pairs not d1=bestPairs(node+1) + bestPairs(getRightmostBrotherIndex(node+1)); // node pairs d2=label(node).p[ALPHA_PRO_BASEPAIR]; // left path - righthand best pairs i=node+1; while(i < size() && isPair(i)) { d2+=bestPairs(getRightmostBrotherIndex(i+1)); i=i+1; } // right path - lefthand best pairs i=getRightmostBrotherIndex(node+1); while(isPair(i) && i < size()) { d2+=bestPairs(i+1); i=getRightmostBrotherIndex(i+1); } return max(d1,d2); } } #ifdef HAVE_LIBG2 // This features require the g2 library void RNAProfileAlignment::drawBaseCircles(int device_id,const BaseProbs &bp,double center_x,double center_y) const { const double box_size=6.0; const double max_radius=3.0; double xpos,ypos; int color; // double dashes=0.5; // draw the base probabilities as circles on the edges of the square and the gap probability // as the center square // upper left corner = a xpos=center_x-box_size/2.0; ypos=center_y+box_size/2.0; color=g2_ink(device_id,1,0,0); g2_pen(device_id,color); g2_filled_circle(device_id,xpos,ypos,max_radius*bp.a); // upper right corner = c xpos=center_x+box_size/2.0; ypos=center_y+box_size/2.0; color=g2_ink(device_id,0,1,0); g2_pen(device_id,color); g2_filled_circle(device_id,xpos,ypos,max_radius*bp.c); // lower right corner = g xpos=center_x+box_size/2.0; ypos=center_y-box_size/2.0; color=g2_ink(device_id,0,0,1); g2_pen(device_id,color); g2_filled_circle(device_id,xpos,ypos,max_radius*bp.g); // lower right corner = u xpos=center_x-box_size/2.0; ypos=center_y-box_size/2.0; color=g2_ink(device_id,1,0,1); g2_pen(device_id,color); g2_filled_circle(device_id,xpos,ypos,max_radius*bp.u); // gap in the center color=g2_ink(device_id,0,0,0); g2_pen(device_id,color); g2_filled_circle(device_id,center_x,center_y,max_radius*bp.gap); // draw rectangle for orientation //g2_set_dash(device_id,1,&dashes); color=g2_ink(device_id,0.75,0.75,0.75); g2_pen(device_id,color); g2_rectangle(device_id,center_x-box_size/2,center_y-box_size/2,center_x+box_size/2,center_y+box_size/2); } #endif inline double RNAProfileAlignment::getMlBaseFreq(const BaseProbs &bp) const { return max(max(max(bp.a,bp.c),bp.g),bp.u); } char RNAProfileAlignment::getMlBase(const BaseProbs &bp) const { double p = getMlBaseFreq(bp); if(bp.a==p) return ALPHA_BASE_A; if(bp.c==p) return ALPHA_BASE_C; if(bp.g==p) return ALPHA_BASE_G; if(bp.u==p) return ALPHA_BASE_U; assert(false); // you should never get so far ... return 'x'; } void RNAProfileAlignment::getSeqAli(string &seq,Uint row, Uint i, Uint j) const { if(i==0 && j==getMaxLength(0)) seq=""; if(j==0) return; // basepair=internal node if(isPair(i)) { getSeqAli(seq,row,i+1,noc(i)); getSeqAli(seq,row,rb(i),j-1); } else { // base=leaf seq+=m_lb[i].columnStr[row]; getSeqAli(seq,row,rb(i),j-1); } } void RNAProfileAlignment::getStructAli(string &s,Uint row) const { s=""; map pairs; makePairTable(pairs, row); // iterate through leaves nodes and use information of pairs for(Uint i=0;i &pairs, Uint row) const { pair *baseIndex; // left and right base RNA_Alphabet c; assert(row[m_size]; // initialize pairs, all gaps for(int i=size()-1;i>=0;i--) { baseIndex[i].first=-1; baseIndex[i].second=-1; } for(int i=size()-1;i>=0;i--) { c=m_lb[i].columnStr[row]; if(isLeave(i)) { if(c!=ALPHA_GAP) { baseIndex[i].first=i; baseIndex[i].second=i; } } else { // internal node // leftmost and rightmost base bool lmBaseFound=false; for(size_type r=0,h=i+1;r &pairprob, deque &baseprobs, double minFreq) const { deque::iterator itPP; // deque::iterator itBP; string::iterator it; for(itPP=pairprob.begin(),it=structure.begin();itPP!=pairprob.end();itPP++,it++) { if(*itPPbase &baseprob) const { BaseProbs bp; // generate base strings for(size_type i=0;i &pairprob) const { WATCH(DBG_GET_PROFILE_STRUCTURE,"Profile_RNA_Alignment_Forest::getStructureAlignment",size()); s=""; getStructureAlignmentFromCSF(s,pairprob,t,0,getMaxLength(0)); } #ifdef HAVE_LIBG2 // This features require the g2 library void RNAProfileAlignment::squigglePlot(const string &filename, SquigglePlotOptions &options) const { const double base_fontsize=8; const Uint num_grey_colors=100; const double min_grey_color=1.0; string seq,structure; string base,structname; float *X,*Y,min_X=0,max_X=0,min_Y=0,max_Y=0; Uint i; short *pair_table; int id_PS,id; int ps_grey_colors[num_grey_colors]; int ps_color_red; int ps_color_black; double xpos,ypos; deque pairprob; deque baseprobs; getStructureAlignment(options.minPairProb,structure,pairprob); getSequenceAlignment(baseprobs); // filterConsensus(structure,pairprob,baseprobs,0.5); //assert(baseprobs.size() == structure.size()); if(baseprobs.size() != structure.size()) cerr << "Error in resolving consensus structure!" << endl; X = new float[structure.size()]; Y = new float[structure.size()]; pair_table = make_pair_table(structure.c_str()); i = naview_xy_coordinates(pair_table, X, Y); if(i!=structure.size()) cerr << "strange things happening in squigglePlot ..." << endl; // calculate image dimesions for(i=0;ii+1) { // pairs in both structures if(pairprob[i]==1) g2_pen(id,ps_color_red); else g2_pen(id,ps_grey_colors[(int)floor(pairprob[i]*num_grey_colors-1)]); g2_line(id,X[i],Y[i],X[pair_table[i+1]-1],Y[pair_table[i+1]-1]); } } g2_flush(id); g2_close(id); free(pair_table); DELETE(X); DELETE(Y); } #endif void RNAProfileAlignment::printSeqAli() const { Uint i,l; deque seqs; string seq,info; // get alignment rows for(i=0;i > RNAProfileAlignment::getSeqAli() const { Uint i; deque > seqAli; pair p; string seq; // get alignment rows^M for(i=0;i strs; string str,info; // get alignment rows for(i=0;i RNAProfileAlignment::getStrAli() const { Uint i; deque strs; string str; // get alignment rows^M for(i=0;i baseprobs; getSequenceAlignment(baseprobs); // build sequence deque::const_iterator it; for(it=baseprobs.begin();it!=baseprobs.end();it++) { seq += getMlBase(*it); } return seq; } string RNAProfileAlignment::getConsStr(double minPairProb) const { string str; deque pairprob; getStructureAlignment(minPairProb,str,pairprob); return str; } deque RNAProfileAlignment::getBaseProb() const { deque baseprobs; deque bestBaseprob; getSequenceAlignment(baseprobs); // build sequence deque::const_iterator it; for(it=baseprobs.begin();it!=baseprobs.end();it++) { bestBaseprob.push_back(getMlBaseFreq(*it)); } return bestBaseprob; } //deque,double> > RNAProfileAlignment::getPairProb(double minPairProb) void RNAProfileAlignment::getPairProb(double &minPairProb, deque,double> > &pairprobs) { //deque,double> > structureProbs; deque pairprob; string str; getStructureAlignment(minPairProb,str,pairprob); // store the position of open brackets in an integer deque deque openBrackets; // iterator for structureProbs pair tmpPair; pair,double> tmpPair2; // iterate through structure str for(int i=0;i<=str.length();i++) { if(str[i]=='(') { openBrackets.push_back(i); } else if(str[i]==')') { // insert the int value of the corresponding opening bracket into structureProbs // insert position of the actual closing bracket // insert probability according to the actual bases tmpPair = pair (openBrackets.back(),i); tmpPair2 = pair,double> (tmpPair,pairprob[i]); pairprobs.push_back(tmpPair2); // delete position of the last opening bracket openBrackets.pop_back(); } } } void RNAProfileAlignment::printFastaAli(bool noStructure) const { string str,seq; // get alignment rows for(Uint i=0;i" << m_strNames[i] << endl; cout << seq << endl; if(!noStructure) cout << str << endl; } } void RNAProfileAlignment::printConsensus(double minPairProb) const { const int mountain_high=10; int j; string seq,structure; deque baseprobs; deque pairprob; getSequenceAlignment(baseprobs); getStructureAlignment(minPairProb,structure,pairprob); // build sequence deque::const_iterator it; for(it=baseprobs.begin();it!=baseprobs.end();it++) { seq += getMlBase(*it); } // assert(seq.size() == structure.size()); if(seq.size() != structure.size()) cerr << "Error in resolving consensus structure!" << endl; for(Uint i=0;i=1-(1/(double)mountain_high)*(double)j) cout << "*"; else cout << " "; } } cout << endl; } cout << setw(25) << " " << RNAFuncs::UpperCase(seq.substr(i,55)) << endl; cout << setw(25) << " " << structure.substr(i,55) << endl; // show structure frequency mountain for(j=0;j=(1/(double)mountain_high)*(double)j) cout << "*"; else cout << " "; } } cout << endl; } cout << endl; } } void RNAProfileAlignment::addStrNames(const deque& strNames) { deque::const_iterator it; for(it=strNames.begin();it!=strNames.end();it++) m_strNames.push_back(*it); } void RNAProfileAlignment::save(const string &filename) { ofstream s(filename.c_str()); // save the pforest to stream in binary format // first of all save the size s.write(reinterpret_cast(&m_size),sizeof(size_type)); s.write(reinterpret_cast(&m_numStructures),sizeof(Uint)); // save the arrays for(Uint i=0;i(&m_lb[i].p[r]),sizeof(double)); s.write(reinterpret_cast(m_lb[i].columnStr.c_str()),sizeof(char)*m_numStructures); } // for(Uint i=0;i(&m_lb[i]),sizeof(label_type)*m_size); // } // s.write(reinterpret_cast(m_lb),sizeof(label_type)*m_size); s.write(reinterpret_cast(m_rb),sizeof(size_type)*m_size); s.write(reinterpret_cast(m_noc),sizeof(size_type)*m_size); for(Uint r=0;r(ss.str().c_str()),sizeof(char)*20); } // s.write(reinterpret_cast(m_sumUpCSF),sizeof(size_type)*m_size); // s.write(reinterpret_cast(m_rmb),sizeof(size_type)*m_size); // s.write(m_name; //Uint m_numStructures; //Uint m_numStructuresX; //Uint m_numStructuresY; //deque m_strNames; //bool hasSequence; }