[Bioperl-guts-l] [14995] bioperl-live/trunk: [bug 2663]

Christopher John Fields cjfields at dev.open-bio.org
Sun Nov 16 01:20:00 EST 2008


Revision: 14995
Author:   cjfields
Date:     2008-11-16 01:20:00 -0500 (Sun, 16 Nov 2008)

Log Message:
-----------
[bug 2663]
* some fixes for seq_ind().
* 7 other SearchIO.t tests failing (all FASTA related), will look into

Modified Paths:
--------------
    bioperl-live/trunk/Bio/Search/HSP/GenericHSP.pm
    bioperl-live/trunk/t/SearchIO.t

Modified: bioperl-live/trunk/Bio/Search/HSP/GenericHSP.pm
===================================================================
--- bioperl-live/trunk/Bio/Search/HSP/GenericHSP.pm	2008-11-16 06:18:02 UTC (rev 14994)
+++ bioperl-live/trunk/Bio/Search/HSP/GenericHSP.pm	2008-11-16 06:20:00 UTC (rev 14995)
@@ -715,7 +715,6 @@
     return $self->{RANK};
 }
 
-
 =head2 seq_inds
 
  Title   : seq_inds
@@ -734,6 +733,7 @@
            :             'nomatch'   - mismatched residue or gap positions
            :             'mismatch'  - mismatched residue positions (no gaps)
            :             'gap'       - gap positions only
+           :             'frameshift'- frameshift positions only
            :             'conserved-not-identical' - conserved positions w/o 
            :                            identical residues
            :             The name can be shortened to 'id' or 'cons' unless
@@ -796,6 +796,8 @@
        $class = 'mismatch';
    } elsif( $t eq 'g' ) {
        $class = 'gap';
+   } elsif( $t eq 'f' ) {
+       $class = 'frameshift';    
    } else {
        $self->warn("unknown sequence class $class using 'identical'");
        $class = 'identical';
@@ -817,17 +819,17 @@
         # repeat the position based on the number of gaps inserted
         @ary = map { my @tmp;
                      # position holds number of gaps inserted
-                     for my $g (1..$self->{"${class}Res$seqType"}->{$_}) {
+                     for my $g (1..$self->{seqinds}{"${class}Res$seqType"}->{$_}) {
                         push @tmp, $_ ;
                      }
                      @tmp}
-              sort { $a <=> $b } keys %{ $self->{"${class}Res$seqType"}};
+              sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
    } elsif( $class eq '_conservedall' ) {
        @ary = sort { $a <=> $b }
-       keys %{ $self->{"_conservedRes$seqType"}},
-       keys %{ $self->{"_identicalRes$seqType"}},
+       keys %{ $self->{seqinds}{"_conservedRes$seqType"}},
+       keys %{ $self->{seqinds}{"_identicalRes$seqType"}},
    }  else {
-       @ary = sort { $a <=> $b } keys %{ $self->{"${class}Res$seqType"}};
+       @ary = sort { $a <=> $b } keys %{ $self->{seqinds}{"${class}Res$seqType"}};
    }
    require Bio::Search::BlastUtils if $collapse;
 
@@ -854,8 +856,9 @@
 
 sub ambiguous_seq_inds {
     my $self = shift;
-    return $self->{'_warnRes'} if exists $self->{'_warnRes'};
-    return $self->{'_warnRes'} = '';
+    $self->_calculate_seq_positions();    
+    return $self->{seqinds}{'_warnRes'} if exists $self->{seqinds}{'_warnRes'};
+    return $self->{seqinds}{'_warnRes'} = '';
 }
 
 =head2 Inherited from L<Bio::SeqFeature::SimilarityPair>
@@ -996,111 +999,148 @@
     my ($self, at args) = @_;
     return unless ( $self->{'_sequenceschanged'} );
     $self->{'_sequenceschanged'} = 0;
-    my ($mchar, $schar, $qchar);
     my ($seqString, $qseq,$sseq) = ( $self->homology_string(),
                                      $self->query_string(),
                                      $self->hit_string() );
-
-    # Using hashes to avoid saving duplicate residue numbers.
-    my %identicalList_query = ();
-    my %identicalList_sbjct = ();
-    my %conservedList_query = ();
-    my %conservedList_sbjct = ();
-
-    my %gapList_query = ();
-    my %gapList_sbjct = ();
-    my %nomatchList_query = ();
-    my %nomatchList_sbjct = ();
-    my %mismatchList_query = ();
-    my %mismatchList_sbjct = ();
-    
+    my ($mlen, $qlen, $slen) = (CORE::length($seqString), CORE::length($qseq), CORE::length($sseq));
     my $qdir = $self->query->strand || 1;
     my $sdir = $self->hit->strand || 1;
-    my $resCount_query = ($qdir >=0) ? $self->query->end : $self->query->start;
-    my $resCount_sbjct = ($sdir >=0) ? $self->hit->end : $self->hit->start;
-
+    my ($resCount_query, $endpoint_query) = ($qdir <=0) ? ($self->query->end, $self->query->start)
+        : ($self->query->start, $self->query->end);
+    my ($resCount_sbjct, $endpoint_sbjct) = ($sdir <=0) ? ($self->hit->end, $self->hit->start)
+        : ($self->hit->start, $self->hit->end);
+    
     my $prog = $self->algorithm;
+    
     if( $prog  =~ /FAST|SSEARCH|SMITH-WATERMAN/i ) {
+    
+        # we infer the end of the regional sequence where the first and last
+        # non spaces are in the homology string
+        # then we use the HSP->length to tell us how far to read
+        # to cut off the end of the sequence
+        
+        my ($start, $rest) = (0,0);
+        if( $seqString =~ /^(\s+)?(.*?)\s*$/ ) {
+            ($start, $rest) = ($1 ? CORE::length($1) : 0, CORE::length($2));
+        }
+    
+        $seqString = substr($seqString, $start, $rest);
+        $qseq = substr($qseq, $start, $rest);
+        $sseq = substr($sseq, $start, $rest);
+
+        # commented out 10/29/08
+        # removing frameshift symbols doesn't take into account the following:
+        # 1) does not remove the same point in the homology string (get
+        # positional errors)
+        # 2) adjustments to the overall position in the string due to the
+        # frameshift must be taken into consideration (get balancing errors)
+        #
+        # Frameshifts will be handled directly in the main loop. 
+        # --chris
+        
         # fasta reports some extra 'regional' sequence information
         # we need to clear out first
         # this seemed a bit insane to me at first, but it appears to
         # work --jason
-
-        # we infer the end of the regional sequence where the first
-        # non space is in the homology string
-        # then we use the HSP->length to tell us how far to read
-        # to cut off the end of the sequence
-
-        # one possible problem is the sequence which
-
-        my ($start) = (0);
-        if( $seqString =~ /^(\s+)/ ) {
-            $start = CORE::length($1);
-        }
-
-        $seqString = substr($seqString, $start,$self->length('total'));
-        $qseq = substr($qseq, $start,$self->length('total'));
-        $sseq = substr($sseq, $start,$self->length('total'));
-
-        $qseq =~ s![\\\/]!!g;
-        $sseq =~ s![\\\/]!!g;
+        
+        #$qseq =~ s![\\\/]!!g;
+        #$sseq =~ s![\\\/]!!g;
     }
 
     my ($warn, $sbjct_offset, $query_offset) = ('',1,1);
-    if($prog =~ /^(PSI)?T(BLAST|FAST)N/oi ) {
-    $sbjct_offset = 3;
-    $warn = 'subject';
+    if($prog =~ /^(?:PSI)?T(BLAST|FAST)(N|X|Y)/oi ) {
+        $sbjct_offset = 3;
+        if ($1 eq 'BLAST' && $2 eq 'X') { #TBLASTX
+            $query_offset = 3;
+            $warn = 'query/subject';
+        } else {
+            $warn = 'subject';
+        }
     } elsif($prog =~ /^(BLAST|FAST)(X|Y|XY)/oi  ) {
-    $query_offset = 3;
-    $warn = 'query';
-    } elsif($prog =~ /^T(BLAST|FAST)(X|Y|XY)/oi ) {
-    $query_offset = $sbjct_offset = 3;
-    $warn = 'query/subject';
+        $query_offset = 3;
+        $warn = 'query';
     }
-    my @qrange = (0..$query_offset-1); # 0, or 0..2
-    my @srange = (0..$sbjct_offset-1); # 0, or 0..2
-    while( $mchar = chop($seqString) ) {
-        ($qchar, $schar) = (chop($qseq), chop($sseq));
-        if( $mchar eq '+' || $mchar eq '.') {
-            $conservedList_query{ $resCount_query - ($_ * $qdir) } = 1 for @qrange;
-            $conservedList_sbjct{ $resCount_sbjct - ($_ * $sdir) } = 1 for @srange;
-        } elsif( $mchar eq ':' || $mchar ne ' ' ) {
-            $identicalList_query{ $resCount_query - ($_ * $qdir) } = 1 for @qrange;
-            $identicalList_sbjct{ $resCount_sbjct - ($_ * $sdir) } = 1 for @srange;
-        } elsif( $mchar eq ' ') {
-            $nomatchList_query{ $resCount_query - ($_ * $qdir) } = 1 for @qrange;
-            $nomatchList_sbjct{ $resCount_sbjct - ($_ * $sdir) } = 1 for @srange;
-            # mismatch; only count if the symbol matched to isn't a gap
-            if ($schar ne $self->{GAP_SYMBOL}) {
-                $mismatchList_query{ $resCount_query - ($_ * $qdir) } = 1 for @qrange;
+    my ($qfs, $sfs) = (0,0);
+    CHAR_LOOP:
+    for my $pos (0..CORE::length($seqString)-1) {
+        my @qrange = (0 - $qfs)..($query_offset - 1);
+        my @srange = (0 - $sfs)..($sbjct_offset - 1);
+        #$self->debug("QRange:@qrange SRange:@srange\n") if ($qfs || $sfs);
+        ($qfs, $sfs) = (0,0);
+        my ($mchar, $qchar, $schar) = (
+            unpack("x$pos A1",$seqString) || ' ',
+            $pos < CORE::length($qseq) ? unpack("x$pos A1",$qseq) : '-',
+            $pos < CORE::length($sseq) ? unpack("x$pos A1",$sseq) : '-'
+            );
+        my $matchtype = '';
+        my ($qgap, $sgap) = (0,0);
+        if( $mchar eq '+' || $mchar eq '.') {    # conserved
+            $self->{seqinds}{_conservedRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
+            $self->{seqinds}{_conservedRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
+            $matchtype = 'conserved';
+        } elsif( $mchar eq ':' || $mchar ne ' ' ) { # identical
+            $self->{seqinds}{_identicalRes_query}{ $resCount_query + ($_ * $qdir) } = 1 for @qrange;
+            $self->{seqinds}{_identicalRes_sbjct}{ $resCount_sbjct + ($_ * $sdir) } = 1 for @srange;
+            $matchtype = 'identical';
+        } elsif( $mchar eq ' ' ) {  # possible mismatch/nomatch/frameshift
+            $qfs = $qchar eq '/'  ?  -1 : # base inserted to match frame
+                   $qchar eq '\\' ?   1 : # base deleted to match frame
+                   0;
+            $sfs = $schar eq '/'  ?  -1 :
+                   $schar eq '\\' ?   1 :
+                   0;
+            # may need to rework logic depending on how we want to treat gaps
+            # across from insert/delete markers. Right now they aren't included.
+            # --cjfields
+            if ($qfs) {
+                # gaps across from frameshifts are not counted
+                $self->{seqinds}{_frameshiftRes_query}{ $resCount_query - ($query_offset * $qdir)} = $qfs;
+                $matchtype = "$qfs frameshift-query";
+                $sgap = $qgap = 1;
             }
-            if ($qchar ne $self->{GAP_SYMBOL}) {

@@ Diff output truncated at 10000 characters. @@



More information about the Bioperl-guts-l mailing list