[Bioperl-guts-l] [16899] bioperl-live/trunk: get_tiled_alns and tests

Mark Allen Jensen maj at dev.open-bio.org
Thu Mar 4 09:36:06 EST 2010


Revision: 16899
Author:   maj
Date:     2010-03-04 09:36:06 -0500 (Thu, 04 Mar 2010)
Log Message:
-----------
get_tiled_alns and tests

Modified Paths:
--------------
    bioperl-live/trunk/Bio/Search/Tiling/MapTiling.pm
    bioperl-live/trunk/t/SearchIO/Tiling.t

Modified: bioperl-live/trunk/Bio/Search/Tiling/MapTiling.pm
===================================================================
--- bioperl-live/trunk/Bio/Search/Tiling/MapTiling.pm	2010-03-04 13:09:27 UTC (rev 16898)
+++ bioperl-live/trunk/Bio/Search/Tiling/MapTiling.pm	2010-03-04 14:36:06 UTC (rev 16899)
@@ -80,6 +80,41 @@
 object with the L</contexts> method. If contexts don't apply for the
 given report, this returns C<('all')>. 
 
+=head1 TILED ALIGNMENTS
+
+The experimental method L<ALIGNMENTS/get_tiled_alns> will use a tiling
+to concatenate tiled hsps into a series of L<Bio::SimpleAlign>
+objects:
+
+ @alns = $tiling->get_tiled_alns($type, $context);
+
+Each alignment contains two sequences with ids 'query' and 'subject',
+and consists of a concatenation of tiling HSPs which overlap or are
+directly adjacent. The alignment are returned in C<$type> sequence
+order. When HSPs overlap, the alignment sequence is taken from the HSP
+which comes first in the coverage map array.
+
+The sequences in each alignment contain features (even though they are
+L<Bio::LocatableSeq objects) which map the original query/subject
+coordinates to the new alignment sequence coordinates. You can
+determine the original BLAST fragments this way:
+
+ $aln = ($tiling->get_tiled_alns)[0];
+ $qseq = $aln->get_seq_by_id('query');
+ $hseq = $aln->get_seq_by_id('subject');
+ foreach my $feat ($qseq->get_SeqFeatures) {
+    $org_start = ($feat->get_tag_values('query_start'))[0];
+    $org_end = ($feat->get_tag_values('query_end'))[0];
+    # original fragment as represented in the tiled alignment:
+    $org_fragment = $feat->seq;
+ }
+ foreach my $feat ($hseq->get_SeqFeatures) {
+    $org_start = ($feat->get_tag_values('subject_start'))[0];
+    $org_end = ($feat->get_tag_values('subject_end'))[0];
+    # original fragment as represented in the tiled alignment:
+    $org_fragment = $feat->seq;
+ }
+
 =head1 DESIGN NOTE
 
 The major calculations are made just-in-time, and then memoized. So,
@@ -143,6 +178,7 @@
 use Bio::Root::Root;
 use Bio::Search::Tiling::TilingI;
 use Bio::Search::Tiling::MapTileUtils;
+use Bio::LocatableSeq;
 
 # use base qw(Bio::Root::Root Bio::Search::Tiling::TilingI);
 use base qw(Bio::Root::Root Bio::Search::Tiling::TilingI);
@@ -310,12 +346,16 @@
 	my (@qfeats, @hfeats);
 	my $query_string = '';
 	my $hit_string = '';
+	my ($qlen,$hlen) = (0,0);
+	my ($qinc, $hinc, $qstart, $hstart);
 	for my $intvl (@$grp) {
 	    # following just chooses the first available hsp containing the
 	    # interval -- this is arbitrary, could be smarter.
 	    my $aln = ( containing_hsps($intvl, @tiling) )[0]->get_aln;
 	    my $qseq = $aln->get_seq_by_pos(1);
 	    my $hseq = $aln->get_seq_by_pos(2);
+	    $qstart ||= $qseq->start;
+	    $hstart ||= $hseq->start;
 	    # calculate the slice boundaries
 	    my ($beg, $end);
 	    for ($type) {
@@ -333,35 +373,39 @@
 	    $aln = $aln->slice($beg, $end);
 	    $qseq = $aln->get_seq_by_pos(1);
 	    $hseq = $aln->get_seq_by_pos(2);
+	    $qinc = $qseq->length - $qseq->num_gaps($Bio::LocatableSeq::GAP_SYMBOLS);
+	    $hinc = $hseq->length - $hseq->num_gaps($Bio::LocatableSeq::GAP_SYMBOLS);
+
 	    push @qfeats, Bio::SeqFeature::Generic->new(
-		-start => CORE::length($query_string)+1,
-		-end => CORE::length($query_string)+CORE::length($qseq->seq),
+		-start => $qlen+1,
+		-end => $qlen+$qinc,
 		-strand => $qseq->strand,
 		-primary => 'query',
 		-source_tag => 'BLAST',
 		-display_name => 'query coordinates',
 		-tag => { query_id => $qseq->id,
 			  query_desc => $qseq->desc,
-			  query_start => $qseq->start,
-			  query_end => $qseq->end}
+			  query_start => $qstart + (($qseq->strand && $qseq->strand < 0) ? -1 : 1)*$qlen,
+			  query_end => $qstart + (($qseq->strand && $qseq->strand < 0) ? -1 : 1)*($qlen+$qinc-1),
+		}
 		);
 	    push @hfeats, Bio::SeqFeature::Generic->new(
-		-start => CORE::length($hit_string)+1,
-		-end => CORE::length($hit_string)+CORE::length($hseq->seq),
+		-start => $hlen+1,
+		-end => $hlen+$hinc,
 		-strand => $hseq->strand,
 		-primary => 'subject/hit',
 		-source_tag => 'BLAST',
 		-display_name => 'subject/hit coordinates',
 		-tag => { subject_id => $hseq->id,
 			  subject_desc => $hseq->desc,
-			  subject_start => $hseq->start,
-			  subject_end => $hseq->end
+			  subject_start => $hstart + (($hseq->strand && $hseq->strand < 0) ? -1 : 1)*$hlen,
+			  subject_end => $hstart + (($hseq->strand && $hseq->strand < 0) ? -1 : 1)*($hlen+$hinc-1)
 		}
 		);
 	    $query_string .= $qseq->seq;
 	    $hit_string .= $hseq->seq;
-	    # want to add features that describe the original
-	    # hsp coordinates.
+	    $qlen += $qinc;
+	    $hlen += $hinc;
 	}
 	# create the LocatableSeqs and add the features to each
 	# then add the seqs to the new aln

Modified: bioperl-live/trunk/t/SearchIO/Tiling.t
===================================================================
--- bioperl-live/trunk/t/SearchIO/Tiling.t	2010-03-04 13:09:27 UTC (rev 16898)
+++ bioperl-live/trunk/t/SearchIO/Tiling.t	2010-03-04 14:36:06 UTC (rev 16899)
@@ -10,7 +10,7 @@
     use Bio::Root::Test;
     $EXHAUSTIVE = $ENV{BIOPERL_TILING_EXHAUSTIVE_TESTS};
     $VERBOSE    = $ENV{BIOPERL_TILING_VERBOSE_TESTS};
-    test_begin(-tests => ($EXHAUSTIVE ? 6475 : 1097) );
+    test_begin(-tests => ($EXHAUSTIVE ? 6519 : 1141) );
 }
 
 use_ok('Bio::Search::Tiling::MapTiling');
@@ -20,6 +20,7 @@
 use_ok('File::Spec');
 
 my ($blio, $result, $hit, $tiling, $hsp);
+
 my @normal_formats = qw( blast  wublast
                          blastn wublastn
                          blastp wublastp
@@ -353,8 +354,51 @@
 }
 is_deeply( [$tiling->range('subject', 'all')], $expected_ranges{'all'}, "bug2942: subject all : range correct" );
 
+# test get_tiled_alns
 
+$blio = Bio::SearchIO->new( -file=>test_input_file( 'dcr1_sp.WUBLASTP' ) );
+$result = $blio->next_result;
+while ($hit = $result->next_hit) {
+    last if $hit->name =~ /ASPTN/;
+}
 
+$tiling = Bio::Search::Tiling::MapTiling->new($hit);
+
+ok my @alns = $tiling->get_tiled_alns, "get_tiled_alns";
+is scalar @alns, 6, "got all alns";
+
+for my $aln ( @alns ) {
+    my (@aint, @qint, @sint);
+    my $qs = $aln->get_seq_by_id('query');
+    my $ss = $aln->get_seq_by_id('subject');
+    ok my @qfeats = $qs->get_SeqFeatures;
+    foreach (@qfeats) {
+	push @aint, [$_->start, $_->end];
+	push @qint, [($_->get_tag_values('query_start'))[0],
+		     ($_->get_tag_values('query_end'))[0] ];
+    }
+    is( eval(join('+', map {$$_[1]-$$_[0]+1} @aint)),
+	eval(join('+', map {$$_[1]-$$_[0]+1} @qint)), 
+	"aln and qfeat lengths correspond" );
+    is( $qs->length - $qs->num_gaps('-'), eval(join('+', map {$$_[1]-$$_[0]+1} @qint)), "q length correct");
+    ok my @hfeats = $ss->get_SeqFeatures;
+    @aint = ();
+    ok ( @qfeats == @hfeats, "features on q and s correspond");
+    foreach (@hfeats) {
+	push @aint, [$_->start, $_->end];
+	push @sint, [($_->get_tag_values('subject_start'))[0],
+		     ($_->get_tag_values('subject_end'))[0] ];
+    }
+    is( eval(join('+', map {$$_[1]-$$_[0]+1} @aint)),
+	eval(join('+', map {$$_[1]-$$_[0]+1} @sint)), 
+	"aln and hfeat lengths correspond" );
+    is( $ss->length - $ss->num_gaps('-'), eval(join('+', map {$$_[1]-$$_[0]+1} @sint)), "s length correct");
+
+}
+1;
+
+
+
 package Bio::Search::Tiling::MapTiling;
 
 sub cmp_frac {



More information about the Bioperl-guts-l mailing list