[Bioperl-guts-l] [16899] bioperl-live/trunk: get_tiled_alns and tests
Mark Allen Jensen
maj at dev.open-bio.org
Thu Mar 4 09:36:06 EST 2010
Revision: 16899
Author: maj
Date: 2010-03-04 09:36:06 -0500 (Thu, 04 Mar 2010)
Log Message:
-----------
get_tiled_alns and tests
Modified Paths:
--------------
bioperl-live/trunk/Bio/Search/Tiling/MapTiling.pm
bioperl-live/trunk/t/SearchIO/Tiling.t
Modified: bioperl-live/trunk/Bio/Search/Tiling/MapTiling.pm
===================================================================
--- bioperl-live/trunk/Bio/Search/Tiling/MapTiling.pm 2010-03-04 13:09:27 UTC (rev 16898)
+++ bioperl-live/trunk/Bio/Search/Tiling/MapTiling.pm 2010-03-04 14:36:06 UTC (rev 16899)
@@ -80,6 +80,41 @@
object with the L</contexts> method. If contexts don't apply for the
given report, this returns C<('all')>.
+=head1 TILED ALIGNMENTS
+
+The experimental method L<ALIGNMENTS/get_tiled_alns> will use a tiling
+to concatenate tiled hsps into a series of L<Bio::SimpleAlign>
+objects:
+
+ @alns = $tiling->get_tiled_alns($type, $context);
+
+Each alignment contains two sequences with ids 'query' and 'subject',
+and consists of a concatenation of tiling HSPs which overlap or are
+directly adjacent. The alignment are returned in C<$type> sequence
+order. When HSPs overlap, the alignment sequence is taken from the HSP
+which comes first in the coverage map array.
+
+The sequences in each alignment contain features (even though they are
+L<Bio::LocatableSeq objects) which map the original query/subject
+coordinates to the new alignment sequence coordinates. You can
+determine the original BLAST fragments this way:
+
+ $aln = ($tiling->get_tiled_alns)[0];
+ $qseq = $aln->get_seq_by_id('query');
+ $hseq = $aln->get_seq_by_id('subject');
+ foreach my $feat ($qseq->get_SeqFeatures) {
+ $org_start = ($feat->get_tag_values('query_start'))[0];
+ $org_end = ($feat->get_tag_values('query_end'))[0];
+ # original fragment as represented in the tiled alignment:
+ $org_fragment = $feat->seq;
+ }
+ foreach my $feat ($hseq->get_SeqFeatures) {
+ $org_start = ($feat->get_tag_values('subject_start'))[0];
+ $org_end = ($feat->get_tag_values('subject_end'))[0];
+ # original fragment as represented in the tiled alignment:
+ $org_fragment = $feat->seq;
+ }
+
=head1 DESIGN NOTE
The major calculations are made just-in-time, and then memoized. So,
@@ -143,6 +178,7 @@
use Bio::Root::Root;
use Bio::Search::Tiling::TilingI;
use Bio::Search::Tiling::MapTileUtils;
+use Bio::LocatableSeq;
# use base qw(Bio::Root::Root Bio::Search::Tiling::TilingI);
use base qw(Bio::Root::Root Bio::Search::Tiling::TilingI);
@@ -310,12 +346,16 @@
my (@qfeats, @hfeats);
my $query_string = '';
my $hit_string = '';
+ my ($qlen,$hlen) = (0,0);
+ my ($qinc, $hinc, $qstart, $hstart);
for my $intvl (@$grp) {
# following just chooses the first available hsp containing the
# interval -- this is arbitrary, could be smarter.
my $aln = ( containing_hsps($intvl, @tiling) )[0]->get_aln;
my $qseq = $aln->get_seq_by_pos(1);
my $hseq = $aln->get_seq_by_pos(2);
+ $qstart ||= $qseq->start;
+ $hstart ||= $hseq->start;
# calculate the slice boundaries
my ($beg, $end);
for ($type) {
@@ -333,35 +373,39 @@
$aln = $aln->slice($beg, $end);
$qseq = $aln->get_seq_by_pos(1);
$hseq = $aln->get_seq_by_pos(2);
+ $qinc = $qseq->length - $qseq->num_gaps($Bio::LocatableSeq::GAP_SYMBOLS);
+ $hinc = $hseq->length - $hseq->num_gaps($Bio::LocatableSeq::GAP_SYMBOLS);
+
push @qfeats, Bio::SeqFeature::Generic->new(
- -start => CORE::length($query_string)+1,
- -end => CORE::length($query_string)+CORE::length($qseq->seq),
+ -start => $qlen+1,
+ -end => $qlen+$qinc,
-strand => $qseq->strand,
-primary => 'query',
-source_tag => 'BLAST',
-display_name => 'query coordinates',
-tag => { query_id => $qseq->id,
query_desc => $qseq->desc,
- query_start => $qseq->start,
- query_end => $qseq->end}
+ query_start => $qstart + (($qseq->strand && $qseq->strand < 0) ? -1 : 1)*$qlen,
+ query_end => $qstart + (($qseq->strand && $qseq->strand < 0) ? -1 : 1)*($qlen+$qinc-1),
+ }
);
push @hfeats, Bio::SeqFeature::Generic->new(
- -start => CORE::length($hit_string)+1,
- -end => CORE::length($hit_string)+CORE::length($hseq->seq),
+ -start => $hlen+1,
+ -end => $hlen+$hinc,
-strand => $hseq->strand,
-primary => 'subject/hit',
-source_tag => 'BLAST',
-display_name => 'subject/hit coordinates',
-tag => { subject_id => $hseq->id,
subject_desc => $hseq->desc,
- subject_start => $hseq->start,
- subject_end => $hseq->end
+ subject_start => $hstart + (($hseq->strand && $hseq->strand < 0) ? -1 : 1)*$hlen,
+ subject_end => $hstart + (($hseq->strand && $hseq->strand < 0) ? -1 : 1)*($hlen+$hinc-1)
}
);
$query_string .= $qseq->seq;
$hit_string .= $hseq->seq;
- # want to add features that describe the original
- # hsp coordinates.
+ $qlen += $qinc;
+ $hlen += $hinc;
}
# create the LocatableSeqs and add the features to each
# then add the seqs to the new aln
Modified: bioperl-live/trunk/t/SearchIO/Tiling.t
===================================================================
--- bioperl-live/trunk/t/SearchIO/Tiling.t 2010-03-04 13:09:27 UTC (rev 16898)
+++ bioperl-live/trunk/t/SearchIO/Tiling.t 2010-03-04 14:36:06 UTC (rev 16899)
@@ -10,7 +10,7 @@
use Bio::Root::Test;
$EXHAUSTIVE = $ENV{BIOPERL_TILING_EXHAUSTIVE_TESTS};
$VERBOSE = $ENV{BIOPERL_TILING_VERBOSE_TESTS};
- test_begin(-tests => ($EXHAUSTIVE ? 6475 : 1097) );
+ test_begin(-tests => ($EXHAUSTIVE ? 6519 : 1141) );
}
use_ok('Bio::Search::Tiling::MapTiling');
@@ -20,6 +20,7 @@
use_ok('File::Spec');
my ($blio, $result, $hit, $tiling, $hsp);
+
my @normal_formats = qw( blast wublast
blastn wublastn
blastp wublastp
@@ -353,8 +354,51 @@
}
is_deeply( [$tiling->range('subject', 'all')], $expected_ranges{'all'}, "bug2942: subject all : range correct" );
+# test get_tiled_alns
+$blio = Bio::SearchIO->new( -file=>test_input_file( 'dcr1_sp.WUBLASTP' ) );
+$result = $blio->next_result;
+while ($hit = $result->next_hit) {
+ last if $hit->name =~ /ASPTN/;
+}
+$tiling = Bio::Search::Tiling::MapTiling->new($hit);
+
+ok my @alns = $tiling->get_tiled_alns, "get_tiled_alns";
+is scalar @alns, 6, "got all alns";
+
+for my $aln ( @alns ) {
+ my (@aint, @qint, @sint);
+ my $qs = $aln->get_seq_by_id('query');
+ my $ss = $aln->get_seq_by_id('subject');
+ ok my @qfeats = $qs->get_SeqFeatures;
+ foreach (@qfeats) {
+ push @aint, [$_->start, $_->end];
+ push @qint, [($_->get_tag_values('query_start'))[0],
+ ($_->get_tag_values('query_end'))[0] ];
+ }
+ is( eval(join('+', map {$$_[1]-$$_[0]+1} @aint)),
+ eval(join('+', map {$$_[1]-$$_[0]+1} @qint)),
+ "aln and qfeat lengths correspond" );
+ is( $qs->length - $qs->num_gaps('-'), eval(join('+', map {$$_[1]-$$_[0]+1} @qint)), "q length correct");
+ ok my @hfeats = $ss->get_SeqFeatures;
+ @aint = ();
+ ok ( @qfeats == @hfeats, "features on q and s correspond");
+ foreach (@hfeats) {
+ push @aint, [$_->start, $_->end];
+ push @sint, [($_->get_tag_values('subject_start'))[0],
+ ($_->get_tag_values('subject_end'))[0] ];
+ }
+ is( eval(join('+', map {$$_[1]-$$_[0]+1} @aint)),
+ eval(join('+', map {$$_[1]-$$_[0]+1} @sint)),
+ "aln and hfeat lengths correspond" );
+ is( $ss->length - $ss->num_gaps('-'), eval(join('+', map {$$_[1]-$$_[0]+1} @sint)), "s length correct");
+
+}
+1;
+
+
+
package Bio::Search::Tiling::MapTiling;
sub cmp_frac {
More information about the Bioperl-guts-l
mailing list