[Bioperl-l] Should coords be adjusted after removing alignment columns?

Sendu Bala bix at sendu.me.uk
Tue Aug 14 10:57:29 EDT 2007


I'm looking at what looks like a pretty major bug in Bio::SimpleAlign, 
but before I commit the fix I wanted to check my sanity/understanding.

My understanding is that an alignment may be built from just sub-parts 
of a number of sequences. So you give each sequence in the alignment a 
start and stop so you can later map back the aligned region to the 
original sequence. So, for example, the following should all pass:

diff -r1.56 SimpleAlign.t
459a460,540
 >
 >
 > # is _remove_col really working correctly?
 > my $a = Bio::LocatableSeq->new(-id => 'a', -seq => 
'atcgatcgatcgatcg', -start => 5, -end => 20);
 > my $b = Bio::LocatableSeq->new(-id => 'b', -seq => 
'-tcgatc-atcgatcg', -start => 30, -end => 43);
 > my $c = Bio::LocatableSeq->new(-id => 'c', -seq => 
'atcgatcgatc-atc-', -start => 50, -end => 63);
 > my $d = Bio::LocatableSeq->new(-id => 'd', -seq => 
'--cgatcgatcgat--', -start => 80, -end => 91);
 > my $e = Bio::LocatableSeq->new(-id => 'e', -seq => 
'-t-gatcgatcga-c-', -start => 100, -end => 111);
 > $aln = Bio::SimpleAlign->new();
 > $aln->add_seq($a);
 > $aln->add_seq($b);
 > $aln->add_seq($c);
 >
 > my $gapless = $aln->remove_gaps();
 > foreach my $seq ($gapless->each_seq) {
 >       if ($seq->id eq 'a') {
 >               is $seq->start, 6;
 >               is $seq->end, 19;
 >               is $seq->seq, 'tcgatcatcatc';
 >       }
 >       elsif ($seq->id eq 'b') {
 >               is $seq->start, 30;
 >               is $seq->end, 42;
 >               is $seq->seq, 'tcgatcatcatc';
 >       }
 >       elsif ($seq->id eq 'c') {
 >               is $seq->start, 51;
 >               is $seq->end, 63;
 >               is $seq->seq, 'tcgatcatcatc';
 >       }
 > }
 >
 > $aln->add_seq($d);
 > $aln->add_seq($e);
 > $gapless = $aln->remove_gaps();
 > foreach my $seq ($gapless->each_seq) {
 >       if ($seq->id eq 'a') {
 >               is $seq->start, 8;
 >               is $seq->end, 17;
 >               is $seq->seq, 'gatcatca';
 >       }
 >       elsif ($seq->id eq 'b') {
 >               is $seq->start, 32;
 >               is $seq->end, 40;
 >               is $seq->seq, 'gatcatca';
 >       }
 >       elsif ($seq->id eq 'c') {
 >               is $seq->start, 53;
 >               is $seq->end, 61;
 >               is $seq->seq, 'gatcatca';
 >       }
 >       elsif ($seq->id eq 'd') {
 >               is $seq->start, 81;
 >               is $seq->end, 90;
 >               is $seq->seq, 'gatcatca';
 >       }
 >       elsif ($seq->id eq 'e') {
 >               is $seq->start, 101;
 >               is $seq->end, 110;
 >               is $seq->seq, 'gatcatca';
 >       }
 > }
 >
 > my $f = Bio::LocatableSeq->new(-id => 'f', -seq => 
'a-cgatcgatcgat-g', -start => 30, -end => 43);
 > $aln = Bio::SimpleAlign->new();
 > $aln->add_seq($a);
 > $aln->add_seq($f);
 >
 > $gapless = $aln->remove_gaps();
 > foreach my $seq ($gapless->each_seq) {
 >       if ($seq->id eq 'a') {
 >               is $seq->start, 5;
 >               is $seq->end, 20;
 >               is $seq->seq, 'acgatcgatcgatg';
 >       }
 >       elsif ($seq->id eq 'f') {
 >               is $seq->start, 30;
 >               is $seq->end, 43;
 >               is $seq->seq, 'acgatcgatcgatg';
 >       }
 > }


But they don't. Once you remove certain columns the start and stop of 
the sequences in the alignment are no longer correct coordinates for the 
sub-sequence in the original sequence.

I propose the following patch to resolve this issue:

diff -r1.136 SimpleAlign.pm
1116c1116,1118
<
---
 >
 >     my $gap = $self->gap_char;
 >
1129,1137c1131,1147
<             my $spliced;
<             $spliced .= $start > 0 ? substr($sequence,0,$start) : '';
<             $spliced .= substr($sequence,$end+1,$seq->length-$end+1);
<             $sequence = $spliced;
<             if ($start == 1) {
<               $new_seq->start($end);
<             }
<             else {
<               $new_seq->start( $seq->start);
---
 >             my $orig = $sequence;
 >             my $head =  $start > 0 ? substr($sequence, 0, $start) : '';
 >             my $tail = ($end + 1) >= length($sequence) ? '' : 
substr($sequence, $end + 1);
 >             $sequence = $head.$tail;
 >             # start
 >             unless (defined $new_seq->start) {
 >                 if ($start == 0) {
 >                     my $start_adjust = () = substr($orig, 0, $end + 
1) =~ /$gap/g;
 >                     $new_seq->start($seq->start + $end + 1 - 
$start_adjust);
 >                 }
 >                 else {
 >                     my $start_adjust = $orig =~ /$gap+/;
 >                     if ($start_adjust) {
 >                         $start_adjust = $+[0] - 1 < $start;
 >                     }
 >                     $new_seq->start($seq->start + $start_adjust);
 >                 }
1140,1141c1150,1152
<             if($end >= $seq->end){
<              $new_seq->end( $start);
---
 >             if (($end + 1) >= length($orig)) {
 >                 my $end_adjust = () = substr($orig, $start) =~ /$gap/g;
 >                 $new_seq->end($seq->end - (length($orig) - $start) + 
$end_adjust);
1144c1155
<              $new_seq->end($seq->end);
---
 >                 $new_seq->end($seq->end);
1148c1159
<                 push @new, $new_seq;
---
 >               push @new, $new_seq;
1207,1209c1218,1234
<       # sort the positions to remove columns at the end 1st
<       @$positions = sort { $b->[0] <=> $a->[0] } @$positions;
<       $aln = $self->_remove_col($aln,$positions);
---
 >       # sort the positions
 >       @$positions = sort { $a->[0] <=> $b->[0] } @$positions;
 >
 >     my @remove;
 >     my $length = 0;
 >     foreach my $pos (@{$positions}) {
 >         my ($start, $end) = @{$pos};
 >
 >         #have to offset the start and end for subsequent removes
 >         $start-=$length;
 >         $end  -=$length;
 >         $length += ($end-$start+1);
 >         push @remove, [$start,$end];
 >     }
 >
 >     #remove the segments
 >     $aln = $#remove >= 0 ? $self->_remove_col($aln,\@remove) : $self;


This breaks 2 tests in SimpleAlign.t, but as far as I can tell, those 
tests expect the wrong answer. Changed to expect the correct answer, 
SimpleAlign.t and all other tests in the test suite pass.

diff -r1.56 SimpleAlign.t
214,215c214,215
<       "P84139/1-33              NEGEHQIKLDELFEKLLRARLIFKNKDVLRRC\n".
<       "P814153/1-33             NEGMHQIKLDVLFEKLLRARLIFKNKDVLRRC\n".
---
 >       "P84139/2-33              NEGEHQIKLDELFEKLLRARLIFKNKDVLRRC\n".
 >       "P814153/2-33             NEGMHQIKLDVLFEKLLRARLIFKNKDVLRRC\n".
229c229
<       "gb|443893|124775/1-32    -RFRIKVPPAVEGARPALLIFKSRPELGC\n",
---
 >       "gb|443893|124775/2-32    -RFRIKVPPAVEGARPALLIFKSRPELGC\n",


Can someone triple-check my thinking and report back please?

Cheers,
Sendu.


More information about the Bioperl-l mailing list