[Bioperl-l] bioperl count sam reads

Mark Aquino maquino at knome.com
Wed May 2 21:39:54 EDT 2012


Hi all,

I'm a little stumped as to how to successfully count the depths of all reads at a specific locus in a sam/bam file.  I know I can do this with GATK DepthOfCoverage but I wanted to do some more customized things with my script yet I haven't figured out how to get the right base.  I was a bit surprised there wasn't (or that it's not well documented) a method to get the individuals read's base at a specific position while getting the $refbase is quite easy.  (I'm betting such a method exists and is just not documented well)

At any rate, gaps in the alignment are the cause for my problems, so if anyone knows a simpler way to do call the bases correctly, or a clever algorithm to deal with this issue, it would be much appreciated.  Here's what I have for code and it works except in cases where there are multiple gaps in the reference sequence, e.g. the alignment below should be T-T here not C-C but is shifted due to the second gap.  



#!/progs/bin/perl
use strict;
use warnings;
use Bio::DB::Sam;
use Bio::DB::Bam::AlignWrapper;
use Pod::Usage;
use Getopt::Long;
use Bio::DB::Bam::Pileup;
use Term::ANSIColor;

my $sam = Bio::DB::Sam->new(-bam  =>$BAM,
        -fasta=> $FASTA);
getBases($chr, $pos, $pos);


sub getBases {
    my $print = 1;
    my ($chr, $start_query, $end_query) = @_;
    my @alignments = $sam->get_features_by_location(-seq_id => $chr,
            -start  => $start_query,
            -end    => $end_query);
    my $refbase;
    my ($a_count, $t_count, $g_count, $c_count, $n_count, $del_count, $ins_count) = (0, 0, 0, 0, 0, 0, 0);
    for my $a (@alignments) {

        my $start  = $a->start;
        my $end    = $a->end;

        my $query_start = $a->query->start;     
        my $query_end   = $a->query->end;
        my $ref_dna   = $a->dna;        # reference sequence bases
        my ($ref, $matches, $query) = $a->padded_alignment;
        my $offset = 0;
        if ($ref =~ /^([-]+)[ATCG]+/){
            $offset = length($1);
        }
        #print "$offset\n";
        $refbase = $sam->segment($chr,$start_query,$start_query)->dna;        
        printAlignment($ref, $matches, $query, $start_query, $start, $offset);
        my $base = substr($query, $start_query-$start+$offset, 1);
        if (!$base){
            next;
        }
        $a_count++ if ($base eq "A");
        $t_count++ if ($base eq "T");
        $c_count++ if ($base eq "C");
        $g_count++ if ($base eq "G");
        $n_count++ if ($base eq "N");
        $del_count++ if ($base eq "-");
        my @scores    = $a->qscore;     # per-base quality scores
        my $match_qual= $a->qual;       # quality of the match
    }
    my $total_depth = $a_count + $t_count + $c_count + $g_count + $n_count + $del_count;
    if ($print == 1){
     #   print "$start_query\tref base: $ref_base\n"; 
        print "$chr:$start_query($refbase)\t";
        print "A:$a_count\t";
        print "T:$t_count\t";
        print "C:$c_count\t";
        print "G:$g_count\t";
        print "N:$n_count\t";
        print "D:$del_count\t";
        print "Total:$total_depth\n";
    }
    return ($a_count, $t_count, $c_count, $g_count, $n_count, $del_count, $ins_count);
}
sub printAlignment{
    my ($ref, $matches, $query, $start_query, $start, $offset) = @_;
    print substr($ref, 0, $start_query-$start+$offset);
    print (color("red"), substr($ref, $start_query-$start+$offset, 1), color("reset"));
    print substr($ref, $start_query-$start+$offset+1),"\n";
    print substr($matches, 0, $start_query-$start+$offset);
    print (color("red"), substr($matches, $start_query-$start+$offset, 1), color("reset"));
    print substr($matches, $start_query-$start+$offset+1),"\n";
    print substr($query, 0, $start_query-$start+$offset);
    print (color("red"), substr($query, $start_query-$start+$offset, 1), color("reset"));
    print substr($query, $start_query-$start+$offset+1),"\n";
}

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.open-bio.org/pipermail/bioperl-l/attachments/20120502/69afd7f9/attachment-0002.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: Untitled.tiff
Type: image/tiff
Size: 6222 bytes
Desc: not available
URL: <http://lists.open-bio.org/pipermail/bioperl-l/attachments/20120502/69afd7f9/attachment-0002.tiff>


More information about the Bioperl-l mailing list