[Bioperl-l] genbank parsing of multiple 'function' tags within primary tag

galeb abu-ali abualiga2 at gmail.com
Thu Sep 8 10:44:39 EDT 2011


Hi,

I'm parsing a genbank file with Bio::SeqIO and am stuck on instances of
multiple tags within a primary tag.  E.g., when there are several 'function'
tag-values within a 'CDS' primary tag, I don't know how to link those
'function' tag-values to a particular 'locus_tag'.  As parsed values are
returned as a list, I tried creating an array of hashes, where the hash-key
is 'locus_tag' and hash-values are multiple 'function' tags, but am failing
miserably.  Pasted below is what I managed so far.  At your convenience,
please advise.

thanks!

galeb

#!/usr/local/bin/perl
# parse_gbk.pl
# gsa 09042011
# script to parse out features from gbk
#
http://www.bioperl.org/wiki/HOWTO:Feature-Annotation#Customizing_Sequence_Object_Construction

use strict; use warnings;
use Bio::SeqIO;

my @loci;
my @seqs;
my @directions;
my @start_coords;
my @end_coords;
my @genes;
my @products;
my @notes;
my @functions;
my %functions;

my $gb_file = shift;
my $seqio_obj = Bio::SeqIO->new(-file => $gb_file );
my $seq_obj = $seqio_obj->next_seq;

for my $feat_obj ( $seq_obj->get_SeqFeatures ) {
    if ( $feat_obj->primary_tag eq ( 'gene' ) ) {
                if ($feat_obj->has_tag( 'locus_tag' ) ) {
                push ( @seqs, $feat_obj->seq->seq ); #collect sequences
                    for my $val ( $feat_obj->get_tag_values( 'locus_tag' ) )
{
                            push ( @loci, $val ); # locus_tags
            }
        }
                if ( $feat_obj->has_tag( 'gene' ) ) {
                           for my $val ( $feat_obj->get_tag_values( 'gene' )
) {
                            push ( @genes, $val ); # gene names
                        }
                }
                else {
                    push ( @genes, "" ); # if gene names are absent, leave
empty
        }
        if ( $feat_obj->location->isa( 'Bio::Location::Simple' ) ) { # gene
coordinates
            for my $location ( $feat_obj->location ) {
                push ( @start_coords, $location->start );
                push ( @end_coords, $location->end );
                if ( $location->strand == -1 ) {
                    push ( @directions, "reverse" );
                }
                else {
                    push ( @directions, "forward" );
                }
            }
        }
    }
    # gene products, notes, functions
    if ( $feat_obj->primary_tag eq ( 'CDS' ) || $feat_obj->primary_tag eq (
'misc_feature' ) || $feat_obj->primary_tag eq ( 'ncRNA' ) ||
$feat_obj->primary_tag eq ( 'rRNA' ) || $feat_obj->primary_tag eq ( 'tRNA' )
|| $feat_obj->primary_tag eq ( 'misc_RNA' ) ) {
        if ( $feat_obj->has_tag( 'product' ) ) {
            for my $product ( $feat_obj->get_tag_values( 'product' ) ) {
                push ( @products, $product );
            }
        }
        else {
            push ( @products, "" );
        }
        if ( $feat_obj->has_tag( 'note' ) ) {
            for my $note ( $feat_obj->get_tag_values( 'note' ) ) {
                push ( @notes, $note );
            }
        }
        else {
            push ( @notes, "" );
        }
        if ( $feat_obj->has_tag( 'function' ) ) {
            for my $function ( $feat_obj->get_tag_values( 'function' ) ) {
                push ( @functions, $function );
            }
        }
        else {
            push ( @functions, "" );
        }

    }
}

print
"locus\tgene_name\tstart_nt\tend_nt\tlength_nt\tdirection\tproduct\tnote\tfunction\tsequence_nt\n";
# header

for ( my $elem = 0; $elem < scalar @loci; ++$elem ) {
    print $loci[$elem], "\t",$genes[$elem], "\t", $start_coords[$elem],
"\t", $end_coords[$elem], "\t", length( $seqs[$elem] ), "\t",
$directions[$elem], "\t", $products[$elem], "\t", $notes[$elem], "\t",
$functions[$elem], "\t", $seqs[$elem], "\n";
}


More information about the Bioperl-l mailing list