[Bioperl-l] bioperl and kegg(out of memory problem )

Ambrose aharry2001 at yahoo.com
Mon Apr 2 09:56:33 EDT 2007



Hello ALL,

I have the code below,which parses my kegg files.A host of the files are parsed and the information is inserted into my databases but unfortunate after the program runs for some hours it stops showing the message out of memory.I assume that this happens because the bioperl object is too big.Please just check the code below

best regards Ambrose


#!/usr/local/ActivePerl/bin/perl
#
#

use strict;
use Bio::SeqIO;
use Bio::FASTASequence;
use DBI;
use Benchmark  qw(:all) ;

my($ko,$prosite,$ncbigi,$ncbigeneid,$pfam,$uniprot,$ecn1,$pathway_id1,$pathway_name1,$ec_num);
my(%dblink_KO,%dblink_Pfam,%dblink_PROSITE,%dblink_NCBIGI,%dblink_NCBIGENEID,%dblink_UniProt);
my(%pathway_name,%pathway_id,%ecnumbers,%crc64,%ntseq,%aaseq);
my( @kg_id);
my $db="gbdb";
my $host="localhost";
my $userid="root";
my $passwd="ubuntu";
my $connectionInfo="dbi:mysql:$db;"."mysql_socket=/var/run/mysqld/mysqld.sock";
my ($t1,$t2);
my $dbh = DBI->connect($connectionInfo,$userid,$passwd);
my $time_used;
 
 
 
 eval { $dbh->do("DROP TABLE kegginfo") };
 print "Dropping kegginfo failed: $@\n" if $@;
 $dbh->do("CREATE TABLE kegginfo (kg_id BIGINT NOT NULL AUTO_INCREMENT,
                                   up_id INT UNSIGNED REFERENCES uniprotinfo(up_id),
                                                                  filename VARCHAR(50),
                                                    kegg_id VARCHAR(50),
                                   keggaccn VARCHAR(50),
                                                                  description VARCHAR(250),
                                   ec_numbers VARCHAR(250),
                                              pathway_id VARCHAR(250),
                                              pathway_name VARCHAR(250),
                                              crc64 VARCHAR(50),
                                   ko_id VARCHAR(50),
                                   pfam_id VARCHAR(50),
                                   ncbigi_id VARCHAR(50),
                                   ncbigeneid_id VARCHAR(50),
                                   uniprot_id VARCHAR(50),
                                   prosite_id VARCHAR(50),
                                   PRIMARY KEY (kg_id)
                                 )");
                                 

eval { $dbh->do("DROP TABLE keggntsequence") };
print "Dropping keggntsequence failed: $@\n" if $@;
$dbh->do("CREATE TABLE keggntsequence (kg_id BIGINT(15) UNSIGNED REFERENCES uniprotinfo(kg_id),
                                                    keggaccn VARCHAR(50),
                                  nucleotidesequence text
                                 )");

eval { $dbh->do("DROP TABLE keggaasequence") };
print "Dropping keggaasequence failed: $@\n" if $@;
$dbh->do("CREATE TABLE keggaasequence (kg_id BIGINT(15) UNSIGNED REFERENCES uniprotinfo(kg_id),
                                                    keggaccn VARCHAR(50),
                                                    crc64 VARCHAR(50),
                                  aminoacidsequence text
                                 )");
eval { $dbh->do("DROP TABLE timestable") };
print "Dropping timestable failed: $@\n" if $@;
$dbh->do("CREATE TABLE timestable (aut_id BIGINT(15) UNSIGNED NOT NULL AUTO_INCREMENT,
                                   genome VARCHAR(100),
                                    totaltime_seconds int(100),
                                                                  PRIMARY KEY(aut_id))");



open (LIST, "genomes.list") || die "Cannot open input kegg genomes file genomes.list\n $! \n";
$t1=new Benchmark;
my @genomelist = ();
while (my $line=<LIST>) {
    #ignore comment lines
    if ($line !~ /^#/) {
        chomp $line;
                
        push (@genomelist, $line); #store the filename
    }
}

close LIST;
my $count=0;
foreach my $genomefile (@genomelist) {

    #in case the user fails to remove some strange files from
    #the genomes.list file.. check for the KEGG format
    my $check=checkKeggFormat($genomefile);
    if ($check==0) {
        #if file is not kegg, start with the next one...
        print "ERROR: $genomefile doesn't look like a KEGG file to me! \n";
        #<stdin>;
        next;
    }
#print $genomefile,"\n";
    my $stream = Bio::SeqIO->new(-file => $genomefile, -format => 'KEGG');

    while ( my $seq = $stream->next_seq() ) {

        my $primary_id = $seq->primary_id;
        my $display_id = $seq->display_id; #name
        my $keggaccn   = $seq->accession; #accn
        my @description = $seq->annotation->get_Annotations('description');
        
        my @dblinks     = $seq->annotation->get_Annotations('dblink');
        my @orthologs   = $seq->annotation->get_Annotations('ortholog');
        my @orthologs   = grep {$_->database eq 'KO'} $seq->annotation->get_Annotations('dblink');
        my @class       = $seq->annotation->get_Annotations('pathway');
         $ntseq{$keggaccn} = $seq->seq;
         $aaseq{$keggaccn} = $seq->translate->seq; 
         $aaseq{$keggaccn} =~s /\*$//;
                 my $fasta = ">".$count."\n".$aaseq{$keggaccn};
         my $newseq = Bio::FASTASequence->new($fasta);
         $crc64{$keggaccn}=$newseq->getCrc64();
#print $keggaccn,"crc64:$crc64{$keggaccn}\n";
        
        $count++;
        if ($keggaccn eq "") { print "PRIMARY KEY NOT FOUND no keggaccn\n";
        next;}    

        if(@dblinks)
        {
                my @dblink_KO=();
                my @dblink_Pfam=();
                my @dblink_PROSITE=();
                my @dblink_NCBIGI=();
                my @dblink_NCBIGENEID=();
                my @dblink_UniProt=();
        
                foreach my $ele (@dblinks) {
                    if ($ele =~ /^KO:/){
                        $ele=~s/KO://;
                        push (@dblink_KO,$ele);
                        $dblink_KO{$keggaccn}=$ele;
                        next;
                    }
                        #parse Pfam: dblink
                    if ($ele =~ /^Pfam:/){
                        $ele=~s/Pfam://;
                        push (@dblink_Pfam,$ele);
                        $dblink_Pfam{$keggaccn}=$ele;
                        next;
                    }
                        #parse PROSITE: dblink
                    if ($ele =~ /^PROSITE:/){
                        $ele=~s/PROSITE://;
                        push (@dblink_PROSITE,$ele);
                        $dblink_PROSITE{$keggaccn}=$ele;
                        next;
                    }
                        #parse NCBI-GI: dblink
                    if ($ele =~ /^NCBI-GI:/){
                        $ele=~s/NCBI-GI://;
                        push (@dblink_NCBIGI,$ele);
                        $dblink_NCBIGI{$keggaccn}=$ele;
                        next;
                    }
                        #parse NCBI-GeneID: dblink
                    if ($ele =~ /^NCBI-GeneID:/){
                        $ele=~s/NCBI-GeneID://;
                        push (@dblink_NCBIGENEID,$ele);
                        $dblink_NCBIGENEID{$keggaccn}=$ele;
                        next;
                        }
                        #parse UniProt: dblink
                    if ($ele =~ /^UniProt:/){
                        $ele=~s/UniProt://;
                        push (@dblink_UniProt,$ele);
                        $dblink_UniProt{$keggaccn}=$ele;
                        next;
                    }
            
                }#end foreach     #finished parsing all dblinks    
        }#end if @dblinks
        if(@class)
        {
            foreach my $pathway (@class) {
    
                $pathway=~s/^\s+|\s+$//;
                my @arr = split (/\s+/,$pathway);
                my $pathway_id = $arr[0];
                shift @arr;
                my $pathway_name = join(" ", at arr);
                $pathway_name{$keggaccn}=$pathway_name;
                $pathway_id{$keggaccn}=$pathway_id;
                #print $pathway_id{$keggaccn},"\t",$pathway_name{$keggaccn},"\n";
                                    
            }
            
        }
        
        my @ecnumbers=();
        @ecnumbers = extractECnumbers(@description);
        if(@ecnumbers)
        {
                if (@ecnumbers!=0) 
                {
                    foreach my $ecn (@ecnumbers) 
                    {
                       $ecnumbers{$keggaccn}=$ecn;
                    }#end foreach
                }
                else {
                    #print "ECnumbers:\n";
                     }
        }
        
        
#         print $keggaccn,"\t",$dblink_UniProt{$keggaccn},"\t",$dblink_NCBIGENEID{$keggaccn},
#                 "\t",$dblink_NCBIGI{$keggaccn},"\t","ec:$ecnumbers{$keggaccn}","\t",
#                  "p1:$pathway_id{$keggaccn}","\t","p2:$pathway_name{$keggaccn}","\n";
#         
                $dbh->do("INSERT INTO kegginfo VALUES (?,?, ?, ?, ?, ?,?,?,?,?,?,?,?,?,?,?)",
         undef,"NULL","NULL",$genomefile,$display_id,$keggaccn, at description,$ecnumbers{$keggaccn},
                  $pathway_id{$keggaccn},$pathway_name{$keggaccn},$crc64{$keggaccn},$dblink_KO{$keggaccn},
                 $dblink_Pfam{$keggaccn},$dblink_NCBIGI{$keggaccn},$dblink_NCBIGENEID{$keggaccn},
                 $dblink_UniProt{$keggaccn},$dblink_PROSITE{$keggaccn});
         

        $dbh->do("INSERT INTO keggaasequence VALUES (?,?,?,?)",
            undef,"",$keggaccn,$crc64{$keggaccn},$aaseq{$keggaccn});
                        

        $dbh->do("INSERT INTO keggntsequence VALUES (?,?,?)",
            undef,"",$keggaccn,$ntseq{$keggaccn});
                
               
    }
     $t2=new Benchmark;
    $time_used=timeThis($t1,$t2,"Finished parsing file $genomefile");
    $dbh->do("INSERT INTO timestable VALUES (?,?,?)",
    undef,"NULL",$genomefile,$time_used);
 
}


$dbh->do("CREATE INDEX keggIindex ON kegginfo (kg_id,keggaccn)");
print "Index created on kegginfo\n";

$dbh->do("CREATE INDEX keggaasequence1 ON keggaasequence (kg_id,keggaccn)");
print "Index created on keggaasequence\n";

$dbh->do("CREATE INDEX keggntsequence1 ON keggntsequence (kg_id,keggaccn)");
print "Index created on keggntsequence\n";


print"Updating the tables................\n";

    
$dbh->do("update kegginfo,keggaasequence set keggaasequence.kg_id=kegginfo.kg_id 
         where 
                kegginfo.keggaccn=keggaasequence.keggaccn");
        print " keggaasequence kg_id\n";

$dbh->do("update kegginfo,keggntsequence set keggntsequence.kg_id=kegginfo.kg_id 
         where 
                kegginfo.keggaccn=keggntsequence.keggaccn");
        print " keggaasequence kg_id\n";



sub extractECnumbers ($) {
    #sample description lines
     #riboflavin kinase / FMN adenylyltransferase [EC:2.7.1.26 2.7.7.2]
    #ATP synthase F0 subunit c [EC:3.6.3.14]

    my @desc=shift;
    my $description = join ("", at desc);
    my @ecnumbers=();
    #print "parsing ec for $description..\n";
    #check if EC number exists
    if ($description=~/\[EC:/) {
        
        my @array = split (/\[EC:/,$description);
        $array[1]=~s/]//g;
        shift @array; #remove the annotation , only EC numbers remain
        foreach my $ele (@array) {
            $ele=~s/^\s+|\s+$//g;
            $ele= "EC:".$ele;
            push (@ecnumbers,$ele);
        }    
        return @ecnumbers;
    }
    else {
        #return an empty value
        return ;

    }

}







sub checkKeggFormat ($) {
=head2

checkKeggFormat

make sure that the file is a valid KEGG file
function checks the first two lines,
1st must start with ENTRY
2nd must start with DEFINITION

returns 0 or 1

=cut
    my $genomefile=shift;

    open (TEST,$genomefile) || die "Cannot open file $genomefile for reading \n";
    my $testline=<TEST>;
#print "$testline\n";
    if ($testline=~/^ENTRY/) {
        #continue
        #$testline=<TEST>;#double check
        #if ($testline=~/^NAME/) {
            #this looks like a valid kegg file
            return 1;
        #}
        #else {
        #    close TEST;
        #    return 0;
        #}
    }
    else {
        close TEST;    
        return 0;
    }

}

sub timeThis ($$$) 
{
    my ($start,$end,$message) = @_;
    my $td = timediff($end, $start);
    my $t = timestr($td);    
        print "$message : ",$t,"\n";
        my @array = split (/\s+/,$t);
#20 wallclock secs (14.23 usr +  0.84 sys = 15.07 CPU)
        return $array[0]; #return the no. of seconds.
}

   

  
---------------------------------
Looking for earth-friendly autos? 
 Browse Top Cars by "Green Rating" at Yahoo! Autos' Green Center.  


More information about the Bioperl-l mailing list