[Bioperl-guts-l] bioperl-live/Bio/Tools Glimmer.pm,1.6,1.7

Christopher John Fields cjfields at dev.open-bio.org
Thu Feb 15 14:36:13 EST 2007


Update of /home/repository/bioperl/bioperl-live/Bio/Tools
In directory dev.open-bio.org:/tmp/cvs-serv390/Bio/Tools

Modified Files:
	Glimmer.pm 
Log Message:
Bug 2206 - updates to allow all Glimmer formats to be parsed

Index: Glimmer.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/Tools/Glimmer.pm,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** Glimmer.pm	26 Sep 2006 22:03:17 -0000	1.6
--- Glimmer.pm	15 Feb 2007 19:36:11 -0000	1.7
***************
*** 13,17 ****
  =head1 NAME
  
! Bio::Tools::Glimmer - parser for GlimmerM/GlimmerHMM eukaryotic gene predictions
  
  =head1 SYNOPSIS
--- 13,18 ----
  =head1 NAME
  
! Bio::Tools::Glimmer - parser for Glimmer 2.X/3.X prokaryotic and 
! GlimmerM/GlimmerHMM eukaryotic gene predictions
  
  =head1 SYNOPSIS
***************
*** 19,25 ****
--- 20,31 ----
     use Bio::Tools::Glimmer;
  
+    # file
     my $parser = new Bio::Tools::Glimmer(-file => $file);
     # filehandle:
     $parser = Bio::Tools::Glimmer->new( -fh  => \*INPUT );
+    # provide a sequence identifier (Glimmer 2.X)
+    my $parser = Bio::Tools::Glimmer->new(-file => $file, -seqname => seqname);
+    # force format (override automatic detection)
+    my $parser = Bio::Tools::Glimmer->new(-file => $file, -format => 'GlimmerM');
  
     # parse the results
***************
*** 46,54 ****
  =head1 DESCRIPTION
  
! This is a module for parsing GlimmerM and GlimmerHMM predictions 
  It will create gene objects from the prediction report which can 
  be attached to a sequence using Bioperl objects, or output as GFF 
  suitable for loading into Bio::DB::GFF for use with Gbrowse.
  
  GlimmerM is open source and available at 
  L<http://www.tigr.org/software/glimmerm/>.
--- 52,63 ----
  =head1 DESCRIPTION
  
! This is a module for parsing Glimmer, GlimmerM and GlimmerHMM predictions.  
  It will create gene objects from the prediction report which can 
  be attached to a sequence using Bioperl objects, or output as GFF 
  suitable for loading into Bio::DB::GFF for use with Gbrowse.
  
+ Glimmer is open source and available at
+ L<http://www.cbcb.umd.edu/software/glimmer/>.
+ 
  GlimmerM is open source and available at 
  L<http://www.tigr.org/software/glimmerm/>.
***************
*** 57,65 ****
  L<http://www.cbcb.umd.edu/software/GlimmerHMM/>.
  
! =head1 BUGS
  
- This module does B<not> parse Glimmer2 or Glimmer3 bacterial gene
- prediction files. Details on their output formats can be found at
- L<http://www.cbcb.umd.edu/software/glimmer/>.
  
  =head1 FEEDBACK
--- 66,76 ----
  L<http://www.cbcb.umd.edu/software/GlimmerHMM/>.
  
! Note that Glimmer 2.X will only process the first
! sequence in a fasta file, and the prediction report does not contain any
! sort of sequence identifier
! 
! Note that Glimmer 3.X produces two output files.  This module only parses
! the .predict file.
  
  
  =head1 FEEDBACK
***************
*** 128,132 ****
   Function: Builds a new Bio::Tools::Glimmer object 
   Returns : an instance of Bio::Tools::Glimmer
!  Args    :
  
  
--- 139,143 ----
   Function: Builds a new Bio::Tools::Glimmer object 
   Returns : an instance of Bio::Tools::Glimmer
!  Args    : format ('Glimmer', 'GlimmerM', 'GlimmerHMM'), seqname
  
  
***************
*** 137,140 ****
--- 148,166 ----
  
    my $self = $class->SUPER::new(@args);
+ 
+   my ($format, $seqname) = $self->_rearrange([qw(FORMAT SEQNAME)], @args);
+ 
+   # override automagic format detection
+   if (defined($format) &&
+       (($format eq 'Glimmer')  ||
+        ($format eq 'GlimmerM') ||
+        ($format eq 'GlimmerHMM'))
+   ) {
+       $self->_format($format);
+   }
+   
+   # hardwire seq_id when creating gene and exon objects (Glimmer 2.X)
+   $self->_seqname($seqname) if defined($seqname);
+   
    return $self;
  }
***************
*** 209,213 ****
      # if the prediction section hasn't been parsed yet, we do this now
      $self->_parse_predictions() unless $self->_predictions_parsed();
! 
      # get next gene structure
      $gene = $self->_prediction();
--- 235,239 ----
      # if the prediction section hasn't been parsed yet, we do this now
      $self->_parse_predictions() unless $self->_predictions_parsed();
!     
      # get next gene structure
      $gene = $self->_prediction();
***************
*** 227,230 ****
--- 253,318 ----
  
  sub _parse_predictions {
+ 
+     my ($self) = @_;
+ 
+     
+     my %method = (
+                   'Glimmer'    => '_parse_prokaryotic',
+                   'GlimmerM'   => '_parse_eukaryotic',
+                   'GlimmerHMM' => '_parse_eukaryotic',
+                   '_DEFAULT_'  => '_parse_eukaryotic',
+               );
+     
+     my $format = $self->_format();
+     
+     if (!$format) {
+         
+         while (my $line = $self->_readline()) {
+ 
+             if ( $line =~ /^Glimmer\S*\s+\(Version\s*\S+\)/ ) {
+                 $format = 'GlimmerM';
+                 $self->_pushback($line);
+                 last;
+             }
+             elsif ( $line =~ /^Glimmer\S*$/ ) {
+                 $format = 'GlimmerHMM';
+                 $self->_pushback($line);
+                 last;
+             }
+             elsif ($line =~ /^Putative Genes:$/) {
+                 $format = 'Glimmer';
+                 $self->_pushback($line);
+                 last;
+             }
+             elsif ($line =~ /^>(\S+)/) {
+                 $format = 'Glimmer';
+                 $self->_pushback($line);
+                 last;
+             }
+             
+         }
+         
+     }
+ 
+     my $method =
+         (exists($method{$format})) ? $method{$format} : $method{'_DEFAULT_'};
+ 
+     return $self->$method();
+     
+ }
+ 
+ 
+ =head2 _parse_eukaryotic
+ 
+  Title   : _parse_eukaryotic()
+  Usage   : $obj->_parse_eukaryotic()
+  Function: Parses the prediction section. Automatically called by
+            next_prediction() if not yet done.
+  Example :
+  Returns : 
+ 
+ =cut
+ 
+ sub _parse_eukaryotic {
      my ($self) = @_;
  
***************
*** 235,239 ****
  	    $source = "$1_$2";
  	    next;
! 	} elsif( /^(Glimmer\S*)$/ ) { # GlimmerHMM has no version
  	    $source = $1;
  	    next;
--- 323,327 ----
  	    $source = "$1_$2";
  	    next;
! 	} elsif( /^(GlimmerHMM\S*)$/ ) { # GlimmerHMM has no version
  	    $source = $1;
  	    next;
***************
*** 284,287 ****
--- 372,462 ----
  }
  
+ =head2 _parse_prokaryotic
+ 
+  Title   : _parse_prokaryotic()
+  Usage   : $obj->_parse_prokaryotic()
+  Function: Parses the prediction section. Automatically called by
+            next_prediction() if not yet done.
+  Example :
+  Returns : 
+ 
+ =cut
+ 
+ sub _parse_prokaryotic {
+     my ($self) = @_;
+ 
+     # default value, possibly overriden later
+     my $source = 'Glimmer';
+ 
+     # Glimmer 2.X does not provide a sequence identifer
+     # in the prediction report
+     my $seqname = $self->_seqname();
+     
+     while(defined($_ = $self->_readline())) {
+         # Glimmer 3.X does provide a sequence identifier -
+         # beware whitespace at the end (comes through from
+         # (the fasta file)
+         if ($_ =~ /^Putative Genes:$/) {
+             $source = 'Glimmer_2.X';
+             next;
+         }
+         # Glimmer 3.X sequence identifier
+         elsif ($_ =~ /^>(\S+)/) {
+             $seqname = $1;
+             $source = 'Glimmer_3.X';
+             next;
+         }        
+         elsif (
+                # Glimmer 2.X prediction 
+                (/^\s+(\d+)\s+      # gene num
+                 (\d+)\s+(\d+)\s+   # start, end
+                 \[([\+\-])\d{1}\s+ # strand
+                 /ox ) ||
+                # Glimmer 3.X prediction
+                (/\w+(\d+)\s+       # orf (numeric portion)
+                 (\d+)\s+(\d+)\s+   # start, end
+                 ([\+\-])\d{1}\s+   # strand
+                /ox)) {
+ 	    my ($genenum,$start,$end,$strand) = 
+ 		( $1,$2,$3,$4 );
+ 
+             # Glimmer 2.X predictions do not include
+             # the stop codon - this might extend the
+             # prediction off either end of the sequence
+             if ($source eq 'Glimmer_2.X') {
+                 if ($strand eq '-') {
+                     $end -= 3;
+                 }
+                 else {
+                     $end += 3;
+                 }
+             }
+             
+             my $exon = new Bio::Tools::Prediction::Exon
+                 ('-seq_id'     => $seqname,
+                  '-start'      => $start,
+                  '-end'        => $end,
+                  '-strand'     => $strand eq '-' ? '-1' : '1',
+                  '-source_tag' => $source,
+                  '-primary_tag'=> 'exon',
+                  '-tag'         => { 'Group' => "GenePrediction_$genenum"},
+              );
+             
+             my $gene = Bio::Tools::Prediction::Gene->new
+                 (
+                  '-seq_id'      => $seqname,
+                  '-primary_tag' => "gene",
+                  '-source_tag'  => $source,
+                  '-tag'         => { 'Group' => "GenePrediction_$genenum"},
+              );
+             
+             $gene->add_exon($exon);
+             $self->_add_prediction($gene) 
+ 	}
+     }
+     
+     $self->_predictions_parsed(1);
+ }
+ 
  =head2 _prediction
  
***************
*** 340,343 ****
--- 515,555 ----
  }
  
+ =head2 _seqname
+ 
+  Title   : _seqname
+  Usage   : $obj->_seqname($seqname)
+  Function: internal (for Glimmer 2.X)
+  Example :
+  Returns : String
+ 
+ =cut
+ 
+ sub _seqname {
+     my ($self, $val) = @_;
+ 
+     $self->{'_seqname'} = $val if $val;
+     if(! exists($self->{'_seqname'})) {
+ 	$self->{'_seqname'} = 'unknown';
+     }
+     return $self->{'_seqname'};
+ }
+ 
+ =head2 _format
+ 
+  Title   : _format
+  Usage   : $obj->_format($format)
+  Function: internal
+  Example :
+  Returns : String
+ 
+ =cut
+ 
+ sub _format {
+     my ($self, $val) = @_;
+ 
+     $self->{'_format'} = $val if $val;
+ 
+     return $self->{'_format'};
+ }
  
  1;



More information about the Bioperl-guts-l mailing list