[Bioperl-guts-l] bioperl-live/Bio/SeqIO embl.pm,1.92,1.93

Senduran Balasubramaniam sendu at dev.open-bio.org
Thu Aug 24 09:36:51 EDT 2006


Update of /home/repository/bioperl/bioperl-live/Bio/SeqIO
In directory dev.open-bio.org:/tmp/cvs-serv24141/Bio/SeqIO

Modified Files:
	embl.pm 
Log Message:
improved species handling; now more likely that output of input OS, OC lines matches input

Index: embl.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/SeqIO/embl.pm,v
retrieving revision 1.92
retrieving revision 1.93
diff -C2 -d -r1.92 -r1.93
*** embl.pm	12 Aug 2006 11:00:03 -0000	1.92
--- embl.pm	24 Aug 2006 13:36:49 -0000	1.93
***************
*** 652,669 ****
  
  		if ($seq->can('species') && (my $spec = $seq->species)) {
! 			my($species, @class) = $spec->classification();
! 			my $genus = $class[0];
! 			my $OS = "$genus $species";
! 			if (my $ssp = $spec->sub_species) {
! 				$OS .= " $ssp";
! 			}
! 			if (my $common = $spec->common_name) {
! 				$OS .= " ($common)";
! 			}
  			$self->_print("OS   $OS\n") || return;
  			my $OC = join('; ', reverse(@class)) .'.';
! 			$self->_write_line_EMBL_regex("OC   ","OC   ",$OC,'; |$',80) || return; #'
  			if ($spec->organelle) {
! 				$self->_write_line_EMBL_regex("OG   ","OG   ",$spec->organelle,'; |$',80) || return; #'
  			}
  			$self->_print("XX\n") || return;
--- 652,670 ----
  
  		if ($seq->can('species') && (my $spec = $seq->species)) {
! 			my @class = $spec->classification();
!             shift @class; # get rid of species name. Some embl files include
!                           # the species name in the OC lines, but this seems
!                           # more like an error than something we need to
!                           # emulate
! 			my $OS = $spec->scientific_name;
!             if ($spec->common_name) {
!                 $OS .= ' ('.$spec->common_name.')';
!             }
!             print "OS line is 'OS   $OS'\n";
  			$self->_print("OS   $OS\n") || return;
  			my $OC = join('; ', reverse(@class)) .'.';
! 			$self->_write_line_EMBL_regex("OC   ","OC   ",$OC,'; |$',80) || return;
  			if ($spec->organelle) {
! 				$self->_write_line_EMBL_regex("OG   ","OG   ",$spec->organelle,'; |$',80) || return;
  			}
  			$self->_print("XX\n") || return;
***************
*** 972,1029 ****
  
      $_ = $$buffer;
!     my( $sub_species, $species, $genus, $common, @class, $ns_name );
      while (defined( $_ ||= $self->_readline )) {
! 
!         if (/^OS\s+((\S+)(?:\s+([^\(]\S*))?(?:\s+([^\(]\S*))?(?:\s+\((.*)\))?)/) {
!             $ns_name = $1;
!             $genus   = $2;
! 	    $species = $3 || 'sp.';
! 	    $sub_species = $4 if $4;
!             $common      = $5 if $5;
          }
!         elsif (s/^OC\s+//) {
! 	    # only split on ';' or '.' so that 
! 	    # classification that is 2 words will 
! 	    # still get matched
! 	    # use map to remove trailing/leading spaces
! 	    chomp;
!             push(@class,  map { s/^\s+//; s/\s+$//; $_; } split /[;\.]+/);
          }
- 	elsif (/^OG\s+(.*)/) {
- 	    $org = $1;
- 	}
          else {
              last;
          }
! 
          $_ = undef; # Empty $_ to trigger read of next line
      }
  
      $$buffer = $_;
! 
      # Don't make a species object if it is "Unknown" or "None"
!     return if $genus =~ /^(Unknown|None)$/i;
! 
!     # Bio::Species array needs array in Species -> Kingdom direction
!     if ($class[0] eq 'Viruses') {
!         push( @class, $ns_name );
      }
!     elsif ($class[$#class] eq $genus) {
!         push( @class, $species );
      }
!     elsif ($class[$#class] eq "$genus $species") {
!         # no nothing    
!     } else {
!         push( @class, $genus, $species );
      }
      @class = reverse @class;
  
      my $make = Bio::Species->new();
!     $make->classification( \@class, "FORCE" );  # no name validation please
!     $make->common_name( $common      ) if $common;
      unless ($class[-1] eq 'Viruses') {
!         $make->sub_species( $sub_species ) if $sub_species;
      }
!     $make->organelle  ( $org         ) if $org;
      return $make;
  }
--- 973,1048 ----
  
      $_ = $$buffer;
!     my( $sub_species, $species, $genus, $common, $sci_name, $class_lines );
      while (defined( $_ ||= $self->_readline )) {
!         if (/^OS\s+(.+)/) {
!             $sci_name = $1;
          }
!         elsif (s/^OC\s+(.+)$//) {
!             $class_lines .= $1;
!         }
!         elsif (/^OG\s+(.*)/) {
!             $org = $1;
          }
          else {
              last;
          }
!         
          $_ = undef; # Empty $_ to trigger read of next line
      }
  
      $$buffer = $_;
!     
!     # Convert data in classification lines into classification array.
!     # only split on ';' or '.' so that classification that is 2 or more words
!     # will still get matched, use map() to remove trailing/leading/intervening
!     # spaces
!     my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /[;\.]+/, $class_lines;
!     
!     # do we have a genus?
!     my $possible_genus = $class[-1];
!     if ($sci_name =~ /^$possible_genus/) {
!         $genus = $possible_genus;
!         ($species) = $sci_name =~ /^$genus\s+(.+)/;
!     }
!     else {
!         $species = $sci_name;
!     }
!     
      # Don't make a species object if it is "Unknown" or "None"
!     if ($genus) {
!         return if $genus =~ /^(Unknown|None)$/i;
      }
!     
!     # is this organism of rank species or is it lower?
!     # (doesn't catch everything, but at least the guess isn't dangerous)
!     if ($species =~ /subsp\.|var\./) {
!         ($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.|var\.).+)/;
      }
!     
!     # sometimes things have common name in brackets, like
!     # Schizosaccharomyces pombe (fission yeast), so get rid of the common
!     # name bit. Probably dangerous if real scientific species name ends in
!     # bracketed bit.
!     unless ($class[-1] eq 'Viruses') {
!         ($species, $common) = $species =~ /^(.+)\s+\((.+)\)$/;
!         $sci_name =~ s/\s+\(.+\)$// if $common;
!     }
!      
!     # Bio::Species array needs array in Species -> Kingdom direction
!     unless ($class[-1] eq $sci_name) {
!         push(@class, $sci_name);
      }
      @class = reverse @class;
  
      my $make = Bio::Species->new();
!     $make->scientific_name($sci_name);
!     $make->classification(@class);
      unless ($class[-1] eq 'Viruses') {
!         $make->genus($genus) if $genus;
!         $make->species($species) if $species;
!         $make->sub_species($sub_species) if $sub_species;
!         $make->common_name($common) if $common;
      }
!     $make->organelle($org) if $org;
      return $make;
  }



More information about the Bioperl-guts-l mailing list