[Bioperl-guts-l] bioperl-live/Bio/SeqIO genbank.pm,1.163,1.164

Christopher John Fields cjfields at dev.open-bio.org
Tue Jun 5 19:46:14 EDT 2007


Update of /home/repository/bioperl/bioperl-live/Bio/SeqIO
In directory dev.open-bio.org:/tmp/cvs-serv19478/Bio/SeqIO

Modified Files:
	genbank.pm 
Log Message:
Partial fix for bug 2305

Index: genbank.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/SeqIO/genbank.pm,v
retrieving revision 1.163
retrieving revision 1.164
diff -C2 -d -r1.163 -r1.164
*** genbank.pm	30 Jan 2007 15:17:04 -0000	1.163
--- genbank.pm	5 Jun 2007 23:46:12 -0000	1.164
***************
*** 154,157 ****
--- 154,158 ----
  James Wasmuth, james.wasmuth at ed.ac.uk
  Brian Osborne, bosborne at alum.mit.edu
+ Chris Fields, cjfields at uiuc dot edu
  
  =head1 APPENDIX
***************
*** 212,217 ****
      if( ! defined $self->sequence_factory ) {
              $self->sequence_factory(new Bio::Seq::SeqFactory
!                                                                             (-verbose => $self->verbose(),
!                                                                              -type => 'Bio::Seq::RichSeq'));
      }
  }
--- 213,218 ----
      if( ! defined $self->sequence_factory ) {
              $self->sequence_factory(new Bio::Seq::SeqFactory
!                             (-verbose => $self->verbose(),
!                              -type => 'Bio::Seq::RichSeq'));
      }
  }
***************
*** 255,259 ****
  
  	my @tokens = split(' ', $1);
! 
  	# this is important to have the id for display in e.g. FTHelper,
  	# otherwise you won't know which entry caused an error
--- 256,263 ----
  
  	my @tokens = split(' ', $1);
!     
!     # there should be at least six tokens in the LOCUS line; if not we may be in trouble...
!     $self->warn('Missing tokens in the LOCUS line; output may be malformed') if @tokens < 6;
!     
  	# this is important to have the id for display in e.g. FTHelper,
  	# otherwise you won't know which entry caused an error
***************
*** 269,278 ****
  	    my $circ = shift(@tokens);
  	    if ($circ eq 'circular') {
! 		$params{'-is_circular'} = 1;
! 		$params{'-division'} = shift(@tokens);
  	    } else {
! 				# 'linear' or 'circular' may actually be omitted altogether
! 		$params{'-division'} =
! 		    (CORE::length($circ) == 3 ) ? $circ : shift(@tokens);
  	    }
  	} else {
--- 273,282 ----
  	    my $circ = shift(@tokens);
  	    if ($circ eq 'circular') {
!             $params{'-is_circular'} = 1;
!             $params{'-division'} = shift(@tokens);
  	    } else {
! 			# 'linear' or 'circular' may actually be omitted altogether
!             $params{'-division'} =
!                 (CORE::length($circ) == 3 ) ? $circ : shift(@tokens);
  	    }
  	} else {
***************
*** 290,311 ****
  	if($date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/) {
  	    if( length($date) < 11 ) {
! 		# improperly formatted date
! 		# But we'll be nice and fix it for them
! 		my ($d,$m,$y) = ($2,$3,$4);
! 		if( length($d) == 1 ) {
! 		    $d = "0$d";
! 		}
! 		# guess the century here
! 		if( length($y) == 2 ) {
! 		    if( $y > 60 ) { # arbitrarily guess that '60' means 1960
! 			$y = "19$y";
! 		    } else {
! 			$y = "20$y";
! 		    }
! 		    $self->warn("Date was malformed, guessing the century for $date to be $y\n");
! 		}
! 		$params{'-dates'} = [join('-',$d,$m,$y)];
! 	    } else {
! 		$params{'-dates'} = [$date];
  	    }
  	}
--- 294,315 ----
  	if($date =~ s/\s*((\d{1,2})-(\w{3})-(\d{2,4})).*/$1/) {
  	    if( length($date) < 11 ) {
!             # improperly formatted date
!             # But we'll be nice and fix it for them
!             my ($d,$m,$y) = ($2,$3,$4);
!             if( length($d) == 1 ) {
!                 $d = "0$d";
!             }
!             # guess the century here
!             if( length($y) == 2 ) {
!                 if( $y > 60 ) { # arbitrarily guess that '60' means 1960
!                     $y = "19$y";
!                 } else {
!                     $y = "20$y";
!                 }
!                 $self->warn("Date was malformed, guessing the century for $date to be $y\n");
!             }
!             $params{'-dates'} = [join('-',$d,$m,$y)];
!         } else {
!             $params{'-dates'} = [$date];
  	    }
  	}
***************
*** 354,358 ****
  	    }
  	    # Version number
! 	    elsif( /^VERSION\s+(.+)$/ ) {
  		my ($acc,$gi) = split(' ',$1);
  		if($acc =~ /^\w+\.(\d+)/) {
--- 358,362 ----
  	    }
  	    # Version number
! 	    elsif( /^VERSION\s+(\S.+)$/ ) {
  		my ($acc,$gi) = split(' ',$1);
  		if($acc =~ /^\w+\.(\d+)/) {
***************
*** 365,369 ****
  	    }
  	    # Keywords
! 	    elsif( /^KEYWORDS\s+(.*)/ ) {
  		my @kw = split(/\s*\;\s*/,$1);
  		while( defined($_ = $self->_readline) ) {
--- 369,373 ----
  	    }
  	    # Keywords
! 	    elsif( /^KEYWORDS\s+(\S.*)/ ) {
  		my @kw = split(/\s*\;\s*/,$1);
  		while( defined($_ = $self->_readline) ) {
***************
*** 379,383 ****
  	    }
  	    # Organism name and phylogenetic information
! 	    elsif (/^SOURCE/) {
  		if($builder->want_slot('species')) {
  		    $species = $self->_read_GenBank_Species(\$buffer);
--- 383,387 ----
  	    }
  	    # Organism name and phylogenetic information
! 	    elsif (/^SOURCE\s+\S/) {
  		if($builder->want_slot('species')) {
  		    $species = $self->_read_GenBank_Species(\$buffer);
***************
*** 391,395 ****
  	    }
  	    # References
! 	    elsif (/^REFERENCE/) {
  		if($annotation) {
  		    my @refs = $self->_read_GenBank_References(\$buffer);
--- 395,399 ----
  	    }
  	    # References
! 	    elsif (/^REFERENCE\s+\S/) {
  		if($annotation) {
  		    my @refs = $self->_read_GenBank_References(\$buffer);
***************
*** 405,409 ****
  	    }
  	    # Comments
! 	    elsif (/^COMMENT\s+(.*)/) {
  		if($annotation) {
  		    my $comment = $1;
--- 409,413 ----
  	    }
  	    # Comments
! 	    elsif (/^COMMENT\s+(\S.*)/) {
  		if($annotation) {
  		    my $comment = $1;
***************
*** 426,430 ****
  	    }
  	    # Corresponding Genbank nucleotide id, Genpept only
! 	    elsif( /^DBSOURCE\s+(.+)/ ) {
  		if ($annotation) {
  		    my $dbsource = $1;
--- 430,434 ----
  	    }
  	    # Corresponding Genbank nucleotide id, Genpept only
! 	    elsif( /^DBSOURCE\s+(\S.+)/ ) {
  		if ($annotation) {
  		    my $dbsource = $1;
***************
*** 1059,1066 ****
  	}
      }
- 
  }
  
- 
  =head2 _read_GenBank_References
  
--- 1063,1068 ----
***************
*** 1280,1291 ****
          # escaped '>', so a simple-minded approach suffices)
          $line =~ s{<[^>]+>}{}g;
!         if ($line =~ m{^(?:\s{0,2})(\w+)\s+(.+)$}ox) {
!             ($tag, $data) = ($1, $2);
          } else {
              ($data = $line) =~ s{^\s+}{};
              chomp $data;
              $tag = 'CLASSIFICATION' if ($tag ne 'CLASSIFICATION' && $tag eq 'ORGANISM' &&  $line =~ m{[;\.]+});
          }
-         last if ($tag && !exists $source{$tag});
          (exists $ann->{$tag}) ? ($ann->{$tag} .= ' '.$data) : ($ann->{$tag} .= $data);
          $line = undef;        
--- 1282,1294 ----
          # escaped '>', so a simple-minded approach suffices)
          $line =~ s{<[^>]+>}{}g;
!         if ($line =~ m{^(?:\s{0,2})(\w+)\s+(.+)?$}ox) {
!             ($tag, $data) = ($1, $2 || '');
!             last if ($tag && !exists $source{$tag});            
          } else {
+             return unless $tag;
              ($data = $line) =~ s{^\s+}{};
              chomp $data;
              $tag = 'CLASSIFICATION' if ($tag ne 'CLASSIFICATION' && $tag eq 'ORGANISM' &&  $line =~ m{[;\.]+});
          }
          (exists $ann->{$tag}) ? ($ann->{$tag} .= ' '.$data) : ($ann->{$tag} .= $data);
          $line = undef;        
***************
*** 1294,1298 ****
      ($sl, $class_lines, $sci_name) = ($ann->{SOURCE}, $ann->{CLASSIFICATION}, $ann->{ORGANISM});
      
!     $$buffer = $line;   
  
      # parse out organelle, common name, abbreviated name if present;
--- 1297,1303 ----
      ($sl, $class_lines, $sci_name) = ($ann->{SOURCE}, $ann->{CLASSIFICATION}, $ann->{ORGANISM});
      
!     $$buffer = $line;
! 
!     $sci_name || return;
  
      # parse out organelle, common name, abbreviated name if present;
***************
*** 1310,1315 ****
      }
  
-     $sci_name || return;
- 
      # Convert data in classification lines into classification array.
      # only split on ';' or '.' so that classification that is 2 or more words will
--- 1315,1318 ----



More information about the Bioperl-guts-l mailing list