[Bioperl-guts-l] bioperl-live/Bio/SeqIO genbank.pm,1.119,1.120

Brian Osborne bosborne at pub.open-bio.org
Thu Aug 25 21:51:31 EDT 2005


Update of /home/repository/bioperl/bioperl-live/Bio/SeqIO
In directory pub.open-bio.org:/tmp/cvs-serv1742/Bio/SeqIO

Modified Files:
	genbank.pm 
Log Message:
Fix genbank.pm so it treats multi-line SOURCE sections correctly, this defect broke both common_name() and classification(). Unfortunately the fix is an unattractive hack - I believe that re-writing genbank.pm to accomodate N lines per section would be a substantial amount of work.


Index: genbank.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/SeqIO/genbank.pm,v
retrieving revision 1.119
retrieving revision 1.120
diff -C2 -d -r1.119 -r1.120
*** genbank.pm	16 Apr 2005 16:21:28 -0000	1.119
--- genbank.pm	26 Aug 2005 01:51:28 -0000	1.120
***************
*** 18,22 ****
  
  It is probably best not to use this object directly, but
! rather go through the SeqIO handler system. Go:
  
      $stream = Bio::SeqIO->new(-file => $filename,
--- 18,22 ----
  
  It is probably best not to use this object directly, but
! rather go through the SeqIO handler:
  
      $stream = Bio::SeqIO->new(-file => $filename,
***************
*** 33,38 ****
  file databases.
  
! There is alot of flexibility here about how to dump things which I need
! to document fully.
  
  =head2 Optional functions
--- 33,38 ----
  file databases.
  
! There is some flexibility here about how to write GenBank output
! that is not fully documented.
  
  =head2 Optional functions
***************
*** 77,83 ****
  
  Data parsed in Bio::SeqIO::genbank is stored in a variety of data
! fields in the sequence object that is returned.  More information in
! the HOWTOs about exactly what each field means and where it goes.
! Here is a partial list of fields.
  
  Items listed as RichSeq or Seq or PrimarySeq and then NAME() tell you
--- 77,82 ----
  
  Data parsed in Bio::SeqIO::genbank is stored in a variety of data
! fields in the sequence object that is returned. Here is a partial list
! of fields.
  
  Items listed as RichSeq or Seq or PrimarySeq and then NAME() tell you
***************
*** 116,119 ****
--- 115,121 ----
   Sequence             PrimarySeq seq()
  
+ There is more information in the Feature-Annotation HOWTO about each 
+ field and how it is mapped to the Sequence object.
+ 
  =head1 FEEDBACK
  
***************
*** 132,137 ****
  
  Report bugs to the Bioperl bug tracking system to help us keep track
! the bugs and their resolution.
! Bug reports can be submitted via email or the web:
  
    http://bugzilla.bioperl.org/
--- 134,138 ----
  
  Report bugs to the Bioperl bug tracking system to help us keep track
! the bugs and their resolution. Bug reports can be submitted via the web:
  
    http://bugzilla.bioperl.org/
***************
*** 1180,1192 ****
             formats, and varietas in plants
   Example : ORGANISM  unknown marine gamma proteobacterium NOR5
!            $genus = undef;
             $species = unknown marine gamma proteobacterium NOR5
  
             ORGANISM  Drosophila sp. 'white tip scutellum'
!            $genus = Drosophila; $species = sp.;
!            $subspecies = 'white tip scutellum'
  
             ORGANISM  Ajellomyces capsulatus var. farciminosus
!            $genus = Ajellomyces; $species = capsulatus var.;
             $subspecies = farciminosus
  
--- 1181,1195 ----
             formats, and varietas in plants
   Example : ORGANISM  unknown marine gamma proteobacterium NOR5
!            $genus = undef
             $species = unknown marine gamma proteobacterium NOR5
  
             ORGANISM  Drosophila sp. 'white tip scutellum'
!            $genus = Drosophila
!            $species = sp.
!            $subspecies = white tip scutellum
  
             ORGANISM  Ajellomyces capsulatus var. farciminosus
!            $genus = Ajellomyces
!            $species = capsulatus var.
             $subspecies = farciminosus
  
***************
*** 1197,1307 ****
  
  sub _read_GenBank_Species {
!     my( $self,$buffer) = @_;
!     my @organell_names = ("chloroplast", "mitochondr"); 
!     # only those carrying DNA, apart from the nucleus
! 	
! 	my @unkn_names=("other", 'unknown organism', 'not specified', 'not shown', 'Unspecified', 'Unknown', 'None', 'unclassified', 'unidentified organism', 'not supplied');
! 	#dictionary of synonyms for taxid 32644
! 	my @unkn_genus=('unknown','unclassified','uncultured','unidentified');
! 	#all above can be part of valid species name
!     
  	$_ = $$buffer;
!     
!     my( $sub_species, $species, $genus, $common, $organelle, @class, $ns_name );
!     # upon first entering the loop, we must not read a new line -- the SOURCE
!     # line is already in the buffer (HL 05/10/2000)
!     while (defined($_) || defined($_ = $self->_readline())) {
! 	# de-HTMLify (links that may be encountered here don't contain
! 	# escaped '>', so a simple-minded approach suffices)
!         s/<[^>]+>//g;
! 	if (/^SOURCE\s+(.*)/o) {
! 	    # FIXME this is probably mostly wrong (e.g., it yields things like
! 	    # Homo sapiens adult placenta cDNA to mRNA
! 	    # which is certainly not what you want)
! 	    $common = $1;
! 	    $common =~ s/\.$//; # remove trailing dot
! 	} elsif (/^\s{2}ORGANISM/o) {
! 	    my @spflds = split(' ', $_);
!             ($ns_name) = $_ =~ /\w+\s+(.*)/o;
! 	    shift(@spflds); # ORGANISM
! 	   #does the next term start with uppercase?
! 		#yes: valid genus; no then unconventional
! 		#e.g. leaf litter basidiomycete sp. Collb2-39
! 		if ($spflds[0]=~m/^[A-Z]/)	{
! 			$genus=shift(@spflds);
! 		} else { undef $genus; }
! 		#populate species tag
! 		if (@spflds)	{
! 			#my $size=scalar @spflds;
! 			while (my $fld = shift @spflds)	{
! 				$species .= "$fld ";
! 				#does it have subspecies or varietas?
! 				last if ($fld =~ m/(sp\.|var\.)/);
  			}
! 			chop $species;	#last space
! 			$sub_species = join ' ', at spflds if(@spflds);
! 		}
! 		else { $species = 'sp.'; }
! 		#does ORGANISM start with any words which make its genus undefined?
! 		#these are in @unkn_genus	
! 		#this in case species starts with uppercase so isn't caught above. 
! 		#alter common name if required
! 		my $unconv=0;	#is it unconventional species name?
! 		foreach (@unkn_genus)	{
! 			if ($genus && $genus=~m/$_/i)	{
! 				$species = $genus." ".$species; undef $genus;
! 				$unconv=1;
! 				last;
  			}
! 			elsif ($species=~m/$_/i)	{
! 				$unconv=1;
! 				last;
  			}
  		}
! 		if (!$unconv && !$sub_species && $species =~s/^(\w+)\s(\w+)$/$1/)	{
! 			#need to extract subspecies from conventional ORGANISM format.  
! 			#Will the 'word' in a two element species name
! 			#e.g. $species = 'thummi thummi' => $species='thummi' & $sub_species='thummi' 	
! 				$sub_species = $2;
  		}
! 		
! 	} elsif (/^\s+(.+)/o) {
! 	    # only split on ';' or '.' so that 
! 	    # classification that is 2 words will 
! 	    # still get matched
! 	    # use map to remove trailing/leading spaces
!             push(@class, map { s/^\s+//; s/\s+$//; $_; } split /[;\.]+/, $1);
!         } else {
!             last;
!         }
!         
!         $_ = undef; # Empty $_ to trigger read of next line
!     }
!      $$buffer = $_;
!     
!     # Don't make a species object if it's empty or "Unknown" or "None"
!     #return unless $genus and  $genus !~ /^(Unknown|None)$/oi;
!     	 # Don't make a species object if it belongs to taxid 32644
! 	 my $unkn = grep { $_ =~ /^\Q$common\E$/; } @unkn_names;
! 	 return unless ($species||$genus) and $unkn==0;
  	# Bio::Species array needs array in Species -> Kingdom direction
!     if ($class[0] eq 'Viruses') {
!         push( @class, $ns_name );
!     }
!     elsif ($genus && $class[$#class] eq $genus) {
!         push( @class, $species );
!     } else {
!         push( @class, $genus, $species );
!     }
!     @class = reverse @class;
!     
!     my $make = Bio::Species->new();
!     $make->classification( \@class, "FORCE" ); # no name validation please
!     $make->common_name( $common      ) if $common;
!     unless ($class[-1] eq 'Viruses') {
!         $make->sub_species( $sub_species ) if $sub_species;
!     }
!     $make->organelle($organelle) if $organelle;
!     return $make;
  }
  
--- 1200,1322 ----
  
  sub _read_GenBank_Species {
! 	my( $self,$buffer) = @_;
! 	my @organell_names = ("chloroplast", "mitochondr"); 
! 	# only those carrying DNA, apart from the nucleus
! 
! 	my @unkn_names = ("other", 'unknown organism', 'not specified', 'not shown',
! 							'Unspecified', 'Unknown', 'None', 'unclassified',
! 							'unidentified organism', 'not supplied');
! 	# dictionary of synonyms for taxid 32644
! 	my @unkn_genus = ('unknown','unclassified','uncultured','unidentified');
! 	# all above can be part of valid species name
! 
  	$_ = $$buffer;
! 
! 	my( $sub_species, $species, $genus, $common, $organelle, @class,
! 		 $ns_name, $source_flag );
! 	# upon first entering the loop, we must not read a new line -- the SOURCE
! 	# line is already in the buffer (HL 05/10/2000)
! 	while (defined($_) || defined($_ = $self->_readline())) {
! 		# de-HTMLify (links that may be encountered here don't contain
! 		# escaped '>', so a simple-minded approach suffices)
! 		s/<[^>]+>//g;
! 		if (/^SOURCE\s+(.*)/o) {
! 			# FIXME this is probably mostly wrong (e.g., it yields things like
! 			# Homo sapiens adult placenta cDNA to mRNA
! 			# which is certainly not what you want)
! 			$common = $1;
! 			$common =~ s/\.$//; # remove trailing dot
! 			$source_flag = 1; ###
! 		} elsif (/^\s{2}ORGANISM/o) {
! 			$source_flag = 0; ###
! 			my @spflds = split(' ', $_);
! 			($ns_name) = $_ =~ /\w+\s+(.*)/o;
! 			shift(@spflds); # ORGANISM
! 			# does the next term start with uppercase?
! 			# yes: valid genus; no then unconventional
! 			# e.g. leaf litter basidiomycete sp. Collb2-39
! 			if ($spflds[0] =~ m/^[A-Z]/)	{
! 				$genus = shift(@spflds);
! 			} else { undef $genus; }
! 			# populate species tag
! 			if (@spflds)	{
! 				# my $size=scalar @spflds;
! 				while (my $fld = shift @spflds)	{
! 					$species .= "$fld ";
! 					# does it have subspecies or varietas?
! 					last if ($fld =~ m/(sp\.|var\.)/);
! 				}
! 				chop $species;	# last space
! 				$sub_species = join ' ', at spflds if(@spflds);
  			}
! 			else { $species = 'sp.'; }
! 			# does ORGANISM start with any words which make its genus undefined?
! 			# these are in @unkn_genus	
! 			# this in case species starts with uppercase so isn't caught above. 
! 			# alter common name if required
! 			my $unconv = 0; # is it unconventional species name?
! 			foreach (@unkn_genus)	{
! 				if ($genus && $genus =~ m/$_/i)	{
! 					$species = $genus ." ". $species;
! 					undef $genus;
! 					$unconv = 1;
! 					last;
! 				}
! 				elsif ($species =~ m/$_/i)	{
! 					$unconv=1;
! 					last;
! 				}
  			}
! 			if (!$unconv && !$sub_species && $species =~ s/^(\w+)\s(\w+)$/$1/)	{
! 				# need to extract subspecies from conventional ORGANISM format.  
! 				# Will the 'word' in a two element species name
! 				# e.g. $species = 'thummi thummi' => $species='thummi' & 
! 				# $sub_species='thummi'
! 				$sub_species = $2;
  			}
  		}
! 		elsif ($source_flag) {
! 			$common .= $_;
! 			$common =~ s/\n//g;
! 			$common =~ s/\s+/ /g;
! 			$source_flag = 0;
! 		} elsif (/^\s+(.+)/o) {
! 			# only split on ';' or '.' so that 
! 			# classification that is 2 words will 
! 			# still get matched
! 			# use map to remove trailing/leading spaces
! 			push(@class, map { s/^\s+//; s/\s+$//; $_; } split /[;\.]+/, $1);
! 		} else {
! 			last;
  		}
! 
! 		$_ = undef; # Empty $_ to trigger read of next line
! 	}
! 	$$buffer = $_;
! 
! 	# Don't make a species object if it's empty or "Unknown" or "None"
! 	# return unless $genus and  $genus !~ /^(Unknown|None)$/oi;
! 	# Don't make a species object if it belongs to taxid 32644
! 	my $unkn = grep { $_ =~ /^\Q$common\E$/; } @unkn_names;
! 	return unless ($species || $genus) and $unkn == 0;
  	# Bio::Species array needs array in Species -> Kingdom direction
! 	if ($class[0] eq 'Viruses') {
! 		push( @class, $ns_name );
! 	}
! 	elsif ($genus && $class[$#class] eq $genus) {
! 		push( @class, $species );
! 	} else {
! 		push( @class, $genus, $species );
! 	}
! 	@class = reverse @class;
! 
! 	my $make = Bio::Species->new();
! 	$make->classification( \@class, "FORCE" ); # no name validation please
! 	$make->common_name( $common ) if $common;
! 	unless ($class[-1] eq 'Viruses') {
! 		$make->sub_species( $sub_species ) if $sub_species;
! 	}
! 	$make->organelle($organelle) if $organelle;
! 	return $make;
  }
  



More information about the Bioperl-guts-l mailing list