[Bioperl-guts-l] bioperl-live/Bio/SeqIO swiss.pm, 1.92, 1.93 embl.pm, 1.101, 1.102 genbank.pm, 1.144, 1.145
Senduran Balasubramaniam
sendu at dev.open-bio.org
Tue Sep 5 09:43:23 EDT 2006
Update of /home/repository/bioperl/bioperl-live/Bio/SeqIO
In directory dev.open-bio.org:/tmp/cvs-serv6996/Bio/SeqIO
Modified Files:
swiss.pm embl.pm genbank.pm
Log Message:
improved genus capturing, new-style organism parsing for swiss.pm
Index: swiss.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/SeqIO/swiss.pm,v
retrieving revision 1.92
retrieving revision 1.93
diff -C2 -d -r1.92 -r1.93
*** swiss.pm 4 Jul 2006 22:23:23 -0000 1.92
--- swiss.pm 5 Sep 2006 13:43:21 -0000 1.93
***************
*** 480,509 ****
# Organism lines
if ($seq->can('species') && (my $spec = $seq->species)) {
! my($species, @class) = $spec->classification();
! my $genus = $class[0];
! my $OS = "$genus $species";
! if ($class[-1] eq 'Viruses') {
! $OS = $species;
! $OS .= " ". $spec->sub_species if $spec->sub_species;
! } else {
! if ($class[$#class] =~ /viruses/i) {
! # different OS / OC syntax for viruses LP 09/16/2000
! shift @class;
! }
! if (my $ssp = $spec->sub_species) {
! $OS .= " $ssp";
! }
! foreach (($spec->variant, $spec->common_name)) {
! $OS .= " ($_)" if $_;
! }
! }
$self->_print( "OS $OS.\n");
my $OC = join('; ', reverse(@class)) .'.';
$self->_write_line_swissprot_regex("OC ","OC ",$OC,"\; \|\$",80);
if ($spec->organelle) {
! $self->_write_line_swissprot_regex("OG ","OG ",$spec->organelle,"\; \|\$",80);
}
if ($spec->ncbi_taxid) {
! $self->_print("OX NCBI_TaxID=".$spec->ncbi_taxid.";\n");
}
}
--- 480,503 ----
# Organism lines
if ($seq->can('species') && (my $spec = $seq->species)) {
! my @class = $spec->classification();
! shift(@class);
! my $species = $spec->species;
! my $genus = $spec->genus;
! my $OS = $spec->scientific_name;
! if ($class[-1] =~ /viruses/i) {
! $OS = $species;
! $OS .= " ". $spec->sub_species if $spec->sub_species;
! }
! foreach (($spec->variant, $spec->common_name)) {
! $OS .= " ($_)" if $_;
! }
$self->_print( "OS $OS.\n");
my $OC = join('; ', reverse(@class)) .'.';
$self->_write_line_swissprot_regex("OC ","OC ",$OC,"\; \|\$",80);
if ($spec->organelle) {
! $self->_write_line_swissprot_regex("OG ","OG ",$spec->organelle,"\; \|\$",80);
}
if ($spec->ncbi_taxid) {
! $self->_print("OX NCBI_TaxID=".$spec->ncbi_taxid.";\n");
}
}
***************
*** 948,1057 ****
local $_ = $line;
! my( $subspecies, $species, $genus, $common, $variant, $ncbi_taxid, $ns_name );
! my (@class, at spflds);
! my ($binomial, $descr);
my $osline = "";
-
while ( defined $_ ) {
! last unless /^O[SCGX]/;
! # believe it or not, but OS may come multiple times -- at this time
! # we can't capture multiple species
! if(/^OS\s+(\S.+)/ && (! defined($binomial))) {
! $osline .= " " if $osline;
! $osline .= $1;
! if($osline =~ s/(,|, and|\.)$//) {
! ($binomial, $descr) = $osline =~ /(\S[^\(]+)(.*)/;
! ($ns_name) = $binomial;
! $ns_name =~ s/\s+$//; #####
!
! #binomial could contain more than a three words.
! @spflds = split(' ',$binomial);
!
! #if first term a conventional uppercase genus?
! unless ( (grep { /^\Q$spflds[0]/i } @Unknown_genus) ||
! ($spflds[0] =~ m/^[^A-Z]/) ) {
! $genus = shift @spflds;
! } else { undef $genus; }
!
! if (@spflds) {
! while (my $fld = shift @spflds) {
! $species.="$fld ";
! last if ($fld =~ m/(sp\.|var\.)/);
! }
! chop $species; #last space
! $subspecies = join ' ', at spflds if(@spflds);
! }
! else { $species = 'sp.'; }
! while($descr =~ /\(([^\)]+)\)/g) {
! my $item = $1;
! # strain etc may not necessarily come first (yes, swissprot
! # is messy)
! if((! defined($variant)) &&
! (($item =~ /(^|[^\(\w])([Ss]train|isolate|serogroup|serotype|subtype|clone)\b/) ||
! ($item =~ /^(biovar|pv\.|type\s+)/))) {
! $variant = $item;
! } elsif($item =~ s/^subsp\.\s+//) {
! if(! $subspecies) {
! $subspecies = $item;
! } elsif(! $variant) {
! $variant = $item;
! }
! } elsif(! defined($common)) {
! # we're only interested in the first common name
! $common = $item;
! if((index($common, '(') >= 0) &&
! (index($common, ')') < 0)) {
! $common .= ')';
! }
! }
! }
! }
! } elsif (s/^OC\s+//) {
! push(@class, split /[\;\.]\s*/);
! if($class[0] =~ /viruses/i) {
! # viruses have different OS/OC syntax
! my @virusnames = split(/\s+/, $binomial);
! $species = (@virusnames > 1) ? pop(@virusnames) : '';
! $genus = join(" ", @virusnames);
! $subspecies = $descr;
! }
}
! elsif (/^OG\s+(.*)/) {
! $org = $1;
! }
! elsif (/^OX\s+(.*)/ && (! defined($ncbi_taxid))) {
my $taxstring = $1;
! # we only keep the first one and ignore all others
if ($taxstring =~ /NCBI_TaxID=([\w\d]+)/) {
$ncbi_taxid = $1;
! } else {
$self->throw("$taxstring doesn't look like NCBI_TaxID");
}
! }
! $_ = $self->_readline;
}
$self->_pushback($_); # pushback the last line because we need it
#if the organism belongs to taxid 32644 then no Bio::Species object.
! return if grep { /^\Q$binomial$/ } @Unknown_names;
! if (@class) {
! if ($class[0] eq 'Viruses') {
! push( @class, $ns_name );
! } elsif (defined($genus) && ($class[-1] eq $genus)) {
! push( @class, $species );
! } else {
! push( @class, $genus, $species );
}
}
@class = reverse @class;
my $taxon = Bio::Species->new();
! $taxon->classification( \@class, "FORCE" ); # no name validation please
! $taxon->common_name( $common ) if $common;
! $taxon->sub_species( $subspecies ) if $subspecies;
! $taxon->organelle ( $org ) if $org;
! $taxon->ncbi_taxid ( $ncbi_taxid ) if $ncbi_taxid;
! $taxon->variant($variant) if $variant;
!
# done
return $taxon;
--- 942,1056 ----
local $_ = $line;
! my( $sub_species, $species, $genus, $common, $variant, $ncbi_taxid, $sci_name, $class_lines, $descr );
my $osline = "";
while ( defined $_ ) {
! last unless /^O[SCGX]/;
! # believe it or not, but OS may come multiple times -- at this time
! # we can't capture multiple species
! if(/^OS\s+(\S.+)/ && (! defined($sci_name))) {
! $osline .= " " if $osline;
! $osline .= $1;
! if($osline =~ s/(,|, and|\.)$//) {
! ($sci_name, $descr) = $osline =~ /(\S[^\(]+)(.*)/;
! $sci_name =~ s/\s+$//;
!
! while($descr =~ /\(([^\)]+)\)/g) {
! my $item = $1;
! # strain etc may not necessarily come first (yes, swissprot
! # is messy)
! if((! defined($variant)) &&
! (($item =~ /(^|[^\(\w])([Ss]train|isolate|serogroup|serotype|subtype|clone)\b/) ||
! ($item =~ /^(biovar|pv\.|type\s+)/))) {
! $variant = $item;
! }
! elsif($item =~ s/^subsp\.\s+//) {
! if(! $sub_species) {
! $sub_species = $item;
! }
! elsif(! $variant) {
! $variant = $item;
! }
! }
! elsif(! defined($common)) {
! # we're only interested in the first common name
! $common = $item;
! if((index($common, '(') >= 0) &&
! (index($common, ')') < 0)) {
! $common .= ')';
! }
! }
! }
! }
}
! elsif (s/^OC\s+(\S.+)$//) {
! $class_lines .= $1;
! }
! elsif (/^OG\s+(.*)/) {
! $org = $1;
! }
! elsif (/^OX\s+(.*)/ && (! defined($ncbi_taxid))) {
my $taxstring = $1;
! # we only keep the first one and ignore all others
if ($taxstring =~ /NCBI_TaxID=([\w\d]+)/) {
$ncbi_taxid = $1;
! }
! else {
$self->throw("$taxstring doesn't look like NCBI_TaxID");
}
! }
! $_ = $self->_readline;
}
$self->_pushback($_); # pushback the last line because we need it
#if the organism belongs to taxid 32644 then no Bio::Species object.
! return if grep { /^\Q$sci_name$/ } @Unknown_names;
!
! # Convert data in classification lines into classification array.
! # only split on ';' or '.' so that classification that is 2 or more words
! # will still get matched, use map() to remove trailing/leading/intervening
! # spaces
! my @class = map { s/^\s+//; s/\s+$//; s/\s{2,}/ /g; $_; } split /[;\.]+/, $class_lines;
!
! if ($class[0] =~ /viruses/i) {
! # viruses have different OS/OC syntax
! my @virusnames = split(/\s+/, $sci_name);
! $species = (@virusnames > 1) ? pop(@virusnames) : '';
! $genus = join(" ", @virusnames);
! $sub_species = $descr;
! }
! else {
! # do we have a genus?
! my $possible_genus = $class[-1];
! $possible_genus .= "|$class[-2]" if $class[-2];
! if ($sci_name =~ /^($possible_genus)/) {
! $genus = $1;
! ($species) = $sci_name =~ /^$genus\s+(.+)/;
! }
! else {
! $species = $sci_name;
! }
!
! # is this organism of rank species or is it lower?
! # (doesn't catch everything, but at least the guess isn't dangerous)
! if ($species =~ /subsp\.|var\./) {
! ($species, $sub_species) = $species =~ /(.+)\s+((?:subsp\.|var\.).+)/;
}
}
+
+ # Bio::Species array needs array in Species -> Kingdom direction
+ unless ($class[-1] eq $sci_name) {
+ push(@class, $sci_name);
+ }
@class = reverse @class;
my $taxon = Bio::Species->new();
! $taxon->scientific_name($sci_name);
! $taxon->classification(@class);
! $taxon->common_name($common) if $common;
! $taxon->sub_species($sub_species) if $sub_species;
! $taxon->organelle($org) if $org;
! $taxon->ncbi_taxid($ncbi_taxid) if $ncbi_taxid;
! $taxon->variant($variant) if $variant;
!
# done
return $taxon;
Index: embl.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/SeqIO/embl.pm,v
retrieving revision 1.101
retrieving revision 1.102
diff -C2 -d -r1.101 -r1.102
*** embl.pm 2 Sep 2006 13:00:18 -0000 1.101
--- embl.pm 5 Sep 2006 13:43:21 -0000 1.102
***************
*** 1046,1051 ****
# do we have a genus?
my $possible_genus = $class[-1];
! if ($sci_name =~ /^$possible_genus/) {
! $genus = $possible_genus;
($species) = $sci_name =~ /^$genus\s+(.+)/;
}
--- 1046,1052 ----
# do we have a genus?
my $possible_genus = $class[-1];
! $possible_genus .= "|$class[-2]" if $class[-2];
! if ($sci_name =~ /^($possible_genus)/) {
! $genus = $1;
($species) = $sci_name =~ /^$genus\s+(.+)/;
}
Index: genbank.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/SeqIO/genbank.pm,v
retrieving revision 1.144
retrieving revision 1.145
diff -C2 -d -r1.144 -r1.145
*** genbank.pm 29 Aug 2006 18:20:12 -0000 1.144
--- genbank.pm 5 Sep 2006 13:43:21 -0000 1.145
***************
*** 1306,1311 ****
# do we have a genus?
my $possible_genus = $class[-1];
! if ($sci_name =~ /^$possible_genus/) {
! $genus = $possible_genus;
($species) = $sci_name =~ /^$genus\s+(.+)/;
}
--- 1306,1312 ----
# do we have a genus?
my $possible_genus = $class[-1];
! $possible_genus .= "|$class[-2]" if $class[-2];
! if ($sci_name =~ /^($possible_genus)/) {
! $genus = $1;
($species) = $sci_name =~ /^$genus\s+(.+)/;
}
More information about the Bioperl-guts-l
mailing list