[Bioperl-guts-l] [14506] bioperl-live/trunk/Bio/SeqIO/swiss.pm: Rewrote RX line parsing and writing to allow DOI values seen and printed out
Heikki Lehvaslaiho
heikki at dev.open-bio.org
Wed Feb 13 03:08:41 EST 2008
Revision: 14506
Author: heikki
Date: 2008-02-13 03:08:41 -0500 (Wed, 13 Feb 2008)
Log Message:
-----------
Rewrote RX line parsing and writing to allow DOI values seen and printed out
Modified Paths:
--------------
bioperl-live/trunk/Bio/SeqIO/swiss.pm
Modified: bioperl-live/trunk/Bio/SeqIO/swiss.pm
===================================================================
--- bioperl-live/trunk/Bio/SeqIO/swiss.pm 2008-02-13 06:51:26 UTC (rev 14505)
+++ bioperl-live/trunk/Bio/SeqIO/swiss.pm 2008-02-13 08:08:41 UTC (rev 14506)
@@ -537,25 +537,29 @@
# j.gilbert and h.lapp agreed that the rp line in swissprot seems
# more like a comment than a parseable value, so print it as is
if ($ref->rp) {
- $self->_write_line_swissprot_regex("RP ","RP ",$ref->rp,
- "\\s\+\|\$",80);
+ $self->_write_line_swissprot_regex("RP ","RP ",$ref->rp,
+ "\\s\+\|\$",80);
}
if ($ref->comment) {
$self->_write_line_swissprot_regex("RC ","RC ",$ref->comment,
"\\s\+\|\$",80);
}
- if ($ref->medline) {
+ if ($ref->medline or $ref->pubmed or $ref->doi) {
+ use Data::Dumper; print Dumper $ref; # Heikki
# new RX format in swissprot LP 09/17/00
- if ($ref->pubmed) {
+ # RX line can now have a DOI, Heikki 13 Feb 2008
+
+ my $line;
+ $line .= "MEDLINE=". $ref->medline. '; ' if $ref->medline;
+ $line .= "PubMed=". $ref->pubmed. '; ' if $ref->pubmed;
+ $line .= "DOI=". $ref->doi. '; ' if $ref->doi;
+ chop $line;
+
$self->_write_line_swissprot_regex("RX ","RX ",
- "MEDLINE=".$ref->medline.
- "; PubMed=".$ref->pubmed.";",
- "\\s\+\|\$",80);
- } else {
- $self->_write_line_swissprot_regex("RX MEDLINE; ","RX MEDLINE; ",
- $ref->medline.".","\\s\+\|\$",80);
+ $line,
+ "\\s\+\|\$",80);
+
}
- }
my $author = $ref->authors .';' if($ref->authors);
my $title = $ref->title .';' if( $ref->title);
my $rg = $ref->rg . ';' if $ref->rg;
@@ -888,60 +892,62 @@
sub _read_swissprot_References{
my ($self,$line) = @_;
- my ($b1, $b2, $rp, $rg, $title, $loc, $au, $med, $com, $pubmed);
+ my ($b1, $b2, $rp, $rg, $title, $loc, $au, $med, $com, $pubmed, $doi);
my @refs;
local $_ = $line;
- while( defined $_ ) {
- if( /^[^R]/ || /^RN/ ) {
- if( $rp ) {
- $rg =~ s/;\s*$//g if defined($rg);
+ while ( defined $_ ) {
+ if ( /^[^R]/ || /^RN/ ) {
+ if ( $rp ) {
+ $rg =~ s/;\s*$//g if defined($rg);
if (defined($au)) {
$au =~ s/;\s*$//;
} else {
$au = $rg;
}
$title =~ s/;\s*$//g if defined($title);
- push @refs, Bio::Annotation::Reference->new
- (-title => $title,
- -start => $b1,
- -end => $b2,
- -authors => $au,
- -location=> $loc,
- -medline => $med,
- -pubmed => $pubmed,
- -comment => $com,
- -rp => $rp,
- -rg => $rg,
- -tagname => 'reference',
- );
+ push @refs, Bio::Annotation::Reference->new
+ (-title => $title,
+ -start => $b1,
+ -end => $b2,
+ -authors => $au,
+ -location=> $loc,
+ -medline => $med,
+ -pubmed => $pubmed,
+ -doi => $doi,
+ -comment => $com,
+ -rp => $rp,
+ -rg => $rg,
+ -tagname => 'reference',
+ );
# reset state for the next reference
- $rp = '';
- }
+ $rp = '';
+ }
if (index($_,'R') != 0) {
$self->_pushback($_); # want this line to go back on the list
- last; # may be the safest exit point HL 05/11/2000
+ last; # may be the safest exit point HL 05/11/2000
}
# don't forget to reset the state for the next reference
- $b1 = $b2 = $rg = $med = $com = $pubmed = undef;
+ $b1 = $b2 = $rg = $med = $com = $pubmed = $doi = undef;
$title = $loc = $au = undef;
} elsif ( /^RP\s{3}(.+? OF (\d+)-(\d+).*)/) {
- $rp .= $1;
- $b1 = $2;
- $b2 = $3;
+ $rp .= $1;
+ $b1 = $2;
+ $b2 = $3;
} elsif ( /^RP\s{3}(.*)/) {
- if($rp) { $rp .= " ".$1 }
- else { $rp = $1 }
- } elsif( /^RX\s{3}MEDLINE;\s+(\d+)(?!<;)/ ) {
- $med = $1;
- } elsif( /^RX\s{3}MEDLINE=(\d+);\s+PubMed=(\d+);/ ) {
- $med = $1;
- $pubmed= $2;
- } elsif( /^RX\s{3}PubMed=(\d+);/ ) { # can start with pubmed only
- $pubmed = $1;
- } elsif( /^RA\s{3}(.*)/ ) {
- $au .= $au ? " $1" : $1;
- } elsif( /^RG\s{3}(.*)/ ) {
- $rg .= $rg ? " $1" : $1;
+ if ($rp) {
+ $rp .= " ".$1;
+ } else {
+ $rp = $1;
+ }
+ } elsif (/^RX\s{3}(.*)/) { # each reference can have only one RX line
+ my $line = $1;
+ $med = $1 if $line =~ /MEDLINE=(\d+);/;
+ $pubmed = $1 if $line =~ /PubMed=(\d+);/;
+ $doi = $1 if $line =~ /DOI=([^;]+);/;
+ } elsif ( /^RA\s{3}(.*)/ ) {
+ $au .= $au ? " $1" : $1;
+ } elsif ( /^RG\s{3}(.*)/ ) {
+ $rg .= $rg ? " $1" : $1;
} elsif ( /^RT\s{3}(.*)/ ) {
if ($title) {
my $tline = $1;
@@ -950,12 +956,10 @@
$title = $1;
}
} elsif (/^RL\s{3}(.*)/ ) {
- $loc .= $loc ? " $1" : $1;
+ $loc .= $loc ? " $1" : $1;
} elsif ( /^RC\s{3}(.*)/ ) {
- $com .= $com ? " $1" : $1;
- }
- #/^CC/ && last;
- #/^SQ/ && last; # there may be sequences without CC lines! HL 05/11/2000
+ $com .= $com ? " $1" : $1;
+ }
$_ = $self->_readline;
}
return \@refs;
More information about the Bioperl-guts-l
mailing list