[Bioperl-guts-l] bioperl-live/Bio/SeqIO gbdriver.pm, 1.2, 1.3 genbank.pm, 1.164, 1.165
Christopher John Fields
cjfields at dev.open-bio.org
Thu Jun 7 13:49:23 EDT 2007
Update of /home/repository/bioperl/bioperl-live/Bio/SeqIO
In directory dev.open-bio.org:/tmp/cvs-serv24509
Modified Files:
gbdriver.pm genbank.pm
Log Message:
Some more fixes for bug 2305
Index: genbank.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/SeqIO/genbank.pm,v
retrieving revision 1.164
retrieving revision 1.165
diff -C2 -d -r1.164 -r1.165
*** genbank.pm 5 Jun 2007 23:46:12 -0000 1.164
--- genbank.pm 7 Jun 2007 17:49:21 -0000 1.165
***************
*** 204,207 ****
--- 204,213 ----
swissprot GenBank GenPept REFSEQ embl PDB);
+ our %VALID_ALPHABET = (
+ 'bp' => 'dna',
+ 'aa' => 'protein',
+ 'rc' => '' # rc = release candidate; file has no sequences
+ );
+
sub _initialize {
my($self, at args) = @_;
***************
*** 257,263 ****
my @tokens = split(' ', $1);
- # there should be at least six tokens in the LOCUS line; if not we may be in trouble...
- $self->warn('Missing tokens in the LOCUS line; output may be malformed') if @tokens < 6;
-
# this is important to have the id for display in e.g. FTHelper,
# otherwise you won't know which entry caused an error
--- 263,266 ----
***************
*** 265,271 ****
$params{'-display_id'} = $display_id;
# may still be useful if we don't want the seq
! $params{'-length'} = shift(@tokens);
# the alphabet of the entry
! $params{'-alphabet'} = (lc(shift @tokens) eq 'bp') ? 'dna' : 'protein';
# for aa there is usually no 'molecule' (mRNA etc)
if (($params{'-alphabet'} eq 'dna') || (@tokens > 2)) {
--- 268,287 ----
$params{'-display_id'} = $display_id;
# may still be useful if we don't want the seq
! my $seqlength = shift(@tokens);
! if (exists $VALID_ALPHABET{$seqlength}) {
! # moved one token too far. No locus name?
! $self->warn("Bad LOCUS name? Changing [$params{'-display_id'}] to 'unknown' and length to $display_id");
! $params{'-display_id'} = 'unknown';
! $params{'-length'} = $display_id;
! # add token back...
! unshift @tokens, $seqlength;
! } else {
! $params{'-length'} = $seqlength;
! }
# the alphabet of the entry
! # shouldn't assign alphabet unless one is specifically designated (such as for rc files)
! my $alphabet = lc(shift @tokens);
! $params{'-alphabet'} = (exists $VALID_ALPHABET{$alphabet}) ? $VALID_ALPHABET{$alphabet} :
! $self->warn("Unknown alphabet: $alphabet");
# for aa there is usually no 'molecule' (mRNA etc)
if (($params{'-alphabet'} eq 'dna') || (@tokens > 2)) {
***************
*** 1073,1077 ****
Args :
-
=cut
--- 1089,1092 ----
Index: gbdriver.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/SeqIO/gbdriver.pm,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** gbdriver.pm 27 Mar 2007 12:56:57 -0000 1.2
--- gbdriver.pm 7 Jun 2007 17:49:21 -0000 1.3
***************
*** 207,211 ****
sub next_seq {
! my $self = shift;
local($/) = "\n";
my ($ann, $data, $annkey);
--- 207,211 ----
sub next_seq {
! my $self = shift;
local($/) = "\n";
my ($ann, $data, $annkey);
***************
*** 253,261 ****
chomp $seqdata->{DATA};
# postprocessing for some data
- #if (exists $POSTPROCESS_DATA{ $seqdata->{NAME} }) {
- # my $mt = "_process_".lc $seqdata->{NAME};
- # $self->$mt($seqdata);
- #}
-
if ($seqdata->{NAME} eq 'FEATURES') {
$self->_process_features($seqdata)
--- 253,256 ----
***************
*** 300,303 ****
--- 295,370 ----
}
+ sub next_chunk {
+ my $self = shift;
+ local($/) = "\n";
+ my ($ann, $data, $annkey);
+ my $endrec = my $seenfeat = 0;
+ my $seqdata;
+ my $seenlocus;
+ my $hobj = $self->seqhandler;
+ PARSER:
+ while (defined(my $line = $self->_readline)) {
+ next if $line =~ m{^\s*$};
+ # have to catch this at the top of the loop, then exit SEQ loop on //
+ # The reason? The regex match for ann/feat keys also matches some lines
+ # in the sequence; no easy way around it since some feature keys may
+ # start with a number as well
+ if ($ann && $ann eq 'ORIGIN') {
+ SEQ:
+ while (defined($line)) {
+ last SEQ if index($line,'//') == 0;
+ $seqdata->{DATA} .= uc $line;
+ $line = $self->_readline;
+ }
+ $seqdata->{DATA} =~ tr{0-9 \n}{}d;
+ }
+ $endrec = 1 if (index($line,'//')==0);
+
+ if ($line =~ m{^(\s{0,5})(\w+)\s+(.*)$}ox || $endrec) {
+ ($ann, $data) = ($2, $3);
+ unless ($seenlocus) {
+ $self->throw("No LOCUS found. Not GenBank in my book!")
+ if ($ann ne 'LOCUS');
+ $seenlocus = 1;
+ }
+ # use the spacer to determine the annotation type
+ my $len = length($1 || '');
+
+ $annkey = ($len == 0 || $len > 4) ? 'DATA' : $ann;
+
+ # Push off the previously cached data to the handler
+ # whenever a new primary annotation or seqfeature is found
+ # Note use of $endrec for catching end of record
+ if (($annkey eq 'DATA') && $seqdata) {
+ chomp $seqdata->{DATA};
+ # postprocessing for some data
+ if ($seqdata->{NAME} eq 'FEATURES') {
+ $self->_process_features($seqdata)
+ }
+ # using handler methods in the Handler object, more centralized
+ $hobj->data_handler($seqdata);
+ # bail here on //
+ last PARSER if $endrec;
+ # reset for next round
+ $seqdata = undef;
+ }
+
+ $seqdata->{NAME} = ($len == 0) ? $ann : # primary ann
+ ($len > 4 ) ? 'FEATURES': # sf feature key
+ $seqdata->{NAME}; # all rest are sec. ann
+ if ($seqdata->{NAME} eq 'FEATURES') {
+ $seqdata->{FEATURE_KEY} = $ann;
+ }
+ # throw back to top if seq is found to avoid regex
+ next PARSER if $ann eq 'ORIGIN';
+ } else {
+ ($data = $line) =~ s{^\s+}{};
+ chomp $data;
+ }
+ my $delim = ($seqdata && $seqdata->{NAME} eq 'FEATURES') ? "\n" : ' ';
+ $seqdata->{$annkey} .= ($seqdata->{$annkey}) ? $delim.$data : $data;
+ }
+ }
+
=head2 write_seq
More information about the Bioperl-guts-l
mailing list