[Bioperl-guts-l] bioperl-live/Bio/SeqIO gbdriver.pm, 1.2, 1.3 genbank.pm, 1.164, 1.165

Christopher John Fields cjfields at dev.open-bio.org
Thu Jun 7 13:49:23 EDT 2007


Update of /home/repository/bioperl/bioperl-live/Bio/SeqIO
In directory dev.open-bio.org:/tmp/cvs-serv24509

Modified Files:
	gbdriver.pm genbank.pm 
Log Message:
Some more fixes for bug 2305

Index: genbank.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/SeqIO/genbank.pm,v
retrieving revision 1.164
retrieving revision 1.165
diff -C2 -d -r1.164 -r1.165
*** genbank.pm	5 Jun 2007 23:46:12 -0000	1.164
--- genbank.pm	7 Jun 2007 17:49:21 -0000	1.165
***************
*** 204,207 ****
--- 204,213 ----
      swissprot    GenBank    GenPept    REFSEQ    embl    PDB);
  
+ our %VALID_ALPHABET = (
+     'bp' => 'dna',
+     'aa' => 'protein',
+     'rc' => '' # rc = release candidate; file has no sequences
+ );
+ 
  sub _initialize {
      my($self, at args) = @_;
***************
*** 257,263 ****
  	my @tokens = split(' ', $1);
      
-     # there should be at least six tokens in the LOCUS line; if not we may be in trouble...
-     $self->warn('Missing tokens in the LOCUS line; output may be malformed') if @tokens < 6;
-     
  	# this is important to have the id for display in e.g. FTHelper,
  	# otherwise you won't know which entry caused an error
--- 263,266 ----
***************
*** 265,271 ****
  	$params{'-display_id'} = $display_id;
  	# may still be useful if we don't want the seq
! 	$params{'-length'} = shift(@tokens);
  	# the alphabet of the entry
! 	$params{'-alphabet'} = (lc(shift @tokens) eq 'bp') ? 'dna' : 'protein';
  	# for aa there is usually no 'molecule' (mRNA etc)
  	if (($params{'-alphabet'} eq 'dna') || (@tokens > 2)) {
--- 268,287 ----
  	$params{'-display_id'} = $display_id;
  	# may still be useful if we don't want the seq
!     my $seqlength = shift(@tokens);
!     if (exists $VALID_ALPHABET{$seqlength}) {
!         # moved one token too far.  No locus name?
!         $self->warn("Bad LOCUS name?  Changing [$params{'-display_id'}] to 'unknown' and length to $display_id");
!         $params{'-display_id'} = 'unknown';
!         $params{'-length'} = $display_id;
!         # add token back...
!         unshift @tokens, $seqlength;
!     } else {
!     	$params{'-length'} = $seqlength;
!     }
  	# the alphabet of the entry
!     # shouldn't assign alphabet unless one is specifically designated (such as for rc files)
!     my $alphabet = lc(shift @tokens);
! 	$params{'-alphabet'} = (exists $VALID_ALPHABET{$alphabet}) ? $VALID_ALPHABET{$alphabet} :
!                            $self->warn("Unknown alphabet: $alphabet");
  	# for aa there is usually no 'molecule' (mRNA etc)
  	if (($params{'-alphabet'} eq 'dna') || (@tokens > 2)) {
***************
*** 1073,1077 ****
   Args    :
  
- 
  =cut
  
--- 1089,1092 ----

Index: gbdriver.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/SeqIO/gbdriver.pm,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** gbdriver.pm	27 Mar 2007 12:56:57 -0000	1.2
--- gbdriver.pm	7 Jun 2007 17:49:21 -0000	1.3
***************
*** 207,211 ****
  
  sub next_seq {
!     my $self = shift;    
      local($/) = "\n";
      my ($ann, $data, $annkey);
--- 207,211 ----
  
  sub next_seq {
!     my $self = shift;
      local($/) = "\n";
      my ($ann, $data, $annkey);
***************
*** 253,261 ****
                  chomp $seqdata->{DATA};
                  # postprocessing for some data
-                 #if (exists $POSTPROCESS_DATA{ $seqdata->{NAME} }) {
-                 #    my $mt = "_process_".lc $seqdata->{NAME};
-                 #    $self->$mt($seqdata);
-                 #}
-                 
                  if ($seqdata->{NAME} eq 'FEATURES') {
                      $self->_process_features($seqdata)
--- 253,256 ----
***************
*** 300,303 ****
--- 295,370 ----
  }
  
+ sub next_chunk {
+     my $self = shift;
+     local($/) = "\n";
+     my ($ann, $data, $annkey);
+     my $endrec = my $seenfeat = 0;
+     my $seqdata;
+     my $seenlocus;
+     my $hobj = $self->seqhandler;
+     PARSER:
+     while (defined(my $line = $self->_readline)) {
+         next if $line =~ m{^\s*$};
+         # have to catch this at the top of the loop, then exit SEQ loop on //
+         # The reason? The regex match for ann/feat keys also matches some lines
+         # in the sequence; no easy way around it since some feature keys may
+         # start with a number as well
+         if ($ann && $ann eq 'ORIGIN') {
+             SEQ:
+             while (defined($line)) {
+                 last SEQ if index($line,'//') == 0;
+                 $seqdata->{DATA} .= uc $line;
+                 $line = $self->_readline;
+             }
+             $seqdata->{DATA} =~ tr{0-9 \n}{}d;
+         }
+         $endrec = 1 if (index($line,'//')==0);
+ 
+         if ($line =~ m{^(\s{0,5})(\w+)\s+(.*)$}ox || $endrec) {
+             ($ann, $data) = ($2, $3);
+             unless ($seenlocus) {
+                 $self->throw("No LOCUS found.  Not GenBank in my book!")
+                     if ($ann ne 'LOCUS');
+                 $seenlocus = 1;
+             }
+             # use the spacer to determine the annotation type
+             my $len = length($1 || '');
+             
+             $annkey  = ($len == 0 || $len > 4)   ? 'DATA'  : $ann;
+             
+             # Push off the previously cached data to the handler
+             # whenever a new primary annotation or seqfeature is found
+             # Note use of $endrec for catching end of record
+             if (($annkey eq 'DATA') && $seqdata) {
+                 chomp $seqdata->{DATA};
+                 # postprocessing for some data
+                 if ($seqdata->{NAME} eq 'FEATURES') {
+                     $self->_process_features($seqdata)
+                 }
+                 # using handler methods in the Handler object, more centralized
+                 $hobj->data_handler($seqdata);
+                 # bail here on //
+                 last PARSER if $endrec;
+                 # reset for next round
+                 $seqdata = undef;
+             }
+ 
+             $seqdata->{NAME} =  ($len == 0) ? $ann :   # primary ann
+                                 ($len > 4 ) ? 'FEATURES': # sf feature key
+                                 $seqdata->{NAME};      # all rest are sec. ann
+             if ($seqdata->{NAME} eq 'FEATURES') {
+                 $seqdata->{FEATURE_KEY} = $ann;
+             }
+             # throw back to top if seq is found to avoid regex
+             next PARSER if $ann eq 'ORIGIN';
+         } else {
+             ($data = $line) =~ s{^\s+}{};
+             chomp $data;
+         }
+         my $delim = ($seqdata && $seqdata->{NAME} eq 'FEATURES') ? "\n" : ' ';
+         $seqdata->{$annkey} .= ($seqdata->{$annkey}) ? $delim.$data : $data;
+     }
+ }
+ 
  =head2 write_seq
  



More information about the Bioperl-guts-l mailing list