[Bioperl-guts-l] bioperl-live/scripts/Bio-DB-GFF genbank2gff3.PLS, 1.5, 1.6

Sheldon Mckay smckay at pub.open-bio.org
Thu Mar 24 08:03:27 EST 2005


Update of /home/repository/bioperl/bioperl-live/scripts/Bio-DB-GFF
In directory pub.open-bio.org:/tmp/cvs-serv20214/scripts/Bio-DB-GFF

Modified Files:
	genbank2gff3.PLS 
Log Message:
The script was silently broken by a change to the default behaviour of
the Bio::FeatureHolderI::get_all_SeqFeatures method.

I also:
1) made some changes to how shared exons are handled in response to
the updated GFF3 spec.
2) dealt with labelling problem in gene containment hierarchies derived from
third-party (non-refseq) annotations.

I think more testing is required.



Index: genbank2gff3.PLS
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/scripts/Bio-DB-GFF/genbank2gff3.PLS,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** genbank2gff3.PLS	17 Jan 2005 19:48:57 -0000	1.5
--- genbank2gff3.PLS	24 Mar 2005 13:03:25 -0000	1.6
***************
*** 84,88 ****
--- 84,90 ----
  use strict;
  
+ use lib "$ENV{HOME}/bioperl-live";
  use Pod::Usage;
+ use Bio::Root::RootI;
  use Bio::SeqIO;
  use Bio::SeqFeature::Tools::Unflattener;
***************
*** 93,97 ****
  
  use vars qw/$split @filter $zip $outdir $help $ethresh
!             $file @files $dir $summary $nolump/;
  
  $| = 1;
--- 95,100 ----
  
  use vars qw/$split @filter $zip $outdir $help $ethresh
!             $file @files $dir $summary $nolump
!             $gene_id $rna_id $tnum %method %id %seen/;
  
  $| = 1;
***************
*** 159,164 ****
  $outdir .= '/' unless $outdir =~ m|/$|;
  
- my ($gene_id, $rna_id, $tnum, %method, %id);
- 
  for my $file ( @files ) {
      chomp $file;
--- 162,165 ----
***************
*** 184,192 ****
  
      my $in = Bio::SeqIO->new(-fh => \*FH, -format => 'GenBank');
  
      while ( my $seq = $in->next_seq ) {
  	my $seq_name = $seq->accession;
  	my $end = $seq->length;
!         
          # arrange disposition of GFF output
          $outfile = $lump || $outdir . $seq->accession . ".gff";
--- 185,195 ----
  
      my $in = Bio::SeqIO->new(-fh => \*FH, -format => 'GenBank');
+     my $gffio = Bio::Tools::GFF->new( -noparse => 1, -gff_version => 3 );
  
      while ( my $seq = $in->next_seq ) {
  	my $seq_name = $seq->accession;
  	my $end = $seq->length;
! 	my @to_print;
! 
          # arrange disposition of GFF output
          $outfile = $lump || $outdir . $seq->accession . ".gff";
***************
*** 212,237 ****
          unflatten_seq($seq);
  
- 	my $gffio = Bio::Tools::GFF->new( -noparse => 1, -gff_version => 3 );
- 
          # construct a GFF header
          print $out &gff_header($seq_name, $end);
  
! 	for my $feature ($seq->get_all_SeqFeatures) {
  	    $feature->source_tag('GenBank');
!             my $method = $feature->primary_tag;
! 
  	    # unflattened gene parts will have this tag
!             if ( $feature->has_tag('gene') ) {
!                 my $tempstr = &gene_features($feature,$gffio);
!                 print $out "$tempstr\n" if $tempstr;
!             }
  	    # otherwise handle as generic feats with IDHandler labels 
!             else {
!                 my $tempstr = &generic_features($feature,$gffio,$seq_name);
!                 print $out "$tempstr\n" if $tempstr;
!             }
  	}
  
! 	$gffio->close;
  
  	# deal with the corresponding DNA
--- 215,245 ----
          unflatten_seq($seq);
  
          # construct a GFF header
          print $out &gff_header($seq_name, $end);
  
! 	# Note that we use our own get_all_SeqFeatures function 
!         # to rescue cloned exons
! 	for my $feature ( get_all_SeqFeatures($seq) ) {
! 	    
  	    $feature->source_tag('GenBank');
! 	    my $method = $feature->primary_tag;
! 	    
  	    # unflattened gene parts will have this tag
! 	    if ( $feature->has_tag('gene') ) {
! 		my $unique = gene_features($feature);
! 		push @to_print, $feature if $unique;
! 	    }
! 	    
  	    # otherwise handle as generic feats with IDHandler labels 
! 	    else {
! 		my $gff = generic_features($feature,$gffio,$seq_name);
! 		print $out "$gff\n" if $gff;
! 	    }
  	}
  
!         for my $printme ( @to_print ) {
! 	    my $gff = $gffio->gff_string($printme);
!             print $out "$gff\n";
!         }
  
  	# deal with the corresponding DNA
***************
*** 280,284 ****
  
  sub gene_features {
!     my ($f, $io) = @_;
      local $_ = $f->primary_tag;
      $method{$_}++;
--- 288,292 ----
  
  sub gene_features {
!     my $f = shift;
      local $_ = $f->primary_tag;
      $method{$_}++;
***************
*** 298,302 ****
  	$f->add_tag_value( Parent => $gene_id );
      }
!     elsif ( /exon/ || /CDS/  ) {
  	return 0 unless $rna_id;
  	$f->add_tag_value( Parent => $rna_id );
--- 306,310 ----
  	$f->add_tag_value( Parent => $gene_id );
      }
!     elsif ( /exon/ || /CDS/ ) {
  	return 0 unless $rna_id;
  	$f->add_tag_value( Parent => $rna_id );
***************
*** 307,311 ****
      }
      
!     $io->gff_string($f) ;
  }
  
--- 315,322 ----
      }
      
!     # now we can skip cloned exons
!     return 0 if /exon/ && ++$seen{$f} > 1;
! 
!     return 1;
  }
  
***************
*** 324,328 ****
      }
  
!     $io->gff_string($f) ;
  }
  
--- 335,339 ----
      }
  
!     $io->gff_string($f);
  }
  
***************
*** 361,368 ****
      # map feature types to the sequence ontology
      $tm->map_types_to_SO( -seq => $seq );
!     
      1;
  }
  
  sub filter {
      my $seq = shift;
--- 372,404 ----
      # map feature types to the sequence ontology
      $tm->map_types_to_SO( -seq => $seq );
! 
!     # recursively add gene tags if required
!     # to rescue unlabelled containment hierarchies
!     for my $f ($seq->get_SeqFeatures) {
! 	if ($f->has_tag('gene')) {
! 	    ($gene_id) = $f->get_tag_values('gene'); 
! 	    label_geneparts($f);
! 	}
!     }
! 
      1;
  }
  
+ sub label_geneparts {
+     my $g = shift;
+ 
+     for my $child ( $g->get_SeqFeatures ) {
+ 	add_gene($child) unless $child->has_tag('gene');
+ 	for my $grandchild ( $child->get_SeqFeatures ) {
+ 	    add_gene($grandchild) unless $grandchild->has_tag('gene');
+ 	}
+     }
+ }
+ 
+ sub add_gene {
+     my $f = shift;
+     $f->add_tag_value( gene => $gene_id );
+ }
+ 
  sub filter {
      my $seq = shift;
***************
*** 377,378 ****
--- 413,452 ----
      $seq->add_SeqFeature(@feats) if @feats;
  }
+ 
+ 
+ # The default behaviour of Bio::FeatureHolderI:get_all_SeqFeatures
+ # changed to filter out cloned features.  We have to implement the old
+ # method.  These two subroutines were adapted from the v1.4 Bio::FeatureHolderI
+ sub get_all_SeqFeatures  {
+     my $seq = shift;
+     my @flatarr;
+ 
+     foreach my $feat ( $seq->get_SeqFeatures ){
+         push(@flatarr,$feat);
+         _add_flattened_SeqFeatures(\@flatarr,$feat);
+     }
+     return @flatarr;
+ }
+ 
+ sub _add_flattened_SeqFeatures  {
+     my ($arrayref,$feat) = @_;
+     my @subs = ();
+ 
+     if ($feat->isa("Bio::FeatureHolderI")) {
+ 	@subs = $feat->get_SeqFeatures;
+     } 
+     elsif ($feat->isa("Bio::SeqFeatureI")) {
+ 	@subs = $feat->sub_SeqFeature;
+     }
+     else {
+ 	warn ref($feat)." is neither a FeatureHolderI nor a SeqFeatureI. ".
+ 	    "Don't know how to flatten.";
+     }
+ 
+     for my $sub (@subs) {
+ 	push(@$arrayref,$sub);
+ 	_add_flattened_SeqFeatures($arrayref,$sub);
+     }
+ 
+ }
+ 



More information about the Bioperl-guts-l mailing list