[Bioperl-guts-l] bioperl-live/scripts/Bio-DB-GFF genbank2gff3.PLS,
1.5, 1.6
Sheldon Mckay
smckay at pub.open-bio.org
Thu Mar 24 08:03:27 EST 2005
Update of /home/repository/bioperl/bioperl-live/scripts/Bio-DB-GFF
In directory pub.open-bio.org:/tmp/cvs-serv20214/scripts/Bio-DB-GFF
Modified Files:
genbank2gff3.PLS
Log Message:
The script was silently broken by a change to the default behaviour of
the Bio::FeatureHolderI::get_all_SeqFeatures method.
I also:
1) made some changes to how shared exons are handled in response to
the updated GFF3 spec.
2) dealt with labelling problem in gene containment hierarchies derived from
third-party (non-refseq) annotations.
I think more testing is required.
Index: genbank2gff3.PLS
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/scripts/Bio-DB-GFF/genbank2gff3.PLS,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** genbank2gff3.PLS 17 Jan 2005 19:48:57 -0000 1.5
--- genbank2gff3.PLS 24 Mar 2005 13:03:25 -0000 1.6
***************
*** 84,88 ****
--- 84,90 ----
use strict;
+ use lib "$ENV{HOME}/bioperl-live";
use Pod::Usage;
+ use Bio::Root::RootI;
use Bio::SeqIO;
use Bio::SeqFeature::Tools::Unflattener;
***************
*** 93,97 ****
use vars qw/$split @filter $zip $outdir $help $ethresh
! $file @files $dir $summary $nolump/;
$| = 1;
--- 95,100 ----
use vars qw/$split @filter $zip $outdir $help $ethresh
! $file @files $dir $summary $nolump
! $gene_id $rna_id $tnum %method %id %seen/;
$| = 1;
***************
*** 159,164 ****
$outdir .= '/' unless $outdir =~ m|/$|;
- my ($gene_id, $rna_id, $tnum, %method, %id);
-
for my $file ( @files ) {
chomp $file;
--- 162,165 ----
***************
*** 184,192 ****
my $in = Bio::SeqIO->new(-fh => \*FH, -format => 'GenBank');
while ( my $seq = $in->next_seq ) {
my $seq_name = $seq->accession;
my $end = $seq->length;
!
# arrange disposition of GFF output
$outfile = $lump || $outdir . $seq->accession . ".gff";
--- 185,195 ----
my $in = Bio::SeqIO->new(-fh => \*FH, -format => 'GenBank');
+ my $gffio = Bio::Tools::GFF->new( -noparse => 1, -gff_version => 3 );
while ( my $seq = $in->next_seq ) {
my $seq_name = $seq->accession;
my $end = $seq->length;
! my @to_print;
!
# arrange disposition of GFF output
$outfile = $lump || $outdir . $seq->accession . ".gff";
***************
*** 212,237 ****
unflatten_seq($seq);
- my $gffio = Bio::Tools::GFF->new( -noparse => 1, -gff_version => 3 );
-
# construct a GFF header
print $out &gff_header($seq_name, $end);
! for my $feature ($seq->get_all_SeqFeatures) {
$feature->source_tag('GenBank');
! my $method = $feature->primary_tag;
!
# unflattened gene parts will have this tag
! if ( $feature->has_tag('gene') ) {
! my $tempstr = &gene_features($feature,$gffio);
! print $out "$tempstr\n" if $tempstr;
! }
# otherwise handle as generic feats with IDHandler labels
! else {
! my $tempstr = &generic_features($feature,$gffio,$seq_name);
! print $out "$tempstr\n" if $tempstr;
! }
}
! $gffio->close;
# deal with the corresponding DNA
--- 215,245 ----
unflatten_seq($seq);
# construct a GFF header
print $out &gff_header($seq_name, $end);
! # Note that we use our own get_all_SeqFeatures function
! # to rescue cloned exons
! for my $feature ( get_all_SeqFeatures($seq) ) {
!
$feature->source_tag('GenBank');
! my $method = $feature->primary_tag;
!
# unflattened gene parts will have this tag
! if ( $feature->has_tag('gene') ) {
! my $unique = gene_features($feature);
! push @to_print, $feature if $unique;
! }
!
# otherwise handle as generic feats with IDHandler labels
! else {
! my $gff = generic_features($feature,$gffio,$seq_name);
! print $out "$gff\n" if $gff;
! }
}
! for my $printme ( @to_print ) {
! my $gff = $gffio->gff_string($printme);
! print $out "$gff\n";
! }
# deal with the corresponding DNA
***************
*** 280,284 ****
sub gene_features {
! my ($f, $io) = @_;
local $_ = $f->primary_tag;
$method{$_}++;
--- 288,292 ----
sub gene_features {
! my $f = shift;
local $_ = $f->primary_tag;
$method{$_}++;
***************
*** 298,302 ****
$f->add_tag_value( Parent => $gene_id );
}
! elsif ( /exon/ || /CDS/ ) {
return 0 unless $rna_id;
$f->add_tag_value( Parent => $rna_id );
--- 306,310 ----
$f->add_tag_value( Parent => $gene_id );
}
! elsif ( /exon/ || /CDS/ ) {
return 0 unless $rna_id;
$f->add_tag_value( Parent => $rna_id );
***************
*** 307,311 ****
}
! $io->gff_string($f) ;
}
--- 315,322 ----
}
! # now we can skip cloned exons
! return 0 if /exon/ && ++$seen{$f} > 1;
!
! return 1;
}
***************
*** 324,328 ****
}
! $io->gff_string($f) ;
}
--- 335,339 ----
}
! $io->gff_string($f);
}
***************
*** 361,368 ****
# map feature types to the sequence ontology
$tm->map_types_to_SO( -seq => $seq );
!
1;
}
sub filter {
my $seq = shift;
--- 372,404 ----
# map feature types to the sequence ontology
$tm->map_types_to_SO( -seq => $seq );
!
! # recursively add gene tags if required
! # to rescue unlabelled containment hierarchies
! for my $f ($seq->get_SeqFeatures) {
! if ($f->has_tag('gene')) {
! ($gene_id) = $f->get_tag_values('gene');
! label_geneparts($f);
! }
! }
!
1;
}
+ sub label_geneparts {
+ my $g = shift;
+
+ for my $child ( $g->get_SeqFeatures ) {
+ add_gene($child) unless $child->has_tag('gene');
+ for my $grandchild ( $child->get_SeqFeatures ) {
+ add_gene($grandchild) unless $grandchild->has_tag('gene');
+ }
+ }
+ }
+
+ sub add_gene {
+ my $f = shift;
+ $f->add_tag_value( gene => $gene_id );
+ }
+
sub filter {
my $seq = shift;
***************
*** 377,378 ****
--- 413,452 ----
$seq->add_SeqFeature(@feats) if @feats;
}
+
+
+ # The default behaviour of Bio::FeatureHolderI:get_all_SeqFeatures
+ # changed to filter out cloned features. We have to implement the old
+ # method. These two subroutines were adapted from the v1.4 Bio::FeatureHolderI
+ sub get_all_SeqFeatures {
+ my $seq = shift;
+ my @flatarr;
+
+ foreach my $feat ( $seq->get_SeqFeatures ){
+ push(@flatarr,$feat);
+ _add_flattened_SeqFeatures(\@flatarr,$feat);
+ }
+ return @flatarr;
+ }
+
+ sub _add_flattened_SeqFeatures {
+ my ($arrayref,$feat) = @_;
+ my @subs = ();
+
+ if ($feat->isa("Bio::FeatureHolderI")) {
+ @subs = $feat->get_SeqFeatures;
+ }
+ elsif ($feat->isa("Bio::SeqFeatureI")) {
+ @subs = $feat->sub_SeqFeature;
+ }
+ else {
+ warn ref($feat)." is neither a FeatureHolderI nor a SeqFeatureI. ".
+ "Don't know how to flatten.";
+ }
+
+ for my $sub (@subs) {
+ push(@$arrayref,$sub);
+ _add_flattened_SeqFeatures($arrayref,$sub);
+ }
+
+ }
+
More information about the Bioperl-guts-l
mailing list