[Bioperl-guts-l] bioperl commit
Jason Stajich
jason at pub.open-bio.org
Fri Jan 30 17:12:10 EST 2004
jason
Fri Jan 30 17:12:10 EST 2004
Update of /home/repository/bioperl/bioperl-live/Bio/DB/Taxonomy
In directory pub.open-bio.org:/tmp/cvs-serv8077/Bio/DB/Taxonomy
Modified Files:
flatfile.pm
Log Message:
add API for supporting retrieval of child nodes
bioperl-live/Bio/DB/Taxonomy flatfile.pm,1.3,1.4
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/DB/Taxonomy/flatfile.pm,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- /home/repository/bioperl/bioperl-live/Bio/DB/Taxonomy/flatfile.pm 2003/06/02 18:06:13 1.3
+++ /home/repository/bioperl/bioperl-live/Bio/DB/Taxonomy/flatfile.pm 2004/01/30 22:12:10 1.4
@@ -74,7 +74,7 @@
package Bio::DB::Taxonomy::flatfile;
use vars qw(@ISA $DEFAULT_INDEX_DIR $DEFAULT_NODE_INDEX
$DEFAULT_NAME2ID_INDEX $DEFAULT_ID2NAME_INDEX
- $NCBI_TAXONOMY_HOSTNAME
+ $NCBI_TAXONOMY_HOSTNAME $DEFAULT_PARENT_INDEX
$NCBI_TAXONOMY_FILE @DIVISIONS);
use strict;
use Bio::DB::Taxonomy;
@@ -88,9 +88,12 @@
$DEFAULT_NODE_INDEX = 'nodes';
$DEFAULT_NAME2ID_INDEX = 'names2id';
$DEFAULT_ID2NAME_INDEX = 'id2names';
+$DEFAULT_PARENT_INDEX = 'parents';
$NCBI_TAXONOMY_HOSTNAME = 'ftp.ncbi.nih.gov';
$NCBI_TAXONOMY_FILE = '/pub/taxonomy/taxdump.tar.gz';
+$DB_BTREE->{'flags'} = R_DUP; # allow duplicate values in DB_File BTREEs
+
@DIVISIONS = ([qw(BCT Bacteria)],
[qw(INV Invertebrates)],
[qw(MAM Mammals)],
@@ -163,7 +166,7 @@
if( @_ > 1 ) {
($taxonid,$name) = $self->_rearrange([qw(TAXONID
- NAME)], at _);
+ NAME)], at _);
if( $name ) {
($taxonid) = $self->get_taxonid($name);
}
@@ -173,9 +176,12 @@
my $orig_taxonid = $taxonid;
my (@fields,$node,$taxonnode);
my $first = 1;
+ my @classification;
while( defined ($node = $self->{'_nodes'}->[$taxonid]) ) {
+
my ($taxid,$parent,$rank,$code,$divid) = split(SEPARATOR,$node);
my ($taxon_name) = $self->{'_id2name'}->[$taxid];
+
push @fields, $taxon_name if ($rank && $rank ne 'no rank') ;
if( $first ) {
$taxonnode = new Bio::Taxonomy::Node(-dbh => $self,
@@ -186,16 +192,10 @@
-division => $DIVISIONS[$divid]->[0]);
$first = 0;
}
-
last if $parent == 1 || ! $parent || ! $taxid;
$taxonid = $parent;
}
-
- my $speciesnode = new Bio::Species(-ncbi_taxid => $orig_taxonid,
-# -common_name => $item{'CommonName'},
-# -division => $item{'Division'});
- -classification => [@fields],
- );
+ $taxonnode->classification(@fields);
return $taxonnode;
}
@@ -221,6 +221,36 @@
return 0;
}
+
+=head2 get_Children_Taxids
+
+ Title : get_Children_Taxids
+ Usage : my @childrenids = $db->get_Children_Taxids
+ Function: Get the children of a node in the taxonomy
+ Returns : Array of Ids
+ Args : L<Bio::Taxonomy::Node> or a taxon_id
+
+
+=cut
+
+
+sub get_Children_Taxids{
+ my ($self,$node) = @_;
+ my $id;
+ if( ref($node) ) {
+ if( $node->can('object_id') ) {
+ $id = $node->object_id;
+ } elsif( $node->can('ncbi_taxid') ) {
+ $id = $node->ncbi_taxid;
+ } else {
+ $self->warn("Don't know how to extract a taxon id from the object of type ".ref($node)."\n");
+ return undef;
+ }
+ } else { $id = $node }
+ my @vals = $self->{'_parentbtree'}->get_dup($id);
+ return @vals;
+}
+
=head2 Helper methods
=cut
@@ -234,42 +264,52 @@
my $nodeindex = "$dir/$DEFAULT_NODE_INDEX";
my $name2idindex = "$dir/$DEFAULT_NAME2ID_INDEX";
my $id2nameindex = "$dir/$DEFAULT_ID2NAME_INDEX";
+ my $parent2childindex = "$dir/$DEFAULT_PARENT_INDEX";
$self->{'_nodes'} = [];
$self->{'_id2name'} = [];
$self->{'_name2id'} = {};
-
+ $self->{'_parent2children'} = {};
+
if( ! -e $nodeindex || $force ) {
+ my (%parent2children, at nodes);
open(NODES,$nodesfile) ||
$self->throw("Cannot open node file '$nodesfile' for reading");
unlink $nodeindex;
- tie ( @{$self->{'_nodes'}}, 'DB_File', $nodeindex, O_RDWR|O_CREAT,
- 0644, $DB_RECNO) ||
- $self->throw("Cannot open file '$nodeindex': $!");
+ unlink $parent2childindex;
+ my $nh = tie ( @nodes, 'DB_File', $nodeindex, O_RDWR|O_CREAT,
+ 0644, $DB_RECNO) ||
+ $self->throw("Cannot open file '$nodeindex': $!");
+ my $btree = tie( %parent2children,
+ 'DB_File', $parent2childindex,
+ O_RDWR|O_CREAT, 0644, $DB_BTREE);
while(<NODES>) {
chomp;
my ($taxid,$parent,$rank,$code,$divid) = split(/\t\|\t/,$_);
# keep this stringified
- $self->{'_nodes'}->[$taxid] = join(SEPARATOR,
- ($taxid,$parent,$rank,
- $code,$divid));
+ $nodes[$taxid] = join(SEPARATOR,
+ ($taxid,$parent,$rank,
+ $code,$divid));
+ $btree->put($parent,$taxid);
}
close(NODES);
- undef $self->{'_nodes'};
- untie( @{$self->{'_nodes'}} );
+ untie @nodes ;
+ untie %parent2children;
}
- if( ! -e $name2idindex || ! -e $id2nameindex || $force ) {
+ if( (! -e $name2idindex || -z $name2idindex)
+ || (! -e $id2nameindex || -z $id2nameindex)
+ || $force ) {
open(NAMES,$namesfile) ||
$self->throw("Cannot open names file '$namesfile' for reading");
unlink $name2idindex;
unlink $id2nameindex;
-
- tie (@{$self->{'_id2name'}}, 'DB_File', $id2nameindex,
+ my (@id2name,%name2id);
+ my $idh = tie (@id2name, 'DB_File', $id2nameindex,
O_RDWR|O_CREAT, 0644, $DB_RECNO) ||
$self->throw("Cannot open file '$id2nameindex': $!");
- tie ( %{$self->{'_name2id'}}, 'DB_File', $name2idindex, O_RDWR|O_CREAT,
+ my $nameh = tie ( %name2id, 'DB_File', $name2idindex, O_RDWR|O_CREAT,
0644, $DB_HASH) ||
$self->throw("Cannot open file '$name2idindex': $!");
@@ -279,19 +319,17 @@
$class =~ s/\s+\|\s*$//;
$uniquename = $name unless $uniquename;
my $idx = lc($name);
- $self->{'_name2id'}->{$idx} = join(SEPARATOR,
- ($taxid, $name,$uniquename,
- $class));
+ $nameh->put($idx, join(SEPARATOR,
+ ($taxid, $name,$uniquename,
+ $class)));
if( $class && $class eq 'scientific name' ) {
# only store the id2name lookup when it is the "proper" name
- $self->{'_id2name'}->[$taxid] = $uniquename;
+ $id2name[$taxid] = $uniquename;
}
}
close(NAMES);
- undef $self->{'_id2name'};
- undef $self->{'_name2id'};
- untie( %{$self->{'_name2id'}} );
- untie( @{$self->{'_id2name'}} );
+ untie( %name2id);
+ untie( @id2name);
}
}
@@ -316,20 +354,24 @@
my $nodeindex = "$dir/$DEFAULT_NODE_INDEX";
my $name2idindex = "$dir/$DEFAULT_NAME2ID_INDEX";
my $id2nameindex = "$dir/$DEFAULT_ID2NAME_INDEX";
+ my $parent2childindex = "$dir/$DEFAULT_PARENT_INDEX";
+
if( ! -e $nodeindex ||
! -e $name2idindex ||
! -e $id2nameindex ) {
$self->warn("Index files have not been created");
return 0;
}
- tie ( @{$self->{'_nodes'}}, 'DB_File',
- $nodeindex, O_RDONLY,0644, $DB_RECNO)
+ tie ( @{$self->{'_nodes'}}, 'DB_File', $nodeindex, O_RDWR,undef, $DB_RECNO)
|| $self->throw("$! $nodeindex");
- tie (@{$self->{'_id2name'}}, 'DB_File', $id2nameindex,O_RDONLY, 0644,
+ tie (@{$self->{'_id2name'}}, 'DB_File', $id2nameindex,O_RDWR, undef,
$DB_RECNO) || $self->throw("$! $id2nameindex");
- tie ( %{$self->{'_name2id'}}, 'DB_File', $name2idindex, O_RDONLY,0644,
+ tie ( %{$self->{'_name2id'}}, 'DB_File', $name2idindex, O_RDWR,undef,
$DB_HASH) || $self->throw("$! $name2idindex");
+ $self->{'_parentbtree'} = tie( %{$self->{'_parent2children'}},
+ 'DB_File', $parent2childindex,
+ O_RDWR, 0644, $DB_BTREE);
$self->{'_initialized'} = 1;
}
More information about the Bioperl-guts-l
mailing list