[Bioperl-guts-l] bioperl-live/Bio/DB/SeqFeature/Store berkeleydb.pm, NONE, 1.1 GFF3Loader.pm, 1.11, 1.12 memory.pm, 1.1, 1.2
Lincoln Stein
lstein at dev.open-bio.org
Tue Jun 20 17:26:34 EDT 2006
Update of /home/repository/bioperl/bioperl-live/Bio/DB/SeqFeature/Store
In directory dev.open-bio.org:/tmp/cvs-serv3563/Bio/DB/SeqFeature/Store
Modified Files:
GFF3Loader.pm memory.pm
Added Files:
berkeleydb.pm
Log Message:
implementation of Bio::DB::SeqFeature::Store in BerkeleyDB databases; syntax checked but not tested in any other way
Index: memory.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/DB/SeqFeature/Store/memory.pm,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** memory.pm 19 Jun 2006 04:21:37 -0000 1.1
--- memory.pm 20 Jun 2006 21:26:32 -0000 1.2
***************
*** 272,276 ****
my @search_terms = ref($attributes->{$att_name}) && ref($attributes->{$att_name}) eq 'ARRAY'
? @{$attributes->{$att_name}} : $attributes->{$att_name};
- my $glob_match;
my @regexp_terms;
my @terms;
--- 272,275 ----
Index: GFF3Loader.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/DB/SeqFeature/Store/GFF3Loader.pm,v
retrieving revision 1.11
retrieving revision 1.12
diff -C2 -d -r1.11 -r1.12
*** GFF3Loader.pm 19 Jun 2006 04:21:37 -0000 1.11
--- GFF3Loader.pm 20 Jun 2006 21:26:32 -0000 1.12
***************
*** 620,624 ****
else {
! $self->tmp_store->store($f)
}
--- 620,624 ----
else {
! $self->tmp_store->store_noindex($f)
}
--- NEW FILE: berkeleydb.pm ---
package Bio::DB::SeqFeature::Store::berkeleydb;
# $Id: berkeleydb.pm,v 1.1 2006/06/20 21:26:32 lstein Exp $
use strict;
use base 'Bio::DB::SeqFeature::Store';
use Bio::DB::GFF::Util::Rearrange 'rearrange';
use DB_File;
use Fcntl qw(O_RDWR O_CREAT);
use File::Temp 'tempdir';
use File::Path 'rmtree','mkpath';
use constant BINSIZE => 10_000;
# this will eventually be renamed bdb.pm, but right now I don't want to break what's already written.
###
# object initialization
#
sub init {
my $self = shift;
my ($directory,
$autoindex,
$is_temporary,
$write,
$create,
) = rearrange([['DSN','DB'],
[qw(DIR AUTOINDEX)],
['TMP','TEMP','TEMPORARY'],
[qw(WRITE WRITABLE)],
'CREATE',
], at _);
$directory ||= $autoindex;
$directory ||= $is_temporary ? File::Spec->tmpdir : '.';
$directory = tempdir(__PACKAGE__.'_XXXXXX',TMPDIR=>1,CLEANUP=>1,DIR=>$directory) if $is_temporary;
-d $directory or $self->throw("Invalid directory $directory");
$create++ if $is_temporary;
$write ||= $create;
$write && -w $directory or $self->throw("Can't write into the directory $directory");
$self->default_settings;
$self->directory($directory);
$self->temporary($is_temporary);
$self->_autoindex($autoindex) if $autoindex;
$self->_open_databases($write,$create);
return $self;
}
sub can_store_parentage { 1 }
sub _autoindex {
my $self = shift;
my $autoindex_dir = shift;
# do something here
}
sub _open_databases {
my $self = shift;
my ($write,$create) = @_;
my $directory = $self->directory;
unless (-d $directory) { # directory does not exist
$create or $self->throw("Directory $directory does not exist and you did not specify the -create flag");
mkpath($directory) or $self->throw("Couldn't create database directory $directory: $!");
}
my $flags = O_RDONLY;
$flags |= O_RDWR if $write;
$flags |= O_CREAT if $create;
# Create the main database; this is a DB_HASH implementation
my %h;
tie (%h,'DB_File',$self->_features_path,$flags,0666,$DB_HASH)
or $self->throw("Couldn't tie: ".$self->_features_path . $!);
if ($create) {
%h = ();
$h{'.next_id'} = 1;
}
$self->db(\%h);
# Create the index databases; these are DB_BTREE implementations with duplicates allowed.
local $DB_BTREE->{flags} = R_DUP;
$DB_BTREE->{compare} = sub { lc($_[0]) cmp lc($_[1]) };
for my $idx ($self->_index_files) {
my $path = $self->_qualify($idx);
my %db;
tie(%db,'DB_File',$path,$flags,0666,$DB_BTREE)
or $self->throw("Couldn't tie $path: $!");
$self->index_db($idx=>\%db);
}
# Create the parentage database
my %p;
tie (%p,'DB_File',$self->_parentage_path,$flags,0666,$DB_BTREE)
or $self->throw("Couldn't tie: ".$self->_parentage_path . $!);
if ($create) {
%p = ();
}
$self->parentage_db(\%p);
if (-e $self->_fasta_path) {
my $dna_db = Bio::DB::Fasta->new($self->_fasta_path) or $self->throw("Can't reindex sequence file: $@");
$self->dna_db($dna_db);
}
my $mode = $write ? "+>>"
: $create ? "+>"
: "<";
open (F,$mode,$self->_notes_path) or $self->throw($self->_notes_path.": $!");
$self->notes_db(\*F);
}
sub _close_databases {
my $self = shift;
$self->db(undef);
$self->dna_db(undef);
$self->index_db($_=>undef) foreach $self->_index_files;
}
sub _store {
my $self = shift;
my $indexed = shift;
my $db = $self->db;
my $count = 0;
for my $obj (@_) {
my $primary_id = $obj->primary_id;
$self->_delete_indexes($primary_id) if $indexed && $primary_id;
$primary_id = $db->{'.next_id'}++ unless defined $primary_id;
$db->{$primary_id} = $self->freeze($obj);
$obj->primary_id($primary_id);
$self->_update_indexes($obj) if $indexed;
$count++;
}
$count;
}
sub _fetch {
my $self = shift;
my $id = shift;
my $db = $self->db;
my $obj = $self->thaw($db->{$id},$id);
$obj;
}
sub _add_SeqFeature {
my $self = shift;
my $parent = shift;
my @children = @_;
my $parent_id = (ref $parent ? $parent->primary_id : $parent)
or $self->throw("$parent should have a primary_id");
my $p = $self->parentage_db;
for my $child (@children) {
my $child_id = ref $child ? $child->primary_id : $child;
defined $child_id or die "no primary ID known for $child";
$p->{$parent_id}={$child_id};
}
}
sub _fetch_SeqFeatures {
my $self = shift;
my $parent = shift;
my @types = @_;
my $parent_id = $parent->primary_id or $self->throw("$parent should have a primary_id");
my $index = $self->parentage_db;
my $db = tied %$index;
my @children_ids = $db->get_dup($parent_id);
my @children = map {$self->fetch($_)} @children_ids;
if (@types) {
my $regexp = join '|',map {quotemeta($_)} $self->find_types(@types);
return grep {($_->primary_tag.':'.$_->source_tag) =~ /^$regexp$/i} @children;
} else {
return @children;
}
}
sub _update_indexes {
my $self = shift;
my $obj = shift;
defined (my $id = $obj->primary_id) or return;
$self->_update_name_index($obj,$id);
$self->_update_type_index($obj,$id);
$self->_update_location_index($obj,$id);
$self->_update_attribute_index($obj,$id);
$self->_update_note_index($obj,$id);
}
sub _update_name_index {
my $self = shift;
my ($obj,$id,$delete) = @_;
my $db = $self->index_db('names') or $self->throw("Couldn't find 'names' index file");
my ($names,$aliases) = $self->feature_names($obj);
# little stinky - needs minor refactoring
foreach (@$names) {
my $key = lc $_;
$self->update_or_delete($delete,$db,$key,$id);
}
foreach (@$aliases) {
my $key = lc($_)."_2"; # the _2 indicates a secondary (alias) ID
$self->update_or_delete($delete,$db,$key,$id);
}
}
sub _update_type_index {
my $self = shift;
my ($obj,$id,$delete) = @_;
my $db = $self->index_db('types')
or $self->throw("Couldn't find 'types' index file");
my $primary_tag = $obj->primary_tag;
my $source_tag = $obj->source_tag || '';
return unless defined $primary_tag;
$primary_tag .= ":$source_tag";
my $key = lc $primary_tag;
$self->update_or_delete($delete,$db,$key,$id);
}
# Note: this indexing scheme is space-inefficient because it stores the
# denormalized sequence ID followed by the bin in XXXXXX zero-leading
# format. It should be replaced with a binary numeric encoding and the
# BTREE {compare} attribute changed accordingly.
sub _update_location_index {
my $self = shift;
my ($obj,$id,$delete) = @_;
my $db = $self->index_db('locations')
or $self->throw("Couldn't find 'locations' index file");
my $seq_id = $obj->seq_id || '';
my $start = $obj->start || '';
my $end = $obj->end || '';
my $strand = $obj->strand;
my $bin_min = sprintf("%06d",int $start/BINSIZE);
my $bin_max = sprintf("%06d",int $end/BINSIZE);
for (my $bin = $bin_min; $bin <= $bin_max; $bin++ ) {
my $key = "\L$seq_id\E$bin";
$self->update_or_delete($delete,$db,$key,pack("i4",$id,$start,$end,$strand));
}
}
sub _update_attribute_index {
my $self = shift;
my ($obj,$id,$delete) = @_;
my $db = $self->index_db('attributes')
or $self->throw("Couldn't find 'attributes' index file");
for my $tag ($obj->all_tags) {
for my $value ($obj->each_tag_value($tag)) {
my $key = "\L${tag}:${value}\E";
$self->update_or_delete($delete,$db,$key,$id);
}
}
}
sub _update_note_index {
my $self = shift;
my ($obj,$id,$delete) = @_;
return if $delete; # we don't know how to do this
my $fh = $self->notes_db;
my @notes = $obj->get_tag_values('Note') if $obj->has_tag('Note');
print $fh $_,"\t",pack("u*",$id) or $self->throw("An error occurred while updating note index: $!")
foreach @notes;
}
sub update_or_delete {
my $self = shift;
my ($delete,$db,$key,$id) = @_;
if ($delete) {
tied(%$db)->del_dup($key,$id);
} else {
$db->{$key} = $id;
}
}
# these methods return pointers to....
# the database that stores serialized objects
sub db {
my $self = shift;
my $d = $self->setting('db');
$self->setting(db=>shift) if @_;
$d;
}
sub parentage_db {
my $self = shift;
my $d = $self->setting('parentage_db');
$self->setting(parentage_db=>shift) if @_;
$d;
}
# the Bio::DB::Fasta object
sub dna_db {
my $self = shift;
my $d = $self->setting('dna_db');
$self->setting(dna_db=>shift) if @_;
$d;
}
# the specialized notes database
sub notes_db {
my $self = shift;
my $d = $self->setting('notes_db');
$self->setting(notes_db=>shift) if @_;
$d;
}
# The indicated index berkeley db
sub index_db {
my $self = shift;
my $index_name = shift;
my $d = $self->setting($index_name);
$self->setting($index_name=>shift) if @_;
$d;
}
# return names of all the indexes
sub _index_files {
return qw(names types locations attributes);
}
# the directory in which we store our indexes
sub directory {
my $self = shift;
my $d = $self->setting('directory');
$self->setting(directory=>shift) if @_;
$d;
}
# flag indicating that we are a temporary database
sub temporary {
my $self = shift;
my $d = $self->setting('temporary');
$self->setting(temporary=>shift) if @_;
$d;
}
# file name utilities...
sub _qualify {
my $self = shift;
my $file = shift;
return $self->directory .'/' . $file;
}
sub _features_path {
shift->_qualify('features.bdb');
}
sub _parentage_path {
shift->_qualify('parentage.bdb');
}
sub _type_path {
shift->_qualify('types.idx');
}
sub _location_path {
shift->_qualify('locations.idx');
}
sub _attribute_path {
shift->_qualify('attributes.idx');
}
sub _notes_path {
shift->_qualify('notes.idx');
}
sub _fasta_path {
shift->_qualify('sequence.fa');
}
###########################################
# searching
###########################################
sub _features {
my $self = shift;
my ($seq_id,$start,$end,$strand,
$name,$class,$allow_aliases,
$types,
$attributes,
$range_type,
$iterator
) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
'NAME','CLASS','ALIASES',
['TYPES','TYPE','PRIMARY_TAG'],
['ATTRIBUTES','ATTRIBUTE'],
'RANGE_TYPE',
'ITERATOR',
], at _);
my (@from, at where, at args, at group);
$range_type ||= 'overlaps';
my @result;
unless (defined $name or defined $seq_id or defined $types or defined $attributes) {
@result = keys %{$self->{_index}{ids}};
}
my %found = ();
if (defined($name)) {
# hacky backward compatibility workaround
$name = "$class:$name" if defined $class && length $class > 0;
$self->filter_by_name($name,$allow_aliases,\%found);
}
if (defined $seq_id) {
$self->filter_by_location($seq_id,$start,$end,$strand,$range_type,\%found);
}
if (defined $types) {
$self->filter_by_type($types,\%found);
}
if (defined $attributes) {
$self->filter_by_attribute($attributes,\%found);
}
push @result,keys %found;
return $iterator ? Bio::DB::SeqFeature::Store::berkeleydb::Iterator->new($self,\@result)
: map {$self->fetch($_)} @result;
}
sub filter_by_name {
my $self = shift;
my ($name,$allow_aliases,$filter) = @_;
my $index = $self->index_db('names');
my $db = tied(%$index);
my ($stem,$regexp) = $self->glob_match($name);
$stem ||= $name;
$regexp ||= $name;
$regexp ||= "(?:_2)?" if $allow_aliases;
my $key = $stem;
my $value;
my @results;
for (my $status = $db->seq($key,$value,R_CURSOR);
$status == 0 && $key =~ /^$regexp$/io;
$status = $db->seq($key,$value,R_NEXT)) {
push @results,$value;
}
$self->update_filter($filter,\@results);
}
sub filter_by_type {
my $self = shift;
my ($types,$filter) = @_;
my @types = ref $types eq 'ARRAY' ? @$types : $types;
my $index = $self->index_db('types');
my $db = tied(%$index);
my @results;
for my $type (@types) {
my ($primary_tag,$source_tag);
if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
$primary_tag = $type->method;
$source_tag = $type->source;
} else {
($primary_tag,$source_tag) = split ':',$type,2;
}
my $match = defined $source_tag ? "^$primary_tag:$source_tag\$" : "^$primary_tag:";
$source_tag ||= '';
my $key = lc "$primary_tag:$source_tag";
my $value;
for (my $status = $db->seq($key,$value,R_CURSOR);
$status == 0 && $key =~ /$match/i;
$status = $db->seq($key,$value,R_NEXT)) {
push @results,$value;
}
}
$self->update_filter($filter,\@results);
}
sub filter_by_location {
my $self = shift;
my ($seq_id,$start,$end,$strand,$range_type,$filter) = @_;
$strand ||= 0;
my $index = $self->index_db('locations');
my $db = tied(%$index);
my $binstart = defined $start ? sprintf("%06d",int $start/BINSIZE) : '';
my $binend = defined $end ? sprintf("%06d",int $end/BINSIZE) : 'z'; # beyond a number
my %seenit;
my @results;
if ($range_type eq 'overlaps' or $range_type eq 'contains') {
my $key = "\L$seq_id\E$binstart";
my $keystop = "\L$seq_id\E$binend";
my $value;
for (my $status = $db->seq($key,$value,R_CURSOR);
$status == 0 && $key le $keystop;
$status = $db->seq($key,$value,R_NEXT)) {
my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
next if $seenit{$id}++;
next if $strand && $fstrand != $strand;
if ($range_type eq 'overlaps') {
next unless $fend >= $start && $fstart <= $end;
}
elsif ($range_type eq 'contains') {
next unless $fstart >= $start && $fend <= $end;
}
push @results,$id;
}
}
# for contained in, we look for features originating and terminating outside the specified range
# this is incredibly inefficient, but fortunately the query is rare (?)
elsif ($range_type eq 'contained_in') {
my $key = "\L$seq_id";
my $keystop = "\L$seq_id\E$binstart";
my $value;
# do the left part of the range
for (my $status = $db->seq($key,$value,R_CURSOR);
$status == 0 && $key le $keystop;
$status = $db->seq($key,$value,R_NEXT)) {
my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
next if $seenit{$id}++;
next if $strand && $fstrand != $strand;
next unless $fstart <= $start && $fend >= $end;
push @results,$id;
}
# do the right part of the range
$key = "\L$seq_id\E$binend";
for (my $status = $db->seq($key,$value,R_CURSOR);
$status == 0;
$status = $db->seq($key,$value,R_NEXT)) {
my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
next if $seenit{$id}++;
next if $strand && $fstrand != $strand;
next unless $fstart <= $start && $fend >= $end;
push @results,$id;
}
}
$self->update_filter($filter,\@results);
}
sub filter_by_attribute {
my $self = shift;
my ($attributes,$filter) = @_;
my $index = $self->index_db('attributes');
my $db = tied(%$index);
for my $att_name (keys %$attributes) {
my @result;
my @search_terms = ref($attributes->{$att_name}) && ref($attributes->{$att_name}) eq 'ARRAY'
? @{$attributes->{$att_name}} : $attributes->{$att_name};
for my $v (@search_terms) {
my ($stem,$regexp) = $self->glob_match($v);
$stem ||= $v;
$regexp ||= $v;
my $key = "\L${att_name}:${stem}\E";
my $value;
for (my $status = $db->seq($key,$value,R_CURSOR);
$status == 0 && $key =~ /^$att_name:$regexp$/i;
$status = $db->seq($key,$value,R_NEXT)) {
push @result,$value;
}
}
$self->update_filter($filter,\@result);
}
}
sub search_notes {
my $self = shift;
my ($search_string,$limit) = @_;
$search_string =~ tr/*?//d;
my @results;
my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
my $search = join '|', at words;
my (%found,$found);
my $note_index = $self->notes_db;
seek($note_index,0,0); # back to start
while (<$note_index>) {
next unless /$search/;
chomp;
my ($note,$uu) = split "\t";
$found{unpack("u*",$uu)}++;
last if $limit && ++$found >= $limit;
}
my (@features, @matches);
for my $idx (keys %found) {
my $feature = $self->fetch($idx) or next;
my @values = $feature->get_tag_values('Note') if $feature->has_tag('Note');
my $value = "@values";
my $hits;
$hits++ while $value =~ /($search)/ig; # count the number of times we were hit
push @matches,$hits;
push @features,$feature;
}
for (my $i=0; $i<@matches; $i++) {
my $feature = $features[$i];
my $matches = $matches[$i];
my $relevance = 10 * $matches;
my $note;
$note = join ' ',$feature->get_tag_values('Note') if $feature->has_tag('Note');
push @results,[$feature->display_name,$note,$relevance];
}
return @results;
}
sub glob_match {
my $self = shift;
my $term = shift;
return unless $term =~ /([^*?]*)(?:^|[^\\])?[*?]/;
my $stem = $1;
$term =~ s/(^|[^\\])([+\[\]^{}\$|\(\).])/$1\\$2/g;
$term =~ s/(^|[^\\])\*/$1.*/g;
$term =~ s/(^|[^\\])\?/$1./g;
return ($stem,$term);
}
sub update_filter {
my $self = shift;
my ($filter,$results) = @_;
if (%$filter) {
my @filtered = grep {$filter->{$_}} @$results;
%$filter = map {$_=>1} @filtered;
} else {
%$filter = map {$_=>1} @$results;
}
}
sub DESTROY {
my $self = shift;
my $db = $self->db;
warn "CLEANING UP";
untie %$db;
rmtree($self->directory,0,1) if $self->temporary;
}
package Bio::DB::SeqFeature::Store::berkeleydb::Iterator;
sub new {
my $class = shift;
my $store = shift;
my $ids = shift;
return bless {store => $store,
ids => $ids},ref($class) || $class;
}
sub next_seq {
my $self = shift;
my $store = $self->{store} or return;
my $id = shift @{$self->{ids}};
defined $id or return;
return $store->fetch($id);
}
1;
More information about the Bioperl-guts-l
mailing list