[Bioperl-guts-l] bioperl-live/Bio/DB/SeqFeature/Store berkeleydb.pm, NONE, 1.1 GFF3Loader.pm, 1.11, 1.12 memory.pm, 1.1, 1.2

Lincoln Stein lstein at dev.open-bio.org
Tue Jun 20 17:26:34 EDT 2006


Update of /home/repository/bioperl/bioperl-live/Bio/DB/SeqFeature/Store
In directory dev.open-bio.org:/tmp/cvs-serv3563/Bio/DB/SeqFeature/Store

Modified Files:
	GFF3Loader.pm memory.pm 
Added Files:
	berkeleydb.pm 
Log Message:
implementation of Bio::DB::SeqFeature::Store in BerkeleyDB databases; syntax checked but not tested in any other way

Index: memory.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/DB/SeqFeature/Store/memory.pm,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** memory.pm	19 Jun 2006 04:21:37 -0000	1.1
--- memory.pm	20 Jun 2006 21:26:32 -0000	1.2
***************
*** 272,276 ****
      my @search_terms = ref($attributes->{$att_name}) && ref($attributes->{$att_name}) eq 'ARRAY'
                             ? @{$attributes->{$att_name}} : $attributes->{$att_name};
-     my $glob_match;
      my @regexp_terms;
      my @terms;
--- 272,275 ----

Index: GFF3Loader.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/DB/SeqFeature/Store/GFF3Loader.pm,v
retrieving revision 1.11
retrieving revision 1.12
diff -C2 -d -r1.11 -r1.12
*** GFF3Loader.pm	19 Jun 2006 04:21:37 -0000	1.11
--- GFF3Loader.pm	20 Jun 2006 21:26:32 -0000	1.12
***************
*** 620,624 ****
  
    else {
!     $self->tmp_store->store($f)
    }
  	
--- 620,624 ----
  
    else {
!     $self->tmp_store->store_noindex($f)
    }
  	

--- NEW FILE: berkeleydb.pm ---
package Bio::DB::SeqFeature::Store::berkeleydb;

# $Id: berkeleydb.pm,v 1.1 2006/06/20 21:26:32 lstein Exp $
use strict;
use base 'Bio::DB::SeqFeature::Store';
use Bio::DB::GFF::Util::Rearrange 'rearrange';
use DB_File;
use Fcntl qw(O_RDWR O_CREAT);
use File::Temp 'tempdir';
use File::Path 'rmtree','mkpath';
use constant BINSIZE => 10_000;

# this will eventually be renamed bdb.pm, but right now I don't want to break what's already written.

###
# object initialization
#
sub init {
  my $self          = shift;
  my ($directory,
      $autoindex,
      $is_temporary,
      $write,
      $create,
     ) = rearrange([['DSN','DB'],
		   [qw(DIR AUTOINDEX)],
		   ['TMP','TEMP','TEMPORARY'],
		   [qw(WRITE WRITABLE)],
		   'CREATE',
		  ], at _);
  $directory ||= $autoindex;
  $directory ||= $is_temporary ? File::Spec->tmpdir : '.';
  $directory = tempdir(__PACKAGE__.'_XXXXXX',TMPDIR=>1,CLEANUP=>1,DIR=>$directory) if $is_temporary;
  -d $directory or $self->throw("Invalid directory $directory");

  $create++ if $is_temporary;
  $write ||= $create;
  $write && -w $directory or $self->throw("Can't write into the directory $directory");


  $self->default_settings;
  $self->directory($directory);
  $self->temporary($is_temporary);

  $self->_autoindex($autoindex) if $autoindex;
  $self->_open_databases($write,$create);
  return $self;
}

sub can_store_parentage { 1 }

sub _autoindex {
  my $self = shift;
  my $autoindex_dir = shift;
  # do something here
}

sub _open_databases {
  my $self = shift;
  my ($write,$create) = @_;

  my $directory  = $self->directory;
  unless (-d $directory) {  # directory does not exist
    $create or $self->throw("Directory $directory does not exist and you did not specify the -create flag");
    mkpath($directory) or $self->throw("Couldn't create database directory $directory: $!");
  }

  my $flags = O_RDONLY;
  $flags   |= O_RDWR  if $write;
  $flags   |= O_CREAT if $create;

  # Create the main database; this is a DB_HASH implementation
  my %h;
  tie (%h,'DB_File',$self->_features_path,$flags,0666,$DB_HASH) 
    or $self->throw("Couldn't tie: ".$self->_features_path . $!);
  if ($create) {
    %h = ();
    $h{'.next_id'} = 1;
  }
  $self->db(\%h);


  # Create the index databases; these are DB_BTREE implementations with duplicates allowed.
  local $DB_BTREE->{flags} = R_DUP;
  $DB_BTREE->{compare}     = sub { lc($_[0]) cmp lc($_[1]) };
  for my $idx ($self->_index_files) {
    my $path = $self->_qualify($idx);
    my %db;
    tie(%db,'DB_File',$path,$flags,0666,$DB_BTREE)
      or $self->throw("Couldn't tie $path: $!");
    $self->index_db($idx=>\%db);
  }

  # Create the parentage database
  my %p;
  tie (%p,'DB_File',$self->_parentage_path,$flags,0666,$DB_BTREE)
    or $self->throw("Couldn't tie: ".$self->_parentage_path . $!);
  if ($create) {
    %p = ();
  }
  $self->parentage_db(\%p);

  if (-e $self->_fasta_path) {
    my $dna_db = Bio::DB::Fasta->new($self->_fasta_path) or $self->throw("Can't reindex sequence file: $@");
    $self->dna_db($dna_db);
  }

  my $mode =  $write  ? "+>>"
            : $create ? "+>"
            : "<";

  open (F,$mode,$self->_notes_path) or $self->throw($self->_notes_path.": $!");
  $self->notes_db(\*F);
}

sub _close_databases {
  my $self = shift;
  $self->db(undef);
  $self->dna_db(undef);
  $self->index_db($_=>undef) foreach $self->_index_files;
}


sub _store {
  my $self    = shift;
  my $indexed = shift;
  my $db   = $self->db;
  my $count = 0;
  for my $obj (@_) {
    my $primary_id = $obj->primary_id;
    $self->_delete_indexes($primary_id) if $indexed && $primary_id;
    $primary_id    = $db->{'.next_id'}++ unless defined $primary_id;
    $db->{$primary_id} = $self->freeze($obj);
    $obj->primary_id($primary_id);
    $self->_update_indexes($obj)        if $indexed;
    $count++;
  }
  $count;
}

sub _fetch {
  my $self = shift;
  my $id   = shift;
  my $db = $self->db;
  my $obj = $self->thaw($db->{$id},$id);
  $obj;
}

sub _add_SeqFeature {
  my $self = shift;
  my $parent   = shift;
  my @children = @_;
  my $parent_id = (ref $parent ? $parent->primary_id : $parent)
    or $self->throw("$parent should have a primary_id");
  my $p = $self->parentage_db;
  for my $child (@children) {
    my $child_id = ref $child ? $child->primary_id : $child;
    defined $child_id or die "no primary ID known for $child";
    $p->{$parent_id}={$child_id};
  }
}

sub _fetch_SeqFeatures {
  my $self   = shift;
  my $parent = shift;
  my @types  = @_;
  my $parent_id = $parent->primary_id or $self->throw("$parent should have a primary_id");
  my $index     = $self->parentage_db;
  my $db        = tied %$index;

  my @children_ids  = $db->get_dup($parent_id);
  my @children      = map {$self->fetch($_)} @children_ids;

  if (@types) {
    my $regexp = join '|',map {quotemeta($_)} $self->find_types(@types);
    return grep {($_->primary_tag.':'.$_->source_tag) =~ /^$regexp$/i} @children;
  } else {
    return @children;
  }
}

sub _update_indexes {
  my $self = shift;
  my $obj  = shift;
  defined (my $id   = $obj->primary_id) or return;
  $self->_update_name_index($obj,$id);
  $self->_update_type_index($obj,$id);
  $self->_update_location_index($obj,$id);
  $self->_update_attribute_index($obj,$id);
  $self->_update_note_index($obj,$id);
}

sub _update_name_index {
  my $self = shift;
  my ($obj,$id,$delete) = @_;
  my $db = $self->index_db('names') or $self->throw("Couldn't find 'names' index file");
  my ($names,$aliases) = $self->feature_names($obj);

  # little stinky - needs minor refactoring
  foreach (@$names) {
    my $key = lc $_;
    $self->update_or_delete($delete,$db,$key,$id);
  }

  foreach (@$aliases) {
    my $key = lc($_)."_2"; # the _2 indicates a secondary (alias) ID
    $self->update_or_delete($delete,$db,$key,$id);
  }

}

sub _update_type_index {
  my $self = shift;
  my ($obj,$id,$delete) = @_;
  my $db = $self->index_db('types')
    or $self->throw("Couldn't find 'types' index file");
  my $primary_tag = $obj->primary_tag;
  my $source_tag  = $obj->source_tag || '';
  return unless defined $primary_tag;

  $primary_tag    .= ":$source_tag";
  my $key          = lc $primary_tag;
  $self->update_or_delete($delete,$db,$key,$id);
}

# Note: this indexing scheme is space-inefficient because it stores the
# denormalized sequence ID followed by the bin in XXXXXX zero-leading
# format. It should be replaced with a binary numeric encoding and the
# BTREE {compare} attribute changed accordingly.
sub _update_location_index {
  my $self = shift;
  my ($obj,$id,$delete) = @_;
  my $db = $self->index_db('locations')
    or $self->throw("Couldn't find 'locations' index file");

  my $seq_id      = $obj->seq_id || '';
  my $start       = $obj->start  || '';
  my $end         = $obj->end    || '';
  my $strand      = $obj->strand;
  my $bin_min     = sprintf("%06d",int $start/BINSIZE);
  my $bin_max     = sprintf("%06d",int $end/BINSIZE);

  for (my $bin = $bin_min; $bin <= $bin_max; $bin++ ) {
    my $key = "\L$seq_id\E$bin";
    $self->update_or_delete($delete,$db,$key,pack("i4",$id,$start,$end,$strand));
  }
}

sub _update_attribute_index {
  my $self      = shift;
  my ($obj,$id,$delete) = @_;
  my $db = $self->index_db('attributes')
    or $self->throw("Couldn't find 'attributes' index file");

  for my $tag ($obj->all_tags) {
    for my $value ($obj->each_tag_value($tag)) {
      my $key = "\L${tag}:${value}\E";
      $self->update_or_delete($delete,$db,$key,$id);
    }
  }
}

sub _update_note_index {
  my $self = shift;
  my ($obj,$id,$delete) = @_;
  return if $delete; # we don't know how to do this

  my $fh = $self->notes_db;
  my @notes = $obj->get_tag_values('Note') if $obj->has_tag('Note');


  print $fh $_,"\t",pack("u*",$id) or $self->throw("An error occurred while updating note index: $!")
    foreach @notes;
}

sub update_or_delete {
  my $self = shift;
  my ($delete,$db,$key,$id) = @_;
  if ($delete) {
    tied(%$db)->del_dup($key,$id);
  } else {
    $db->{$key} = $id;
  }
}

# these methods return pointers to....
# the database that stores serialized objects
sub db {
  my $self = shift;
  my $d = $self->setting('db');
  $self->setting(db=>shift) if @_;
  $d;
}

sub parentage_db {
  my $self = shift;
  my $d = $self->setting('parentage_db');
  $self->setting(parentage_db=>shift) if @_;
  $d;
}

# the Bio::DB::Fasta object
sub dna_db {
  my $self = shift;
  my $d = $self->setting('dna_db');
  $self->setting(dna_db=>shift) if @_;
  $d;
}

# the specialized notes database
sub notes_db {
  my $self = shift;
  my $d = $self->setting('notes_db');
  $self->setting(notes_db=>shift) if @_;
  $d;
}

# The indicated index berkeley db
sub index_db {
  my $self = shift;
  my $index_name = shift;
  my $d = $self->setting($index_name);
  $self->setting($index_name=>shift) if @_;
  $d;
}


# return names of all the indexes
sub _index_files {
  return qw(names types locations attributes);
}

# the directory in which we store our indexes
sub directory {
  my $self = shift;
  my $d = $self->setting('directory');
  $self->setting(directory=>shift) if @_;
  $d;
}

# flag indicating that we are a temporary database
sub temporary {
  my $self = shift;
  my $d = $self->setting('temporary');
  $self->setting(temporary=>shift) if @_;
  $d;
}

# file name utilities...
sub _qualify {
  my $self = shift;
  my $file = shift;
  return $self->directory .'/' . $file;
}

sub _features_path {
  shift->_qualify('features.bdb');
}

sub _parentage_path {
  shift->_qualify('parentage.bdb');
}

sub _type_path {
  shift->_qualify('types.idx');
}

sub _location_path {
  shift->_qualify('locations.idx');
}

sub _attribute_path {
  shift->_qualify('attributes.idx');
}

sub _notes_path {
  shift->_qualify('notes.idx');
}

sub _fasta_path {
  shift->_qualify('sequence.fa');
}

###########################################
# searching
###########################################

sub _features {
  my $self = shift;
  my ($seq_id,$start,$end,$strand,
      $name,$class,$allow_aliases,
      $types,
      $attributes,
      $range_type,
      $iterator
     ) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
		    'NAME','CLASS','ALIASES',
		    ['TYPES','TYPE','PRIMARY_TAG'],
		    ['ATTRIBUTES','ATTRIBUTE'],
		    'RANGE_TYPE',
		    'ITERATOR',
		   ], at _);

  my (@from, at where, at args, at group);
  $range_type ||= 'overlaps';

  my @result;
  unless (defined $name or defined $seq_id or defined $types or defined $attributes) {
    @result = keys %{$self->{_index}{ids}};
  }

  my %found = ();

  if (defined($name)) {
    # hacky backward compatibility workaround
    $name     = "$class:$name" if defined $class && length $class > 0;
    $self->filter_by_name($name,$allow_aliases,\%found);
  }

  if (defined $seq_id) {
    $self->filter_by_location($seq_id,$start,$end,$strand,$range_type,\%found);
  }

  if (defined $types) {
    $self->filter_by_type($types,\%found);
  }

  if (defined $attributes) {
    $self->filter_by_attribute($attributes,\%found);
  }

  push @result,keys %found;
  return $iterator ? Bio::DB::SeqFeature::Store::berkeleydb::Iterator->new($self,\@result)
                   : map {$self->fetch($_)} @result;
}

sub filter_by_name {
  my $self = shift;
  my ($name,$allow_aliases,$filter) = @_;

  my $index = $self->index_db('names');
  my $db    = tied(%$index);

  my ($stem,$regexp) = $self->glob_match($name);
  $stem   ||= $name;
  $regexp ||= $name;
  $regexp ||= "(?:_2)?" if $allow_aliases;

  my $key   = $stem;
  my $value;
  my @results;
  for (my $status = $db->seq($key,$value,R_CURSOR);
       $status == 0 && $key =~ /^$regexp$/io;
       $status = $db->seq($key,$value,R_NEXT)) {
    push @results,$value;
  }
  $self->update_filter($filter,\@results);
}

sub filter_by_type {
  my $self = shift;
  my ($types,$filter) = @_;
  my @types = ref $types eq 'ARRAY' ?  @$types : $types;

  my $index = $self->index_db('types');
  my $db    = tied(%$index);

  my @results;

  for my $type (@types) {
    my ($primary_tag,$source_tag);
    if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
      $primary_tag = $type->method;
      $source_tag  = $type->source;
    } else {
      ($primary_tag,$source_tag) = split ':',$type,2;
    }
    my $match = defined $source_tag ? "^$primary_tag:$source_tag\$" : "^$primary_tag:";
    $source_tag ||= '';
    my $key   = lc "$primary_tag:$source_tag";
    my $value;

    for (my $status = $db->seq($key,$value,R_CURSOR);
 	 $status == 0 && $key =~ /$match/i;
	 $status = $db->seq($key,$value,R_NEXT)) {
      push @results,$value;
    }

  }
  $self->update_filter($filter,\@results);
}

sub filter_by_location {
  my $self = shift;
  my ($seq_id,$start,$end,$strand,$range_type,$filter) = @_;
  $strand ||= 0;

  my $index    = $self->index_db('locations');
  my $db       = tied(%$index);

  my $binstart = defined $start ? sprintf("%06d",int $start/BINSIZE) : '';
  my $binend   = defined $end   ? sprintf("%06d",int $end/BINSIZE)   : 'z';  # beyond a number

  my %seenit;
  my @results;

  if ($range_type eq 'overlaps' or $range_type eq 'contains') {
    my $key     = "\L$seq_id\E$binstart";
    my $keystop = "\L$seq_id\E$binend";
    my $value;
    for (my $status = $db->seq($key,$value,R_CURSOR);
	 $status == 0 && $key le $keystop;
	 $status = $db->seq($key,$value,R_NEXT)) {
      my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
      next if $seenit{$id}++;
      next if $strand && $fstrand != $strand;
      if ($range_type eq 'overlaps') {
	next unless $fend >= $start && $fstart <= $end;
      }
      elsif ($range_type eq 'contains') {
	next unless $fstart >= $start && $fend <= $end;
      }
      push @results,$id;
    }
  }

  # for contained in, we look for features originating and terminating outside the specified range
  # this is incredibly inefficient, but fortunately the query is rare (?)
  elsif ($range_type eq 'contained_in') {
    my $key     = "\L$seq_id";
    my $keystop = "\L$seq_id\E$binstart";
    my $value;

    # do the left part of the range
    for (my $status = $db->seq($key,$value,R_CURSOR);
	 $status == 0 && $key le $keystop;
	 $status = $db->seq($key,$value,R_NEXT)) {
      my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
      next if $seenit{$id}++;
      next if $strand && $fstrand != $strand;
      next unless $fstart <= $start && $fend >= $end;
      push @results,$id;
    }

    # do the right part of the range
    $key = "\L$seq_id\E$binend";
    for (my $status = $db->seq($key,$value,R_CURSOR);
	 $status == 0;
	 $status = $db->seq($key,$value,R_NEXT)) {
      my ($id,$fstart,$fend,$fstrand) = unpack("i4",$value);
      next if $seenit{$id}++;
      next if $strand && $fstrand != $strand;
      next unless $fstart <= $start && $fend >= $end;
      push @results,$id;
    }

  }

  $self->update_filter($filter,\@results);
}

sub filter_by_attribute {
  my $self = shift;
  my ($attributes,$filter) = @_;

  my $index = $self->index_db('attributes');
  my $db    = tied(%$index);

   for my $att_name (keys %$attributes) {
     my @result;
     my @search_terms = ref($attributes->{$att_name}) && ref($attributes->{$att_name}) eq 'ARRAY'
                           ? @{$attributes->{$att_name}} : $attributes->{$att_name};

     for my $v (@search_terms) {
       my ($stem,$regexp) = $self->glob_match($v);
       $stem   ||= $v;
       $regexp ||= $v;
       my $key = "\L${att_name}:${stem}\E";
       my $value;
       for (my $status = $db->seq($key,$value,R_CURSOR);
	    $status == 0 && $key =~ /^$att_name:$regexp$/i;
	    $status = $db->seq($key,$value,R_NEXT)) {
	 push @result,$value;
       }
     }
     $self->update_filter($filter,\@result);
   }
}


sub search_notes {
  my $self = shift;
  my ($search_string,$limit) = @_;

  $search_string =~ tr/*?//d;

  my @results;

  my @words = map {quotemeta($_)} $search_string =~ /(\w+)/g;
  my $search = join '|', at words;

  my (%found,$found);
  my $note_index = $self->notes_db;
  seek($note_index,0,0);  # back to start
  while (<$note_index>) {
    next unless /$search/;
    chomp;
    my ($note,$uu) = split "\t";
    $found{unpack("u*",$uu)}++;
    last if $limit && ++$found >= $limit;
  }

  my (@features, @matches);
  for my $idx (keys %found) {
    my $feature    = $self->fetch($idx) or next;
    my @values     = $feature->get_tag_values('Note') if $feature->has_tag('Note');
    my $value      = "@values";

    my $hits;
    $hits++ while $value =~ /($search)/ig;  # count the number of times we were hit
    push @matches,$hits;
    push @features,$feature;
  }

  for (my $i=0; $i<@matches; $i++)  {
    my $feature = $features[$i];
    my $matches = $matches[$i];

    my $relevance = 10 * $matches;
    my $note;
    $note   = join ' ',$feature->get_tag_values('Note') if $feature->has_tag('Note');
    push @results,[$feature->display_name,$note,$relevance];
  }

  return @results;
}

sub glob_match {
  my $self = shift;
  my $term = shift;
  return unless $term =~ /([^*?]*)(?:^|[^\\])?[*?]/;
  my $stem = $1;
  $term =~ s/(^|[^\\])([+\[\]^{}\$|\(\).])/$1\\$2/g;
  $term =~ s/(^|[^\\])\*/$1.*/g;
  $term =~ s/(^|[^\\])\?/$1./g;
  return ($stem,$term);
}


sub update_filter {
  my $self = shift;
  my ($filter,$results) = @_;

  if (%$filter) {
    my @filtered = grep {$filter->{$_}} @$results;
    %$filter     = map {$_=>1} @filtered;
  } else {
    %$filter     = map {$_=>1} @$results;
  }

}


sub DESTROY {
  my $self = shift;
  my $db   = $self->db;
  warn "CLEANING UP";
  untie %$db;
  rmtree($self->directory,0,1) if $self->temporary;
}

package Bio::DB::SeqFeature::Store::berkeleydb::Iterator;

sub new {
  my $class = shift;
  my $store = shift;
  my $ids   = shift;
  return bless {store => $store,
		ids   => $ids},ref($class) || $class;
}

sub next_seq {
  my $self  = shift;
  my $store = $self->{store} or return;
  my $id    = shift @{$self->{ids}};
  defined $id or return;
  return $store->fetch($id);
}

1;



More information about the Bioperl-guts-l mailing list